In [1]:
!pip install transformers datasets pdfplumber rouge_score

import pdfplumber
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_metric


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfplumber
  Downloading pdfplumber-0.11.1-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.9/57.9 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collect

In [2]:
def extract_text_from_pdf(pdf_path):
    """
    Extract text content from a PDF file.
    """
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text



In [3]:
def generate_questions_from_text(text, model, tokenizer, num_questions=5, start_phrases=["how", "which", "do"]):
    """
    Generate questions from input text using a T5 model.
    """
    generated_questions = []
    for start_phrase in start_phrases:
        input_text = f"{start_phrase}: {text}"
        inputs = tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True)
        outputs = model.generate(inputs['input_ids'],
                                 max_length=100,
                                 num_return_sequences=num_questions,
                                 num_beams=5,  # Adjust beam width for diversity
                                 no_repeat_ngram_size=2,  # Avoid repeating n-grams
                                 early_stopping=True  # Stop generation when the model has finished generating sequences
                                 )
        questions = list(set([tokenizer.decode(output, skip_special_tokens=True).strip() for output in outputs]))  # Convert outputs to a set to ensure unique questions and strip whitespace
        generated_questions.extend(questions)
    return generated_questions[:num_questions]



In [4]:
pdf_path = "/content/bel.pdf"
pdf_text = extract_text_from_pdf(pdf_path)

In [5]:
model_name = 'valhalla/t5-small-e2e-qg'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
generated_questions = generate_questions_from_text(pdf_text, model, tokenizer, num_questions=5)

for i, question in enumerate(generated_questions):
    print(f"Question {i+1}: {question.strip()}")


Question 1: What is immortality within human reach? <sep> By what century is the prospect of living up to 5000 years might well become reality? What does the idea that death is a key to life are at best based on dubious science? What do Chipko activists in TehriGarhwal sing praising their hills as paradise, the place of Gods, where the mountains bloom with rare plants and dense cedars? What was the name of
Question 2: What is immortality within human reach? <sep> By what century is the prospect of living up to 5000 years might well become reality? What does the idea that death is a key to life are at best based on dubious science? What do Chipko activists in TehriGarhwal sing praising their hills as paradise, the place of Gods, where the mountains bloom with rare plants and dense cedars?
Question 3: What is immortality within human reach? <sep> By what century is the prospect of living up to 5000 years based on dubious science? What does the scientific fraternity rarely take seriously?

In [7]:
def format_and_remove_duplicates(generated_questions):
    seen_questions_global = set()
    formatted_questions = []
    question_counter = 1

    for i, question_set in enumerate(generated_questions, start=1):
        question_segments = question_set.split('<sep>')
        questions = []
        for segment in question_segments:
            questions.extend([q.strip() + '?' for q in segment.split('?') if q.strip()])

        unique_questions_local = []
        seen_questions_local = set()

        for q in questions:
            if q not in seen_questions_local:
                unique_questions_local.append(q)
                seen_questions_local.add(q)

        # Add unique questions from the local set to the global set
        for q in unique_questions_local:
            if q not in seen_questions_global:
                formatted_questions.append(f"Question {question_counter}: {q}")
                seen_questions_global.add(q)
                question_counter += 1

    return formatted_questions

# Example usage with your generated questions
generated_questions = generate_questions_from_text(pdf_text, model, tokenizer, num_questions=5)

formatted_questions = format_and_remove_duplicates(generated_questions)

# Print formatted questions
for question in formatted_questions:
    print(question)



Question 1: What is immortality within human reach?
Question 2: By what century is the prospect of living up to 5000 years might well become reality?
Question 3: What does the idea that death is a key to life are at best based on dubious science?
Question 4: What do Chipko activists in TehriGarhwal sing praising their hills as paradise, the place of Gods, where the mountains bloom with rare plants and dense cedars?
Question 5: What was the name of?
Question 6: By what century is the prospect of living up to 5000 years based on dubious science?
Question 7: What does the scientific fraternity rarely take seriously?
Question 8: What did Chipko activists sing in the 1970s?
Question 9: What was the name of the movement to save the indigenous forests of oak and rhododendron from being felled by the Forest Department?
Question 10: What was the name of the movement to save the indigenous forests of oak and rhododendron?
Question 11: What does the idea that death is key to life are at best base

In [8]:
pip install nltk




In [9]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [10]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Reference questions
reference_questions = [
    ["What is immortality within human reach?"],
    ["By what century is the prospect of living up to 5000 years might well become reality?"],
    ["What does the idea that death is a key to life are at best based on dubious science?"],
    ["What do Chipko activists in TehriGarhwal sing praising their hills as paradise, the place of Gods, where the mountains bloom with rare plants and dense cedars?"],
    ["What was the name of?"],
    ["By what century is the prospect of living up to 5000 years based on dubious science?"],
    ["What does the scientific fraternity rarely take seriously?"],
    ["What did Chipko activists sing in the 1970s?"],
    ["What was the name of the movement to save the indigenous forests of oak and rhododendron from being felled by the Forest Department?"],
    ["What was the name of the movement to save the indigenous forests of oak and rhododendron?"],
    ["What does the idea that death is key to life are at best based on?"],
    ["What do Chipko activists in TehriGarhwal sing?"],
    ["What did ChipKo protest against?"]
]

# Generated questions (example)
generated_questions = generate_questions_from_text(pdf_text, model, tokenizer, num_questions=5)
formatted_questions = format_and_remove_duplicates(generated_questions)
# Calculate BLEU score for each generated question
smooth = SmoothingFunction().method4

bleu_scores = []

for gen_q in formatted_questions:
    gen_tokens = gen_q.split()
    ref_tokens = [ref_q[0].split() for ref_q in reference_questions]
    score = sentence_bleu(ref_tokens, gen_tokens, smoothing_function=smooth)
    bleu_scores.append(score)

# Print BLEU scores
for i, score in enumerate(bleu_scores, 1):
    print(f"Question {i}: BLEU score = {score:.4f}")



Question 1: BLEU score = 0.6804
Question 2: BLEU score = 0.8782
Question 3: BLEU score = 0.8915
Question 4: BLEU score = 0.9244
Question 5: BLEU score = 0.6148
Question 6: BLEU score = 0.8782
Question 7: BLEU score = 0.7598
Question 8: BLEU score = 0.7598
Question 9: BLEU score = 0.8788
Question 10: BLEU score = 0.8782
Question 11: BLEU score = 0.8702
Question 12: BLEU score = 0.7260
Question 13: BLEU score = 0.6148


In [11]:
from rouge import Rouge

# Function to calculate ROUGE scores
def calculate_rouge(reference_questions, formatted_questions):
    rouge = Rouge()
    scores = []

    for i, gen_q in enumerate(formatted_questions):
        # Join reference questions for comparison
        reference = ' '.join(reference_questions[i]) if i < len(reference_questions) else ' '.join(reference_questions[-1])
        # Calculate ROUGE scores
        score = rouge.get_scores(gen_q, reference)[0]
        scores.append(score)

    return scores

rouge_scores = calculate_rouge(reference_questions, formatted_questions)

# Print ROUGE scores for each question
for i, score in enumerate(rouge_scores, start=1):
    print(f"ROUGE score for Question {i}: {score}")


ROUGE score for Question 1: {'rouge-1': {'r': 1.0, 'p': 0.75, 'f': 0.8571428522448981}, 'rouge-2': {'r': 1.0, 'p': 0.7142857142857143, 'f': 0.8333333284722222}, 'rouge-l': {'r': 1.0, 'p': 0.75, 'f': 0.8571428522448981}}
ROUGE score for Question 2: {'rouge-1': {'r': 1.0, 'p': 0.8888888888888888, 'f': 0.9411764656055364}, 'rouge-2': {'r': 1.0, 'p': 0.8823529411764706, 'f': 0.9374999950195313}, 'rouge-l': {'r': 1.0, 'p': 0.8888888888888888, 'f': 0.9411764656055364}}
ROUGE score for Question 3: {'rouge-1': {'r': 1.0, 'p': 0.9, 'f': 0.9473684160664821}, 'rouge-2': {'r': 1.0, 'p': 0.8947368421052632, 'f': 0.9444444394598766}, 'rouge-l': {'r': 1.0, 'p': 0.9, 'f': 0.9473684160664821}}
ROUGE score for Question 4: {'rouge-1': {'r': 1.0, 'p': 0.9259259259259259, 'f': 0.961538456545858}, 'rouge-2': {'r': 1.0, 'p': 0.9259259259259259, 'f': 0.961538456545858}, 'rouge-l': {'r': 1.0, 'p': 0.9259259259259259, 'f': 0.961538456545858}}
ROUGE score for Question 5: {'rouge-1': {'r': 1.0, 'p': 0.71428571428

In [20]:
import spacy
import re
nlp = spacy.load("en_core_web_sm")


In [15]:
def split_into_sentences(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences

In [16]:
def extract_answers(question, sentences):
    doc = nlp(question)
    keywords = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    pattern = re.compile(r'\b(?:' + '|'.join(re.escape(keyword) for keyword in keywords) + r')\b', re.IGNORECASE)

    for sentence in sentences:
        if pattern.search(sentence):
            return sentence

    return "Answer not found in the text."

In [17]:
sentences = split_into_sentences(pdf_text)

In [18]:
generated_questions = generate_questions_from_text(pdf_text, model, tokenizer, num_questions=5)
formatted_questions = format_and_remove_duplicates(generated_questions)

In [21]:
for i, question in enumerate(formatted_questions, 1):
    answer = extract_answers(question, sentences)
    print(f"Question {i}: {question}")
    print(f"Answer: {answer}\n")

Question 1: Question 1: What is immortality within human reach?
Answer: If we are to believe the latest reports from medical laboratories, immortality is within human reach.


Question 2: Question 2: By what century is the prospect of living up to 5000 years might well become reality?
Answer: By the next century, the prospect of living up to 5000 years might well become reality.

Question 3: Question 3: What does the idea that death is a key to life are at best based on dubious science?
Answer: By the next century, the prospect of living up to 5000 years might well become reality.

Question 4: Question 4: What do Chipko activists in TehriGarhwal sing praising their hills as paradise, the place of Gods, where the mountains bloom with rare plants and dense cedars?
Answer: In the 1970s, Chipko activists in TehriGarhwal used to sing a song praising their hills as paradise, the
place of Gods, where the mountains bloom with rare plants and dense cedars.

Question 5: Question 5: What was the 