In [1]:
!pip install pdfplumber

import pandas as pd
from transformers import pipeline
import pdfplumber
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
def extract_clean_text_chunks_from_pdf(pdf_path, chunk_size=1000):
    text_chunks = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                page_text = re.sub(r'http\S+|www\S+|file:\S+|\S+\.html', '', page_text)
                page_text = re.sub(r'\s+', ' ', page_text).strip()
                for i in range(0, len(page_text), chunk_size):
                    text_chunks.append(page_text[i:i + chunk_size])
    return text_chunks

question_generator = pipeline("text2text-generation", model="valhalla/t5-small-qa-qg-hl")

def generate_questions(text, num_questions=15):
    formatted_text = "generate question: " + text
    questions = question_generator(formatted_text, max_length=100, num_beams=5, num_return_sequences=num_questions)
    return questions

def retrieve_relevant_chunks(prompt, text_chunks, top_n=5):
    vectorizer = TfidfVectorizer().fit_transform([prompt] + text_chunks)
    cosine_similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
    top_n_indices = cosine_similarities.argsort()[-top_n:][::-1]
    relevant_chunks = [text_chunks[i] for i in top_n_indices]
    return relevant_chunks

def generate_questions_from_prompt_with_rag(prompt, pdf_text, total_questions=20, top_n_chunks=5, questions_per_chunk=2):
    relevant_chunks = retrieve_relevant_chunks(prompt, pdf_text, top_n=top_n_chunks)

    if not relevant_chunks:
        print("No relevant chunks found for the prompt.")
        return []

    all_generated_questions = set()
    for chunk in relevant_chunks:
        questions = generate_questions(chunk, num_questions=questions_per_chunk)
        for q in questions:
            cleaned_question = re.sub(r'\s+', ' ', q['generated_text']).strip()
            if len(cleaned_question) > 15:
                all_generated_questions.add(cleaned_question)

        if len(all_generated_questions) >= total_questions:
            break

    return list(all_generated_questions)[:total_questions]

if __name__ == "__main__":
    pdf_path = input("Enter the PDF file path :")

    prompt = input("Enter a prompt (a complete sentence or a word): ")

    text_chunks = extract_clean_text_chunks_from_pdf(pdf_path)

    if not text_chunks:
        print("No text found in the PDF.")
    else:
        generated_questions = generate_questions_from_prompt_with_rag(
            prompt, text_chunks, total_questions=20, top_n_chunks=5, questions_per_chunk=2
        )

        if not generated_questions:
            print("No questions were generated. Please check the prompt or PDF content.")
        else:
            output_file = "generated_questions.txt"
            with open(output_file, "w", encoding="utf-8") as f:
                for idx, question in enumerate(generated_questions, 1):
                    print(f"{idx}. {question}")
                    f.write(f"{idx}. {question}\n")

            print(f"Generated {len(generated_questions)} questions and saved to '{output_file}'.")


Enter the PDF file path :/content/KarmaYoga.pdf
Enter a prompt (a complete sentence or a word): Sannyasin
1. What did the Sannyasin think the Vyadha gave him?
2. What did the Sannyasin say to the Sannyasin?
3. What was the name of the young Sannyasin who refused to marry the princess?
4. What did the young Sannyasin threw the garland over the Sannyasin?
5. What did many wise men seek to solve the problem of Sannyasin?
6. What happened to the young Sannyasin who refused to marry the princess?
7. What did the young Sannyasin threw the garland over the princess?
8. What was the name of the king who followed the Sannyasin out of his territory?
9. What did many wise men seek to solve the problem of the Sannyasin?
10. What was the name of the king who followed the Sannyasin out of his own territory?
Generated 10 questions and saved to 'generated_questions.txt'.
