In [2]:
!pip install pdfplumber
!pip install googletrans==4.0.0-rc1
!pip install pymupdf
import pandas as pd
from transformers import pipeline
import pdfplumber
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import fitz
from tqdm import tqdm
from googletrans import Translator
import warnings
warnings.filterwarnings("ignore")



In [12]:
import fitz  # PyMuPDF
import re
import pandas as pd
from tqdm import tqdm

def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf:
        for page in pdf:
            page_text = page.get_text()
            clean_text = page_text
            text += clean_text + "\n"
    return text

def split_text(text, max_length=1000):
    return [text[i:i + max_length] for i in range(0, len(text), max_length)]

def generate_question(sentence):
    location_rule = re.search(r'(.+?) (में|से|पर) (.+?) है।', sentence)
    if location_rule:
        location = location_rule.group(1)
        location_without_last_word = ' '.join(location.split()[:-1])
        return f"{location_without_last_word} कहाँ {location_rule.group(3)} है?"

    who_rule = re.search(r'(.+?) ने (.+?)।', sentence)
    if who_rule:
        return f"किसने {who_rule.group(2)}?"

    ki_rule = re.search(r'(.+?) की (.+?)।', sentence)
    if ki_rule:
        location = ki_rule.group(1)
        location_without_last_word = ' '.join(location.split()[:-1])
        return f"{location_without_last_word} किसकी {ki_rule.group(2)}?"

    return "Question generation rule not found."

def process_book(sentences):
    data = []

    for sentence in tqdm(sentences, desc="Generating Questions"):
        question = generate_question(sentence)
        if question:
            data.append({'Sentence': sentence, 'Question': question})

    df = pd.DataFrame(data, columns=['Sentence', 'Question'])

    return df

path = input("Enter the path of the PDF file: ")
text = extract_text_from_pdf(path)
sentences = re.split(r'(?<=।)\s+', text)

df = process_book(sentences)

answers = df[df.Question != "Question generation rule not found."]


# Save the filtered results to a TXT file
output_file_txt = "generated_questions.txt"
with open(output_file_txt, "w", encoding="utf-8") as f:
    for index, row in answers.iterrows():
        f.write(f"Sentence: {row['Sentence']}\n")
        f.write(f"Question: {row['Question']}\n\n")

print(f"Questions and sentences saved to '{output_file_txt}'.")


Enter the path of the PDF file: /content/godan.pdf


Generating Questions: 100%|██████████| 10378/10378 [00:02<00:00, 4837.49it/s]


Questions and sentences saved to 'generated_questions.txt'.


In [15]:
#model 2
translator = Translator()

def extract_clean_text_chunks_from_pdf(pdf_path, chunk_size=1000):
    text_chunks = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                page_text = re.sub(r'http\S+|www\S+|file:\S+|\S+\.html', '', page_text)
                page_text = re.sub(r'\s+', ' ', page_text).strip()
                for i in range(0, len(page_text), chunk_size):
                    chunk = page_text[i:i + chunk_size]
                    if chunk:
                        text_chunks.append(chunk)
    return text_chunks

question_generator = pipeline("text2text-generation", model="valhalla/t5-small-qa-qg-hl")

def generate_questions(text, num_questions=5):
    if not text:
        return []
    formatted_text = "generate question: " + text
    questions = question_generator(formatted_text, max_length=100, num_beams=5, num_return_sequences=num_questions)
    return questions

def retrieve_relevant_chunks(prompt, text_chunks, top_n=5):
    if not text_chunks:
        return []
    vectorizer = TfidfVectorizer().fit_transform([prompt] + text_chunks)
    cosine_similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
    top_n_indices = cosine_similarities.argsort()[-top_n:][::-1]
    relevant_chunks = [text_chunks[i] for i in top_n_indices]
    return relevant_chunks

def safe_translate(text, src, dest):
    try:
        if text is None or text.strip() == "":
            print(f"No text provided for translation from {src} to {dest}.")
            return ""
        translated_text = translator.translate(text, src=src, dest=dest).text
        return translated_text
    except Exception as e:
        return ""

def generate_questions_from_prompt_with_rag(prompt, pdf_text, total_questions=10, top_n_chunks=5, questions_per_chunk=3):
    translated_prompt = safe_translate(prompt, src='hi', dest='en')

    if not translated_prompt:
        print("Prompt translation failed. Exiting.")
        return []

    relevant_chunks = retrieve_relevant_chunks(translated_prompt, pdf_text, top_n=top_n_chunks)

    if not relevant_chunks:
        print("No relevant chunks found.")
        return []

    all_generated_questions = []
    for chunk in relevant_chunks:
        translated_chunk = safe_translate(chunk, src='hi', dest='en')
        if not translated_chunk:
            print("Chunk translation failed, skipping this chunk.")
            continue

        questions = generate_questions(translated_chunk, num_questions=questions_per_chunk)

        for q in questions:
            cleaned_question = re.sub(r'\s+', ' ', q['generated_text']).strip()
            if cleaned_question:
                all_generated_questions.append(cleaned_question)

        if len(all_generated_questions) >= total_questions:
            break

    translated_questions = []
    for q in all_generated_questions:
        translated_q = safe_translate(q, src='en', dest='hi')
        if translated_q:
            translated_questions.append(translated_q)

    return translated_questions

if __name__ == "__main__":
    pdf_path = input("Enter the path of the PDF file: ")
    prompt = input("Enter a prompt (in Hindi): ")

    text_chunks = extract_clean_text_chunks_from_pdf(pdf_path)

    if text_chunks:
        print("Text chunks extracted from the PDF.")
    else:
        print("No text found in the PDF.")

    if text_chunks:
        print(f"Generating questions based on the prompt: '{prompt}'...")
        generated_questions = generate_questions_from_prompt_with_rag(
            prompt, text_chunks, total_questions=10, top_n_chunks=5, questions_per_chunk=2
        )

        if generated_questions:
            print("Generated Questions:")
            for idx, question in enumerate(generated_questions, 1):
                print(f"{idx}. {question}")
        else:
            print("No questions were generated.")

Enter the path of the PDF file: /content/godan.pdf
Enter a prompt (in Hindi): होरी कंधों पर लाठी रख कर घर से ननकला,
Text chunks extracted from the PDF.
Generating questions based on the prompt: 'होरी कंधों पर लाठी रख कर घर से ननकला,'...
Generated Questions:
1. सतू ली की संरचना का क्या हुआ?
2. धान्याय यंत की सतू ली संरचना का क्या हुआ?
3. धन्य्यासभा ने गले से क्या कहा और कहा - पंचो, आपको गरीबों को सताते हुए खुशी नहीं मिलेगी?
4. धन्य्यासभा ने गले से क्या कहा और कहा - पंचो, आपको गरीबों को सताते हुए खुशी नहीं मिलेगी?
5. झुन्नाया का सबसे गहरा और नीच 214 क्या था?
6. झुन्नाया की नीली 214 क्या थी?
7. अम्मुन झुन्नाया का क्या हुआ?
8. जब वह नासर पर कराह रहा था तो अम्मुन झुन्नाया का क्या हुआ?
9. झुन्नाया ने एक मन से क्या कहा - आपका अम्मुन बहुत गुस्से में है?
10. झुन्नाया ने एक मन से क्या कहा?
