In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
import numpy as np
import PyPDF2

from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain


In [None]:
pdf_path = "path_to_your_pdf.pdf"
llm = OpenAI(model_name="gpt-4")

In [None]:

def extract_text_from_pdf(pdf_path):
    pdf_text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfFileReader(file)
        for page_num in range(reader.numPages):
            page = reader.getPage(page_num)
            pdf_text += page.extract_text()
    return pdf_text


pdf_text = extract_text_from_pdf(pdf_path)

In [None]:
def preprocess_text(text):
    vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2)
    dtm = vectorizer.fit_transform(text.split("\n"))
    terms = vectorizer.get_feature_names_out()
    return dtm, terms

dtm, terms = preprocess_text(pdf_text)

In [None]:
def compute_coherence_values(dtm, terms, start=10, step=2, limit=20):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=0)
        lda_model.fit(dtm)
        model_list.append(lda_model)
        
        topics = lda_model.components_
        topic_words = [[terms[i] for i in topic.argsort()[:-10 - 1:-1]] for topic in topics]
        
        texts = [terms[idx] for idx in dtm.nonzero()[1]]
        dictionary = Dictionary([texts])
        corpus = [dictionary.doc2bow(text) for text in texts]
        
        coherence_model = CoherenceModel(topics=topic_words, texts=[texts], dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())
    
    return model_list, coherence_values

model_list, coherence_values = compute_coherence_values(dtm, terms, start=10, step=2, limit=30)
optimal_model = model_list[np.argmax(coherence_values)]
optimal_num_topics = 10 + 2 * np.argmax(coherence_values)
print(f"Optimal number of topics: {optimal_num_topics}")

In [None]:
qa_prompt = PromptTemplate(
    input_variables=["context"],
    template="""
    Generate three high-quality question-answer pairs based on the following context:
    Context: {context}

    For each question, provide three levels of answers:
    - Thorough understanding
    - Partial understanding
    - Limited understanding

    Q1: 
    A1 (Thorough, Score 5): 
    A1 (Good, Score 4): 
    A1 (Average, Score 3): 
    A1 (Below Average, Score 2): 
    A1 (Limited, Score 1): 
    Q2: 
    A2 (Thorough, Score 5): 
    A2 (Good, Score 4): 
    A2 (Average, Score 3): 
    A2 (Below Average, Score 2): 
    A2 (Limited, Score 1): 
    Q3: 
    A3 (Thorough, Score 5): 
    A3 (Good, Score 4): 
    A3 (Average, Score 3): 
    A3 (Below Average, Score 2): 
    A3 (Limited, Score 1): 
    """
)

# Create the LLMChain
qa_chain = LLMChain(llm=llm, prompt=qa_prompt)

In [None]:
def generate_qa_pairs(context, num_pairs=3):
    qa_text = qa_chain.run(context=context)
    qa_pairs = []
    for i in range(num_pairs):
        question = qa_text.split(f"Q{i+1}:")[1].split(f"A{i+1} (Thorough, Score 5):")[0].strip()
        thorough_answer = qa_text.split(f"A{i+1} (Thorough, Score 5):")[1].split(f"A{i+1} (Good, Score 4):")[0].strip()
        good_answer = qa_text.split(f"A{i+1} (Good, Score 4):")[1].split(f"A{i+1} (Average, Score 3):")[0].strip()
        average_answer = qa_text.split(f"A{i+1} (Average, Score 3):")[1].split(f"A{i+1} (Below Average, Score 2):")[0].strip()
        below_avg_answer = qa_text.split(f"A{i+1} (Below Average, Score 2):")[1].split(f"A{i+1} (Limited, Score 1):")[0].strip()
        limited_answer = qa_text.split(f"A{i+1} (Limited, Score 1):")[1].split(f"Q{i+2}:")[0].strip() if i+2 <= num_pairs else qa_text.split(f"A{i+1} (Limited, Score 1):")[1].strip()
        qa_pairs.append({
            "question": question,
            "answers": [
                {"text": thorough_answer, "score": 5},
                {"text": good_answer, "score": 4},
                {"text": average_answer, "score": 3},
                {"text": below_avg_answer, "score": 2},
                {"text": limited_answer, "score": 1}
            ]
        })
    return qa_pairs

In [None]:
def create_qa_dataset(pdf_text, lda_model, terms, n_questions_per_topic=3, total_questions=1200):
    topics = lda_model.components_
    all_qa_pairs = []
    
    for topic in topics:
        topic_words = [terms[i] for i in topic.argsort()[:-n_questions_per_topic - 1:-1]]
        context_sentences = [sentence for sentence in pdf_text.split(". ") if any(word in sentence for word in topic_words)]
        context = ". ".join(context_sentences[:10])  # Limit context size for better generation quality
        qa_pairs = generate_qa_pairs(context, num_pairs=n_questions_per_topic)
        all_qa_pairs.extend(qa_pairs)
        if len(all_qa_pairs) >= total_questions:
            break
    
    return all_qa_pairs[:total_questions]

In [None]:

qa_dataset = create_qa_dataset(pdf_text, optimal_model, terms)
print(qa_dataset)