In [None]:
import PyPDF2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
import numpy as np

from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

In [None]:
pdf_path = "path_to_your_pdf.pdf"
llm = OpenAI(model_name="gpt-4")

In [None]:
def extract_text_from_pdf(pdf_path):
    pdf_text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfFileReader(file)
        for page_num in range(reader.numPages):
            page = reader.getPage(page_num)
            pdf_text += page.extract_text()
    return pdf_text


pdf_text = extract_text_from_pdf(pdf_path)

In [None]:
def preprocess_text(text):
    vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2)
    dtm = vectorizer.fit_transform(text.split("\n"))
    terms = vectorizer.get_feature_names_out()
    return dtm, terms

dtm, terms = preprocess_text(pdf_text)

In [None]:
def compute_coherence_values(dtm, terms, start=10, step=2, limit=30):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=0)
        lda_model.fit(dtm)
        model_list.append(lda_model)
        
        topics = lda_model.components_
        topic_words = [[terms[i] for i in topic.argsort()[:-10 - 1:-1]] for topic in topics]
        
        texts = [terms[idx] for idx in dtm.nonzero()[1]]
        dictionary = Dictionary([texts])
        corpus = [dictionary.doc2bow(text) for text in texts]
        
        coherence_model = CoherenceModel(topics=topic_words, texts=[texts], dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())
    
    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dtm, terms, start=10, step=2, limit=30)
optimal_model = model_list[np.argmax(coherence_values)]
optimal_num_topics = 10 + 2 * np.argmax(coherence_values)
print(f"Optimal number of topics: {optimal_num_topics}")

In [None]:
# Define the prompt template
qa_prompt = PromptTemplate(
    input_variables=["context", "num_pairs"],
    template="""
    Based on the following context, generate {num_pairs} high-quality question-answer pairs. Ensure the questions are relevant and the answers are detailed.

    Context: {context}

    {questions_and_answers}
    """
)
# Create the LLMChain
qa_chain = LLMChain(llm=llm, prompt=qa_prompt)

In [None]:
def generate_multiple_qa_pairs(context, num_pairs):
    questions_and_answers = "\n".join([f"Q{i+1}: \nA{i+1}: \n" for i in range(num_pairs)])
    qa_text = qa_chain.run(context=context, num_pairs=num_pairs, questions_and_answers=questions_and_answers)
    qa_pairs = []
    for i in range(num_pairs):
        question = qa_text.split(f"Q{i+1}:")[1].split(f"A{i+1}:")[0].strip()
        answer = qa_text.split(f"A{i+1}:")[1].split(f"Q{i+2}:")[0].strip() if i+2 <= num_pairs else qa_text.split(f"A{i+1}:")[1].strip()
        qa_pairs.append({"question": question, "answer": answer})
    return qa_pairs

In [None]:
def create_qa_dataset(pdf_text, lda_model, terms, total_questions=1200):
    topics = lda_model.components_
    num_topics = len(topics)
    n_questions_per_topic = total_questions // num_topics
    num_pairs_per_call = 10  # Adjust this number to optimize the LLM calls
    all_qa_pairs = []
    
    for topic in topics:
        topic_words = [terms[i] for i in topic.argsort()[:-10 - 1:-1]]
        context_sentences = [sentence for sentence in pdf_text.split(". ") if any(word in sentence for word in topic_words)]
        context = ". ".join(context_sentences[:10])  # Limit context size for better generation quality
        
        for _ in range(n_questions_per_topic // num_pairs_per_call):
            qa_pairs = generate_multiple_qa_pairs(context, num_pairs_per_call)
            all_qa_pairs.extend(qa_pairs)
            if len(all_qa_pairs) >= total_questions:
                break
        if len(all_qa_pairs) >= total_questions:
            break
    
    return all_qa_pairs[:total_questions]

In [None]:
qa_dataset = create_qa_dataset(pdf_text, optimal_model, terms)

In [None]:
def save_json(data, filename, directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    filepath = os.path.join(directory, filename)
    with open(filepath, 'w') as json_file:
        json.dump(data, json_file, indent=4)


In [None]:
save_json(qa_dataset, 'qa_dataset.json', 'qa_datasets')