In [1]:
import logging
import random
import os
import tempfile
import streamlit as st
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import GrobidParser
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_huggingface import HuggingFaceEmbeddings




In [2]:
os.environ["GROQ_API_KEY"] = "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"

In [3]:
llm = ChatGroq(
        model= "llama3-70b-8192",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2
    )

In [4]:
# loader = GenericLoader.from_filesystem(
#     "/Users/sathya/Desktop/Rag/Electromagnetics_Vol1.pdf",
#     glob="*",
#     suffixes=[".pdf"],
#     parser=GrobidParser(segment_sentences=False),
# )
loader = PyPDFLoader("/Users/sathya/Desktop/Rag/Electromagnetics_Vol1.pdf")

In [5]:
pages = loader.load()

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500)

In [7]:
from langchain.chains import QAGenerationChain
docs = text_splitter.split_documents(pages)

In [8]:
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)
# took 37s 

#from langchain.vectorstores import FAISS

#vectorstore = FAISS.from_documents(documents=docs, embedding=HuggingFaceEmbeddings())
#toook 36s 
#----------------------------------------

fast_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(documents=docs, embedding=fast_embeddings)
#fastest so far took 22s


  from tqdm.autonotebook import tqdm, trange


In [9]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from tqdm.auto import tqdm

# Initialize the embedding model
# fast_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


# # Initialize vectorstore as None
# vectorstore = None

# # Process documents in batches
# batch_size = 1000
# for i in tqdm(range(0, len(docs), batch_size)):
#     batch = docs[i:i+batch_size]
    
#     if vectorstore is None:
#         # Create the vectorstore with the first batch
#         vectorstore = FAISS.from_documents(batch, embedding=fast_embeddings)
#     else:
#         # Add subsequent batches to the existing vectorstore
#         vectorstore.add_documents(batch)


#Fastest so far, doing batch processing and done in 16s 


In [10]:
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 6, "fetch_k":20})


In [11]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [12]:

question_gen_template = """
You are an expert in generating educational content and question generation. Your task is to create {number_questions} high-quality multiple-choice questions based on the following text. Be sure to generate questions from different areas of focus in your given text and do not stick to the same concept.

IMPORTANT: DO NOT generate questions about specific section numbers, page numbers, or any metadata such as author names, publication dates, etc. Focus only on the core concepts and content related to electromagnetics.

For each question:
1. Identify key concepts or facts from the text.
2. Formulate a clear and concise question based on these key concepts or facts.
3. Generate exactly four distinct and plausible options labeled A, B, C and D. Ensure that only one option is correct.
4. Provide sufficient context in the question so that it can be understood without referring to specific sections or equations.

IMPORTANT: Generate diverse and unique questions. Do not repeat concepts or question types.
Cover a wide range of topics from the given context.

IMPORTANT: Generate exactly {number_questions} multiple-choice questions and do not summarise or ask again if you can generate more, generate exactly that number.

Use this format for each question:
Q: [Question text]
A: [Option A]
B: [Option B]
C: [Option C]
D: [Option D]
Correct Answer: [A/B/C/D]

Example of a good question:
Q: What is the relationship between electric field and magnetic field in an electromagnetic wave?
A: They are perpendicular to each other
B: They are parallel to each other
C: They are at a 45-degree angle to each other
D: There is no relationship between them
Correct Answer: A

Example of a bad question:
Q: What does section 5.1 talk about?


Text:
{context}

Generate {number_questions} multiple-choice questions:
"""


custom_rag_prompt = PromptTemplate.from_template(question_gen_template)

question_gen_chain = (
    {"context": retriever | format_docs, "number_questions": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)



# retriever_chain = retriever | format_docs

# # Define the question generation chain
# question_gen_chain = (
#     {
#         "context": lambda _: retriever_chain.invoke(None),  # Invoke retriever with None input
#         "number_questions": RunnablePassthrough()
#     }
#     | custom_rag_prompt
#     | llm
#     | StrOutputParser()
# )


In [13]:
questions = question_gen_chain.invoke("100")
#all_questions = []
#num_questions = 100
    

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
print(questions)

Here are the 100 multiple-choice questions based on the provided text:

Q1: What is the conductivity of highly distilled water?
A: 6µS/m
B: 5S/m
C: 5–50 mS/m
D: 10−4S/m
Correct Answer: A

Q2: What is the relationship between electric field and magnetic field in an electromagnetic wave?
A: They are perpendicular to each other
B: They are parallel to each other
C: They are at a 45-degree angle to each other
D: There is no relationship between them
Correct Answer: A

Q3: What is the voltage reflection coefficient?
A: Γ = ZL + Z0
B: Γ = ZL - Z0
C: Γ = ZL / Z0
D: Γ = ZL × Z0
Correct Answer: B

Q4: What is impedance matching?
A: Transforming a particular impedance ZL into a modified impedance Zin
B: Matching the impedance of a device to its output impedance
C: Matching the impedance of a device to its input impedance
D: Transforming a particular impedance Zin into a modified impedance ZL
Correct Answer: A

Q5: Why is impedance matching necessary?
A: Because all devices in a system operate at