In [12]:
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from dotenv import load_dotenv
import google.generativeai as genai

In [21]:
load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=api_key)
print(api_key)

AIzaSyCeMUHMBYeK-u2-j83Texb0Ma2hnhbce6E


In [14]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()
    return documents

In [15]:
extracted_data = load_pdf(r"C:\Users\rokig\OneDrive\Desktop\Algo Generator\data")

Previous trailer cannot be read: ("'NumberObject' object is not subscriptable",)
parsing for Object Streams


In [16]:
extracted_data

[Document(metadata={'source': 'C:\\Users\\rokig\\OneDrive\\Desktop\\Algo Generator\\data\\cormen book.pdf', 'page': 0}, page_content=''),
 Document(metadata={'source': 'C:\\Users\\rokig\\OneDrive\\Desktop\\Algo Generator\\data\\cormen book.pdf', 'page': 1}, page_content=''),
 Document(metadata={'source': 'C:\\Users\\rokig\\OneDrive\\Desktop\\Algo Generator\\data\\cormen book.pdf', 'page': 2}, page_content='Introduction to Algorithms \nFourth Edition '),
 Document(metadata={'source': 'C:\\Users\\rokig\\OneDrive\\Desktop\\Algo Generator\\data\\cormen book.pdf', 'page': 3}, page_content=''),
 Document(metadata={'source': 'C:\\Users\\rokig\\OneDrive\\Desktop\\Algo Generator\\data\\cormen book.pdf', 'page': 4}, page_content='Thomas H. Cormen \nCharles E. Leiserson \nRonald L. Rivest \nClifford Stein \nIntroduction to Algorithms \nFourth Edition \nThe MIT Press \nCambridge, Massachusetts London, England '),
 Document(metadata={'source': 'C:\\Users\\rokig\\OneDrive\\Desktop\\Algo Generator\\d

In [17]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 10)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [18]:
chunks = text_split(extracted_data)
length = len(chunks)
print("Total number of chunks:", length)

Total number of chunks: 6508


In [20]:
from langchain.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
faiss_index = FAISS.from_documents(chunks, embedding_model)
faiss_index.save_local("faiss_index")




To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [24]:
faiss_index = FAISS.load_local(
    "faiss_index",
    embedding_model,
    allow_dangerous_deserialization=True
)
def search_similar_documents(query, k=2):
    results = faiss_index.similarity_search(query, k=k)
    return [{"content": result.page_content, "metadata": result.metadata} for result in results]


query = "What is the time complexity of merge sort"
top_results = search_similar_documents(query)

for idx, result in enumerate(top_results, 1):
    print(f"Result {idx}:\n{result['content']}\n")

Result 1:
10 7 instructions/second  1163 seconds (under 20 minutes) : 
By using an algorithm whose running time grows more  slowly, even with a poor 
compiler, computer B runs more than 17 times faster than computer A! The ad- 
vantage of merge sort is even more pronounced when sorting 100 million numbers: 
where insertion sort takes more than 23 days, merge sort takes under four hours. 
Although 100 million might seem like a large number, there are more than 100 mil-

Result 2:
merge sort’s dividing and combining times together are ‚.n/. Adding ‚.n/ to 
the 2T.n=2/ term from the conquer step gives the recurrence for  the worst -case 
running time T.n/ of merge sort: 
T.n/ D 2T.n=2/ C ‚.n/: (2.3) 
Chapter 4 presents the <master theorem,= which shows that T.n/ D ‚.n lg n/. 17 
Compared with insertion sort, whose worst-case running time is ‚.n 2 /, merge sort 
trades away a factor of n for a factor of lg n. Because the logarithm function grows



In [25]:
def get_conversational_chain():
    prompt_template = """
    use the given algorithm informations and provide the user answers.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    Context: {context}
    Question: {question}

    Only return the helpful answer below and nothing else.
    Helpful answer:
    """
    model = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.3)
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
    return chain

In [26]:
def user_input(user_question):
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    new_db = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)

    docs = new_db.similarity_search(user_question)

    chain = get_conversational_chain()

    response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
    return response["output_text"]

In [27]:
while True:
    user_question = input("You: ")
    if user_question.lower() == "exit":
        break
    answer = user_input(user_question)
    print(f"Algo bot: {answer}\n")

stuff: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/docs/how_to/#qa-with-rag
  chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
  response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)


Algo bot: Θ(n lg n)


