In [1]:
import os
from dotenv import load_dotenv
from pathlib import Path

load_dotenv();


In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_cohere import CohereEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.prompts import ChatPromptTemplate
from langchain_qdrant import QdrantVectorStore
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser


In [3]:
pdf_path = Path(os.getcwd()).parent / "documents" / "doc_nodejs.pdf"
loader = PyPDFLoader(file_path=str(pdf_path))
docs = loader.load()


Ignoring wrong pointing object 268 0 (offset 0)
Ignoring wrong pointing object 309 0 (offset 0)


In [4]:
print((docs[0]))


page_content='A PDF Reference for        The Complete Node.js Dev Course                Version 3.0' metadata={'producer': 'macOS Version 10.14.1 (Build 18B75) Quartz PDFContext', 'creator': 'Acrobat PDFMaker 17 for Word', 'creationdate': "D:20190227140340Z00'00'", 'author': 'Andrew Mead', 'moddate': "D:20190227140340Z00'00'", 'source': 'd:\\Python\\gen_ai\\documents\\doc_nodejs.pdf', 'total_pages': 125, 'page': 0, 'page_label': '1'}


In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)
splitted_docs = splitter.split_documents(documents=docs)
print(len(splitted_docs))


333


In [None]:
print(splitted_docs[0])


page_content='A PDF Reference for        The Complete Node.js Dev Course                Version 3.0' metadata={'producer': 'macOS Version 10.14.1 (Build 18B75) Quartz PDFContext', 'creator': 'Acrobat PDFMaker 17 for Word', 'creationdate': "D:20190227140340Z00'00'", 'author': 'Andrew Mead', 'moddate': "D:20190227140340Z00'00'", 'source': 'd:\\Python\\gen_ai\\documents\\doc_nodejs.pdf', 'total_pages': 125, 'page': 0, 'page_label': '1'}


In [3]:
embedder = CohereEmbeddings(cohere_api_key=os.getenv("COHERE_API_KEY"), model="embed-english-v3.0")


In [None]:
embedder.embed_query("Hey there")


In [4]:
vector_store = QdrantVectorStore.from_documents(
    documents=[],
    embedding=embedder, 
    url="http://localhost:6333",
    collection_name="qdrant_basic_rag"
)


In [None]:
vector_store.add_documents(documents=splitted_docs)


In [5]:
retriever = QdrantVectorStore.from_existing_collection(
    collection_name="qdrant_basic_rag",
    url="http://localhost:6333",
    embedding=embedder
)


In [6]:
llm = ChatGoogleGenerativeAI(
    api_key=os.getenv("GOOGLE_API_KEY"),
    model="gemini-1.5-flash"
)


In [8]:
content_writer_prompt = ChatPromptTemplate.from_messages([
    ("system", """
        You are an expert content writer for LLMs.

        Goal:
        Your task is to write detailed, structured content that helps the LLM respond accurately to a user's query using the provided context.

        Instructions:
        - Write informative and explanatory content tailored to the user's query based on the given context.
        - Group the content by **relevant page ranges**. For example:
        - Pages 2-4 will be grouped together.
        - Pages 9-12 will be grouped together.
        - Ensure each content block is clear, complete, and helps answer the query.

        Rules:
        - The output must be in valid **JSON** format.
        - Each content block should have an "id" based on the **starting page number** of that block.  
        (e.g., if content is from pages 3-5, then id = 3)

        Output Format:
        {{
            "query": "user query here",
            "output": [
                {{
                    "id": <starting_page_number>,
                    "content": "Relevant content here"
                }},
                ...
            ]
        }}

        Example:
        Input Query: What is the fs module?
        Given Context:
        Page 2: The fs module in Node.js is used to interact with the file system. It allows reading, writing, and deleting files.
        Page 3: It supports both synchronous and asynchronous file operations. Common methods include fs.readFile and fs.writeFile.
        Page 7: The path module is often used with fs to resolve directory paths.
        Page 8: This helps ensure cross-platform compatibility for file and folder paths.
        Output:
        {{
        "query": "What is the fs module?",
        "output": [
            {{
                "id": 2,
                "content": "The fs (File System) module in Node.js provides an API for interacting with the file system. It supports both synchronous and asynchronous methods for reading, writing, appending, and deleting files. Common methods include fs.readFile, fs.writeFile, and fs.unlink. These functions are critical in server-side applications where file manipulation is necessary."
            }},
            {{
                "id": 7,
                "content": "The path module is commonly used alongside the fs module to handle and resolve file paths in a consistent, cross-platform way. It ensures that directory structures and file locations work correctly across different operating systems, which enhances the reliability of file operations performed using fs."
            }}]
        }}

        Now, here is your context:
        <context>
        {context}
        </context>
    """),
    ("human", "{query}")
])


In [9]:
def write_content(query: str):
    if len(query) <= 0:
        return "Invalid query"
    content_writer_chain = content_writer_prompt | llm | JsonOutputParser();
    context = retriever.similarity_search(query)
    results = content_writer_chain.invoke(input={
        "query": query,
        "context": context
    })
    # results = json.dumps(results);
    if "output" in results:
        return results;
    return {"query": query, "output": "Something went wrong"}


In [55]:
# answer = write_content("What is bcryptjs?")
# print(answer)


In [10]:
query_generator_prompt = ChatPromptTemplate.from_messages([
    ("system", """
        You are an expert query generator for LLMs and search engines.

        Goal:
        Your task is to generate **alternative but equivalent queries** based on a given user query. These should capture the **same intent** but use different phrasings or keywords. This helps in improving retrieval quality from vector databases or semantic search systems.

        Instructions:
        - Write **up to 4 alternative queries** that tackle the same problem the user wants to solve.
        - Use **semantically rich keywords** that are helpful for retrieving relevant documents from a vector database.
        - Keep the alternate queries **short and focused**, avoiding unnecessary words.
        - Ensure all alternate queries preserve the **original intent**.

        Rules:
        - Output must be in **valid JSON** format.
        - Avoid generating queries that are too vague or overly broad.
        - Do not repeat the original query verbatim.

        Output Format:
        {{
            "query": "<original_query>",
            "output": [
                "alternate_query_1",
                "alternate_query_2",
                "alternate_query_3",
                "alternate_query_4"
            ]
        }}

        Example:
        Input Query: What is the fs module?
        Output:
        {{
            "query": "What is the fs module?",
            "output": [
                "Node.js file system module explanation",
                "how to use fs module in Node.js",
                "fs module functions for file handling",
                "Node.js read and write files with fs"
            ]
        }}
    """),
    ("human", "{query}")
])


In [11]:
def query_generator(query:str):
    if len(query) <= 0:
        return "Invalid query"
    query_generator_chain = query_generator_prompt | llm | JsonOutputParser();
    results = query_generator_chain.invoke(input={
        "query": query
    })
    if "output" in results:
        return results;
    return {"query": query, "output": "Something went wrong"}


In [58]:
# queries = query_generator("What is bcrypt?")
# queries


In [17]:
def content_generator(query):
    if not query:
        return "Invalid query"

    query_generate_output = query_generator(query)
    queries = query_generate_output.get("output", [])
    # print(queries)

    content = [write_content(q).get("output", []) for q in queries]

    ranking = {}
    for content_list in content:
        for item in content_list:
            item_id = item["id"]
            ranking[item_id] = ranking.get(item_id, 0) + 1

    print(ranking)

    sorted_ids = sorted(ranking, key=ranking.get, reverse=True)

    updated_content = []
    seen_ids = set()
    for item_id in sorted_ids:
        for content_list in content:
            for item in content_list:
                if item["id"] == item_id and item_id not in seen_ids:
                    updated_content.append(item["content"])
                    seen_ids.add(item_id)
                    break

    return updated_content


In [None]:
# context = content_generator("What is bcrypt?")
# context


{77: 3, 78: 3, 79: 1, 80: 1}


['Storing passwords as plain text is highly insecure.  If a database is compromised, hackers gain access to all user passwords, enabling them to use those credentials on other sites.  The solution is to hash passwords using a secure one-way hashing algorithm like bcrypt.  This ensures that even if the database is compromised, the actual passwords remain hidden and secure.',
 "Bcrypt is a library used for hashing passwords.  The `hash` method takes a plain text password and an integer (typically 8) representing the number of hashing rounds. The result is a hashed password which is stored in the database. The `compare` method is used to verify a user's login attempt.  It compares the user's provided plain text password against the stored hashed password.  If they match, the login is successful. An example using `bcrypt.hash` and `bcrypt.compare` is provided in this section;  `bcrypt.hash` hashes the password 'Red12345!', and `bcrypt.compare` compares 'red12345!' (note the case difference

In [19]:
query_resolver_prompt = ChatPromptTemplate.from_messages([
    ("system", """
    You are an experienced and knowledgeable teacher dedicated to providing the best educational experience for your students.

    ### Instructions:
    - Answer the student's query **thoroughly** based on your expertise and knowledge.
    - If you do **not** have the answer, respond with a polite message such as:
    *"I'm sorry, but I don't have the information to answer that question right now. Is there something else you would like to know?"*
    - Ensure your answer is **comprehensive, clear**, and exceeds student expectations in detail.
    - Never refer to your knowledge base directly in your response (e.g., avoid saying "based on my knowledge...").
    - Always maintain a respectful, engaging, and informative tone.
    - Utilize your full expertise to provide the most complete answer possible.

    ### Rules:
    - Output must be in **valid JSON** format.
    - Never include content outside of your expertise.
    - Never mention your knowledge base directly.
    - The answer must be as **detailed and informative as possible**, avoiding short or summarized responses.

    ---

    ### Output Format:
    {{
        "query": "<student_query>",
        "output": "<your_detailed_answer>"
    }}

    Now, here is your expertise:
    <expertise>
    {context}
    </expertise>
"""),
    ("human", "{query}")
])


In [20]:
def query_resolver(query:str):
    context = content_generator(query);
    if type(context) != list:
        print("came here")
        context = retriever.similarity_search(query)
    query_resolver_chain = query_resolver_prompt | llm | JsonOutputParser();
    results = query_resolver_chain.invoke(input={
        "query": query,
        "context": context
    })
    if "output" in results:
        return results;
    return {"query": query, "output": "Something went wrong"}


In [21]:
query = input(">>>>");
results = query_resolver(query)
print(results);


{78: 5, 79: 4, 80: 3, 81: 1, 77: 2}
{'query': 'what is bcryptjs?', 'output': "Bcryptjs is a powerful and widely-used JavaScript library specifically designed for securely hashing passwords.  It's crucial for protecting user credentials and preventing unauthorized access in any application dealing with user accounts.  At its core, bcryptjs implements the bcrypt algorithm, a robust and well-regarded one-way hashing function.  This means that it's computationally infeasible to reverse the hashing process, making it extremely difficult for attackers to recover plain text passwords even if they gain access to your database.\n\nHere's a breakdown of its key features and functionalities:\n\n* **Password Hashing:**  The primary function of bcryptjs is to transform plain text passwords into secure, randomly-salted hashes.  The 'salt' is a randomly generated string added to the password before hashing, making it computationally expensive to crack multiple passwords even if they use the same pass