In [20]:
import getpass
import os
import dotenv
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
from langchain_core.output_parsers import StrOutputParser # It is used for p=output parsing
from typing import List
from pydantic import BaseModel, Field
from typing import List
from pydantic import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader # text loader 
from langchain_text_splitters import RecursiveCharacterTextSplitter # splitting the document reccursively
from typing import List
from langchain_core.documents import Document
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import MessagesPlaceholder
from langchain.chains import create_history_aware_retriever
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.messages import HumanMessage, AIMessage
from langchain_google_genai import ChatGoogleGenerativeAI


dotenv.load_dotenv()

True

## Document Loading

In [10]:
def load_documents(folder_path: str) -> List[Document]: # returns a list of Document object
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(file_path) # It is a loader for PDF files.
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(file_path) # It is a loader for docx file. 
        else:
            print(f"Unsupported file type: {filename}") # if the file is none of any type, print unsupported file.
            continue
        documents.extend(loader.load()) # loader.load() function is specified above.
    return documents

folder_path = "D:/MLops/DataScienceProject_1/RAG_ChatBot/docs"
documents = load_documents(folder_path)
print(f"Loaded {len(documents)} documents from the folder.")

Loaded 3 documents from the folder.


In [11]:
documents

[Document(metadata={'producer': 'www.ilovepdf.com', 'creator': 'Microsoft® Word 2016', 'creationdate': '2024-05-12T03:26:02+00:00', 'author': 'Ujjwal Gupta', 'moddate': '2024-05-12T03:26:02+00:00', 'source': 'D:/MLops/DataScienceProject_1/RAG_ChatBot/docs\\Ujjwal Gupta CV.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content="CV \n \nSubject: Application for Internship Opportunity at The Indian Patent Offive \n \nDear Hiring Manager, \n \nI am writing to express my keen interest in the internship opportunity within your esteemed \norganization, as advertised. With a strong foundation in Computer Science and Engineering, coupled \nwith a deep passion for Artificial Intelligence (AI) and Machine Learning (ML), I am excited about the \npossibility of contributing to your team and furthering my professional development in this dynamic \nfield. \n \nCurrently pursuing my second year of B.Tech. in Computer Science and Engineering at Maharaja \nAgrasen Institute of Technology, m

## Document Splitting

In [12]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

splits = text_splitter.split_documents(documents)

## Document Embedding

In [13]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
document_embeddings = embedding_function.embed_documents([split.page_content for split in splits])

  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [14]:
document_embeddings[0]

[-0.10116024315357208,
 -0.00020280103490222245,
 0.04198579117655754,
 -0.06857962161302567,
 -0.02047364041209221,
 -0.04980912804603577,
 0.029408005997538567,
 0.028651801869273186,
 -0.05072375386953354,
 -0.03247469663619995,
 -0.05381416901946068,
 -0.07052727788686752,
 0.08150126039981842,
 -0.06255161762237549,
 -0.03781336173415184,
 0.10670776665210724,
 0.06844929605722427,
 -0.08297166228294373,
 -0.01851995848119259,
 -0.06903177499771118,
 0.023464174941182137,
 0.003555864095687866,
 0.025925319641828537,
 -0.0833381935954094,
 -0.09990009665489197,
 0.08743153512477875,
 0.069369375705719,
 -0.03300537168979645,
 0.003436449682340026,
 -0.02685597911477089,
 0.029566125944256783,
 0.04432011395692825,
 0.048100147396326065,
 0.046200234442949295,
 0.049375344067811966,
 0.04202382266521454,
 -0.03465970605611801,
 -0.003917180933058262,
 0.09273724257946014,
 -0.05046144872903824,
 -0.009799106977880001,
 -0.03801983222365379,
 -0.002897376660257578,
 -0.0450800843536

## Putting Embeddings in chroma DB

In [17]:
collection_name = "my_collection"
vector_db = Chroma.from_documents(
    collection_name=collection_name,
    documents=splits,
    embedding=embedding_function,
    persist_directory="./chroma_db"
)
print("Vector store created and persisted to './chroma_db'")

Vector store created and persisted to './chroma_db'


## Context Retriever Chain

In [18]:
def _get_context_retriever_chain(vector_db, llm) :
    retriever = vector_db.as_retriever(search_kwargs={"k": 2})
    contextualize_q_system_prompt = """
        Given a chat history and the latest user question
        which might reference context in the chat history,
        formulate a standalone question which can be understood
        without the chat history. Do NOT answer the question,
        just reformulate it if needed and otherwise return it as is.
    """

    contextualize_q_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", contextualize_q_system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )


    history_aware_retriever = create_history_aware_retriever(
        llm, retriever, contextualize_q_prompt
    )


    return history_aware_retriever



In [19]:
def get_conversational_rag_chain(llm) :
    retriever_chain = _get_context_retriever_chain(vector_db, llm)
    qa_prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a helpful AI assistant. Use the following context to answer the user's question."),
        ("system", "Context: {context}"),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{input}")
    ])

    question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
    rag_chain = create_retrieval_chain(retriever_chain, question_answer_chain)

    return rag_chain



In [None]:
# Augmented Generation

llm_stream_gemini = init_chat_model("gemini-2.0-flash", model_provider="google_genai")


llm_stream = llm_stream_gemini  # Select between OpenAI and Anthropic models for the response

rag_chain = ChatGoogleGenerativeAI(
        model="models/gemini-2.0-flash-latest",
        temperature=0.3,
        convert_system_message_to_human=True,
        verbose=True,
    )
chat_history = []
question1 = "When are skills of Ujjwal Gupta ?"
answer1 = rag_chain.invoke({"input": question1, "chat_history": chat_history})['answer']
chat_history.extend([
    HumanMessage(content=question1),
    AIMessage(content=answer1)
])

print(f"Human: {question1}")
print(f"AI: {answer1}\n")

question2 = "What is his Email-id?"
answer2 = rag_chain.invoke({"input": question2, "chat_history": chat_history})['answer']
chat_history.extend([
    HumanMessage(content=question2),
    AIMessage(content=answer2)
])

print(f"Human: {question2}")
print(f"AI: {answer2}")

Human: When are skills of Ujjwal Gupta ?
AI: Based on the provided CV, Ujjwal Gupta has skills in the domains of Android development, cybersecurity, and Data Structures and Algorithms (DSA).

Human: What is his Email-id?
AI: I am sorry, but the email id of the applicant is not mentioned in the context.
