In [3]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
#from langchain.schema import Document

from langchain_core.documents import Document

#from langchain_community.vectorstores import chroma
from langchain_chroma import Chroma
#utility imports
import numpy as np
from typing import List




  from .autonotebook import tqdm as notebook_tqdm


In [5]:
#RAG IMPLIMENTATION
print("""
RAG ARCHITECTURE:
      1. Document Loading: Loading required documents
      2.Document Splitting: Breaking documets into small chunks
      3.Embedding Generation: Converting all the embeddings into vectors
      4.vector store: storing all the vectors into chromadb
      5.Query processing: converting user query to vector embedding
      6.Sinilarity search: Finding relevant chunks from the vector store
      7.Context Augmentation:combine retrieved chunks with the query
      8.Response generation:LLM generates answers using context
""")
print("""
     it is used to reduce the hallucinations
     provide upto date informations
      it allows to work with domain specific knowldge
      it reduces training cost unlike fine tuning """)


RAG ARCHITECTURE:
      1. Document Loading: Loading required documents
      2.Document Splitting: Breaking documets into small chunks
      3.Embedding Generation: Converting all the embeddings into vectors
      4.vector store: storing all the vectors into chromadb
      5.Query processing: converting user query to vector embedding
      6.Sinilarity search: Finding relevant chunks from the vector store
      7.Context Augmentation:combine retrieved chunks with the query
      8.Response generation:LLM generates answers using context


     it is used to reduce the hallucinations
     provide upto date informations
      it allows to work with domain specific knowldge
      it reduces training cost unlike fine tuning 


In [6]:
docs=[
    """
   Sri is a passionate and highly motivated learner who is actively transitioning into the fields of Machine Learning, Deep Learning, and Generative AI. 
   With a strong curiosity about how intelligent systems function, Sri is focused on building 
   solid practical foundations in AI, including Python programming, data structures, model training, and deployment workflows. Instead of limiting learning to theory, Sri
     strongly prefers a hands-on, project-based approach, using tools such as Google Colab, Python libraries, and real-world datasets to gain deeper understanding. 
     This approach helps translate abstract concepts into real implementations, especially in areas like regression, classification, and neural networks.
    """,

    """
Sri has a growing interest in AI-driven healthcare solutions, particularly in cancer detection using MRI and medical imaging.
 This interest has led to working on literature surveys, studying CNN architectures, and exploring how image processing techniques can support early and accurate diagnosis. 
 At the same time, Sri is actively learning about Retrieval-Augmented Generation (RAG) systems, vector databases, embeddings, and modern AI frameworks like LangChain. 
 A key focus is understanding how documents are split, embedded, stored, and retrieved, enabling intelligent systems to reason over custom knowledge bases rather 
 than relying solely on pretrained model knowledge.
""",

"""
Beyond traditional ML and DL, Sri enjoys exploring emerging technologies such as Generative AI, Agentic AI, chatbots, and multimodal systems. 
There is a strong ambition to build end-to-end AI systems, starting from raw data ingestion, moving through embeddings and vector search, and 
ultimately generating meaningful, context-aware responses using large language models. Sri is detail-oriented and
 values updated, correct, and production-ready code, especially when working with fast-evolving libraries. 
 With consistent effort, curiosity, and a strong preference for learning by doing, Sri is steadily developing the skills required to become a high-impact, 
 highly-paid machine learning engineer.
"""
]
docs

['\n   Sri is a passionate and highly motivated learner who is actively transitioning into the fields of Machine Learning, Deep Learning, and Generative AI. \n   With a strong curiosity about how intelligent systems function, Sri is focused on building \n   solid practical foundations in AI, including Python programming, data structures, model training, and deployment workflows. Instead of limiting learning to theory, Sri\n     strongly prefers a hands-on, project-based approach, using tools such as Google Colab, Python libraries, and real-world datasets to gain deeper understanding. \n     This approach helps translate abstract concepts into real implementations, especially in areas like regression, classification, and neural networks.\n    ',
 '\nSri has a growing interest in AI-driven healthcare solutions, particularly in cancer detection using MRI and medical imaging.\n This interest has led to working on literature surveys, studying CNN architectures, and exploring how image proce

In [6]:
## save sample documents to files
import tempfile
temp_dir=tempfile.mkdtemp()

for i,doc in enumerate(docs):
    with open(f"{temp_dir}/doc_{i}.txt","w") as f:
        f.write(doc)

print(f"Sample document create in : {temp_dir}")

Sample document create in : C:\Users\DELL\AppData\Local\Temp\tmpuyrge44k


In [7]:
temp_dir

'C:\\Users\\DELL\\AppData\\Local\\Temp\\tmpuyrge44k'

In [7]:
#STEP1:  -----> DOCUMENT LOADING
from langchain_community.document_loaders import DirectoryLoader,TextLoader

# Load documents from directory
loader = DirectoryLoader(
    "data", 
    glob="*.txt", 
    loader_cls=TextLoader,
    loader_kwargs={'encoding': 'utf-8'}
)
documents = loader.load()

print(f"Loaded {len(documents)} documents")
print(f"\nFirst document preview:")
print(documents[0].page_content[:200] + "...")


Loaded 3 documents

First document preview:

   Sri is a passionate and highly motivated learner who is actively transitioning into the fields of Machine Learning, Deep Learning, and Generative AI. 
   With a strong curiosity about how intellig...


In [9]:
documents

[Document(metadata={'source': 'data\\doc_0.txt'}, page_content='\n   Sri is a passionate and highly motivated learner who is actively transitioning into the fields of Machine Learning, Deep Learning, and Generative AI. \n   With a strong curiosity about how intelligent systems function, Sri is focused on building \n   solid practical foundations in AI, including Python programming, data structures, model training, and deployment workflows. Instead of limiting learning to theory, Sri\n     strongly prefers a hands-on, project-based approach, using tools such as Google Colab, Python libraries, and real-world datasets to gain deeper understanding. \n     This approach helps translate abstract concepts into real implementations, especially in areas like regression, classification, and neural networks.\n    '),
 Document(metadata={'source': 'data\\doc_1.txt'}, page_content='\nSri has a growing interest in AI-driven healthcare solutions, particularly in cancer detection using MRI and medical

In [8]:
#step2----->DOCUMENT SPLITTING
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Maximum size of each chunk
    chunk_overlap=50,  # Overlap between chunks to maintain context
    length_function=len,
    separators=[" "]  # Hierarchy of separators
)
chunks=text_splitter.split_documents(documents)

print(f"Created {len(chunks)} chunks from {len(documents)} documents")
print(f"\nChunk example:")
print(f"Content: {chunks[0].page_content[:150]}...")
print(f"Metadata: {chunks[0].metadata}")

Created 6 chunks from 3 documents

Chunk example:
Content: Sri is a passionate and highly motivated learner who is actively transitioning into the fields of Machine Learning, Deep Learning, and Generative AI. ...
Metadata: {'source': 'data\\doc_0.txt'}


In [11]:
print("""EMBEDDINGS can be created using openaiembeddings but 
"vectorstore(chromadb) --->converts chunks into embeddings(provided api for openai...it uses any embedding model )-->stores it--->does similarity search also.""")


EMBEDDINGS can be created using openaiembeddings but 
"vectorstore(chromadb) --->converts chunks into embeddings(provided api for openai...it uses any embedding model )-->stores it--->does similarity search also.


In [12]:
chunks

[Document(metadata={'source': 'data\\doc_0.txt'}, page_content='Sri is a passionate and highly motivated learner who is actively transitioning into the fields of Machine Learning, Deep Learning, and Generative AI. \n   With a strong curiosity about how intelligent systems function, Sri is focused on building \n   solid practical foundations in AI, including Python programming, data structures, model training, and deployment workflows. Instead of limiting learning to theory, Sri\n     strongly prefers a hands-on, project-based approach, using tools such as'),
 Document(metadata={'source': 'data\\doc_0.txt'}, page_content='project-based approach, using tools such as Google Colab, Python libraries, and real-world datasets to gain deeper understanding. \n     This approach helps translate abstract concepts into real implementations, especially in areas like regression, classification, and neural networks.'),
 Document(metadata={'source': 'data\\doc_1.txt'}, page_content='Sri has a growing 

In [13]:
print("""
initializing Chromadb and storing chunks in it

""")


initializing Chromadb and storing chunks in it




In [14]:
## Create a Chromdb vector store
persist_directory="./chroma_db"

## Initialize Chromadb with Open AI embeddings
vectorstore=Chroma.from_documents(
    documents=chunks,
    embedding=OpenAIEmbeddings(),
    persist_directory=persist_directory,
    collection_name="rag_collection"

)

print(f"Vector store created with {vectorstore._collection.count()} vectors")
print(f"Persisted to: {persist_directory}")

Vector store created with 6 vectors
Persisted to: ./chroma_db


In [None]:

#vectorstore.delete_collection()

#print("Collection deleted successfully")

In [15]:
#SIMILARITY SEARCH----->
query="what does sri enjoys doing??"

similar_docs=vectorstore.similarity_search(query,k=3)
similar_docs


print("""
top 3 chunks are picked from similarity search--LLM IS NOT INVOLVED YET
""")


top 3 chunks are picked from similarity search--LLM IS NOT INVOLVED YET



In [16]:
print(
    """
ChromaDB stores 4 main things.

1.Embeddings (Vectors) 
These are the numerical representations of your text.

2.Original Text (Document Content)
The actual text chunk---Not just the vector cause After similarity search, you need the original text to send to the LLM

3.Metadata
Metadata is extra information about each chunk

4.Internal IDs & Indexes 
ChromaDB also stores:
-->Unique IDs for each vector
-->Index structures (for fast search)
-->Collection information
-->You usually don't interact with this directly
"""
)


ChromaDB stores 4 main things.

1.Embeddings (Vectors) 
These are the numerical representations of your text.

2.Original Text (Document Content)
The actual text chunk---Not just the vector cause After similarity search, you need the original text to send to the LLM

3.Metadata
Metadata is extra information about each chunk

4.Internal IDs & Indexes 
ChromaDB also stores:
-->Unique IDs for each vector
-->Index structures (for fast search)
-->Collection information
-->You usually don't interact with this directly



In [16]:
#convert vector store to retriever
#create document chain
#create rag chain

In [18]:
query="what is Deep Learning?"

similar_docs=vectorstore.similarity_search(query,k=3)
similar_docs


#similarity search with scores
results_scores=vectorstore.similarity_search_with_score(query,k=3)
results_scores

[(Document(id='0e06feb1-fda8-430b-8d7e-d1fd3726263a', metadata={'source': 'data\\doc_0.txt'}, page_content='project-based approach, using tools such as Google Colab, Python libraries, and real-world datasets to gain deeper understanding. \n     This approach helps translate abstract concepts into real implementations, especially in areas like regression, classification, and neural networks.'),
  0.3926593065261841),
 (Document(id='846d1fb1-18fb-4726-a432-d378215521aa', metadata={'source': 'data\\doc_2.txt'}, page_content='Beyond traditional ML and DL, Sri enjoys exploring emerging technologies such as Generative AI, Agentic AI, chatbots, and multimodal systems. \nThere is a strong ambition to build end-to-end AI systems, starting from raw data ingestion, moving through embeddings and vector search, and \nultimately generating meaningful, context-aware responses using large language models. Sri is detail-oriented and\n values updated, correct, and production-ready code, especially when 

In [19]:
#### Understanding Similarity Scores
print("""
The similarity score represents how closely related a document chunk is to your query. The scoring depends on the distance metric used:

ChromaDB default: Uses L2 distance (Euclidean distance)

- Lower scores = MORE similar (closer in vector space)
- Score of 0 = identical vectors
- Typical range: 0 to 2 (but can be higher)


Cosine similarity (if configured):

- Higher scores = MORE similar
- Range: -1 to 1 (1 being identical)
      """)


The similarity score represents how closely related a document chunk is to your query. The scoring depends on the distance metric used:

ChromaDB default: Uses L2 distance (Euclidean distance)

- Lower scores = MORE similar (closer in vector space)
- Score of 0 = identical vectors
- Typical range: 0 to 2 (but can be higher)


Cosine similarity (if configured):

- Higher scores = MORE similar
- Range: -1 to 1 (1 being identical)
      


In [20]:
from langchain_openai import ChatOpenAI

llm=ChatOpenAI(
    model_name="gpt-3.5-turbo"
)


In [21]:
test_response=llm.invoke("What is Large Language Models")
test_response

AIMessage(content="Large language models are a type of artificial intelligence that can process and generate human-like text. These models are typically trained on large datasets of text and are capable of understanding and generating language with a high degree of accuracy and fluency. They can be used for a wide range of natural language processing tasks, including text generation, translation, sentiment analysis, and more. These models have been a significant advancement in the field of AI and have led to improvements in various applications such as chatbots, language translation, and content generation. Some popular examples of large language models include OpenAI's GPT-3 and Google's BERT.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 125, 'prompt_tokens': 12, 'total_tokens': 137, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_

In [22]:
## Convert vector store to retriever
retriever=vectorstore.as_retriever(
    search_kwarg={"k":3} ## Retrieve top 3 relevant chunks
)
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x000001F4A32DC690>, search_kwargs={})

In [23]:
## Create a prompt template
from langchain_core.prompts import ChatPromptTemplate
system_prompt="""You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.

Context: {context}"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])

In [24]:
prompt

ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. \nUse the following pieces of retrieved context to answer the question. \nIf you don't know the answer, just say that you don't know. \nUse three sentences maximum and keep the answer concise.\n\nContext: {context}"), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])