### Data Ingestion



In [47]:
### =Document Structure

from langchain_core.documents import Document

In [48]:
doc = Document(
    page_content="This is the main content of the document. I am using it to create RAG",
    metadata={"source": "idontknow.txt",
               "author": "Sumit Padwal", 
               "pages" : 5,
               "date_created": "2026-02-02"
               }
)
doc

Document(metadata={'source': 'idontknow.txt', 'author': 'Sumit Padwal', 'pages': 5, 'date_created': '2026-02-02'}, page_content='This is the main content of the document. I am using it to create RAG')

In [49]:
## Create a simple txt file
import os
os.makedirs("../data/data", exist_ok=True)

In [50]:
sample_texts={
    "../data/data/python_intro.txt":"""Python Programming Language

Python is a high-level, interpreted programming language known for its simplicity and readability.
It was created by Guido van Rossum and first released in 1991.

Python is widely used in:
- Web development
- Data science and machine learning
- Automation and scripting
- Artificial intelligence
- Software development

One of Python’s biggest strengths is its easy-to-understand syntax, which allows developers to write
clear and concise code. This makes Python an excellent choice for beginners as well as professionals.

Python supports multiple programming paradigms, including procedural, object-oriented, and functional
programming. It also has a large standard library and a strong community, which helps developers build
applications faster and more efficiently.
"""
}

for filepath, content in sample_texts.items():
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(content)
print("Sample text files created.")

Sample text files created.


In [51]:
###TextLoader
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/data/python_intro.txt", encoding="utf-8")
Documents = loader.load()
print(Documents)

[Document(metadata={'source': '../data/data/python_intro.txt'}, page_content='Python Programming Language\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nIt was created by Guido van Rossum and first released in 1991.\n\nPython is widely used in:\n- Web development\n- Data science and machine learning\n- Automation and scripting\n- Artificial intelligence\n- Software development\n\nOne of Python’s biggest strengths is its easy-to-understand syntax, which allows developers to write\nclear and concise code. This makes Python an excellent choice for beginners as well as professionals.\n\nPython supports multiple programming paradigms, including procedural, object-oriented, and functional\nprogramming. It also has a large standard library and a strong community, which helps developers build\napplications faster and more efficiently.\n')]


In [52]:
###DirectoryLoader
from langchain_community.document_loaders import DirectoryLoader

## load all the text files from the directory
dir_loader = DirectoryLoader(
    "../data/data",
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"},
    show_progress=False
)

Documents = dir_loader.load()
Documents

[Document(metadata={'source': '..\\data\\data\\python_intro.txt'}, page_content='Python Programming Language\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nIt was created by Guido van Rossum and first released in 1991.\n\nPython is widely used in:\n- Web development\n- Data science and machine learning\n- Automation and scripting\n- Artificial intelligence\n- Software development\n\nOne of Python’s biggest strengths is its easy-to-understand syntax, which allows developers to write\nclear and concise code. This makes Python an excellent choice for beginners as well as professionals.\n\nPython supports multiple programming paradigms, including procedural, object-oriented, and functional\nprogramming. It also has a large standard library and a strong community, which helps developers build\napplications faster and more efficiently.\n')]

In [53]:

from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
## load all the text files from the directory
dir_loader = DirectoryLoader(
    "../data/data/pdf",
    glob="**/*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=False
)
pdf_Documents = dir_loader.load()
pdf_Documents

[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2026-02-01T14:23:52-06:00', 'source': '..\\data\\data\\pdf\\Resume Sumit Padwal.pdf', 'file_path': '..\\data\\data\\pdf\\Resume Sumit Padwal.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Sumit padwal', 'subject': '', 'keywords': '', 'moddate': '2026-02-01T14:23:52-06:00', 'trapped': '', 'modDate': "D:20260201142352-06'00'", 'creationDate': "D:20260201142352-06'00'", 'page': 0}, page_content='SUMIT SUNIL PADWAL \nChicago, IL | 312-358-0500 | sumitpadwal8@gmail.com| Linkedin | Github \nEDUCATION \n \nIllinois Institute of Technology, Chicago, IL                                                                                               Expected Graduation – May 2026 \nMaster of Science, Computer Science \nAjeenkya D Y Patil University, India                                                                                                                      

In [54]:
type(pdf_Documents[0])

langchain_core.documents.base.Document

### Chunks

In [55]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)

    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    # Show example of a chunk
    if split_docs:
        print("\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs

In [56]:
chunks = split_documents(pdf_Documents)

chunks



Split 1 documents into 6 chunks

Example chunk:
Content: SUMIT SUNIL PADWAL 
Chicago, IL | 312-358-0500 | sumitpadwal8@gmail.com| Linkedin | Github 
EDUCATION 
 
Illinois Institute of Technology, Chicago, IL                                                  ...
Metadata: {'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2026-02-01T14:23:52-06:00', 'source': '..\\data\\data\\pdf\\Resume Sumit Padwal.pdf', 'file_path': '..\\data\\data\\pdf\\Resume Sumit Padwal.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Sumit padwal', 'subject': '', 'keywords': '', 'moddate': '2026-02-01T14:23:52-06:00', 'trapped': '', 'modDate': "D:20260201142352-06'00'", 'creationDate': "D:20260201142352-06'00'", 'page': 0}


[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2026-02-01T14:23:52-06:00', 'source': '..\\data\\data\\pdf\\Resume Sumit Padwal.pdf', 'file_path': '..\\data\\data\\pdf\\Resume Sumit Padwal.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Sumit padwal', 'subject': '', 'keywords': '', 'moddate': '2026-02-01T14:23:52-06:00', 'trapped': '', 'modDate': "D:20260201142352-06'00'", 'creationDate': "D:20260201142352-06'00'", 'page': 0}, page_content='SUMIT SUNIL PADWAL \nChicago, IL | 312-358-0500 | sumitpadwal8@gmail.com| Linkedin | Github \nEDUCATION \n \nIllinois Institute of Technology, Chicago, IL                                                                                               Expected Graduation – May 2026 \nMaster of Science, Computer Science \nAjeenkya D Y Patil University, India                                                                                                                      

### Embedding and VectorStoreDB

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [58]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5"
)


Loading weights: 100%|██████████| 199/199 [00:00<00:00, 472.35it/s, Materializing param=pooler.dense.weight]                               
BertModel LOAD REPORT from: BAAI/bge-small-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [59]:
from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./chroma_db"  # optional but recommended
)

vectorstore.persist()


In [60]:
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5}
)


In [61]:
from langchain_openai import ChatOpenAI


In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI  # or Ollama
import os

os.environ["OPENAI_API_KEY"] = ""
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)


prompt = PromptTemplate.from_template("""
Answer the question using ONLY the context below.

Context:
{context}

Question:
{question}
""")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
)

response = rag_chain.invoke("What is Sumit Padwal number")
print(response.content)


Sumit Padwal's number is 312-358-0500.
