In [15]:
from typing import Dict, Any, List, Optional, Union, TypedDict
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain.document_loaders.pdf import PyPDFDirectoryLoader 
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain.embeddings import OpenAIEmbeddings 
from langchain.chains import RetrievalQA
from langchain.schema import Document 
#from langchain.vectorstores import Chroma
from langchain_chroma import Chroma
import datetime, json, time, os
from dotenv import load_dotenv
import warnings
warnings.filterwarnings("ignore")

load_dotenv()

True

In [2]:
persist_directory = "chroma_db"
pdf_directory = "Data"  

# Load Documents

In [3]:
documents = []
for filename in os.listdir(pdf_directory):
    print(f"Processing file {filename}")
    if filename.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join(pdf_directory, filename))
        documents.extend(loader.load())

Processing file llama-3.pdf
Processing file deepseek-v3.pdf


In [4]:
print(f"Total document Pages: {len(documents)}  pages")

Total document Pages: 145  pages


# Chunking

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [6]:
len(texts)

if os.path.exists(persist_directory):
    print("ChromaDB already exists. Skipping build.")

# Build Vector Store

In [7]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)
print("ChromaDB built successfully.")

ChromaDB built successfully.


In [8]:
llm = ChatOpenAI(model="gpt-4.1-nano", temperature=0)

In [17]:
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
retriever = vectordb.as_retriever()

In [22]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever,return_source_documents=True)

# Halucination without RAG

In [25]:
llm.invoke("how many parameters in deep-seek v3?").content

'DeepSeek V3 has approximately 1.2 billion parameters.'

# RAG

In [23]:
qa.invoke("how many parameters in deep-seek v3?")

{'query': 'how many parameters in deep-seek v3?',
 'result': 'DeepSeek-V3 comprises 671 billion (671B) total parameters.',
 'source_documents': [Document(id='89061abd-f233-4845-845a-19679fa4bc0d', metadata={'total_pages': 53, 'creator': 'LaTeX with hyperref', 'page': 34, 'creationdate': '2025-02-19T02:11:22+00:00', 'subject': '', 'producer': 'pdfTeX-1.40.25', 'source': 'Data/deepseek-v3.pdf', 'author': '', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'title': '', 'keywords': '', 'page_label': '35', 'trapped': '/False', 'moddate': '2025-02-19T02:11:22+00:00'}, page_content='DeepSeek-V3 has some limitations, especially on the deployment. Firstly, to ensure efficient\ninference, the recommended deployment unit for DeepSeek-V3 is relatively large, which might\npose a burden for small-sized teams. Secondly, although our deployment strategy for DeepSeek-\nV3 has achieved an end-to-end generation speed of more than two times that

In [19]:
vectordb.similarity_search(query="how many parameters in deep-seek v3?",k=5)

[Document(id='89061abd-f233-4845-845a-19679fa4bc0d', metadata={'source': 'Data/deepseek-v3.pdf', 'author': '', 'creator': 'LaTeX with hyperref', 'keywords': '', 'total_pages': 53, 'creationdate': '2025-02-19T02:11:22+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'moddate': '2025-02-19T02:11:22+00:00', 'page': 34, 'trapped': '/False', 'producer': 'pdfTeX-1.40.25', 'title': '', 'page_label': '35'}, page_content='DeepSeek-V3 has some limitations, especially on the deployment. Firstly, to ensure efficient\ninference, the recommended deployment unit for DeepSeek-V3 is relatively large, which might\npose a burden for small-sized teams. Secondly, although our deployment strategy for DeepSeek-\nV3 has achieved an end-to-end generation speed of more than two times that of DeepSeek-V2,\nthere still remains potential for further enhancement. Fortunately, these limitations are expected\nto be naturally addressed 