In [None]:
!pip install langchain chromadb sentence-transformers pypdf langchain-community google-generativeai langchain_chroma -qU


In [None]:
import os
import chromadb
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain_chroma import Chroma
from langchain.embeddings.base import Embeddings
from langchain_community.document_loaders import (
    TextLoader,
    Docx2txtLoader,
    PyPDFLoader
)

In [None]:


def load_documents(file_path: str):
    """
    Load a document (PDF, DOCX, or TXT) into LangChain Document objects.

    Args:
        file_path (str): Path to the file uploaded by the user.

    Returns:
        list[Document]: List of LangChain Document objects.
    """
    # Ensure file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    # Get file extension
    ext = os.path.splitext(file_path)[-1].lower()

    # Choose loader based on file type
    if ext == ".pdf":
        loader = PyPDFLoader(file_path)
    elif ext == ".docx":
        loader = Docx2txtLoader(file_path)
    elif ext == ".txt":
        loader = TextLoader(file_path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")

    # Load the document(s)
    documents = loader.load()
    print(f"âœ… Loaded {len(documents)} document(s) from {file_path}")

    return documents


# Choose your file
pdf_path = "/content/Data Science with Generative AI outline.pdf"

# Load PDF pages
loader = load_documents(pdf_path)
documents = loader.load()

print(f"âœ… Loaded {len(documents)} pages from {pdf_path}")


In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,       # characters per chunk
    chunk_overlap=100,    # overlap for context continuity
    length_function=len
)

chunks = splitter.split_documents(documents)

print(f"âœ… Split into {len(chunks)} text chunks")


In [None]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_text(texts):
    return embedding_model.encode(texts).tolist()

from langchain.embeddings.base import Embeddings

class HFEmbedding(Embeddings):
    def embed_documents(self, texts):
        return embed_text(texts)
    def embed_query(self, text):
        return embed_text([text])[0]


In [None]:
from chromadb.config import Settings

client = chromadb.Client(Settings(persist_directory="./rag_db"))
collection_name = "pdf_knowledge"

vectorstore = Chroma(
    client=client,
    collection_name=collection_name,
    embedding_function=HFEmbedding()
)

# Add chunks to vectorstore
texts = [chunk.page_content for chunk in chunks]
metadatas = [chunk.metadata for chunk in chunks]

vectorstore.add_texts(texts=texts, metadatas=metadatas)

print("âœ… ChromaDB populated successfully with PDF chunks!")


In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
os.environ["GOOGLE_API_KEY"] = "AIzaSyD9caRAI6oChsXLC7uDWB8FcSPcEP_oixg"

import google.generativeai as genai

genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

def generate_answer(context, question):
    prompt = f"""Answer the question based only on the following context:
    {context}
    Question: {question}"""

    response = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
    return response.text


In [None]:
def ask_question(question):
    docs = retriever.invoke(question)
    context = "\n\n".join([d.page_content for d in docs])
    answer = generate_answer(context, question)
    return answer

question = "name of some project that will perform ?"
response = ask_question(question)
print("ðŸ¤– Answer:", response)
