In [16]:
!pip3 install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [18]:
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
import os
from google import genai
from google.genai import types
from PyPDF2 import PdfReader



In [19]:
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [20]:
client_chroma = chromadb.PersistentClient(path="./chroma_db")
collection = client_chroma.get_or_create_collection(name="docs_embeddings")

In [21]:
gemini_client = genai.Client(api_key="AIzaSyrwEPthISSN0ZkBLoy9PG9PTDPp2hFQ0")

In [22]:
chat_history = []

In [23]:
def load_pdf_to_chroma(pdf_path: str, chunk_size: int = 500, overlap: int = 50):
    """Load PDF, split into chunks, embed, and add to ChromaDB"""
    reader = PdfReader("/Users/vrushalipatil/Downloads/Revati_Resume.pdf")
    raw_text = ""
    for page in reader.pages:
        raw_text += page.extract_text() + "\n"

    # Split text into chunks
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap
    )
    chunks = splitter.split_text(raw_text)

    # Embed and store in Chroma
    embeddings = embed_model.encode(chunks, convert_to_numpy=True).tolist()
    for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
        collection.add(
            ids=[f"doc_{i}"],
            documents=[chunk],
            embeddings=[emb]
        )
    print(f"✅ Loaded {len(chunks)} chunks into ChromaDB")

In [24]:
def add_to_history(role: str, content: str):
    chat_history.append({"role": role, "content": content})

In [25]:
def get_history_context(max_turns: int = 3) -> str:
    recent = chat_history[-max_turns * 2 :]
    return "\n".join([f"{msg['role'].capitalize()}: {msg['content']}" for msg in recent])

In [26]:
def get_top_chunks(query: str, top_k: int = 2):
    query_emb = embed_model.encode([query], convert_to_numpy=True)
    results = collection.query(
        query_embeddings=query_emb.tolist(),
        n_results=top_k
    )
    return results["documents"][0] if results["documents"] else []


In [27]:
def rephrase_with_gemini(query: str, retrieved_chunks: list, history: str = "") -> str:
    context = "\n".join(retrieved_chunks)
    user_prompt = f"""
You are a helpful assistant.

Chat history (for context):
{history}

User question:
{query}

Relevant context from documents:
{context}

Task: Provide a clear, concise, user-friendly answer. Rephrase in your own words, don't copy chunks verbatim.
"""

    response = gemini_client.models.generate_content(
        model="gemini-2.0-flash",
        contents=[types.Content(role="user", parts=[types.Part(text=user_prompt)])]
    )

    candidate = response.candidates[0]
    rephrased = "".join([p.text for p in candidate.content.parts if p.text])
    return rephrased.strip()


In [28]:
def answer_query(user_query: str, top_k: int = 2) -> str:
    chunks = get_top_chunks(user_query, top_k=top_k)
    history_text = get_history_context(max_turns=3)
    answer = rephrase_with_gemini(user_query, chunks, history_text)

    add_to_history("user", user_query)
    add_to_history("assistant", answer)
    return answer

In [33]:
if __name__ == "__main__":
    # Load your PDF first (run only once per PDF)
    load_pdf_to_chroma("Users/vrushalipatil/Downloads/Revati_Resume.pdf")

    # Ask questions
    q1 = "Who is narendra modi?"
    print("User:", q1)
    a1 = answer_query(q1)
    print("Assistant:", a1, "\n")

    q2 = "give me the email?"
    print("User:", q2)
    a2 = answer_query(q2)
    print("Assistant:", a2, "\n")

    # Print chat history
    print("\n📜 Chat History:")
    for turn in chat_history:
        print(f"{turn['role'].capitalize()}: {turn['content']}")


✅ Loaded 6 chunks into ChromaDB
User: Who is narendra modi?
Assistant: I'm sorry, but I don't have information about Narendra Modi in the documents I have access to. My information is limited to Revati Patil and the Machine Learning course taught by Keerti. 

User: give me the email?
Assistant: Revati Patil's email address is revatip290@gmail.com. 


📜 Chat History:
User: What is the main topic of this PDF?
Assistant: The PDF is about a Machine Learning course taught by Keerti. It seems to cover a wide range of topics, from the basics of neural networks to more advanced models like CNNs, RNNs, and transformers. The course is designed to be easy to understand, even for complex topics, and is recommended for anyone interested in AI and ML, regardless of their experience level.
User: Can you explain it in simpler terms?
Assistant: Essentially, the PDF describes a Machine Learning course taught by Keerti that's designed to be easy to grasp, even if you're new to the field. It covers everyt