## importing chromadb library 
import chromadb

In [7]:
import chromadb
import uuid
client = chromadb.Client()

## creating collection 

In [8]:
collection_name = "notes"

# Check if a collection with this name already exists
existing_collections = client.list_collections()
existing_names = [c.name for c in existing_collections]

if collection_name in existing_names:
    collection = client.get_collection(name=collection_name)
    print(f"Collection '{collection_name}' already exists. Using existing collection.")
else:
    collection = client.create_collection(name=collection_name)
    print(f"Collection '{collection_name}' created successfully!")

Collection 'notes' created successfully!


In [None]:
from sentence_transformers import SentenceTransformer

# 1. Load the full notes from file
with open("Demystifying The Myth.txt", "r", encoding="utf-8") as f:
    note_text = f.read()

# 2. Split into paragraph-level chunks (separated by blank lines)
raw_chunks = [p.strip() for p in note_text.split("\n\n") if p.strip()]
print(f"Total paragraphs (chunks) found: {len(raw_chunks)}")

# 3. Initialize the embedding model (once per notebook)
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# 4. Generate UUID ids for each chunk
ids = [str(uuid.uuid4()) for _ in raw_chunks]

# 5. Embed each chunk
embeddings = model.encode(raw_chunks).tolist()  # list of [dim]-vectors

# 6. Add all chunks to the 'notes' collection
collection.add(
    ids=ids,
    documents=raw_chunks,
    embeddings=embeddings,
    metadatas=[{"source": "Demystifying The Myth.txt", "type": "paragraph"} for _ in raw_chunks],
)

print("Paragraphs added to Chroma collection.")

Total paragraphs (chunks) found: 10
Paragraphs added to Chroma collection.


In [None]:
# Query the notes collection with a natural language question
query_text = "What is AI?"

# Embed the query
query_embedding = model.encode([query_text]).tolist()

# Retrieve the top-k most similar paragraphs from Chroma
results = collection.query(
    query_embeddings=query_embedding,
    n_results=5,
)

# Pretty-print the results
for i, (doc, meta, dist) in enumerate(zip(results["documents"][0], results["metadatas"][0], results["distances"][0])):
    print(f"Result {i+1}")
    print("Distance:", dist)
    print("Metadata:", meta)
    print("Document:\n", doc)
    print("-" * 80)

Result 1
Distance: 0.481627494096756
Metadata: {'source': 'Demystifying The Myth.txt', 'type': 'paragraph'}
Document:
 What is AI?
Before discussing intelligence, we must first understand AI. AI can be described as the field of computer science that seeks to build systems that mimic human intelligence (Russell, 2010). Mimicking human intelligence will mean thinking and acting rationally, which are components of human intelligence that also expose AI’s limitations to some extent.
--------------------------------------------------------------------------------
Result 2
Distance: 0.481627494096756
Metadata: {'type': 'paragraph', 'source': 'Demystifying The Myth.txt'}
Document:
 What is AI?
Before discussing intelligence, we must first understand AI. AI can be described as the field of computer science that seeks to build systems that mimic human intelligence (Russell, 2010). Mimicking human intelligence will mean thinking and acting rationally, which are components of human intelligence t

In [16]:
# Configure Gemini LLM using API key from .env
import os
from dotenv import load_dotenv
import google.generativeai as genai

# Load environment variables from .env file
load_dotenv()

api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("GEMINI_API_KEY not set in .env. Please edit .env and set GEMINI_API_KEY=your_actual_key.")

# Configure Gemini client
genai.configure(api_key=api_key)

# Use the specified Gemini model
llm = genai.GenerativeModel("gemini-2.5-flash")
print("Gemini LLM (gemini-2.5-flash) initialized.")

Gemini LLM (gemini-2.5-flash) initialized.


In [None]:
# RAG-style helper: retrieve from Chroma, then answer with Gemini

def answer_notes_question(question: str, k: int = 3) -> str:
    """Retrieve top-k note paragraphs relevant to the question and ask Gemini to answer.

    Assumes:
    - `collection` is a Chroma collection with note paragraphs.
    - `model` is the SentenceTransformer embedding model.
    - `llm` is the configured Gemini GenerativeModel.
    """
    # 1. Embed the question
    query_embedding = model.encode([question]).tolist()

    # 2. Retrieve top-k paragraphs from Chroma
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=k,
    )

    retrieved_docs = results["documents"][0]

    # 3. Build a context string from the retrieved paragraphs
    context = "\n\n".join(retrieved_docs)

    # 4. Construct a prompt for Gemini
    prompt = f"""You are a helpful teaching assistant for a class.
Use ONLY the information in the CONTEXT section below to answer the QUESTION.
If the answer is not clearly present in the context, say that you don't know based on the given notes.

CONTEXT:
{context}

QUESTION:
{question}

Answer clearly and concisely.
"""

    response = llm.generate_content(prompt)

    # `response.text` is usually the main text answer. Fallback to str(response) if needed.
    return getattr(response, "text", str(response))


# Example usage (you can change the question text and re-run):
example_answer = answer_notes_question("What is AI?", k=5)
print(example_answer)

AI can be described as the field of computer science that seeks to build systems that mimic human intelligence (Russell, 2010).
