In [None]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())


2.5.1+cu121
12.1
True


In [None]:
FILINGS = [
    {"id":"apple_2023","company":"Apple","year":2023,"path":r"C:\Users\rushy\Downloads\FINBOT\GenAI_FInBot\NOV_2023.pdf"},
    {"id":"apple_2024","company":"Apple","year":2024,"path":r"C:\Users\rushy\Downloads\FINBOT\GenAI_FInBot\NOV_2024.pdf"},
    {"id":"apple_2025","company":"Apple","year":2025,"path":r"C:\Users\rushy\Downloads\FINBOT\GenAI_FInBot\OCT_2025.pdf"},
]

In [3]:
import fitz
import os
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    pages = []
    for i,page in enumerate(doc):
        pages.append({"page":i+1,"text":page.get_text("text")})
    return pages


In [4]:
import re

def chunk_pages(pages,chunk_chars=1200,overlap=200):
    chunks = []
    for page in pages:
        text = re.sub(r"\s+"," ",page['text']).strip()
        start = 0
        while start < len(text):
            end = min(len(text),start+chunk_chars)
            chunk_text = text[start:end]
            chunks.append({
                "text": chunk_text,
                "page_start":page["page"],
                "page_end":page["page"],
            })
            start = max(end-overlap,end)
    return chunks

In [None]:
# Install sentence-transformers (run once)
%pip install sentence-transformers -q


In [48]:
import os
from dotenv import load_dotenv
import torch
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid

load_dotenv()

# ============================================================
# GPU-accelerated embeddings using sentence-transformers
# Model: all-MiniLM-L6-v2 (384 dimensions, very fast)
# ============================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"ðŸš€ Using device: {device}")

# Load the embedding model on GPU
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
print(f"âœ“ Loaded all-MiniLM-L6-v2 on {device}")

def get_embeddings(texts):
    """Get embeddings using local GPU-accelerated sentence-transformers."""
    # encode() returns numpy arrays, convert to list for ChromaDB
    embeddings = embedding_model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
    return embeddings.tolist()

# Initialize ChromaDB
client = chromadb.Client(Settings(persist_directory="./chroma_db"))

# Delete existing collection (dimension changed: Gemini=768 â†’ MiniLM=384)
try:
    client.delete_collection("filings")
    print("Deleted old collection to handle embedding dimension change")
except:
    pass

collection = client.get_or_create_collection("filings")

def ingest_filings(filings, batch_size=64):  # Larger batch OK for local GPU
    for filing in filings:
        pages = extract_text_from_pdf(filing["path"])
        chunks = chunk_pages(pages)

        documents = [ch["text"] for ch in chunks]
        metadatas = [{
            "filing_id": filing["id"],
            "company": filing["company"],
            "year": filing["year"],
            "page_start": ch["page_start"],
            "page_end": ch["page_end"],
        } for ch in chunks]

        ids = [str(uuid.uuid4()) for _ in chunks]

        for i in range(0, len(documents), batch_size):
            docs_b  = documents[i:i+batch_size]
            metas_b = metadatas[i:i+batch_size]
            ids_b   = ids[i:i+batch_size]

            # Local GPU embeddings - much faster than API calls!
            embeddings_b = get_embeddings(docs_b)

            collection.add(
                documents=docs_b,
                metadatas=metas_b,
                ids=ids_b,
                embeddings=embeddings_b
            )

        print(f"[OK] {filing['id']} â†’ {len(documents)} chunks")

ingest_filings(FILINGS)

ðŸš€ Using device: cuda
âœ“ Loaded all-MiniLM-L6-v2 on cuda
Deleted old collection to handle embedding dimension change


Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [00:00<00:00,  3.20it/s]
Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [00:00<00:00,  9.10it/s]
Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [00:00<00:00,  8.96it/s]
Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [00:00<00:00,  7.98it/s]
Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00, 43.25it/s]

[OK] apple_2023 â†’ 262 chunks





In [51]:
import os
from dotenv import load_dotenv
from google import genai
from langchain_google_genai import ChatGoogleGenerativeAI

# Load API key from .env file
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

gemini_client = genai.Client(api_key=GEMINI_API_KEY)


In [52]:
# List available models
for model in gemini_client.models.list():
    print(model.name)


models/embedding-gecko-001
models/gemini-2.5-flash
models/gemini-2.5-pro
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-exp-1206
models/gemini-2.5-flash-preview-tts
models/gemini-2.5-pro-preview-tts
models/gemma-3-1b-it
models/gemma-3-4b-it
models/gemma-3-12b-it
models/gemma-3-27b-it
models/gemma-3n-e4b-it
models/gemma-3n-e2b-it
models/gemini-flash-latest
models/gemini-flash-lite-latest
models/gemini-pro-latest
models/gemini-2.5-flash-lite
models/gemini-2.5-flash-image-preview
models/gemini-2.5-flash-image
models/gemini-2.5-flash-preview-09-2025
models/gemini-2.5-flash-lite-preview-09-2025
models/gemini-3-pro-preview
models/gemini-3-flash-preview
models/gemini-3-pro-image-preview
models/nano-banana-pro-preview
models/gemini-robotics-er-1.5-preview
models/g

In [53]:
SYSTEM_INSTRUCTIONS = """
You are a financial assistant. Answer ONLY using the provided filing snippets.
Cite sources as (filing_id, year, pages X-Y). If information is not present, say "I don't know based on the filings."
Keep answers concise.
"""

def retrieve(query, k=6, filter_by=None):
    # filter_by example: {"year": 2023} or {"filing_id": "apple_2023"}
    # Use local GPU embeddings for the query (same model used during ingestion)
    query_embedding = get_embeddings([query])[0]
    
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=k,
        where=filter_by if filter_by else None
    )
    docs = results["documents"][0]
    metas = results["metadatas"][0]
    snippets = []
    for d, m in zip(docs, metas):
        cite = f"{m['filing_id']} ({m['year']}), pages {m['page_start']}-{m['page_end']}"
        snippets.append(f"[{cite}] {d}")
    context = "\n\n".join(snippets)
    return context

def build_prompt(query, context):
    return f"{SYSTEM_INSTRUCTIONS}\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"

In [60]:
def answer(query, filter_by=None):
    context = retrieve(query, k=6, filter_by=filter_by)
    prompt = build_prompt(query, context)
    resp = gemini_client.models.generate_content(
        model="gemini-2.5-pro",
        contents=[{"role": "user", "parts": [{"text": prompt}]}]
    )
    return resp.text

print(answer("what is the Proceeds from commercial paper in 2023?"))

Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00, 76.89it/s]


ClientError: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0, model: gemini-2.5-pro\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.5-pro\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.5-pro\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0, model: gemini-2.5-pro\nPlease retry in 40.634288827s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_input_token_count', 'quotaId': 'GenerateContentInputTokensPerModelPerMinute-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-pro'}}, {'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'model': 'gemini-2.5-pro', 'location': 'global'}}, {'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-pro'}}, {'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_input_token_count', 'quotaId': 'GenerateContentInputTokensPerModelPerDay-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-pro'}}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '40s'}]}}