In [None]:
from google.genai import types

retry_config = types.HttpRetryOptions(
    attempts= 5,
    exp_base = 0.2,
    initial_delay = 0.5,
    http_status_codes = [500, 502, 503, 504],   
)

In [13]:
import os
from dotenv import load_dotenv

load_dotenv()

LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
OPENAI_API_KEY = os.getenv("OPEN_AI_API_KEY")

print("API keys loaded" if LLAMA_CLOUD_API_KEY and GEMINI_API_KEY and OPENAI_API_KEY else "âœ— Missing API keys")

API keys loaded


In [33]:
FILINGS= [
    {"id":"1","company":"Apple","year":"2023","path": r"C:\Users\rushy\Downloads\FINBOT\GenAI_FInBot\NOV_2023.pdf"},
    {"id":"2","company":"Tesla","year":"2023","path": r"C:\Users\rushy\Downloads\FINBOT\GenAI_FInBot\Tesla_2023.pdf"},
]

In [None]:
embedding_models = [
    {
        "provider":"OpenAI",
        "model_name":"text-embedding-3-small",
        "api_key": "OPENAI_API_KEY",
        "dimensions":1536,
    },
    {
        "provider":"Google",
        "model_name": "text-embedding-004",
        "api_key": "GEMINI_API_KEY",
        "dimensions":768,
    },
    {
        "provider":"Local",
        "model_name":"all-MiniLM-L6-v2",
        "api_key":None,
        "dimensions":384,
    },
    {
        "provider":"Local",
        "model_name":"intfloat/e5-large-v2",
        "api_key":None,
        "dimensions":1024,
    },
]

Need to check
1. Latency p90
2. Recall p50
3. Semantic Quality


In [18]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())

2.5.1+cu121
12.1
True


Pipleline Skeleton
1. pages = extract_pages_text(pdf_paths)
2. tables = extract_tables(pdf_paths)
3. docs = build_docs(pages,tables)
4. chunks = chuk_docs(docs)
5. index(chuks,embedding_model_cfg)
6. retrieve(query) -> context
7. generate(context,query)

In [34]:
import fitz
import re
from pathlib import Path

def clean_text(text:str) -> str:
    return re.sub(r"[^a-zA-Z0-9\s]", "", text).strip()


def extract_pages_from_filings(filings):
    all_pages = []

    for filing in filings:
        doc = fitz.open(filing['path'])

        for page_idx,page in enumerate(doc,start=1):
            text = page.get_text("text")
            all_pages.append({
                "filing_id":filing['id'],
                "company": filing['company'],
                "year":filing['year'],
                "page":page_idx,
                "text":clean_text(text),
            })

        doc.close()

    return all_pages



In [40]:
pages = extract_pages_from_filings(FILINGS)

print(pages[100])
print(len(pages))

{'filing_id': '2', 'company': 'Tesla', 'year': '2023', 'page': 21, 'text': 'Our information technology systems or data or those of our service providers or customers or users could be subject to cyber\nattacks or other security incidents which could result in data breaches intellectual property theft claims litigation regulatory \ninvestigations significant liability reputational damage and other adverse consequences \nWe continue to expand our information technology systems as our operations grow such as product data management procurement inventory \nmanagement production planning and execution sales service and logistics dealer management financial tax and regulatory compliance systems \nThis includes the implementation of new internally developed systems and the deployment of such systems in the US and abroad While we maintain \ninformation technology measures designed to protect us against intellectual property theft data breaches sabotage and other external or internal \ncyberatt

In [49]:
import pdfplumber
import pandas as pd


def extract_tables_from_filings(filings,*,max_tables_per_page=None):
    all_tables = []
    for filing in filings:
        pdf_path = filing['path']
        source_name = Path(pdf_path).name

        with pdfplumber.open(pdf_path) as pdf:
            for page_idx,page in enumerate(pdf.pages,start=1):
                tables = page.extract_tables()

                if not tables:
                    continue

                if max_tables_per_page is not None:
                    tables = tables[:max_tables_per_page]

                for t_idx,table in enumerate(tables):
                    df = pd.DataFrame(table)

                    all_tables.append({
                        "filing_id":filing['id'],
                        "company":filing['company'],
                        "year":filing['year'],
                        "source":source_name,
                        "page":page_idx,
                        "table_id":f"{source_name}_p{page_idx}_t{t_idx}",
                        "df": df
                    })
    return all_tables

In [50]:
tables = extract_tables_from_filings(FILINGS)
len(tables)

151

In [56]:
tables[11]['page']

26

In [58]:
tables[10]['df']

Unnamed: 0,0,1,2,3,4,5,6
0,Products,$,108803.0,,"$ 114,728",$,105126.0
1,Services,60345,,,56054,47710,
2,Total gross margin,$,169148.0,,"$ 170,782",$,152836.0


In [55]:
def table_to_text(df):
    df = df.fillna("").astype(str)
    lines = []
    for row in df.values.tolist():
        row = [cell.strip() for cell in row if cell.strip()]
        lines.append(" | ".join(row))

    return "\n".join(lines)



table_text = table_to_text(tables[11]["df"])
print(table_text)


Products | 36.5 | % | 36.3 | % | 35.3 | %
Services | 70.8 | % | 71.7 | % | 69.7 | %
Total gross margin percentage | 44.1 | % | 43.3 | % | 41.8 | %
