In [None]:
!pip install -q docling chromadb langchain-text-splitters sentence-transformers torch accelerate

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import os
import io
import time
import google.generativeai as genai
from PIL import Image
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, AcceleratorOptions
from langchain_text_splitters import MarkdownHeaderTextSplitter
from docling.datamodel.base_models import InputFormat
from docling_core.types.doc import DocItemLabel
from sentence_transformers import SentenceTransformer
import chromadb
import urllib.request

GEMINI_API_KEY = "AIzaSyDCA5qkmHinWpxJ_4TVzR1Y208PvHCAzZ4" 
genai.configure(api_key=GEMINI_API_KEY)
vision_model = genai.GenerativeModel('gemini-2.5-flash-lite')

os.makedirs("/kaggle/working/data", exist_ok=True)
os.makedirs("/kaggle/working/images", exist_ok=True)

print("Setup Complete.")

In [None]:
pdf_url = "https://ir.tesla.com/_flysystem/s3/sec/000162828024002390/tsla-20231231-gen.pdf"
pdf_path = "/kaggle/working/data/tesla_2023_10k.pdf"
urllib.request.urlretrieve(pdf_url, pdf_path)

print("PDF Downloaded successfully!")

In [None]:
pipeline_options = PdfPipelineOptions()
pipeline_options.accelerator_options = AcceleratorOptions(num_threads=4, device="cuda")

pipeline_options.images_scale = 2.0 
pipeline_options.generate_page_images = True 

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

start = time.time()
result = converter.convert(pdf_path)
end = time.time()
print(f"Conversion Done in {end - start:.2f} seconds!")

markdown_text = result.document.export_to_markdown()

with open("/kaggle/working/tesla.md", "w") as f:
    f.write(markdown_text)



In [None]:
visual_data = []

print("Extracting images and analyzing with Gemini...")
output_dir = "/kaggle/working/images"

for element, level in result.document.iterate_items():
    if element.label == DocItemLabel.PICTURE:
        page_no = element.prov[0].page_no if element.prov else "unknown"
        
        img = None
        if hasattr(element, 'image') and element.image is not None:
            img = element.image.pil_image
        else:
            try:
                img = element.get_image(result.document)
            except:
                continue

        if img:
            img_filename = f"tesla_p{page_no}_{id(element)}.png"
            img_path = os.path.join(output_dir, img_filename)
            img.save(img_path)
            
            prompt = f"""Analyze this image from Tesla's 10-K report (Page {page_no}). 
            Provide a structured summary:
            1. What is this chart/image showing?
            2. Exact numbers or data points visible.
            3. The key trend or takeaway.
            Format output clearly."""
            
            try:
                response = vision_model.generate_content([prompt, img])
                
                visual_data.append({
                    "text": f"[IMAGE SUMMARY]: {response.text}", 
                    "metadata": {
                        "page_number": page_no,
                        "content_type": "visual",
                        "file_name": img_filename,
                        "display_mode": "image_only" 
                    }
                })
            
                print(f" Analyzed image on page {page_no}")
            except Exception as e:
                print(f" Gemini failed on page {page_no}: {e}")


print(f"Successfully processed {len(visual_data)} images.")



In [None]:
from docling_core.types.doc import PictureItem, TableItem, SectionHeaderItem
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    add_start_index=True
)

text_data = []
last_text_context = "" 
current_section = "General" 

for element, level in result.document.iterate_items():
    if isinstance(element, PictureItem):
        continue
    
    page_no = int(element.prov[0].page_no) if element.prov else 0
    
    if isinstance(element, TableItem):
        table_markdown = element.export_to_markdown(result.document)
        

        bridge_prompt = f"""
            Act as a professional Financial Data Indexer. Your goal is to write a high-density keyword summary for a retrieval system (RAG) to index this specific table.
            
            CONTEXT OF THE DOCUMENT:
            "{last_text_context}"
            
            TABLE DATA (MARKDOWN):
            {table_markdown}
            
            TASK:
            Generate a dense, 6-to-7 sentence summary that makes this table highly searchable. 
            Do NOT use generic phrases like "This table shows."
            
            REQUIRED ELEMENTS TO INCLUDE:
            1. THE CORE TOPIC: What specific financial or operational data is being measured?
            2. COLUMN HEADERS: Explicitly list every column header.
            3. ROW ENTITIES: Explicitly list the primary entities or categories in the rows (e.g., specific products, subsidiaries, or dates).
            4. TERMS & STATUSES: Include any specific keywords, metrics, or status labels found in the cells (e.g., "Pilot production", "Adjusted EBITDA", "Restricted").
            5. SYMBOL CLARIFICATION: If the preceding context explains what symbols like dashes (â€”) or asterisks (*) mean in this table, state that meaning (e.g., "Dashes indicate milestones not yet achieved").
            
            OUTPUT FORMAT:
            "Topic: [Topic]. Headers: [Headers]. Entities/Rows: [Rows]. Key Metrics/Statuses: [Keywords]."
        """
        try:
            table_enrichment = vision_model.generate_content(bridge_prompt).text
            time.sleep(7) 
        except:
            print(f" Gemini failed on page {page_no}: {e}")
            table_enrichment = "Tesla Financial Data Table."

        content = f"{table_enrichment} \n---UI_SEPARATOR---\n {table_markdown}"
        c_type = "table"
        
    elif isinstance(element, SectionHeaderItem):
        content = f"{'#' * element.level} {element.text}"
        current_section = element.text 
        last_text_context = content
        c_type = "text"
    else:
        content = element.text
        if len(content) > 100:
            last_text_context = content
        c_type = "text"
    
    if not content or len(content.strip()) < 40:
        continue

    if len(content) > 1000 and not isinstance(element, TableItem):
        sub_chunks = text_splitter.split_text(content)
        for sub in sub_chunks:
            text_data.append({
                "text": sub,
                "metadata": {
                    "page_number": page_no,
                    "content_type": c_type,
                    "section": current_section,
                    "file_name": "tesla_2023_10k.pdf",
                    "is_sub_chunk": True
                }
            })
    else:
        text_data.append({
            "text": content,
            "metadata": {
                "page_number": page_no,
                "content_type": c_type,
                "section": current_section,
                "file_name": "tesla_2023_10k.pdf",
                "is_sub_chunk": False 
            }
        })

print(f"Total chunks created: {len(text_data)}")

In [None]:
client = chromadb.PersistentClient(path="/kaggle/working/chroma_db")
collection = client.get_or_create_collection(name="tesla_reports")

embed_model = SentenceTransformer('all-MiniLM-L6-v2')

all_entries = text_data + visual_data

docs = [item["text"] for item in all_entries]
metas = [item["metadata"] for item in all_entries]
ids = [f"id_{i}" for i in range(len(all_entries))]

print("Generating embeddings and saving to ChromaDB...")
embeddings = embed_model.encode(docs, show_progress_bar=True).tolist()

collection.add(
    documents=docs,
    embeddings=embeddings,
    metadatas=metas,
    ids=ids
)

print("Database built successfully!")

In [None]:
import shutil

shutil.make_archive("/kaggle/working/financial_rag_data", 'zip', "/kaggle/working/", "chroma_db")
!zip -ur /kaggle/working/financial_rag_data.zip /kaggle/working/images /kaggle/working/data/tesla_2023_10k.pdf

from IPython.display import FileLink
FileLink(r'financial_rag_data.zip')