In [1]:
import json
import os
import sys
import time
import csv
from typing import List, Dict
from pathlib import Path
from tqdm.auto import tqdm
from datetime import datetime
from dotenv import load_dotenv

# Need this to run notebooks from the notebooks/ directory, else can't find src/
# Get the notebook's current directory and find project root
notebook_dir = Path.cwd()
if notebook_dir.name == "notebooks":
    project_root = notebook_dir.parent
else:
    project_root = notebook_dir

os.chdir(project_root)
sys.path.insert(0, str(project_root))

print(f"üìÇ Working directory: {os.getcwd()}")

from src.services.llm_services import (
    load_config,
    get_llm,
    get_text_embeddings,
    validate_api_keys,
    print_config_summary
)

load_dotenv()

config = load_config("src/config/config.yaml")

from utils.llm_client import LLMClient

# Initialize LLM client (LLM Configuration)
client = LLMClient(provider=config["llm_provider"].lower(), model=config.get("openrouter_model", config.get("llm_model")))

# Validate API keys
print(validate_api_keys(config, verbose=True))

# Print summary
print_config_summary(config)

# Ensure output directory exists
Path(config["output_dir"]).mkdir(parents=True, exist_ok=True)

üìÇ Working directory: d:\Bootcamps\AEE 2026 Jan\Mini Projects\Operation Ledger Mind
{'OPENAI_API_KEY': True, 'OPENROUTER_API_KEY': True, 'GROQ_API_KEY': False, 'GOOGLE_API_KEY': False, 'COHERE_API_KEY': False}
‚úÖ Config loaded:
  LLM: openai / gpt-4o-mini
  Embeddings: sbert / sentence-transformers/all-MiniLM-L6-v2
  Temperature: 0.2
  Artifacts: ./artifacts




In [2]:
# Initialize LLM, Embeddings, and Reranker
from sentence_transformers import CrossEncoder

llm = get_llm(config)
embeddings = get_text_embeddings(config)

# CrossEncoder: A reranker model that scores query-document pairs
# Unlike bi-encoders (embeddings), cross-encoders see query AND document together
# Gives higher accuracy but is slower (can't pre-compute embeddings)

reranker = CrossEncoder(
    "cross-encoder/ms-marco-MiniLM-L-6-v2"  # Model trained on MS MARCO dataset
                                             # Other options: "cross-encoder/ms-marco-TinyBERT-L-2-v2" (faster)
                                             #                "cross-encoder/ms-marco-MiniLM-L-12-v2" (more accurate)
)

print(f"‚úÖ LLM: {config['llm_provider']} / {config['llm_model']}")
print(f"‚úÖ Embeddings: {config['text_emb_model']}")
print(f"‚úÖ Reranker: cross-encoder/ms-marco-MiniLM-L-6-v2")

# Verify API key with test completion
print("\nüîç Testing LLM API connection...")
try:
    test_response = llm.invoke("Say 'API working!' if you can read this.")
    test_msg = test_response.content if hasattr(test_response, 'content') else str(test_response)
    print(f"‚úÖ LLM API verified: {test_msg[:50]}")
except Exception as e:
    print(f"‚ùå LLM API test failed: {e}")
    print("‚ö†Ô∏è  Please check your .env file and API key configuration.")


  return HuggingFaceEmbeddings(


‚úÖ LLM: openai / gpt-4o-mini
‚úÖ Embeddings: sentence-transformers/all-MiniLM-L6-v2
‚úÖ Reranker: cross-encoder/ms-marco-MiniLM-L-6-v2

üîç Testing LLM API connection...
‚úÖ LLM API verified: API working!


PDF Load + Clean

In [3]:
from pypdf import PdfReader

def load_and_clean_pdf(pdf_path: str) -> str:
    """
    Load PDF and clean by removing common headers/footers.
    
    Args:
        pdf_path: Path to the PDF file
    
    Returns:
        Cleaned text content
    """
    reader = PdfReader(pdf_path)
    pages = []
    
    
    print(f"üìÑ Loading PDF: {pdf_path}")
    print(f"    Total pages: {len(reader.pages)}")
    
    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        
        # Remove common headers/footers
        text = text.replace("Uber Technologies, Inc.", "")
        text = text.replace("2024 Annual Report", "")
        text = text.replace("Form 10-K", "Form 10-K")  # Keep Form 10-K as it's important
        
        # Remove excessive whitespace
        text = " ".join(text.split())
        
        pages.append(text)
    
    full_text = "\n\n".join(pages)
    print(f"‚úÖ Extracted {len(full_text):,} characters from {len(pages)} pages")
    
    return full_text

# Load the document
document_text = load_and_clean_pdf(config["data_root"] + "/Uber_annual_report_2024.pdf")

üìÑ Loading PDF: ./data/raw/Uber_annual_report_2024.pdf
    Total pages: 142
‚úÖ Extracted 624,112 characters from 142 pages


Chunking 1500 chars

use fixed chunk

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from rpds import List

def create_chunks(text: str, chunk_size: int = 1500, chunk_overlap: int = 200) -> list:
    """
    Split document into semantically meaningful chunks.
    
    Args:
        text: Full document text
        chunk_size: Target size of each chunk, if not provide give a value of 1500
        chunk_overlap: Overlap between chunks to maintain context, if not provide give a value of 200
    
    Returns:
        List of text chunks
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", " ", ""],
    )
    
    chunks = text_splitter.split_text(text)
    
    print(f"Chunking Results:")
    print(f"\tTotal chunks: {len(chunks)}")
    print(f"\tAvg chunk size: {sum(len(c) for c in chunks) // len(chunks)} chars")
    print(f"\tMin/Max: {min(len(c) for c in chunks)} / {max(len(c) for c in chunks)} chars")
    
    return chunks

# Create chunks
chunks = create_chunks(document_text, config["chunk_size"], config["chunk_overlap"])

# Display sample chunk
print(f"\nüìù Sample Chunk:")
print(f"   Length: {len(chunks[0])} chars")
print(f"   Preview: {chunks[0][:300]}...")

Chunking Results:
	Total chunks: 544
	Avg chunk size: 1209 chars
	Min/Max: 25 / 1500 chars

üìù Sample Chunk:
   Length: 610 chars
   Preview: On Our Way 2024 ANNUAL REPORT

Uber‚Äôs Mission We reimagine the way the world moves for the better We are Uber. The go-getters. The kind of people who are relentless about our mission to help people go anywhere and get anything and earn their way. Movement is what we power. It‚Äôs our lifeblood. It run...


Master prompt

In [5]:
# STEP A: Question Generation Prompt (LLM A)
QUESTION_GENERATION_PROMPT = """You are a financial analyst creating training questions for a model that will answer questions about Uber's 2024 Annual Report.

Based on the following text excerpt, generate EXACTLY 10 diverse questions that cover:

**Category Distribution:**
- 4 Hard Facts questions: Specific numbers, dates, percentages, financial metrics, concrete data points
- 3 Strategic Summary questions: Business strategies, competitive advantages, risk factors, market analysis
- 3 Stylistic/Creative questions: Trends analysis, future implications, comparative insights, interpretations

**Context:**
{chunk}

**Instructions:**
1. Questions must be answerable ONLY from the provided context
2. Be specific - reference exact metrics, dates, or concepts from the text
3. Vary question complexity and depth
4. Use natural language (avoid overly formal phrasing)

**Output Format (JSON):**
[
  {{"category": "Hard Facts", "question": "What was Uber's total revenue in Q4 2024?"}},
  {{"category": "Strategic Summary", "question": "How is Uber addressing regulatory challenges in European markets?"}},
  {{"category": "Stylistic/Creative", "question": "What does the shift toward autonomous vehicles suggest about Uber's long-term vision?"}}
]

Generate the 10 questions now:"""

# STEP B: Answer Generation Prompt (LLM B)
ANSWER_GENERATION_PROMPT = """You are an expert financial analyst providing precise answers based on Uber's 2024 Annual Report.

**Context:**
{chunk}

**Question:**
{question}

**Instructions:**
1. Answer ONLY based on the information in the context above
2. Be specific - cite exact numbers, dates, and details when available
3. If the context doesn't contain enough information, say "Based on the provided context..."
4. For Hard Facts: Provide concise, data-driven answers
5. For Strategic/Creative: Provide thoughtful analysis while staying grounded in the text
6. Keep answers between 2-5 sentences (50-150 words)

Provide your answer:"""

Generate Q/A (LLM A ‚Üí LLM B)

In [6]:
def generate_questions(chunk: str) -> list[Dict[str, str]]:
    """
    Generate 10 questions from a chunk using the LLMClient with expect_json.
    """
    prompt = QUESTION_GENERATION_PROMPT.format(chunk=chunk)
    
    response = client.chat(
        messages=[{"role": "user", "content": prompt}],
        expect_json=True,           # tells client to try to parse JSON
        temperature=0.7,
        max_tokens=1200,
    )

    # ‚îÄ‚îÄ Handle the response ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    if response.get("parse_error"):
        print(f"‚ö†Ô∏è JSON parse error in chunk: {response['parse_error']}")
        print(f"Raw text snippet: {response['text'][:400]}...\n")
        return []

    parsed = response.get("parsed")
    
    if parsed is None:
        print("‚ö†Ô∏è No parsed JSON returned")
        print(f"Raw text: {response['text'][:400]}...\n")
        return []

    if not isinstance(parsed, list):
        print(f"‚ö†Ô∏è Parsed content is not a list: {type(parsed)}")
        return []

    # Validate structure (optional but recommended)
    valid_questions = []
    for item in parsed:
        if isinstance(item, dict) and "question" in item and "category" in item:
            valid_questions.append(item)
        else:
            print(f"‚ö†Ô∏è Invalid question format: {item}")

    print(f"Generated {len(valid_questions)} valid questions")
    return valid_questions[:10]

def generate_answer(chunk: str, question_data: Dict[str, str]) -> str:
    """
    Generate answer using LLM B.
    """
    prompt = ANSWER_GENERATION_PROMPT.format(
        chunk=chunk,
        question=question_data["question"]
    )
    
    response = client.chat(
        messages=[{"role": "user", "content": prompt}],
        expect_json=False,           # answers are free text, not JSON
        temperature=0.3,             # lower for factual answers
        max_tokens=400,
    )
    
    if "parse_error" in response and response["parse_error"]:
        print(f"Warning: parse error in answer - using raw text")
    
    # Always take the "text" field
    answer_text = response.get("text", "").strip()
    
    if not answer_text:
        return "No answer could be generated from the provided context."
    
    return answer_text

def process_all_chunks(chunks: list[str], questions_per_chunk: int = 10) -> list[Dict]:
    """
    Main generation loop: Process all chunks and create Q/A pairs.
    """
    all_qa_pairs = []
    
    # Limit to first 20 chunks for demo (remove limit for full run)
    chunks_to_process = chunks[:20]  # Change to chunks[:] for full dataset
    
    print(f"\nüîÑ Processing {len(chunks_to_process)} chunks...\n")
    
    for chunk_idx, chunk in enumerate(tqdm(chunks_to_process, desc="Generating Q/A pairs")):
        try:
            # Step A: Generate questions
            questions = generate_questions(chunk)
            
            if not questions:
                print(f"‚ö†Ô∏è Skipping chunk {chunk_idx} - no questions generated")
                continue
            
            # Step B: Generate answers for each question
            for q_data in questions[:questions_per_chunk]:
                try:
                    answer = generate_answer(chunk, q_data)
                    
                    all_qa_pairs.append({
                        "chunk_id": chunk_idx,
                        "category": q_data.get("category", "Unknown"),
                        "question": q_data["question"],
                        "answer": answer,
                        "context": chunk
                    })
                except Exception as e:
                    print(f"‚ö†Ô∏è Error generating answer for chunk {chunk_idx}: {e}")
                    continue
            
            # Rate limiting
            time.sleep(0.5)
            
        except Exception as e:
            print(f"‚ö†Ô∏è Error processing chunk {chunk_idx}: {e}")
            continue
    
    print(f"\n‚úÖ Generated {len(all_qa_pairs)} Q/A pairs from {len(chunks_to_process)} chunks")
    
    # Display statistics
    categories = {}
    for pair in all_qa_pairs:
        cat = pair["category"]
        categories[cat] = categories.get(cat, 0) + 1
    
    print(f"\nüìä Category Distribution:")
    for cat, count in categories.items():
        print(f"   {cat}: {count} ({count/len(all_qa_pairs)*100:.1f}%)")
    
    return all_qa_pairs

# Run the generation loop
qa_pairs = process_all_chunks(chunks, 10) 


üîÑ Processing 20 chunks...



Generating Q/A pairs:   0%|          | 0/20 [00:00<?, ?it/s]

Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions
Generated 10 valid questions

‚úÖ Generated 200 Q/A pairs from 20 chunks

üìä Category Distribution:
   Hard Facts: 80 (40.0%)
   Strategic Summary: 60 (30.0%)
   Stylistic/Creative: 60 (30.0%)


In [7]:
# Save to CSV
output_file = Path(config["generated_dir"]) / "generated_data.csv"

fieldnames = [
    "chunk_id",
    "category",
    "question",
    "answer",
    "context"
]

try:
    with open(output_file, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        
        # Write header
        writer.writeheader()
        
        # Write all rows
        for pair in qa_pairs:
            writer.writerow(pair)
    
    print(f"\n‚úÖ Successfully saved {len(qa_pairs)} Q/A pairs to {output_file}")
    
except Exception as e:
    print(f"\n‚ùå Failed to save CSV: {e}")

# Optional: also show a quick preview of first few rows
if qa_pairs:
    print("\nFirst 3 saved entries:")
    for pair in qa_pairs[:3]:
        print(f"  - {pair['category']} | Q: {pair['question'][:80]}...")


‚úÖ Successfully saved 200 Q/A pairs to data\generated\generated_data.csv

First 3 saved entries:
  - Hard Facts | Q: What is Uber's mission statement as outlined in the 2024 Annual Report?...
  - Hard Facts | Q: What key performance indicator is described as Uber's 'lifeblood'?...
  - Hard Facts | Q: In what year did Uber aim to reimagine the way the world moves according to thei...


Train / Test Split

In [8]:
from sklearn.model_selection import train_test_split

def save_jsonl(data: list[Dict], filepath: str):
    """
    Save data in JSONL format (one JSON object per line).
    """
    with open(filepath, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    print(f"‚úÖ Saved {len(data)} items to {filepath}")

# Split data 80/20
train_data, test_data = train_test_split(
    qa_pairs,
    test_size=1 - config["train_ratio"],
    random_state=42,
    stratify=[pair["category"] for pair in qa_pairs]  # Maintain category distribution
)

print(f"\nüì¶ Dataset Split:")
print(f"   Training set: {len(train_data)} pairs ({len(train_data)/len(qa_pairs)*100:.1f}%)")
print(f"   Test set: {len(test_data)} pairs ({len(test_data)/len(qa_pairs)*100:.1f}%)")

# Save to JSONL files
train_path = Path(config["output_dir"]) / "train.jsonl"
test_path = Path(config["output_dir"]) / "golden_test_set.jsonl"

save_jsonl(train_data, str(train_path))
save_jsonl(test_data, str(test_path))

print(f"\n‚úÖ Data factory complete!")
print(f"   üìÅ Training data: {train_path}")
print(f"   üìÅ Test data: {test_path}")


üì¶ Dataset Split:
   Training set: 160 pairs (80.0%)
   Test set: 40 pairs (20.0%)
‚úÖ Saved 160 items to data\processed\train.jsonl
‚úÖ Saved 40 items to data\processed\golden_test_set.jsonl

‚úÖ Data factory complete!
   üìÅ Training data: data\processed\train.jsonl
   üìÅ Test data: data\processed\golden_test_set.jsonl
