# ESG Document Chunking & Retrieval

This notebook demonstrates how to build a document retrieval system for ESG reports using chunking and vector search.

## Setup
First, we'll import the necessary libraries and our custom modules.

In [None]:
import os
import json
import pandas as pd
from pathlib import Path
from IPython.display import display

# Import our custom modules
import sys
sys.path.append('../scripts')
from chunking import ESGDocumentChunker, TextChunk
from embedding import ESGEmbeddingManager, EmbeddedChunk

## 1. Document Chunking

Let's start by loading our processed ESG report and splitting it into chunks.

In [None]:
# Load processed ESG data from Day 1
with open('../data/processed_esg_data.json', 'r') as f:
    esg_data = json.load(f)

# Initialize chunker
chunker = ESGDocumentChunker(
    chunk_size=500,      # Target chunk size in words
    chunk_overlap=50,    # Words to overlap between chunks
    min_chunk_size=100   # Minimum chunk size to keep
)

# Process first document as example
doc = esg_data[0]
metadata = {
    'company': doc['company'],
    'year': doc['year'],
    'source': doc['original_file']
}

# Try both chunking methods
fixed_chunks = chunker.chunk_document(doc['cleaned_text'], metadata, method='fixed')
section_chunks = chunker.chunk_document(doc['cleaned_text'], metadata, method='section')

print(f"Fixed-size chunks: {len(fixed_chunks)}")
print(f"Section-based chunks: {len(section_chunks)}")

# Display example chunks
print("\nExample fixed-size chunk:")
print(fixed_chunks[0].text[:200], "...")
print("\nExample section-based chunk:")
print(section_chunks[0].text[:200], "...")

## Exercise 1: Chunk Size Analysis

Analyze how different chunk sizes affect the quality of text segments.

In [None]:
# Try different chunk sizes
chunk_sizes = [200, 500, 1000]
results = []

for size in chunk_sizes:
    chunker = ESGDocumentChunker(chunk_size=size)
    chunks = chunker.chunk_document(doc['cleaned_text'], metadata)
    
    results.append({
        'chunk_size': size,
        'num_chunks': len(chunks),
        'avg_length': sum(len(c.text.split()) for c in chunks) / len(chunks)
    })

pd.DataFrame(results)

## 2. Generating Embeddings

Now let's convert our chunks into vector embeddings for semantic search.

In [None]:
# Initialize embedding manager
embedding_manager = ESGEmbeddingManager()

# Generate embeddings for our chunks
texts = [chunk.text for chunk in fixed_chunks]
embeddings = embedding_manager.generate_embeddings(texts)

# Create embedded chunks
embedded_chunks = [
    EmbeddedChunk(
        text=chunk.text,
        embedding=embedding,
        metadata=chunk.metadata
    )
    for chunk, embedding in zip(fixed_chunks, embeddings)
]

# Build search index
embedding_manager.build_index(embedded_chunks)

## 3. Testing Document Retrieval

Let's test our retrieval system with some ESG-related queries.

In [None]:
def search_and_display(query: str, k: int = 3):
    """Search for chunks and display results nicely."""
    results = embedding_manager.search(query, k=k)
    
    print(f"Query: {query}\n")
    for i, result in enumerate(results, 1):
        print(f"Result {i} (Score: {result['score']:.3f})")
        print(f"Text: {result['text'][:200]}...")
        print(f"Source: {result['metadata']['source']}\n")

# Test some queries
queries = [
    "carbon emissions reduction targets",
    "board diversity and inclusion",
    "environmental impact assessment"
]

for query in queries:
    search_and_display(query)
    print("-" * 80 + "\n")

## Exercise 2: Query Analysis

Try different types of queries and analyze the retrieval quality.

In [None]:
# Your turn!
# 1. Try queries with different phrasings
# 2. Test specific ESG metric queries
# 3. Compare results with different k values

# Example:
variations = [
    "What are the company's CO2 emissions?",
    "carbon dioxide emissions data",
    "CO2 reduction goals"
]

for query in variations:
    search_and_display(query, k=2)
    print("-" * 80 + "\n")

## 4. Saving the Index

Finally, let's save our index and chunks for later use.

In [None]:
# Save index and chunks
output_dir = Path('../data')
embedding_manager.save_index(
    index_path=output_dir / 'faiss_index.bin',
    chunks_path=output_dir / 'embedded_chunks.json'
)

print("Index and chunks saved successfully!")