# 01 - Indexing and Data Preparation

This notebook handles:
1. Extracting functions from reference corpus
2. Generating embeddings for all functions
3. Building BM25 index
4. Saving all indexes to disk for use by other notebooks

In [1]:
import sys
import os
import numpy as np
from tqdm import tqdm
import pickle

# Add src to path
sys.path.append(os.path.abspath('.'))

from src.chunking import PythonFunctionExtractor, CodeChunk
from src.embeddings import EmbeddingGenerator
from src.retrieval import DenseRetriever, BM25Retriever
from src.config import RANDOM_SEED

# Set random seed for reproducibility
np.random.seed(RANDOM_SEED)

print("✓ Imports successful")

✓ Imports successful


  from .autonotebook import tqdm as notebook_tqdm


## Step 1: Extract Functions from Reference Corpus

In [2]:
# Initialize function extractor
extractor = PythonFunctionExtractor(min_lines=3, max_lines=500)

# Extract functions from all repositories
reference_corpus_dir = "data/reference_corpus"
all_chunks = extractor.extract_from_directory(reference_corpus_dir)

print(f"\nExtracted {len(all_chunks)} functions from reference corpus")
print(f"\nSample functions:")
for i, chunk in enumerate(all_chunks[:5]):
    print(f"{i+1}. {chunk.function_name} from {chunk.file_path}")


Extracted 3520 functions from reference corpus

Sample functions:
1. hidden_prompt_func from data/reference_corpus/click/src/click/termui.py
2. _build_prompt from data/reference_corpus/click/src/click/termui.py
3. _format_default from data/reference_corpus/click/src/click/termui.py
4. prompt from data/reference_corpus/click/src/click/termui.py
5. confirm from data/reference_corpus/click/src/click/termui.py


## Step 2: Generate Embeddings for All Functions

In [3]:
# Initialize embedding generator
embedding_gen = EmbeddingGenerator()

# Extract code content from chunks
code_texts = [chunk.content for chunk in all_chunks]

print(f"Generating embeddings for {len(code_texts)} functions...")
print("This may take a few minutes...\n")

# Generate embeddings with progress bar
embeddings = embedding_gen.embed_batch(code_texts, batch_size=50, show_progress=True)

# Convert to numpy array
embeddings_matrix = np.array(embeddings)

print(f"\n✓ Generated embeddings with shape: {embeddings_matrix.shape}")

Generating embeddings for 3520 functions...
This may take a few minutes...



embedding:   0%|          | 0/71 [00:07<?, ?it/s]


KeyboardInterrupt: 

## Step 3: Build Dense Retriever Index

In [4]:
# Create dense retriever
dense_retriever = DenseRetriever(all_chunks, embeddings_matrix)

# Save to disk
os.makedirs("indexes", exist_ok=True)
dense_retriever.save("indexes/dense_retriever.pkl")

print("✓ Dense retriever saved to indexes/dense_retriever.pkl")

NameError: name 'embeddings_matrix' is not defined

## Step 4: Build BM25 Index

In [None]:
# Create BM25 retriever
print("Building BM25 index...")
bm25_retriever = BM25Retriever(all_chunks)

# Save to disk
bm25_retriever.save("indexes/bm25_retriever.pkl")

print("✓ BM25 retriever saved to indexes/bm25_retriever.pkl")

## Step 5: Save Metadata

In [None]:
import json

metadata = {
    "num_functions": len(all_chunks),
    "embedding_dimension": embeddings_matrix.shape[1],
    "reference_corpus_dir": reference_corpus_dir,
    "repositories": [
        "algorithms (TheAlgorithms/Python)",
        "string_utils (text processing)",
        "data_structures (linked list)",
        "math_utils (statistics)",
        "file_utils (file operations)",
        "sorting_algos (sorting algorithms)"
    ]
}

with open("indexes/metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)

print("✓ Metadata saved to indexes/metadata.json")
print("\n" + "="*50)
print("INDEXING COMPLETE")
print("="*50)
print(json.dumps(metadata, indent=2))

## Step 6: Test Index Loading (Verification)

In [None]:
# Test loading indexes
print("Testing index loading...\n")

dense_test = DenseRetriever.load("indexes/dense_retriever.pkl")
print(f"✓ Dense retriever loaded: {len(dense_test.chunks)} chunks")

bm25_test = BM25Retriever.load("indexes/bm25_retriever.pkl")
print(f"✓ BM25 retriever loaded: {len(bm25_test.chunks)} chunks")

# Test retrieval with a sample query
test_query = "def reverse_string(s): return s[::-1]"
print(f"\nTest query: {test_query}")

dense_results = dense_test.retrieve(test_query, top_k=3)
print(f"\nTop-3 Dense Retrieval Results:")
for i, (chunk, score) in enumerate(dense_results):
    print(f"{i+1}. {chunk.function_name} (similarity: {score:.4f})")

bm25_results = bm25_test.retrieve(test_query, top_k=3)
print(f"\nTop-3 BM25 Retrieval Results:")
for i, (chunk, score) in enumerate(bm25_results):
    print(f"{i+1}. {chunk.function_name} (score: {score:.4f})")

print("\n" + "="*50)
print("All indexes built and verified successfully!")
print("Ready to use in 02_interactive.ipynb")
print("="*50)