In [1]:
import os
import json
import pickle
from pathlib import Path
from typing import List, Dict
import re
from rank_bm25 import BM25Okapi
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import ast


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BASE_DIR = Path("data/reference_corpus")
INDEX_DIR = Path("indexes")
INDEX_DIR.mkdir(parents=True, exist_ok=True)

EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
CHUNK_SIZE = 200   # tokens approx
CHUNK_OVERLAP = 40

In [3]:
def load_python_files(folder: Path) -> List[Dict]:
    records = []

    for file in folder.glob("*.py"):
        with open(file, "r", encoding="utf-8") as f:
            code = f.read()

        records.append({
            "file_name": file.name,
            "file_path": str(file),
            "raw_code": code
        })

    return records


documents = load_python_files(BASE_DIR)
print(f"Loaded {len(documents)} files")

Loaded 38 files


In [4]:

def build_chunks(doc):
    chunks = []
    code = doc["raw_code"]
    filename = doc["file_name"]

    try:
        tree = ast.parse(code)
    except SyntaxError:
        print(f"Skipping {filename}: Syntax Error")
        return []

    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef):
            func_source = ast.get_source_segment(code, node)
            
            if func_source:
                
                chunks.append({
                    "type": "function",
                    "text": func_source,
                    "function_name": node.name, # Useful metadata
                    "file_name": filename
                })

    return chunks

In [11]:
chunked_docs = []

for doc_id, doc in enumerate(documents):
    chunks = build_chunks(doc)

    for i, chunk in enumerate(chunks):
        chunked_docs.append({
            "doc_id": doc_id,
            "chunk_id": i,
            "chunk_type": chunk["type"],
            "file_name": chunk["file_name"],
            "text": chunk["text"]
        })

print(len(chunked_docs))

75


In [6]:
with open("indexes/corpus.json", "w") as f:
    json.dump(chunked_docs, f, indent=4)

In [7]:
def tokenize_code(text):
    text = text.lower()
    tokens = re.split(r'[^a-z0-9_]', text)
    return [t for t in tokens if t]

tokenized_chunks = [tokenize_code(c["text"]) for c in chunked_docs]

bm25 = BM25Okapi(tokenized_chunks)

with open("indexes/bm25.pkl", "wb") as f:
    pickle.dump(bm25, f)

In [8]:
embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)

texts = [c["text"] for c in chunked_docs]

embeddings = embedder.encode(
    texts,
    batch_size=32,
    show_progress_bar=True,
    normalize_embeddings=True
)

embeddings = np.array(embeddings).astype("float32")

Batches: 100%|██████████| 3/3 [00:01<00:00,  1.73it/s]


In [None]:
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings) # type: ignore

print("FAISS index size:", index.ntotal)

FAISS index size: 75


In [10]:
faiss.write_index(index, str(INDEX_DIR / "reference.index"))

with open(INDEX_DIR / "chunks_metadata.pkl", "wb") as f:
    pickle.dump(chunked_docs, f)

with open(INDEX_DIR / "documents.pkl", "wb") as f:
    pickle.dump(documents, f)

config = {
    "embedding_model": EMBEDDING_MODEL_NAME,
    "chunk_size": CHUNK_SIZE,
    "overlap": CHUNK_OVERLAP,
    "num_documents": len(documents),
    "num_chunks": len(chunked_docs)
}

with open(INDEX_DIR / "index_config.json", "w") as f:
    json.dump(config, f, indent=2)