In [None]:
# imports

import json
import re
import requests
from pathlib import Path
from urllib.parse import urlparse
import faiss
from sentence_transformers import SentenceTransformer
import pickle
from rank_bm25 import BM25Okapi

In [None]:
# load docs

ref_dir = Path("./data/reference_corpus")
all_docs = []

for go_file in ref_dir.rglob("*.go"):
    text = go_file.read_text(encoding="utf-8", errors="ignore")
    repo = go_file.relative_to(ref_dir).parts[0]

    all_docs.append({
        "repo": repo,
        "path": str(go_file.relative_to(ref_dir)),
        "text": text
    })

print(f"Loaded {len(all_docs)} .go files")

In [None]:
# chunk docs

chunk_id_counter = 1
chunks = []

func_pattern = re.compile(r"^func\s", re.MULTILINE)

for doc in all_docs:
    text = doc["text"]
    repo = doc["repo"]
    source_path = doc["path"]

    matches = list(func_pattern.finditer(text))
    if not matches:
        chunk_id = f"chunk_{chunk_id_counter:05d}"
        chunks.append({
            "id": chunk_id,
            "repo": repo,
            "source_path": source_path,
            "text": text.strip(),
        })
        chunk_id_counter += 1
        continue

    for idx, match in enumerate(matches):
        start = match.start()
        end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
        
        chunk_text = text[start:end].strip()
        if not chunk_text:
            continue

        chunk_id = f"chunk_{chunk_id_counter:05d}"
        chunks.append({
            "id": chunk_id,
            "repo": repo,
            "source_path": source_path,
            "text": chunk_text,
        })
        chunk_id_counter += 1

print(f"created {len(chunks)} chunks")


In [None]:
# embed

st_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

texts = [c["text"] for c in chunks]
emb = st_model.encode(texts, convert_to_numpy=True, show_progress_bar=True)

if emb is None:
    raise RuntimeError("No embeddings were built. Check the previous cell.")

print(emb.shape)

In [None]:
indexes_dir = Path("./indexes")

In [None]:
# create faiss flat index

emb = emb.astype("float32")

dim = emb.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(emb)

faiss.write_index(index, str(indexes_dir / "dense_index.faiss"))
print("saved FAISS index to indexes/dense_index.faiss")

In [None]:
# map each vector to original code chunk and save in .json file

dense_meta = {
    "chunk_ids": [c["id"] for c in chunks],
    "repos": [c["repo"] for c in chunks],
    "paths": [c["source_path"] for c in chunks],
}
with open(indexes_dir / "dense_meta.json", "w", encoding="utf-8") as f:
    json.dump(dense_meta, f, indent=2)

print("saved dense_meta.json")

In [None]:
# create BM25 index

tokenized_corpus = []
for c in chunks:
    toks = re.findall(r"[A-Za-z_][A-Za-z0-9_]*", c["text"])
    tokenized_corpus.append(toks)

bm25 = BM25Okapi(tokenized_corpus)

with open(indexes_dir / "bm25_index.pkl", "wb") as f:
    pickle.dump(
        {
            "bm25": bm25,
            "chunks": chunks,
        },
        f,
    )

print("saved bm25_index.pkl")

In [None]:
# save metadata

meta = {
    "num_chunks": len(chunks),
    "dense_index_path": "indexes/dense_index.faiss",
    "dense_meta_path": "indexes/dense_meta.json",
    "bm25_index_path": "indexes/bm25_index.pkl",
    "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
}

with open(indexes_dir / "meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)

print("indexing done")
