<a href="https://colab.research.google.com/github/Rongxuan-Zhou/CS6120_project/blob/main/notebooks/3_index_construction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1. Environment Setup
!pip install -q faiss-cpu sentence-transformers
from google.colab import drive
drive.mount('/content/drive')

import os
PROJECT_PATH = "/content/drive/MyDrive/CS6120_project"
os.chdir(PROJECT_PATH)

# GPU detection
import torch
print(f"Available GPU: {torch.cuda.is_available()}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m101.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m82.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# 2. Load fine-tuned SBERT model
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("models/sbert_model")
model.to('cuda' if torch.cuda.is_available() else 'cpu')
print("Model loaded successfully")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Model loaded successfully


In [5]:
# 3. Build FAISS index (based on src/index_builder.py)
import faiss
import numpy as np
import json
from tqdm import tqdm

# 清理 GPU 缓存
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Load data
print("Loading data...")
with open("data/processed/combined.json") as f:
    data = json.load(f)
    corpus = data["train"]

print(f"Loaded {len(corpus)} documents")

# Batch encoding
print("Generating embeddings...")
batch_size = 128
embeddings = []
for i in tqdm(range(0, len(corpus), batch_size)):
    batch = corpus[i:i+batch_size]
    emb = model.encode(batch, show_progress_bar=False)
    embeddings.append(emb)

embeddings = np.vstack(embeddings)
dimension = embeddings.shape[1]
print(f"Generated {len(embeddings)} embeddings of dimension {dimension}")

# 归一化向量以便使用内积计算余弦相似度
print("Normalizing vectors...")
faiss.normalize_L2(embeddings)

# Create flat FAISS index (精确搜索)
print("Building flat index...")
index_flat = faiss.IndexFlatIP(dimension)
index_flat.add(embeddings)
print(f"Flat index built with {index_flat.ntotal} vectors")

# 创建 HNSW 索引（更快的检索）
print("Building HNSW index...")
M = 16  # 每个节点的连接数
ef_construction = 200  # 构建时的搜索宽度
index_hnsw = faiss.IndexHNSWFlat(dimension, M)
index_hnsw.hnsw.efConstruction = ef_construction
index_hnsw.add(embeddings)
print(f"HNSW index built with {index_hnsw.ntotal} vectors")

# 创建 IVF-PQ 索引（更小的内存占用）
print("Building IVF-PQ index...")
nlist = min(100, len(corpus) // 50)  # 聚类中心数，不超过向量总数的 1/50
m = 8  # 子向量数
bits = 8  # 每个子向量的位数
quantizer = faiss.IndexFlatL2(dimension)
index_ivfpq = faiss.IndexIVFPQ(quantizer, dimension, nlist, m, bits)
index_ivfpq.train(embeddings)
index_ivfpq.add(embeddings)
print(f"IVF-PQ index built with {index_ivfpq.ntotal} vectors")

Loading data...
Loaded 8800 documents
Generating embeddings...


100%|██████████| 69/69 [00:15<00:00,  4.41it/s]


Generated 8800 embeddings of dimension 768
Normalizing vectors...
Building flat index...
Flat index built with 8800 vectors
Building HNSW index...
HNSW index built with 8800 vectors
Building IVF-PQ index...
IVF-PQ index built with 8800 vectors


In [6]:
# 4. Test indexes with sample queries
test_queries = ["How does social media affect mental health?",
               "Best programming languages to learn",
               "Artificial intelligence applications"]

print("Testing indexes with sample queries...")
# 对测试查询进行编码
query_embeddings = model.encode(test_queries)

# 归一化查询向量
faiss.normalize_L2(query_embeddings)

# 设置返回结果数量
k = 3

# Flat 索引搜索（最精确）
print("\nFlat index search results:")
D_flat, I_flat = index_flat.search(query_embeddings, k)

for i, query in enumerate(test_queries):
    print(f"\nQuery: {query}")
    for j in range(k):
        print(f"  Match {j+1}: (Score: {D_flat[i][j]:.4f})")
        print(f"  {corpus[I_flat[i][j]][:100]}...")

# HNSW 索引搜索（快速近似）
print("\nHNSW index search results:")
D_hnsw, I_hnsw = index_hnsw.search(query_embeddings, k)

# 计算与精确搜索的重合度
hnsw_overlap = 0
for i in range(len(test_queries)):
    overlap = len(set(I_flat[i]) & set(I_hnsw[i]))
    hnsw_overlap += overlap / k
hnsw_overlap /= len(test_queries)

print(f"HNSW average overlap with flat search: {hnsw_overlap:.2%}")

# IVF-PQ 索引搜索（紧凑型）
print("\nIVF-PQ index search results:")
index_ivfpq.nprobe = 10  # 搜索时检查的聚类数量
D_ivfpq, I_ivfpq = index_ivfpq.search(query_embeddings, k)

# 计算与精确搜索的重合度
ivfpq_overlap = 0
for i in range(len(test_queries)):
    overlap = len(set(I_flat[i]) & set(I_ivfpq[i]))
    ivfpq_overlap += overlap / k
ivfpq_overlap /= len(test_queries)

print(f"IVF-PQ average overlap with flat search: {ivfpq_overlap:.2%}")

Testing indexes with sample queries...

Flat index search results:

Query: How does social media affect mental health?
  Match 1: (Score: 0.3095)
  The Social Cognitive Theory is relevant to health communication. First, the theory deals with cognit...
  Match 2: (Score: 0.3061)
  Practitioners of magnetic field therapy believe that interactions between the body, the earth, and o...
  Match 3: (Score: 0.2737)
  Back in March 2010, I wrote a post called Beware of “Who Viewed My Profile” Apps on Facebook. It’s s...

Query: Best programming languages to learn
  Match 1: (Score: 0.3201)
  An integrated development environment (IDE) is a programming environment that has been packaged as a...
  Match 2: (Score: 0.3157)
  Furthermore, there is no loss of language ability or language learning ability over time. Age is not...
  Match 3: (Score: 0.3010)
  One of the best pieces of PLC programming software when you want to learn Structured Text is Beckhof...

Query: Artificial intelligence applica

In [7]:
# 5. Save indexes
print("Saving indexes...")
index_dir = os.path.join(PROJECT_PATH, "models/indexes")
os.makedirs(index_dir, exist_ok=True)

# 保存所有索引类型
faiss.write_index(index_flat, os.path.join(index_dir, "flat_index.faiss"))
faiss.write_index(index_hnsw, os.path.join(index_dir, "hnsw_index.faiss"))
faiss.write_index(index_ivfpq, os.path.join(index_dir, "ivfpq_index.faiss"))

# 保存文档数据，用于后续检索时显示结果
with open(os.path.join(index_dir, "corpus_texts.json"), 'w') as f:
    json.dump(corpus, f)

print("All indexes saved successfully")

Saving indexes...
All indexes saved successfully
