In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install ir-datasets sentence-transformers faiss-cpu ranx pandas numpy tqdm -q
print("‚úÖ Setup complete!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m23.8/23.8 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25h‚úÖ Setup complete!


In [None]:
%%time
import ir_datasets
from itertools import islice
from collections import defaultdict
import pickle
import os

os.makedirs('/content/drive/MyDrive/msmarco_project', exist_ok=True)

print("üöÄ Loading MS MARCO (50k passages)...")

# Corpus sample
dataset = ir_datasets.load("msmarco-passage")
corpus = {doc.doc_id: doc.text for doc in islice(dataset.docs_iter(), 50000)}
print(f"Corpus: {len(corpus)} passages")

# Train qrels (fixed unpacking)
train_ds = ir_datasets.load("msmarco-passage/train")
train_qrels = defaultdict(set)
for qrel in train_ds.qrels_iter():
    train_qrels[qrel.query_id].add(qrel.doc_id)

# Dev qrels
dev_ds = ir_datasets.load("msmarco-passage/dev")
dev_qrels = defaultdict(set)
for qrel in dev_ds.qrels_iter():
    dev_qrels[qrel.query_id].add(qrel.doc_id)

print(f"Train qrels: {len(train_qrels)} | Dev: {len(dev_qrels)}")

# Save to Drive
data = {'corpus': corpus, 'train_qrels': dict(train_qrels), 'dev_qrels': dict(dev_qrels)}
with open('/content/drive/MyDrive/msmarco_project/data.pkl', 'wb') as f:
    pickle.dump(data, f)
print("‚úÖ Data saved to Drive!")


üöÄ Loading MS MARCO (50k passages)...
Corpus: 50000 passages
Train qrels: 502939 | Dev: 55578
‚úÖ Data saved to Drive!
CPU times: user 4.1 s, sys: 258 ms, total: 4.36 s
Wall time: 7.16 s


In [None]:
%%time
from sentence_transformers import SentenceTransformer
import faiss
import pickle
import numpy as np

import torch
import numpy as np
import random

# FIXED SEEDS for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
print(f"‚úÖ Seeds set: {SEED}")

print("Loading data...")
with open('/content/drive/MyDrive/msmarco_project/data.pkl', 'rb') as f:
    data = pickle.load(f)
doc_ids = list(data['corpus'].keys())
doc_texts = list(data['corpus'].values())
N = len(doc_ids)
print(f"Corpus: {N} docs")

print("Encoding...")
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(doc_texts, show_progress_bar=True, convert_to_tensor=False)

print("Building FAISS...")
faiss.normalize_L2(embeddings)
d = embeddings.shape[1]
index = faiss.IndexFlatIP(d)
index.add(embeddings.astype('float32'))
print(f"Index: {index.ntotal}/{N} vectors, dim={d}")

# Test immediately (before saving)
def safe_search(query, k=5):
    q_emb = model.encode([query]).astype('float32')
    faiss.normalize_L2(q_emb)
    scores, indices = index.search(q_emb, k)

    # Filter valid indices only
    results = []
    for i, idx in enumerate(indices[0]):
        if 0 <= idx < N:  # Valid range check
            results.append({
                'doc_id': doc_ids[idx],
                'score': float(scores[0][i]),
                'text': doc_texts[idx][:150] + "..."
            })
    return results[:k]

print("\nüéØ Test search:")
tests = ["microsoft azure cloud", "machine learning", "python libraries"]
for query in tests:
    print(f"\nQuery: '{query}'")
    results = safe_search(query)
    for r in results:
        print(f"  {r['score']:.3f} | {r['doc_id'][:15]}... | {r['text']}")

# Save (now safe)
faiss.write_index(index, '/content/drive/MyDrive/msmarco_project/baseline_index.bin')
model.save('/content/drive/MyDrive/msmarco_project/baseline_model/')
with open('/content/drive/MyDrive/msmarco_project/doc_mapping.pkl', 'wb') as f:
    pickle.dump({'doc_ids': doc_ids, 'doc_texts': doc_texts}, f)

print("\n‚úÖ BASELINE COMPLETE!")


Loading data...
Corpus: 50000 docs
Encoding...


Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

Building FAISS...
Index: 50000/50000 vectors, dim=384

üéØ Test search:

Query: 'microsoft azure cloud'
  0.637 | 17728... | With Azure, you can develop hybrid applications that allow your on-premises applications to use cloud services, such as the cloud database and storage...
  0.633 | 47748... | Microsoft Azure Key Vault. Microsoft Azure Key Vault is a cloud-hosted management service that allows users to encrypt keys and small secrets by using...
  0.612 | 17733... | However, this is only one of the options available in Azure. Typically you‚Äôll need to decide whether you deploy the data for your applications in the ...
  0.608 | 17737... | 6: Data center in the cloud. SQL Azure provides organizations with all the benefits of an enterprise-class data center without the hassle, headaches, ...
  0.606 | 17734... | Focus on your app, not on the infrastructure. As you move your applications to Windows Azure, it will help you overcome the issues and worries managin...

Query: 'machine l