# 🇮🇳 Indian Government Schemes — Semantic Search & Recommendation

##Check GPU

In [None]:
import subprocess
r = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
print(r.stdout if r.returncode == 0 else '⚠️ No GPU! Go to Runtime → Change runtime type → T4 GPU')

Wed Feb 25 05:42:35 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.82.07              Driver Version: 580.82.07      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

##Dependencies

In [None]:
# FIX BUG-06: sentence-transformers >= 3.0 removes model.fit() checkpoint args
# Pin to a version that supports both old and new API, or use new trainer API
# We use >= 3.0 new API throughout this notebook
!pip install -q kagglehub "sentence-transformers>=3.0" faiss-cpu pandas numpy \
    scikit-learn matplotlib seaborn tqdm datasets accelerate

import torch
import sentence_transformers
print(f'✅ Dependencies installed')
print(f'   PyTorch:               {torch.__version__}')
print(f'   sentence-transformers: {sentence_transformers.__version__}')
print(f'   CUDA available:        {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'   GPU:                   {torch.cuda.get_device_name(0)}')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Dependencies installed
   PyTorch:               2.10.0+cu128
   sentence-transformers: 5.2.3
   CUDA available:        True
   GPU:                   Tesla T4


In [None]:
# FIX BUG-02: Centralised seed setup — must run before ANY random call
import random
import numpy as np
import torch
import os

SEED = 42

def set_all_seeds(seed: int = SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    # Make CUDA ops deterministic (slight perf cost — worth it for reproducibility)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_all_seeds(SEED)

# DataLoader worker seed function (used later)
def worker_init_fn(worker_id):
    worker_seed = SEED + worker_id
    random.seed(worker_seed)
    np.random.seed(worker_seed)

print(f'✅ All seeds set to {SEED}')

✅ All seeds set to 42


## Download Dataset

In [None]:
import kagglehub
import glob

path = kagglehub.dataset_download('jainamgada45/indian-government-schemes')
print('Dataset path:', path)

csv_files = glob.glob(os.path.join(path, '**', '*.csv'), recursive=True)
assert len(csv_files) > 0, '❌ No CSV files found — check dataset path!'
print('\nCSV files found:')
for f in csv_files:
    size_mb = os.path.getsize(f) / 1e6
    print(f'  {f}  ({size_mb:.2f} MB)')

Downloading from https://www.kaggle.com/api/v1/datasets/download/jainamgada45/indian-government-schemes?dataset_version_number=1...


100%|██████████| 3.10M/3.10M [00:00<00:00, 4.22MB/s]

Extracting files...
Dataset path: /root/.cache/kagglehub/datasets/jainamgada45/indian-government-schemes/versions/1

CSV files found:
  /root/.cache/kagglehub/datasets/jainamgada45/indian-government-schemes/versions/1/updated_data.csv  (12.86 MB)





## Load & Explore Data

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv(csv_files[0])
print(f'Shape: {df.shape}')
print(f'Columns: {df.columns.tolist()}')

# Sanity check
assert len(df) > 0, '❌ DataFrame is empty!'
assert len(df.columns) > 0, '❌ No columns found!'

df.head(3)

Shape: (3400, 11)
Columns: ['scheme_name', 'slug', 'details', 'benefits', 'eligibility', 'application', 'documents', 'level', 'schemeCategory', 'Unnamed: 9', 'tags']


Unnamed: 0,scheme_name,slug,details,benefits,eligibility,application,documents,level,schemeCategory,Unnamed: 9,tags
0,"""Immediate Relief Assistance"" under ""Welfare a...",ira-wrflsncs,"The scheme ""Immediate Relief Assistance"" is a ...","₹ 1,00,000, in two installments of ₹ 50,000 ea...",The applicant should be the family (legal heir...,Step 1: The interested applicant should visit ...,Photograph of the Family (Legal Heir) of the M...,State,"Agriculture,Rural & Environment, Social welfar...",,"Missing, Fisherman, Relief, Financial Assistan..."
1,AICTE SHORT TERM TRAINING PROGRAMME-SFURTI SCHEME,astpss,"Short Term Training Programme-SFURTI Program, ...","Financial Assistance : Limit of funding ₹ 4,00...",The institution should be AICTE approved.,Registration of New Institute: Step 01: Visit ...,Feedback Form Copy of Proceedings Completion R...,Central,Education & Learning,,"Trainings, Financial Assistance, AICTE"
2,Burial and Ex-gratia Payment Scheme in Case of...,baepsicodouldwact,"Launched in 2014, the "" Burial and Ex-gratia P...","Funeral Assistance: ₹3,000 payable in case of ...",The deceased construction worker should have b...,Step 1: The interested applicant should visit ...,Aadhaar Card of the applicant (nominee/Legal h...,State,Social welfare & Empowerment,,"Building Worker, Construction Workers, Unregis..."


In [None]:
print('=== Null Values ===')
print(df.isnull().sum())
print('\n=== Data Types ===')
print(df.dtypes)
print('\n=== Unique values per column ===')
for col in df.columns:
    print(f'  {col}: {df[col].nunique()} unique  |  {df[col].isnull().sum()} nulls')

=== Null Values ===
scheme_name          0
slug                 0
details              0
benefits             0
eligibility          0
application          2
documents           11
level                0
schemeCategory       0
Unnamed: 9        3400
tags                29
dtype: int64

=== Data Types ===
scheme_name        object
slug               object
details            object
benefits           object
eligibility        object
application        object
documents          object
level              object
schemeCategory     object
Unnamed: 9        float64
tags               object
dtype: object

=== Unique values per column ===
  scheme_name: 3397 unique  |  0 nulls
  slug: 3397 unique  |  0 nulls
  details: 3382 unique  |  0 nulls
  benefits: 3304 unique  |  0 nulls
  eligibility: 3274 unique  |  0 nulls
  application: 2510 unique  |  2 nulls
  documents: 3052 unique  |  11 nulls
  level: 2 unique  |  0 nulls
  schemeCategory: 210 unique  |  0 nulls
  Unnamed: 9: 0 unique  |  3400

## Clean Data — Remove All Nulls

In [None]:
print(f'Shape before cleaning: {df.shape}')

# ── 1. Drop fully empty rows
df = df.dropna(how='all')

# FIX BUG-05: ONLY process object (string) dtype columns — never touch numeric columns
obj_cols = df.select_dtypes(include='object').columns.tolist()
num_cols = df.select_dtypes(exclude='object').columns.tolist()
print(f'\nText columns ({len(obj_cols)}): {obj_cols}')
print(f'Numeric columns ({len(num_cols)}): {num_cols}')

# ── 2. Strip whitespace + convert empty strings/nan strings to NaN (object cols only)
for col in obj_cols:
    df[col] = df[col].astype(str).str.strip()
    df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'None': np.nan,
                                'NaN': np.nan, 'NULL': np.nan, 'null': np.nan})

# ── 3. Fill remaining nulls in text columns with empty string
text_cols = obj_cols  # alias for clarity — only text cols
df[text_cols] = df[text_cols].fillna('')

# ── 4. Drop rows where ALL text columns are empty
mask = df[text_cols].apply(lambda r: ' '.join(r.values).strip(), axis=1).str.len() > 5
df = df[mask]

# ── 5. Reset index cleanly
df = df.reset_index(drop=True)

# ── 6. Assertions
assert df.isnull().sum().sum() == 0 or df[text_cols].isnull().sum().sum() == 0, \
    '❌ Nulls still present in text columns after cleaning!'
assert len(df) > 0, '❌ DataFrame empty after cleaning — check source data!'

print(f'\n✅ Shape after cleaning: {df.shape}')
print(f'   Nulls in text cols:   {df[text_cols].isnull().sum().sum()}')

Shape before cleaning: (3400, 11)

Text columns (10): ['scheme_name', 'slug', 'details', 'benefits', 'eligibility', 'application', 'documents', 'level', 'schemeCategory', 'tags']
Numeric columns (1): ['Unnamed: 9']

✅ Shape after cleaning: (3400, 11)
   Nulls in text cols:   0


In [None]:
# FIX BUG-09: Smarter name_col detection
# Score ALL columns matching name keywords, pick the one with SHORTEST avg length
# (names are short; descriptions are long — shortest match = most likely the title)

avg_len = {col: df[col].str.len().mean() for col in text_cols}
sorted_cols = sorted(avg_len.items(), key=lambda x: x[1], reverse=True)

print('Columns sorted by avg text length (richest first):')
for col, avg in sorted_cols:
    print(f'  {col}: avg {avg:.0f} chars')

# FIX BUG-09: collect all keyword-matching cols, pick shortest avg length
NAME_KEYWORDS = ['name', 'title', 'scheme', 'yojana', 'programme', 'program']
candidate_name_cols = [
    col for col in text_cols
    if any(kw in col.lower() for kw in NAME_KEYWORDS)
]

if candidate_name_cols:
    # Among candidates, pick the one with the SHORTEST average text length
    name_col = min(candidate_name_cols, key=lambda c: avg_len[c])
else:
    # Fallback: column with shortest avg length overall (most likely a title/name)
    name_col = sorted_cols[-1][0]

# Validate: name_col should have high uniqueness (titles are mostly unique)
uniqueness = df[name_col].nunique() / len(df)
print(f'\n✅ Name/title column: "{name_col}"')
print(f'   Avg length:  {avg_len[name_col]:.0f} chars')
print(f'   Uniqueness:  {uniqueness:.1%} ({df[name_col].nunique()}/{len(df)} unique)')
if uniqueness < 0.5:
    print(f'   ⚠️  Low uniqueness — consider manually setting name_col to a better column')
    print(f'   Available text columns: {text_cols}')

Columns sorted by avg text length (richest first):
  application: avg 1148 chars
  details: avg 791 chars
  eligibility: avg 651 chars
  benefits: avg 510 chars
  documents: avg 471 chars
  tags: avg 60 chars
  scheme_name: avg 56 chars
  schemeCategory: avg 34 chars
  slug: avg 7 chars
  level: avg 5 chars

✅ Name/title column: "schemeCategory"
   Avg length:  34 chars
   Uniqueness:  6.2% (210/3400 unique)
   ⚠️  Low uniqueness — consider manually setting name_col to a better column
   Available text columns: ['scheme_name', 'slug', 'details', 'benefits', 'eligibility', 'application', 'documents', 'level', 'schemeCategory', 'tags']


In [None]:
# ── Create the document string for each scheme ──
other_cols = [c for c in text_cols if c != name_col]

# FIX BUG-04: Pair-2 needs a REAL second sentence from the body text (not the name again)
# We extract from other_cols content specifically, skipping the name repetition
def build_document(row):
    """Build a rich document string. Name repeated twice for semantic weight."""
    name = str(row[name_col]).strip()
    parts = [name, name]  # name twice for higher embedding weight
    for col in other_cols:
        val = str(row[col]).strip()
        if val and val not in ('nan', 'None', ''):
            parts.append(val)
    return '. '.join(parts).strip()

def get_body_text(row):
    """Return only the non-name content (for augmentation pair-2)."""
    body_parts = []
    for col in other_cols:
        val = str(row[col]).strip()
        if val and val not in ('nan', 'None', ''):
            body_parts.append(val)
    return '. '.join(body_parts).strip()

df['document'] = df.apply(build_document, axis=1)
df['body_text'] = df.apply(get_body_text, axis=1)  # FIX BUG-04: used in augmentation

# Remove very short documents
df = df[df['document'].str.len() > 20].reset_index(drop=True)

# FIX BUG-03: Store a permanent positional index column BEFORE any sampling
# This is the ground truth index used in evaluation
df['df_idx'] = df.index  # explicit, stable — not affected by reset_index later

assert 'df_idx' in df.columns, '❌ df_idx column missing!'
assert df['df_idx'].is_unique, '❌ df_idx has duplicates — index not unique!'

print(f'Total schemes to index: {len(df)}')
print(f'Avg document length: {df["document"].str.len().mean():.0f} chars')
print(f'Avg body_text length: {df["body_text"].str.len().mean():.0f} chars')
print(f'\nSample document:')
print(df['document'].iloc[0][:400])
print(f'\nSample body_text (for augmentation):')
print(df['body_text'].iloc[0][:200])

Total schemes to index: 3400
Avg document length: 3786 chars
Avg body_text length: 3714 chars

Sample document:
Agriculture,Rural & Environment, Social welfare & Empowerment. Agriculture,Rural & Environment, Social welfare & Empowerment. "Immediate Relief Assistance" under "Welfare and Relief for Fishermen During Lean Seasons and Natural Calamities Scheme". ira-wrflsncs. The scheme "Immediate Relief Assistance" is a Sub-Component under the scheme "Welfare and Relief for Fishermen During Lean Seasons and Nat

Sample body_text (for augmentation):
"Immediate Relief Assistance" under "Welfare and Relief for Fishermen During Lean Seasons and Natural Calamities Scheme". ira-wrflsncs. The scheme "Immediate Relief Assistance" is a Sub-Component unde


## Fine-tune MiniLM

In [None]:
# FIX BUG-13: Removed unused 'import re'
# FIX BUG-10: Use sentence-transformers v3 API
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from sentence_transformers.trainer import SentenceTransformerTrainer
from datasets import Dataset as HFDataset
import time

MODEL_NAME = 'all-MiniLM-L6-v2'
model = SentenceTransformer(MODEL_NAME)
print(f'✅ Loaded: {MODEL_NAME}')
print(f'   Max sequence length:  {model.max_seq_length}')
print(f'   Embedding dimension:  {model.get_sentence_embedding_dimension()}')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]



config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Loaded: all-MiniLM-L6-v2
   Max sequence length:  256
   Embedding dimension:  384


In [None]:
# ── Synthesize training pairs ──
# FIX BUG-04: Pair-2 now uses BODY TEXT first sentence (not the scheme name again)
# FIX BUG-02: random.seed already set globally; shuffle uses seeded state

def dropout_query(text: str, dropout: float = 0.3, max_words: int = 40) -> str:
    """Random word dropout to simulate noisy/partial user queries."""
    words = text.split()
    if not words:
        return text
    kept = [w for w in words if random.random() > dropout]
    if not kept:  # edge case: all words dropped
        kept = words[:3]
    return ' '.join(kept[:max_words])

queries   = {}  # qid -> query text
corpus    = {}  # cid -> document text
relevant  = {}  # qid -> set of relevant cids  (for evaluator)

anchor_texts   = []  # for HFDataset
positive_texts = []  # for HFDataset

pair_counts = {'pair1_name': 0, 'pair2_body': 0, 'pair3_dropout': 0}

for i, row in df.iterrows():
    doc       = row['document']
    name      = str(row[name_col]).strip()
    body      = str(row['body_text']).strip()
    cid       = f'doc_{i}'
    corpus[cid] = doc

    # ── Pair 1: scheme name → full document
    if len(name) > 3:
        qid = f'q1_{i}'
        queries[qid]  = name
        relevant[qid] = {cid}
        anchor_texts.append(name)
        positive_texts.append(doc)
        pair_counts['pair1_name'] += 1

    # ── Pair 2: FIRST SENTENCE OF BODY TEXT → full document (FIX BUG-04)
    # Previously used doc.split('.')[0] which always returned the name itself
    if body:
        first_body_sentence = body.split('.')[0][:200].strip()
        if len(first_body_sentence) > 15:  # must be substantial
            qid = f'q2_{i}'
            queries[qid]  = first_body_sentence
            relevant[qid] = {cid}
            anchor_texts.append(first_body_sentence)
            positive_texts.append(doc)
            pair_counts['pair2_body'] += 1

    # ── Pair 3: random word dropout query → full document
    source_for_dropout = body if len(body) > 20 else doc
    q3 = dropout_query(source_for_dropout)
    if len(q3) > 10:
        qid = f'q3_{i}'
        queries[qid]  = q3
        relevant[qid] = {cid}
        anchor_texts.append(q3)
        positive_texts.append(doc)
        pair_counts['pair3_dropout'] += 1

print(f'Training pair counts:')
for k, v in pair_counts.items():
    print(f'  {k}: {v}')
print(f'  TOTAL: {sum(pair_counts.values())}')

assert len(anchor_texts) == len(positive_texts), '❌ Anchor/positive length mismatch!'
assert len(anchor_texts) > 0, '❌ No training pairs generated!'
print(f'\nExample pair (Pair 2 — body first sentence):')
# Find a pair2 example
for i, (a, p) in enumerate(zip(anchor_texts, positive_texts)):
    if a != p.split('.')[0] and len(a) > 15 and a not in p[:len(a)+5]:
        print(f'  Anchor:   {a[:120]}')
        print(f'  Positive: {p[:120]}')
        break

Training pair counts:
  pair1_name: 3400
  pair2_body: 3259
  pair3_dropout: 3400
  TOTAL: 10059

Example pair (Pair 2 — body first sentence):
  Anchor:   "Immediate Relief Assistance" under "Welfare and Relief for Fishermen During Lean Seasons and Natural Calamities Scheme"
  Positive: Agriculture,Rural & Environment, Social welfare & Empowerment. Agriculture,Rural & Environment, Social welfare & Empower


In [None]:
# ── Build train/eval split from pairs ──
# FIX BUG-02: Use seeded shuffle via numpy instead of random.shuffle for reproducibility

n_total = len(anchor_texts)
indices = np.random.permutation(n_total)  # seeded via set_all_seeds above

# Reserve 10% for evaluator (max 500 pairs to keep eval fast)
n_eval = min(500, max(50, int(n_total * 0.10)))
eval_idx  = indices[:n_eval]
train_idx = indices[n_eval:]

train_dataset = HFDataset.from_dict({
    'anchor':   [anchor_texts[i] for i in train_idx],
    'positive': [positive_texts[i] for i in train_idx],
})

# Build evaluator from held-out pairs
# FIX BUG-01: This evaluator is required for save_best_model=True to work
eval_queries  = {f'eq_{j}': anchor_texts[i]   for j, i in enumerate(eval_idx)}
eval_corpus   = {f'ec_{j}': positive_texts[i] for j, i in enumerate(eval_idx)}
eval_relevant = {f'eq_{j}': {f'ec_{j}'} for j in range(len(eval_idx))}

evaluator = InformationRetrievalEvaluator(
    queries=eval_queries,
    corpus=eval_corpus,
    relevant_docs=eval_relevant,
    name='scheme-eval',
    mrr_at_k=[5, 10],
    ndcg_at_k=[5, 10],
    accuracy_at_k=[1, 3, 5],
)

print(f'Train pairs: {len(train_dataset)}')
print(f'Eval pairs:  {len(eval_idx)}')
print(f'\n✅ InformationRetrievalEvaluator ready — save_best_model will now work correctly')

Train pairs: 9559
Eval pairs:  500

✅ InformationRetrievalEvaluator ready — save_best_model will now work correctly


In [None]:
# FIX BUG-06: Use SentenceTransformerTrainer (v3 API) instead of deprecated model.fit()
# FIX BUG-01: evaluator passed in, best model actually saved

BATCH_SIZE = 64
EPOCHS     = 3
SAVE_DIR   = './schemes_minilm_finetuned'

# FIX BUG-08: Ensure save directory exists before training
os.makedirs(SAVE_DIR, exist_ok=True)

train_loss = losses.MultipleNegativesRankingLoss(model)

steps_per_epoch = max(1, len(train_dataset) // BATCH_SIZE)
total_steps     = steps_per_epoch * EPOCHS
warmup_steps    = int(total_steps * 0.1)

training_args = SentenceTransformerTrainingArguments(
    output_dir=SAVE_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=warmup_steps,
    learning_rate=2e-5,
    lr_scheduler_type='cosine',
    fp16=torch.cuda.is_available(),
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,       # FIX BUG-01: now works because evaluator exists
    metric_for_best_model='eval_scheme-eval_cosine_accuracy@1',
    greater_is_better=True,
    save_total_limit=2,
    logging_steps=max(1, steps_per_epoch // 4),
    report_to='none',
    seed=SEED,                         # FIX BUG-02: training seed
    dataloader_num_workers=0,          # FIX BUG-02: 0 = main process, fully reproducible
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator,               # FIX BUG-01: enables real best-model tracking
)

print(f'Config:')
print(f'  Batch size:    {BATCH_SIZE}')
print(f'  Epochs:        {EPOCHS}')
print(f'  Steps/epoch:   {steps_per_epoch}')
print(f'  Total steps:   {total_steps}')
print(f'  Warmup steps:  {warmup_steps}')
print(f'  fp16:          {torch.cuda.is_available()}')
est = total_steps * 0.5 / 60
print(f'  Est. time:     ~{est:.0f}–{est*1.5:.0f} min on T4')

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Config:
  Batch size:    64
  Epochs:        3
  Steps/epoch:   149
  Total steps:   447
  Warmup steps:  44
  fp16:          True
  Est. time:     ~4–6 min on T4


In [None]:
print('🚀 Starting fine-tuning...')
start = time.time()

trainer.train()

elapsed = (time.time() - start) / 60
print(f'\n✅ Training complete in {elapsed:.1f} minutes')

# Save the best model
model.save_pretrained(SAVE_DIR)
print(f'Model saved to: {SAVE_DIR}')
RF
# Verify model files exist
saved_files = os.listdir(SAVE_DIR)
print(f'Saved files: {saved_files}')
assert any('model' in f or 'pytorch' in f or 'safetensor' in f for f in saved_files), \
    '❌ Model weights not found in save directory!'

🚀 Starting fine-tuning...


Epoch,Training Loss,Validation Loss,Scheme-eval Cosine Accuracy@1,Scheme-eval Cosine Accuracy@3,Scheme-eval Cosine Accuracy@5,Scheme-eval Cosine Precision@1,Scheme-eval Cosine Precision@3,Scheme-eval Cosine Precision@5,Scheme-eval Cosine Precision@10,Scheme-eval Cosine Recall@1,Scheme-eval Cosine Recall@3,Scheme-eval Cosine Recall@5,Scheme-eval Cosine Recall@10,Scheme-eval Cosine Ndcg@5,Scheme-eval Cosine Ndcg@10,Scheme-eval Cosine Mrr@5,Scheme-eval Cosine Mrr@10,Scheme-eval Cosine Map@100
1,0.609552,No log,0.648,0.714,0.74,0.648,0.238,0.148,0.0778,0.648,0.714,0.74,0.778,0.698568,0.711005,0.684667,0.689888,0.696741
2,0.600741,No log,0.644,0.722,0.746,0.644,0.240667,0.1492,0.0772,0.644,0.722,0.746,0.772,0.700578,0.708962,0.6853,0.688744,0.695387
3,0.599976,No log,0.64,0.72,0.742,0.64,0.24,0.1484,0.0766,0.64,0.72,0.742,0.766,0.697067,0.704836,0.6819,0.685113,0.691873


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]


✅ Training complete in 2.7 minutes


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model saved to: ./schemes_minilm_finetuned
Saved files: ['checkpoint-450', 'config.json', 'tokenizer.json', 'checkpoint-150', '1_Pooling', 'config_sentence_transformers.json', 'sentence_bert_config.json', '2_Normalize', 'README.md', 'tokenizer_config.json', 'eval', 'model.safetensors', 'modules.json']


## Build FAISS Search Index


In [None]:
import faiss
from tqdm import tqdm

# Load the fine-tuned model back (best checkpoint)
ft_model = SentenceTransformer(SAVE_DIR)

print(f'Encoding {len(df)} scheme documents...')
start = time.time()

corpus_embeddings = ft_model.encode(
    df['document'].tolist(),
    batch_size=128,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True,  # L2 normalize → cosine sim = dot product
)

# FIX BUG-07: Explicit cast + contiguous array — prevents FAISS type errors
corpus_embeddings = np.ascontiguousarray(corpus_embeddings, dtype=np.float32)

elapsed = time.time() - start
print(f'\nEncoding done in {elapsed:.1f}s')
print(f'Embeddings shape: {corpus_embeddings.shape}')
print(f'Dtype: {corpus_embeddings.dtype}')  # must be float32
print(f'Contiguous: {corpus_embeddings.flags["C_CONTIGUOUS"]}')  # must be True

# Shape assertion
assert corpus_embeddings.shape[0] == len(df), \
    f'❌ Embedding count {corpus_embeddings.shape[0]} != df rows {len(df)}'
assert corpus_embeddings.dtype == np.float32, '❌ Embeddings must be float32 for FAISS'
assert not np.isnan(corpus_embeddings).any(), '❌ NaNs detected in embeddings!'
assert not np.isinf(corpus_embeddings).any(), '❌ Infs detected in embeddings!'
print('\n✅ All embedding assertions passed')

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

Encoding 3400 scheme documents...


Batches:   0%|          | 0/27 [00:00<?, ?it/s]


Encoding done in 16.1s
Embeddings shape: (3400, 384)
Dtype: float32
Contiguous: True

✅ All embedding assertions passed


In [None]:
# ── Build FAISS index ──
dim = corpus_embeddings.shape[1]

if len(df) < 50_000:
    index = faiss.IndexFlatIP(dim)
    index_type = 'Flat (exact)'
else:
    nlist = min(256, len(df) // 10)
    quantizer = faiss.IndexFlatIP(dim)
    index = faiss.IndexIVFFlat(quantizer, dim, nlist, faiss.METRIC_INNER_PRODUCT)
    index.train(corpus_embeddings)
    index.nprobe = 32
    index_type = f'IVF{nlist} (approximate)'

index.add(corpus_embeddings)

# FIX BUG-07 + BUG-08: Guard directory exists before writing
os.makedirs(SAVE_DIR, exist_ok=True)
faiss_path = os.path.join(SAVE_DIR, 'schemes.faiss')
faiss.write_index(index, faiss_path)

# Verify index was saved
assert os.path.exists(faiss_path), f'❌ FAISS index not saved at {faiss_path}'

print(f'✅ FAISS index built and saved')
print(f'   Type:          {index_type}')
print(f'   Total vectors: {index.ntotal}')
print(f'   Saved at:      {faiss_path}')
print(f'   File size:     {os.path.getsize(faiss_path)/1e6:.1f} MB')

✅ FAISS index built and saved
   Type:          Flat (exact)
   Total vectors: 3400
   Saved at:      ./schemes_minilm_finetuned/schemes.faiss
   File size:     5.2 MB


In [None]:
# ── Save metadata ──
# FIX BUG-12: Explicitly convert all values to plain Python types for JSON serialisation
import json

# Save lookup CSV (preserves df_idx column for eval)
lookup_path = os.path.join(SAVE_DIR, 'schemes_lookup.csv')
df.to_csv(lookup_path, index=True)

# FIX BUG-12: list() + str() conversions ensure JSON-serialisable types
meta = {
    'base_model':           str(MODEL_NAME),
    'fine_tuned':           True,
    'embedding_dim':        int(dim),
    'num_schemes':          int(len(df)),
    'name_col':             str(name_col),
    'text_cols':            [str(c) for c in text_cols],   # FIX: numpy str → plain str
    'other_cols':           [str(c) for c in other_cols],  # FIX: numpy str → plain str
    'index_type':           str(index_type),
    'normalize_embeddings': True,
    'seed':                 int(SEED),
}

meta_path = os.path.join(SAVE_DIR, 'metadata.json')
with open(meta_path, 'w') as f:
    json.dump(meta, f, indent=2)

# Verify JSON is valid by reloading it
with open(meta_path) as f:
    _ = json.load(f)
print('✅ metadata.json valid')

print('\n✅ All files saved:')
for fname in sorted(os.listdir(SAVE_DIR)):
    fpath = os.path.join(SAVE_DIR, fname)
    if os.path.isfile(fpath):
        size_mb = os.path.getsize(fpath) / 1e6
        print(f'   {fname:<40} ({size_mb:.1f} MB)')

✅ metadata.json valid

✅ All files saved:
   README.md                                (0.1 MB)
   config.json                              (0.0 MB)
   config_sentence_transformers.json        (0.0 MB)
   metadata.json                            (0.0 MB)
   model.safetensors                        (90.9 MB)
   modules.json                             (0.0 MB)
   schemes.faiss                            (5.2 MB)
   schemes_lookup.csv                       (38.6 MB)
   sentence_bert_config.json                (0.0 MB)
   tokenizer.json                           (0.7 MB)
   tokenizer_config.json                    (0.0 MB)


##Search Demo


In [None]:
# FIX BUG-11: Use None as defaults — resolve at call time, not definition time
def search_schemes(query: str, top_k: int = 5,
                   search_model=None, search_index=None, search_df=None):
    """Semantic search over government schemes."""
    # Resolve defaults at call time — avoids stale reference if globals change
    _model = search_model if search_model is not None else ft_model
    _index = search_index if search_index is not None else index
    _df    = search_df    if search_df    is not None else df

    # Encode query with same normalisation as corpus
    q_emb = _model.encode(
        [query],
        convert_to_numpy=True,
        normalize_embeddings=True,
    )
    q_emb = np.ascontiguousarray(q_emb, dtype=np.float32)  # FIX BUG-07

    scores, indices_out = _index.search(q_emb, top_k)

    results = []
    for rank, (score, idx) in enumerate(zip(scores[0], indices_out[0]), 1):
        if idx < 0:  # FAISS returns -1 for unfilled slots
            continue
        row = _df.iloc[idx]
        results.append({
            'rank':    rank,
            'score':   float(score),
            'name':    row[name_col],
            'snippet': row['document'][:200] + '...',
            'df_idx':  int(row['df_idx']),  # FIX BUG-03: use explicit stable index
        })
    return results


def pretty_search(query: str, top_k: int = 5):
    print(f'\n🔍 Query: "{query}"')
    print('=' * 70)
    results = search_schemes(query, top_k)
    if not results:
        print('  No results found.')
        return
    for r in results:
        print(f"\n#{r['rank']}  [score: {r['score']:.3f}]  {r['name']}")
        print(f"    {r['snippet']}")
    print('\n' + '=' * 70)

print('✅ Search functions defined')

✅ Search functions defined


In [None]:
# ── Run test queries ──
test_queries = [
    "scheme for poor farmers financial help",
    "education scholarship for girl child",
    "health insurance for low income families",
    "startup business funding entrepreneurship",
    "housing scheme rural poor",
]

for q in test_queries:
    pretty_search(q, top_k=3)


🔍 Query: "scheme for poor farmers financial help"

#1  [score: 0.497]  Agriculture,Rural & Environment, Banking,Financial Services and Insurance
    Agriculture,Rural & Environment, Banking,Financial Services and Insurance. Agriculture,Rural & Environment, Banking,Financial Services and Insurance. Farmers Accidental Insurance Scheme. fais. The "Fa...

#2  [score: 0.489]  Banking,Financial Services and Insurance, Social welfare & Empowerment
    Banking,Financial Services and Insurance, Social welfare & Empowerment. Banking,Financial Services and Insurance, Social welfare & Empowerment. Financial Assistance Scheme for Purchase of Agricultural...

#3  [score: 0.466]  Agriculture,Rural & Environment, Banking,Financial Services and Insurance
    Agriculture,Rural & Environment, Banking,Financial Services and Insurance. Agriculture,Rural & Environment, Banking,Financial Services and Insurance. Fishermen Accident Relief Scheme. fars. The "Fishe...


🔍 Query: "education scholarship for girl 

In [None]:
# ── Interactive search ──
user_query = input('\nEnter your search query: ')
if user_query.strip():
    pretty_search(user_query, top_k=5)
else:
    print('No query entered.')

## Evaluate Search Quality

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# FIX BUG-15: cap sample size to available rows
sample_size = min(200, len(df))

# FIX BUG-03: use df_idx as ground truth (stable, explicit, not affected by reset_index)
sample_df = df.sample(sample_size, random_state=SEED).reset_index(drop=True)

hits_at_1 = 0
hits_at_3 = 0
hits_at_5 = 0

for _, row in tqdm(sample_df.iterrows(), total=sample_size, desc='Evaluating'):
    true_df_idx = int(row['df_idx'])   # FIX BUG-03: explicit stable index
    query       = str(row[name_col])   # scheme name as query

    results = search_schemes(query, top_k=5)
    retrieved_df_idxs = [r['df_idx'] for r in results]  # FIX BUG-03: compare df_idx

    if true_df_idx in retrieved_df_idxs[:1]: hits_at_1 += 1
    if true_df_idx in retrieved_df_idxs[:3]: hits_at_3 += 1
    if true_df_idx in retrieved_df_idxs[:5]: hits_at_5 += 1

print(f'\n=== Search Evaluation (n={sample_size}) ===')
print(f'  Hit Rate @ 1: {hits_at_1/sample_size*100:.1f}%')
print(f'  Hit Rate @ 3: {hits_at_3/sample_size*100:.1f}%')
print(f'  Hit Rate @ 5: {hits_at_5/sample_size*100:.1f}%')

# Sanity check: HR@5 should be >= HR@3 >= HR@1
assert hits_at_5 >= hits_at_3 >= hits_at_1, '❌ Hit rate ordering violated — logic error in eval!'

In [None]:
# ── Plot score distribution ──
# FIX BUG-15: cap to available rows
n_plot = min(100, len(df))
sample_queries_plot = df[name_col].sample(n_plot, random_state=0).tolist()
top1_scores = []

for q in tqdm(sample_queries_plot, desc='Scoring'):
    r = search_schemes(q, top_k=1)
    if r:
        top1_scores.append(r[0]['score'])

plt.figure(figsize=(8, 4))
sns.histplot(top1_scores, bins=20, kde=True, color='steelblue')
plt.axvline(np.mean(top1_scores), color='red', linestyle='--',
            label=f'Mean: {np.mean(top1_scores):.3f}')
plt.title('Distribution of Top-1 Cosine Similarity Scores')
plt.xlabel('Cosine Similarity Score (higher = better match)')
plt.ylabel('Count')
plt.legend()
plt.tight_layout()
plt.savefig('score_distribution.png', dpi=100)
plt.show()
print(f'Mean score: {np.mean(top1_scores):.3f} | Median: {np.median(top1_scores):.3f}')
if np.mean(top1_scores) < 0.3:
    print('⚠️  Low mean score — model may need more training epochs or data')

## Export to Local Machine

In [None]:
import shutil

ZIP_NAME = 'indian_schemes_search_model'

# FIX BUG-14: root_dir=SAVE_DIR, base_dir=None → zip extracts flat (no subfolder)
# Old (WRONG):  make_archive(name, 'zip', '.', SAVE_DIR)  → extracts to schemes_minilm_finetuned/
# Fixed:        make_archive(name, 'zip', SAVE_DIR)        → extracts flat to MODEL_DIR/
shutil.make_archive(ZIP_NAME, 'zip', root_dir=SAVE_DIR)

zip_path = f'{ZIP_NAME}.zip'
assert os.path.exists(zip_path), '❌ ZIP file not created!'
zip_size = os.path.getsize(zip_path) / 1e6

print(f'✅ Package created: {zip_path}  ({zip_size:.1f} MB)')
print('\nContents (extracts FLAT — no subfolder):')
print('  ├── model.safetensors / pytorch_model.bin')
print('  ├── tokenizer files')
print('  ├── schemes.faiss')
print('  ├── schemes_lookup.csv')
print('  └── metadata.json')

In [None]:
# ══════════════════════════════════════════════
# OPTION A: Direct browser download
# ══════════════════════════════════════════════
from google.colab import files
files.download(f'{ZIP_NAME}.zip')
print('⬇️  Download started!')

In [None]:
# ══════════════════════════════════════════════
# OPTION B: Save to Google Drive
# ══════════════════════════════════════════════
from google.colab import drive
drive.mount('/content/drive')
shutil.copy(f'{ZIP_NAME}.zip', f'/content/drive/MyDrive/{ZIP_NAME}.zip')
print(f'✅ Saved to Google Drive: /content/drive/MyDrive/{ZIP_NAME}.zip')

---
## Run Locally After Download


```python
# search_local.py
from sentence_transformers import SentenceTransformer
import faiss, pandas as pd, numpy as np, json

MODEL_DIR = 'indian_schemes_model'

model    = SentenceTransformer(MODEL_DIR)
index    = faiss.read_index(f'{MODEL_DIR}/schemes.faiss')
df       = pd.read_csv(f'{MODEL_DIR}/schemes_lookup.csv', index_col=0)
meta     = json.load(open(f'{MODEL_DIR}/metadata.json'))
name_col = meta['name_col']

def search(query, top_k=5):
    q_emb = np.ascontiguousarray(
        model.encode([query], normalize_embeddings=True), dtype=np.float32
    )
    scores, idxs = index.search(q_emb, top_k)
    for rank, (score, i) in enumerate(zip(scores[0], idxs[0]), 1):
        if i >= 0:
            print(f'#{rank} [{score:.3f}] {df.iloc[i][name_col]}')

search('scheme for women empowerment')
```

---
### 📋 Bug Fix Summary

| ID | Severity | Fix |
|----|----------|-----|
| BUG-01 | CRITICAL | Added `InformationRetrievalEvaluator` — `save_best_model` now saves actual best |
| BUG-02 | CRITICAL | Full seed setup (`random`, `numpy`, `torch`, `PYTHONHASHSEED`) before any op |
| BUG-03 | CRITICAL | `df_idx` column tracks stable positional index through all sampling |
| BUG-04 | CRITICAL | Pair-2 augmentation uses `body_text` first sentence, not scheme name again |
| BUG-05 | HIGH | Only `object` dtype columns are string-cast; numeric columns untouched |
| BUG-06 | HIGH | Replaced deprecated `model.fit()` + checkpoint args with `SentenceTransformerTrainer` |
| BUG-07 | HIGH | `np.ascontiguousarray(..., dtype=np.float32)` before every `index.add/search` |
| BUG-08 | HIGH | `os.makedirs(SAVE_DIR, exist_ok=True)` before `faiss.write_index` |
| BUG-09 | MEDIUM | name_col picks shortest avg-length keyword match, not first match |
| BUG-10 | MEDIUM | Updated to `SentenceTransformerTrainer` + `HFDataset` (v3 API) |
| BUG-11 | MEDIUM | `search_schemes` uses `None` defaults resolved at call time |
| BUG-12 | MEDIUM | All metadata values explicitly cast to plain Python types before `json.dump` |
| BUG-13 | LOW | Removed unused `import re` |
| BUG-14 | LOW | `make_archive(name, 'zip', root_dir=SAVE_DIR)` — zip extracts flat |
| BUG-15 | LOW | `sample(min(100, len(df)))` guard prevents crash on small datasets |