This code trains a 4-class encoder-based router.Evaluates it on the same test split as Program 1, logs per-sample predictions and aggregate metrics in the same format.
This will give you a direct, fair comparison against the keyword router.

🎯 Model Choice (Justification)
We use sentence-transformers/all-MiniLM-L6-v2  as it is ideal for
fast (important for routing), strong sentence-level semantics, widely cited & reviewer-safe, encoder-only → clean contrast vs LLM fallback


In [1]:
# ================================================================
# 📘 Program 2 — Encoder Router (MiniLM)
# ================================================================
# Purpose:
# - Use MiniLM-L6 embeddings + Logistic Regression for routing
# - Evaluate on test split
# - Save predictions + metrics to Google Drive in a reproducible way
# ================================================================

# -----------------------------
# 📌 Step 0 — Setup & Reproducibility
# -----------------------------
import random
import numpy as np
import pandas as pd
from pathlib import Path

# Reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

print("Reproducibility seed set:", RANDOM_SEED)



Reproducibility seed set: 42


In [2]:
# -----------------------------
# 📌 Step 1 — Mount Google Drive
# -----------------------------
# Google Drive mount
from google.colab import drive
drive.mount('/content/drive')

BASE_DIR = Path("/content/drive/MyDrive/FinGuardSDG")
DATA_DIR = BASE_DIR / "data" / "splits"
MODEL_DIR = BASE_DIR / "models" / "encoder"
RESULT_DIR = BASE_DIR / "results" / "encoder"

# Ensure folders exist
MODEL_DIR.mkdir(parents=True, exist_ok=True)
RESULT_DIR.mkdir(parents=True, exist_ok=True)

print("Using BASE_DIR:", BASE_DIR)




Mounted at /content/drive
Using BASE_DIR: /content/drive/MyDrive/FinGuardSDG


In [3]:
# -----------------------------
# 📌 Step 2 — Load Split Files
# -----------------------------
train_df = pd.read_csv(DATA_DIR / "FinGuard_SDG_train.csv")
val_df   = pd.read_csv(DATA_DIR / "FinGuard_SDG_val.csv")
test_df  = pd.read_csv(DATA_DIR / "FinGuard_SDG_test.csv")

print("Train:", train_df.shape)
print("Val  :", val_df.shape)
print("Test :", test_df.shape)



Train: (812, 7)
Val  : (174, 7)
Test : (174, 7)


In [6]:
# 📌 Step 3 — Load MiniLM Encoder
# -----------------------------
from sentence_transformers import SentenceTransformer

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
encoder = SentenceTransformer(MODEL_NAME)

print("MiniLM loaded.")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

MiniLM loaded.


In [7]:
# -----------------------------
# 📌 Step 4 — Generate Embeddings
# -----------------------------

# Convert categories to labels
y_train = train_df["category"].values
y_val   = val_df["category"].values
y_test  = test_df["category"].values

# Generate embeddings
X_train = encoder.encode(train_df["question_text"].tolist(), batch_size=32, show_progress_bar=True)
X_val   = encoder.encode(val_df["question_text"].tolist(), batch_size=32, show_progress_bar=True)
X_test  = encoder.encode(test_df["question_text"].tolist(), batch_size=32, show_progress_bar=True)

print("Embedding Shapes:")
print("Train:", X_train.shape)
print("Val  :", X_val.shape)
print("Test :", X_test.shape)


Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Embedding Shapes:
Train: (812, 384)
Val  : (174, 384)
Test : (174, 384)


In [8]:
# -----------------------------
# 📌 Step 5 — Train Logistic Regression Router
# -----------------------------

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    n_jobs=-1,
    random_state=RANDOM_SEED
)

clf.fit(X_train, y_train)
print("Classifier trained.")


Classifier trained.


In [9]:
# -----------------------------
# 📌 Step 6 — Evaluate on Test Set
# -----------------------------

from sklearn.metrics import accuracy_score, classification_report

y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.8735632183908046

Classification Report:

              precision    recall  f1-score   support

    advisory       0.92      0.92      0.92        39
  conceptual       0.71      0.83      0.77        36
         esg       0.86      0.91      0.88        33
quantitative       0.97      0.85      0.90        66

    accuracy                           0.87       174
   macro avg       0.87      0.88      0.87       174
weighted avg       0.88      0.87      0.88       174



In [10]:
# -----------------------------
# 📌 Step 7 — Save Per-Sample Predictions
# -----------------------------

encoder_results = []

for idx, row in test_df.iterrows():
    i = list(test_df.index).index(idx)
    encoder_results.append({
        "id": row["id"],
        "true_label": row["category"],
        "predicted_label": y_pred[i],
        "confidence": float(np.max(y_prob[i])),
        "router": "encoder_minilm"
    })

encoder_results_df = pd.DataFrame(encoder_results)

# Save
pred_path = RESULT_DIR / "encoder_router_predictions.csv"
encoder_results_df.to_csv(pred_path, index=False)

print("Saved predictions to:", pred_path)


Saved predictions to: /content/drive/MyDrive/FinGuardSDG/results/encoder/encoder_router_predictions.csv


In [11]:
# -----------------------------
# 📌 Step 8 — Save Results to Google Drive
# -----------------------------

summary = {
    "router": "encoder_minilm",
    "accuracy": float(accuracy_score(y_test, y_pred)),
    "macro_f1": float(classification_report(y_test, y_pred, output_dict=True)["macro avg"]["f1-score"]),
    "per_class": {
        k: v for k, v in classification_report(y_test, y_pred, output_dict=True).items()
        if k in ["quantitative", "conceptual", "esg", "advisory"]
    }
}

import json
summary_path = RESULT_DIR / "encoder_router_summary.json"
with open(summary_path, "w") as f:
    json.dump(summary, f, indent=2)

print("Saved summary to:", summary_path)


Saved summary to: /content/drive/MyDrive/FinGuardSDG/results/encoder/encoder_router_summary.json


In [12]:
# -----------------------------
# 📌 Step 9 — Save the Trained Classifier
# -----------------------------


import joblib

clf_path = MODEL_DIR / "encoder_classifier.joblib"
joblib.dump(clf, clf_path)

print("Saved classifier to:", clf_path)


Saved classifier to: /content/drive/MyDrive/FinGuardSDG/models/encoder/encoder_classifier.joblib
