Proceed with Program 4 — LLM fallback (Llama2)

🎯 What this experiment proves (paper-ready)

The LLM is not used everywhere.
It is invoked selectively when both:
keyword is not confidently quantitative, and encoder confidence is low.

This yields targeted gains on the hardest, ambiguous queries.

🧠 Design (Very Important)

Trigger condition (conservative):Encoder confidence < 0.55
This usually affects ~10–15% of test samples.

📦 Model Choice (Practical & Reviewer-Safe)

Recommended:
meta-llama/Llama-2-7b-chat-hf (4-bit quantized)

Why:
Widely cited, Strong reasoning, Fits Colab with 4-bit quantization, and Clear contrast vs encoder

Applies low-confidence fallback to GPT-4o-mini and Aligns with Programs 1–3

In [None]:
# ================================================================
# 📘 Program 4 — LLM Fallback Router (Llama-2)
# ================================================================
# Purpose:
# - Use MiniLM encoder confidence
# - Route low-confidence queries to Llama-2
# - Save predictions + metrics to Google Drive
# ================================================================
!pip install -q bitsandbytes accelerate

import random
import numpy as np
import pandas as pd
from pathlib import Path

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

print("Reproducibility seed set:", RANDOM_SEED)

from google.colab import drive
drive.mount("/content/drive")

BASE_DIR = Path("/content/drive/MyDrive/FinGuardSDG")
DATA_DIR = BASE_DIR / "data" / "splits"
RESULTS_DIR = BASE_DIR / "results" / "llm_fallback"
MODELS_DIR = BASE_DIR / "models" / "encoder"
LLM_CONFIG_DIR = BASE_DIR / "models" / "llm_fallback"

RESULTS_DIR.mkdir(parents=True, exist_ok=True)
LLM_CONFIG_DIR.mkdir(parents=True, exist_ok=True)

print("BASE_DIR:", BASE_DIR)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hReproducibility seed set: 42
Mounted at /content/drive
BASE_DIR: /content/drive/MyDrive/FinGuardSDG


In [None]:
# ===================================================
# 📌 Step 1 — Load Test Set + Encoder Classifier
# ===================================================
# NOTE:
# The classifier was trained on MiniLM embeddings (Program 2),
# so the same encoder must be loaded here for inference.

import joblib
from sentence_transformers import SentenceTransformer

TEST_PATH = DATA_DIR / "FinGuard_SDG_test.csv"
test_df = pd.read_csv(TEST_PATH)

print("Loaded test set:", test_df.shape)
display(test_df.head())

# Load MiniLM encoder
ENCODER_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
encoder = SentenceTransformer(ENCODER_MODEL)

# Load trained classifier (Program 2)
clf = joblib.load(MODELS_DIR / "encoder_classifier.joblib")

print("MiniLM encoder + classifier loaded.")


Loaded test set: (174, 7)


Unnamed: 0,id,category,subcategory,question_text,answer_text,difficulty,source
0,Q-TVM-054,quantitative,time_value_of_money,"An investment of ₹1,50,000 earns 9% annually. ...","The future value is ₹2,73,832.14.",1,template
1,Q-EQ-047,quantitative,equity_valuation,A firm trades at a premium despite lower curre...,Investors expect future earnings growth.,2,literature-inspired
2,C-RR-011,conceptual,risk_return_theory,Why are risky assets expected to outperform ri...,Investors demand compensation for risk exposure.,1,literature-inspired
3,C-RR-020,conceptual,risk_return_theory,What limitation does variance have as a risk m...,It treats upside and downside deviations equally.,2,literature-inspired
4,Q-TVM-051,quantitative,time_value_of_money,"An annuity pays ₹48,000 annually for 9 years. ...","The present value is ₹2,87,184.93.",2,template


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

MiniLM encoder + classifier loaded.


In [None]:
# ===================================================
# 📌 Step 2 — Encoder Predictions + Confidence
# ===================================================
embeddings = encoder.encode(
    test_df["question_text"].tolist(),
    batch_size=32,
    convert_to_numpy=True,
    show_progress_bar=True,
)

probs = clf.predict_proba(embeddings)
encoder_preds = clf.predict(embeddings)
encoder_conf = probs.max(axis=1)

print("Embeddings shape:", embeddings.shape)


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Embeddings shape: (174, 384)


In [None]:
# ===================================================
# 📌 Step 3 — Load Llama-2 (4-bit)
# ===================================================
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    use_fast=True
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    load_in_4bit=True,
    torch_dtype=torch.float16
)

model.eval()

print("Llama-2 loaded in 4-bit mode.")


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Llama-2 loaded in 4-bit mode.


In [None]:
# ===================================================
# 📌 Step 4 — LLM Routing Prompt + Router
# ===================================================
ROUTING_PROMPT = """
You are a financial AI router.

Classify the following question into ONE of these categories:
- quantitative
- conceptual
- esg
- advisory

Question:
"{question}"

Return ONLY the category name.
"""

def llama_router(question: str):
    prompt = ROUTING_PROMPT.format(question=question)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=5,
            do_sample=False
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True).lower()

    for label in ["quantitative", "conceptual", "esg", "advisory"]:
        if label in response:
            return label

    return None


In [None]:
# ===================================================
# 📌 Step 5 — Hybrid Routing with LLM Fallback
# ===================================================
CONF_THRESHOLD = 0.55

final_preds = []
llm_used = []

for i, row in test_df.iterrows():
    question = row["question_text"]

    if encoder_conf[i] >= CONF_THRESHOLD:
        final_preds.append(encoder_preds[i])
        llm_used.append(False)
    else:
        llm_pred = llama_router(question)
        final_preds.append(llm_pred)
        llm_used.append(True)

print(
    f"LLM used for {sum(llm_used)} / {len(llm_used)} samples "
    f"({100 * sum(llm_used) / len(llm_used):.2f}%)"
)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


LLM used for 47 / 174 samples (27.01%)


In [None]:
# ===================================================
# 📌 Step 6 — Final Evaluation
# ===================================================
from sklearn.metrics import accuracy_score, classification_report

y_true = test_df["category"]

print("Final Accuracy:", accuracy_score(y_true, final_preds))
print("\nClassification Report:\n")
print(classification_report(y_true, final_preds))


Final Accuracy: 0.7931034482758621

Classification Report:

              precision    recall  f1-score   support

    advisory       1.00      0.72      0.84        39
  conceptual       0.92      0.61      0.73        36
         esg       0.86      0.73      0.79        33
quantitative       0.68      0.97      0.80        66

    accuracy                           0.79       174
   macro avg       0.86      0.76      0.79       174
weighted avg       0.83      0.79      0.79       174



In [None]:
# ===================================================
# 📌 Step 7 — Save Predictions
# ===================================================
results_df = pd.DataFrame({
    "id": test_df["id"],
    "question_text": test_df["question_text"],
    "true_label": y_true,
    "encoder_pred": encoder_preds,
    "encoder_conf": encoder_conf,
    "final_pred": final_preds,
    "llm_used": llm_used,
})

pred_path = RESULTS_DIR / "llm_fallback_router_predictions.csv"
results_df.to_csv(pred_path, index=False)

print("Saved predictions to:", pred_path)


Saved predictions to: /content/drive/MyDrive/FinGuardSDG/results/llm_fallback/llm_fallback_router_predictions.csv


In [None]:
# ===================================================
# 📌 Step 8 — Save Summary
# ===================================================
import json

summary = {
    "router": "hybrid_llm_fallback",
    "seed": RANDOM_SEED,
    "confidence_threshold": CONF_THRESHOLD,
    "encoder_model": ENCODER_MODEL,
    "llm_model": MODEL_ID,
    "samples_total": len(test_df),
    "samples_llm": int(sum(llm_used)),
    "llm_percent": float(100 * sum(llm_used) / len(llm_used)),
    "accuracy": float(accuracy_score(y_true, final_preds)),
}

summary_path = RESULTS_DIR / "llm_fallback_router_summary.json"
with open(summary_path, "w") as f:
    json.dump(summary, f, indent=2)

print("Saved summary to:", summary_path)


Saved summary to: /content/drive/MyDrive/FinGuardSDG/results/llm_fallback/llm_fallback_router_summary.json


In [None]:
# ===================================================
# 📌 Step 9 — Save Router Config
# ===================================================
config = {
    "seed": RANDOM_SEED,
    "encoder_model": ENCODER_MODEL,
    "encoder_classifier": "encoder_classifier.joblib",
    "llm_model": MODEL_ID,
    "confidence_threshold": CONF_THRESHOLD,
    "prompt": ROUTING_PROMPT.strip(),
    "generation": {
        "max_new_tokens": 5,
        "do_sample": False
    }
}

config_path = LLM_CONFIG_DIR / "llm_fallback_router_config.json"
with open(config_path, "w") as f:
    json.dump(config, f, indent=2)

print("Saved LLM router config to:", config_path)


Saved LLM router config to: /content/drive/MyDrive/FinGuardSDG/models/llm_fallback/llm_fallback_router_config.json
