In [33]:
!pip install -q sentence-transformers pandas tqdm torch

In [34]:
import torch
import pandas as pd
from sentence_transformers import CrossEncoder
from tqdm import tqdm

# -------------------------------
# CONFIG
# -------------------------------
MODEL_ID = "cross-encoder/nli-deberta-v3-base"
BATCH_SIZE = 64          # Safe default for T4 / L4
MAX_LENGTH = 384         # Truncates long model answers

In [4]:
# -------------------------------
# LOAD MODEL
# -------------------------------
print(f"🚀 Loading {MODEL_ID}...")
model = CrossEncoder(
    MODEL_ID,
    device="cuda",
    max_length=MAX_LENGTH
)

# Enable FP16 for speed
model.model.half()
model.model.eval()

🚀 Loading cross-encoder/nli-deberta-v3-base...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

In [35]:
import re
def normalize_answer(text):
    text = re.sub(r"[\*\-•]", "", text)
    text = text.replace(":", " was")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [36]:
from torch.functional import norm

# -------------------------------
# EVALUATION FUNCTION
# -------------------------------
def evaluate_correctness(csv_path, pred_col, gold_col):
    print(f"\n📂 Loading {csv_path}...")
    df = pd.read_csv(csv_path)

    if pred_col not in df.columns or gold_col not in df.columns:
        raise ValueError(
            f"Missing columns. Found: {df.columns.tolist()}"
        )

    # --------------------------------------------------
    # CORRECT NLI DIRECTION
    # Premise   = GOLD ANSWER
    # Hypothesis = MODEL OUTPUT
    # --------------------------------------------------
    pairs = df[[gold_col, pred_col]].values.tolist()
    norm_pairs = [
    (gold, normalize_answer(pred))
    for gold, pred in zip(df[gold_col], df[pred_col])
]

    print(f"⚡ Running NLI on {len(pairs)} examples...")
    scores = model.predict(
        pairs,
        batch_size=BATCH_SIZE,
        show_progress_bar=True
    )

    norm_scores = model.predict(
        norm_pairs,
        batch_size=BATCH_SIZE,
        show_progress_bar=True
    )

    # -------------------------------
    # LABELS
    # -------------------------------
    labels = scores.argmax(axis=1)
    norm_labels = norm_scores.argmax(axis=1)
    id2label = model.model.config.id2label

    df["nli_label"] = [id2label[l] for l in labels]
    df["nli_label_norm"] = [id2label[l] for l in norm_labels]

    # -------------------------------
    # HALLUCINATION FLAG
    # -------------------------------
    df["hallucinated"] = df["nli_label"] == "contradiction"
    df["hallucinated_norm"] = df["nli_label_norm"] == "contradiction"

    # -------------------------------
    # SAVE
    # -------------------------------
    out_path = csv_path.replace(".csv", "_nli_graded.csv")
    df.to_csv(out_path, index=False)

    # -------------------------------
    # STATS
    # -------------------------------
    total = len(df)
    halluc_rate = df["hallucinated"].mean() * 100
    norm_halluc_rate = df["hallucinated_norm"].mean() * 100

    print("\n✅ DONE")
    print(f"📊 Total rows: {total}")
    print(f"🚨 Hallucination rate: {halluc_rate:.2f}%")
    print(f"🚨 Hallucination rate (post norm): {norm_halluc_rate:.2f}%")
    print(f"💾 Saved to: {out_path}")


In [37]:
main_path = "/content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/{m}/{m}_results_en.csv"

In [38]:
for m in ["aya","gemma","sarvam"]:
  evaluate_correctness(main_path.format(m=m), "model_response", "Answer")


📂 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/aya/aya_results_en.csv...
⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 40.00%
🚨 Hallucination rate (post norm): 40.00%
💾 Saved to: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/aya/aya_results_en_nli_graded.csv

📂 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/gemma/gemma_results_en.csv...
⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 27.50%
🚨 Hallucination rate (post norm): 28.50%
💾 Saved to: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/gemma/gemma_results_en_nli_graded.csv

📂 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/sarvam/sarvam_results_en.csv...
⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 37.00%
🚨 Hallucination rate (post norm): 37.00%
💾 Saved to: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/sarvam/sarvam_results_en_nli_graded.csv
