In [None]:
!pip install -q sentence-transformers pandas tqdm torch
!pip install -q transformers sentencepiece accelerate


In [None]:
from huggingface_hub import login
login(new_session=False)

In [None]:
import torch
import pandas as pd
from sentence_transformers import CrossEncoder
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# -------------------------------
# CONFIG
# -------------------------------
MODEL_ID = "cross-encoder/nli-deberta-v3-base"
BATCH_SIZE = 64          # Safe default for T4 / L4
MAX_LENGTH = 384         # Truncates long model answers

In [None]:
# -------------------------------
# LOAD MODEL
# -------------------------------
print(f"🚀 Loading {MODEL_ID}...")
model = CrossEncoder(
    MODEL_ID,
    device="cuda",
    max_length=MAX_LENGTH
)

# Enable FP16 for speed
model.model.half()
model.model.eval()

🚀 Loading cross-encoder/nli-deberta-v3-base...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

In [None]:
# translation
# trans_MODEL_ID = "ai4bharat/indictrans2-indic-en-1B"

# trans_tokenizer = AutoTokenizer.from_pretrained(trans_MODEL_ID, trust_remote_code=True)
# trans_model = AutoModelForSeq2SeqLM.from_pretrained(
#     trans_MODEL_ID,
#     trust_remote_code=True,
#     torch_dtype=torch.float16,   # SAFE: translation is robust to fp16
# ).cuda()

# trans_model.eval()


NLLB_MODEL_ID = "facebook/nllb-200-distilled-600M"  # smaller/faster
NLLB_BATCH_SIZE = 64
NLLB_MAX_LENGTH = 128

print(f"🌍 Loading NLLB model: {NLLB_MODEL_ID}...")
trans_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL_ID)
trans_model = AutoModelForSeq2SeqLM.from_pretrained(
    NLLB_MODEL_ID,
    torch_dtype=torch.float16
).cuda()
trans_model.eval()

🌍 Loading NLLB model: facebook/nllb-200-distilled-600M...


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): M2M100ScaledWordEmbedding(256206, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): M2M100ScaledWordEmbedding(256206, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-11): 12 x M2M100EncoderLayer(
          (self_attn): M2M100Attention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
       

In [None]:
from tqdm.auto import tqdm
def translate_batch_nllb(texts, src_lang_code, tgt_lang_code="eng_Latn", batch_size=NLLB_BATCH_SIZE, max_length=NLLB_MAX_LENGTH):
    outputs = []

    # Prepend the target language to each text
    texts_with_lang = [f"{tgt_lang_code}: {t}" if isinstance(t, str) else "" for t in texts]

    for i in tqdm(range(0, len(texts_with_lang), batch_size), desc="Translating", total=(len(texts_with_lang)+batch_size-1)//batch_size):
        batch = texts_with_lang[i:i+batch_size]

        inputs = trans_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to("cuda")

        with torch.no_grad():
            generated = trans_model.generate(
                **inputs,
                max_length=max_length,
                num_beams=2,
                do_sample=False
            )

        decoded = trans_tokenizer.batch_decode(generated, skip_special_tokens=True)
        outputs.extend(decoded)

    return outputs

In [None]:
import re
def normalize_answer(text):
    text = re.sub(r"[\*\-•]", "", text)
    text = text.replace(":", " was")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [None]:
# -------------------------------
# EVALUATION FUNCTION
# -------------------------------
def evaluate_correctness(df, pred_col, gold_col):
    # print(f"\n📂 Loading {csv_path}...")
    # df = pd.read_csv(csv_path)

    if pred_col not in df.columns or gold_col not in df.columns:
        raise ValueError(
            f"Missing columns. Found: {df.columns.tolist()}"
        )

    # --------------------------------------------------
    # CORRECT NLI DIRECTION
    # Premise   = GOLD ANSWER
    # Hypothesis = MODEL OUTPUT
    # --------------------------------------------------
    pairs = df[[gold_col, pred_col]].values.tolist()
    norm_pairs = [
    (gold, normalize_answer(pred))
    for gold, pred in zip(df[gold_col], df[pred_col])
]

    print(f"⚡ Running NLI on {len(pairs)} examples...")
    scores = model.predict(
        pairs,
        batch_size=BATCH_SIZE,
        show_progress_bar=True
    )

    norm_scores = model.predict(
        norm_pairs,
        batch_size=BATCH_SIZE,
        show_progress_bar=True
    )

    # -------------------------------
    # LABELS
    # -------------------------------
    labels = scores.argmax(axis=1)
    norm_labels = norm_scores.argmax(axis=1)
    id2label = model.model.config.id2label

    df["nli_label"] = [id2label[l] for l in labels]
    df["nli_label_norm"] = [id2label[l] for l in norm_labels]

    # -------------------------------
    # HALLUCINATION FLAG
    # -------------------------------
    df["hallucinated"] = df["nli_label"] == "contradiction"
    df["hallucinated_norm"] = df["nli_label_norm"] == "contradiction"

    # -------------------------------
    # SAVE
    # -------------------------------
    # out_path = csv_path.replace(".csv", "_nli_graded.csv")
    # df.to_csv(out_path, index=False)

    # -------------------------------
    # STATS
    # -------------------------------
    total = len(df)
    halluc_rate = df["hallucinated"].mean() * 100
    norm_halluc_rate = df["hallucinated_norm"].mean() * 100

    print("\n✅ DONE")
    print(f"📊 Total rows: {total}")
    print(f"🚨 Hallucination rate: {halluc_rate:.2f}%")
    print(f"🚨 Hallucination rate (post norm): {norm_halluc_rate:.2f}%")
    # print(f"💾 Saved to: {out_path}")

    return df


In [None]:
# | Language | Code       |
# | -------- | ---------- |
# | Hindi    | `hin_Deva` |
# | Kannada  | `kan_Knda` |
# | Marathi  | `mar_Deva` |
# | Tamil    | `tam_Taml` |

# langs = {"hi":"hin_Deva",
#          "ka":"kan_Knda",
#          "ma":"mar_Deva",
#          "ta":"tam_Taml"}

langs = {
    "hi": "hin_Latn",
    "ka": "kan_Latn",
    "ma": "mar_Latn",
    "ta": "tam_Latn"
}


In [None]:
main_path = "/content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/selfrag/{m}/{m}_results_{l}.csv"

In [None]:
# for m in ["aya","gemma","sarvam"]:
#   evaluate_correctness(main_path.format(m=m), "model_response", "Answer")

In [None]:
def trans_nli_batch(m,lang,english_df):
  # english_df = pd.read_csv("/content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/English.csv")
  # english_gold_ans = english_df["Answer"].tolist()

  print(f"working with {m} {lang}")
  print(f"🚀 Loading {main_path.format(m=m,l=lang)}...")
  csv_path = main_path.format(m=m,l=lang)
  lang_df = pd.read_csv(csv_path)
  lang_df["en_gold"] = english_df["Answer"].values

  # translate
  lang_df["translated_answers"]=translate_batch_nllb(lang_df["selfrag_final_answer"].tolist(), langs[lang])

  # NLI
  lang_df = evaluate_correctness(lang_df, "translated_answers", "en_gold")

  out_path = csv_path.replace(".csv", "_trans_nli_graded.csv")
  lang_df.to_csv(out_path, index=False)
  print(f"saving to output path : {out_path}")



In [None]:
english_df = pd.read_csv("/content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/English.csv")

In [None]:
for m in ["gemma"]:
  for lang in langs.keys():
    trans_nli_batch(m,lang,english_df)

working with aya hi
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/selfrag/aya/aya_results_hi.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 24.50%
🚨 Hallucination rate (post norm): 24.50%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/selfrag/aya/aya_results_hi_trans_nli_graded.csv
working with aya ka
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/selfrag/aya/aya_results_ka.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 14.50%
🚨 Hallucination rate (post norm): 15.50%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/selfrag/aya/aya_results_ka_trans_nli_graded.csv
working with aya ma
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/selfrag/aya/aya_results_ma.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 26.50%
🚨 Hallucination rate (post norm): 26.00%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/selfrag/aya/aya_results_ma_trans_nli_graded.csv
working with aya ta
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/selfrag/aya/aya_results_ta.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 17.00%
🚨 Hallucination rate (post norm): 17.50%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/selfrag/aya/aya_results_ta_trans_nli_graded.csv
working with gemma hi
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/selfrag/gemma/gemma_results_hi.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 16.00%
🚨 Hallucination rate (post norm): 16.00%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/selfrag/gemma/gemma_results_hi_trans_nli_graded.csv
working with gemma ka
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/selfrag/gemma/gemma_results_ka.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 19.50%
🚨 Hallucination rate (post norm): 19.00%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/selfrag/gemma/gemma_results_ka_trans_nli_graded.csv
working with gemma ma
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/selfrag/gemma/gemma_results_ma.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 15.00%
🚨 Hallucination rate (post norm): 15.50%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/selfrag/gemma/gemma_results_ma_trans_nli_graded.csv
working with gemma ta
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/selfrag/gemma/gemma_results_ta.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 16.50%
🚨 Hallucination rate (post norm): 17.00%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/selfrag/gemma/gemma_results_ta_trans_nli_graded.csv
working with sarvam hi
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/selfrag/sarvam/sarvam_results_hi.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# trans_nli_batch("sarvam","ta",english_df)