In [1]:
!pip install -q sentence-transformers pandas tqdm torch
!pip install -q transformers sentencepiece accelerate


In [2]:
from huggingface_hub import login
login(new_session=False)

In [3]:
import torch
import pandas as pd
from sentence_transformers import CrossEncoder
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# -------------------------------
# CONFIG
# -------------------------------
MODEL_ID = "cross-encoder/nli-deberta-v3-base"
BATCH_SIZE = 64          # Safe default for T4 / L4
MAX_LENGTH = 384         # Truncates long model answers

In [4]:
# -------------------------------
# LOAD MODEL
# -------------------------------
print(f"🚀 Loading {MODEL_ID}...")
model = CrossEncoder(
    MODEL_ID,
    device="cuda",
    max_length=MAX_LENGTH
)

# Enable FP16 for speed
model.model.half()
model.model.eval()

🚀 Loading cross-encoder/nli-deberta-v3-base...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

In [5]:
# translation
trans_MODEL_ID = "ai4bharat/indictrans2-indic-en-1B"

trans_tokenizer = AutoTokenizer.from_pretrained(trans_MODEL_ID, trust_remote_code=True)
trans_model = AutoModelForSeq2SeqLM.from_pretrained(
    trans_MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.float16,   # SAFE: translation is robust to fp16
).cuda()

trans_model.eval()

tokenizer_config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

tokenization_indictrans.py:   0%|          | 0.00/8.04k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-en-1B:
- tokenization_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


dict.SRC.json:   0%|          | 0.00/3.39M [00:00<?, ?B/s]

dict.TGT.json:   0%|          | 0.00/645k [00:00<?, ?B/s]

model.SRC:   0%|          | 0.00/3.26M [00:00<?, ?B/s]

model.TGT:   0%|          | 0.00/759k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

configuration_indictrans.py:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-en-1B:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`torch_dtype` is deprecated! Use `dtype` instead!


modeling_indictrans.py:   0%|          | 0.00/79.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-en-1B:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/4.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

IndicTransForConditionalGeneration(
  (model): IndicTransModel(
    (encoder): IndicTransEncoder(
      (embed_tokens): Embedding(122706, 1024, padding_idx=1)
      (embed_positions): IndicTransSinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-17): 18 x IndicTransEncoderLayer(
          (self_attn): IndicTransAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=8192, bias=True)
          (fc2): Linear(in_features=8192, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05

In [15]:
from tqdm.auto import tqdm
def translate_batch(texts, src_lang, batch_size=64, max_length=256):
    outputs = []

    # for i in range(0, len(texts), batch_size):
    for i in tqdm(
    range(0, len(texts), batch_size),
    desc="Translating",
    total=(len(texts) + batch_size - 1) // batch_size
):
        batch = texts[i:i+batch_size]

        batch = [
            f"{src_lang} eng_Latn {t}" if isinstance(t, str) else ""
            for t in batch
        ]

        inputs = trans_tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length
        ).to("cuda")

        with torch.no_grad():
            generated = trans_model.generate(
                **inputs,
                max_length=max_length,
                num_beams=1,        # ⚡ fastest
                do_sample=False,
                # use_cache=False      # Fix: Disable cache to avoid AttributeError with IndicTrans2
            )

        decoded = trans_tokenizer.batch_decode(
            generated, skip_special_tokens=True
        )

        outputs.extend(decoded)

    return outputs

In [7]:
import re
def normalize_answer(text):
    text = re.sub(r"[\*\-•]", "", text)
    text = text.replace(":", " was")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [8]:
# -------------------------------
# EVALUATION FUNCTION
# -------------------------------
def evaluate_correctness(df, pred_col, gold_col):
    # print(f"\n📂 Loading {csv_path}...")
    # df = pd.read_csv(csv_path)

    if pred_col not in df.columns or gold_col not in df.columns:
        raise ValueError(
            f"Missing columns. Found: {df.columns.tolist()}"
        )

    # --------------------------------------------------
    # CORRECT NLI DIRECTION
    # Premise   = GOLD ANSWER
    # Hypothesis = MODEL OUTPUT
    # --------------------------------------------------
    pairs = df[[gold_col, pred_col]].values.tolist()
    norm_pairs = [
    (gold, normalize_answer(pred))
    for gold, pred in zip(df[gold_col], df[pred_col])
]

    print(f"⚡ Running NLI on {len(pairs)} examples...")
    scores = model.predict(
        pairs,
        batch_size=BATCH_SIZE,
        show_progress_bar=True
    )

    norm_scores = model.predict(
        norm_pairs,
        batch_size=BATCH_SIZE,
        show_progress_bar=True
    )

    # -------------------------------
    # LABELS
    # -------------------------------
    labels = scores.argmax(axis=1)
    norm_labels = norm_scores.argmax(axis=1)
    id2label = model.model.config.id2label

    df["nli_label"] = [id2label[l] for l in labels]
    df["nli_label_norm"] = [id2label[l] for l in norm_labels]

    # -------------------------------
    # HALLUCINATION FLAG
    # -------------------------------
    df["hallucinated"] = df["nli_label"] == "contradiction"
    df["hallucinated_norm"] = df["nli_label_norm"] == "contradiction"

    # -------------------------------
    # SAVE
    # -------------------------------
    # out_path = csv_path.replace(".csv", "_nli_graded.csv")
    # df.to_csv(out_path, index=False)

    # -------------------------------
    # STATS
    # -------------------------------
    total = len(df)
    halluc_rate = df["hallucinated"].mean() * 100
    norm_halluc_rate = df["hallucinated_norm"].mean() * 100

    print("\n✅ DONE")
    print(f"📊 Total rows: {total}")
    print(f"🚨 Hallucination rate: {halluc_rate:.2f}%")
    print(f"🚨 Hallucination rate (post norm): {norm_halluc_rate:.2f}%")
    # print(f"💾 Saved to: {out_path}")

    return df


In [9]:
# | Language | Code       |
# | -------- | ---------- |
# | Hindi    | `hin_Deva` |
# | Kannada  | `kan_Knda` |
# | Marathi  | `mar_Deva` |
# | Tamil    | `tam_Taml` |

langs = {"hi":"hin_Deva",
         "ka":"kan_Knda",
         "ma":"mar_Deva",
         "ta":"tam_Taml"}


In [10]:
main_path = "/content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/{m}/{m}_results_{l}.csv"

In [11]:
# for m in ["aya","gemma","sarvam"]:
#   evaluate_correctness(main_path.format(m=m), "model_response", "Answer")

In [12]:
def trans_nli_batch(m,lang,english_df):
  # english_df = pd.read_csv("/content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/English.csv")
  # english_gold_ans = english_df["Answer"].tolist()

  print(f"working with {m} {lang}")
  print(f"🚀 Loading {main_path.format(m=m,l=lang)}...")
  csv_path = main_path.format(m=m,l=lang)
  lang_df = pd.read_csv(csv_path)
  lang_df["en_gold"] = english_df["Answer"].values

  # translate
  lang_df["translated_answers"]=translate_batch(lang_df["model_response"].tolist(), langs[lang])

  # NLI
  lang_df = evaluate_correctness(lang_df, "translated_answers", "en_gold")

  out_path = csv_path.replace(".csv", "_trans_nli_graded.csv")
  lang_df.to_csv(out_path, index=False)
  print(f"saving to output path : {out_path}")



In [13]:
english_df = pd.read_csv("/content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/English.csv")

In [16]:
for m in ["aya","gemma","sarvam"]:
  for lang in langs.keys():
    trans_nli_batch(m,lang,english_df)

working with aya hi
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/aya/aya_results_hi.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 42.00%
🚨 Hallucination rate (post norm): 42.00%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/aya/aya_results_hi_trans_nli_graded.csv
working with aya ka
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/aya/aya_results_ka.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 34.00%
🚨 Hallucination rate (post norm): 34.00%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/aya/aya_results_ka_trans_nli_graded.csv
working with aya ma
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/aya/aya_results_ma.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 40.50%
🚨 Hallucination rate (post norm): 40.50%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/aya/aya_results_ma_trans_nli_graded.csv
working with aya ta
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/aya/aya_results_ta.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 44.00%
🚨 Hallucination rate (post norm): 44.00%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/aya/aya_results_ta_trans_nli_graded.csv
working with gemma hi
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/gemma/gemma_results_hi.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 39.00%
🚨 Hallucination rate (post norm): 39.50%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/gemma/gemma_results_hi_trans_nli_graded.csv
working with gemma ka
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/gemma/gemma_results_ka.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 34.00%
🚨 Hallucination rate (post norm): 34.50%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/gemma/gemma_results_ka_trans_nli_graded.csv
working with gemma ma
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/gemma/gemma_results_ma.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 37.00%
🚨 Hallucination rate (post norm): 38.50%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/gemma/gemma_results_ma_trans_nli_graded.csv
working with gemma ta
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/gemma/gemma_results_ta.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 32.50%
🚨 Hallucination rate (post norm): 30.00%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/gemma/gemma_results_ta_trans_nli_graded.csv
working with sarvam hi
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/sarvam/sarvam_results_hi.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 37.50%
🚨 Hallucination rate (post norm): 37.50%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/sarvam/sarvam_results_hi_trans_nli_graded.csv
working with sarvam ka
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/sarvam/sarvam_results_ka.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 37.50%
🚨 Hallucination rate (post norm): 37.50%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/sarvam/sarvam_results_ka_trans_nli_graded.csv
working with sarvam ma
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/sarvam/sarvam_results_ma.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 37.00%
🚨 Hallucination rate (post norm): 37.00%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/sarvam/sarvam_results_ma_trans_nli_graded.csv
working with sarvam ta
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/sarvam/sarvam_results_ta.csv...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]

⚡ Running NLI on 200 examples...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 200
🚨 Hallucination rate: 40.00%
🚨 Hallucination rate (post norm): 40.00%
saving to output path : /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/sarvam/sarvam_results_ta_trans_nli_graded.csv
