In [2]:
from huggingface_hub import login
login(new_session=False)

import torch
import pandas as pd
from sentence_transformers import CrossEncoder
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import re
import json
from google.colab import drive

In [3]:
NLLB_MODEL_ID = "facebook/nllb-200-distilled-600M"  # smaller/faster
NLLB_BATCH_SIZE = 64
NLLB_MAX_LENGTH = 128

NLI_MODEL_ID = "cross-encoder/nli-deberta-v3-base"
NLI_BATCH_SIZE = 64
NLI_MAX_LENGTH = 384

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# -------------------------------
# LOAD NLI MODEL
# -------------------------------
print(f"🚀 Loading NLI model: {NLI_MODEL_ID}...")
nli_model = CrossEncoder(NLI_MODEL_ID, device="cuda", max_length=NLI_MAX_LENGTH)
nli_model.model.half()
nli_model.model.eval()

# -------------------------------
# LOAD NLLB MODEL
# -------------------------------
print(f"🌍 Loading NLLB model: {NLLB_MODEL_ID}...")
nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL_ID)
nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
    NLLB_MODEL_ID,
    torch_dtype=torch.float16
).cuda()
nllb_model.eval()


🚀 Loading NLI model: cross-encoder/nli-deberta-v3-base...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

🌍 Loading NLLB model: facebook/nllb-200-distilled-600M...


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): M2M100ScaledWordEmbedding(256206, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): M2M100ScaledWordEmbedding(256206, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-11): 12 x M2M100EncoderLayer(
          (self_attn): M2M100Attention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
       

In [43]:
def normalize_answer(text):
    test = str(text)
    # text = re.sub(r"[\*\-•]", "", text)
    # text = text.replace(":", " was")
    # text = re.sub(r"\s+", " ", text)
    return text.strip()

def translate_batch_nllb(texts, src_lang_code, tgt_lang_code="eng_Latn", batch_size=NLLB_BATCH_SIZE, max_length=NLLB_MAX_LENGTH):
    outputs = []

    # Prepend the target language to each text
    texts_with_lang = [f"{tgt_lang_code}: {t}" if isinstance(t, str) else "" for t in texts]

    for i in tqdm(range(0, len(texts_with_lang), batch_size), desc="Translating", total=(len(texts_with_lang)+batch_size-1)//batch_size):
        batch = texts_with_lang[i:i+batch_size]

        inputs = nllb_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to("cuda")

        with torch.no_grad():
            generated = nllb_model.generate(
                **inputs,
                max_length=max_length,
                num_beams=2,
                do_sample=False
            )

        decoded = nllb_tokenizer.batch_decode(generated, skip_special_tokens=True)
        outputs.extend(decoded)

    return outputs




def evaluate_correctness(df, pred_col, gold_col):
    if pred_col not in df.columns or gold_col not in df.columns:
        raise ValueError(f"Missing columns. Found: {df.columns.tolist()}")

    # --- FIX: Ensure inputs are strings and fill NaNs ---
    df[pred_col] = df[pred_col].fillna("").astype(str)
    df[gold_col] = df[gold_col].fillna("").astype(str)

    # Correct NLI: Premise = GOLD, Hypothesis = MODEL OUTPUT
    pairs = df[[gold_col, pred_col]].values.tolist()
    # norm_pairs = [(gold, normalize_answer(pred)) for gold, pred in pairs]

    print(f"⚡ Running NLI on {len(pairs)} examples...")
    scores = nli_model.predict(pairs, batch_size=NLI_BATCH_SIZE, show_progress_bar=True)
    # norm_scores = nli_model.predict(norm_pairs, batch_size=NLI_BATCH_SIZE, show_progress_bar=True)

    labels = scores.argmax(axis=1)
    # norm_labels = norm_scores.argmax(axis=1)
    id2label = nli_model.model.config.id2label

    df["nli_label"] = [id2label[l] for l in labels]
    # df["nli_label_norm"] = [id2label[l] for l in norm_labels]

    df["hallucinated"] = df["nli_label"] == "contradiction"
    # df["hallucinated_norm"] = df["nli_label_norm"] == "contradiction"

    total = len(df)
    halluc_rate = df["hallucinated"].mean() * 100
    # norm_halluc_rate = df["hallucinated_norm"].mean() * 100

    print("\n✅ DONE")
    print(f"📊 Total rows: {total}")
    print(f"🚨 Hallucination rate: {halluc_rate:.2f}%")
    # print(f"🚨 Hallucination rate (post norm): {norm_halluc_rate:.2f}%")
    return df

In [7]:
drive.mount('/content/drive')

langs = {
    "hi": "hin_Latn",
    "ka": "kan_Latn",
    "ma": "mar_Latn",
    "ta": "tam_Latn"
}

models = ["aya", "gemma", "sarvam"]

# main_path = "/content/drive/MyDrive/Project/Dataset/IndicSquad/Results/Vanilla-t200/{m}/{m}_results_{l}.csv"
# translated_path = "/content/drive/MyDrive/Project/Dataset/IndicSquad/Results/Translated/{m}/{m}_results_{l}_translated_en.csv"

main_path = "/content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/{m}/{m}_results_{l}.csv"
translated_path = "/content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/{m}/{m}_results_{l}_translated_en.csv"

# -------------------------------
# LOAD ENGLISH GOLD ANSWERS
# -------------------------------
json_path = "/content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/cleaned_data/english_cleaned.json"
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

rows = []
for article in data["data"]:
    for para in article.get("paragraphs", []):
        for qa in para.get("qas", []):
            qid = qa.get("id")
            answer = qa["answers"][0]["text"] if qa.get("answers") else ""
            rows.append({"id": qid, "Answer": answer})

english_df = pd.DataFrame(rows)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
def translate_and_nli(m, lang_code, english_df):
    csv_path = main_path.format(m=m, l=lang_code)
    print(f"🚀 Loading {csv_path}")
    df = pd.read_csv(csv_path)

    # --- SAFELY ADD GOLD ANSWERS ---
    df = df.merge(english_df, on="id", how="left")
    df.rename(columns={"Answer": "en_gold"}, inplace=True)

    # --- TRANSLATION ---
    print(f"🌍 Translating {m} {lang_code} with NLLB...")
    df["translated_answers"] = translate_batch_nllb(
        df["model_answer"].tolist(),
        src_lang_code=langs[lang_code]
    )

    # Save translated CSV
    out_trans_path = translated_path.format(m=m, l=lang_code)
    df.to_csv(out_trans_path, index=False)
    print(f"💾 Translated CSV saved: {out_trans_path}")

    # --- NLI ---
    print(f"⚡ Running NLI on {m} {lang_code}...")
    df = evaluate_correctness(df, "translated_answers", "en_gold")

    out_nli_path = out_trans_path.replace(".csv", "_nli_graded.csv")
    df.to_csv(out_nli_path, index=False)
    print(f"💾 NLI graded CSV saved: {out_nli_path}")


In [9]:
for m in models:
    for lang_code in langs.keys():
        translate_and_nli(m, lang_code, english_df)


🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/aya/aya_results_hi.csv
🌍 Translating aya hi with NLLB...


Translating:   0%|          | 0/93 [00:00<?, ?it/s]

💾 Translated CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/aya/aya_results_hi_translated_en.csv
⚡ Running NLI on aya hi...
⚡ Running NLI on 5928 examples...


Batches:   0%|          | 0/93 [00:00<?, ?it/s]

Batches:   0%|          | 0/93 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 5928
🚨 Hallucination rate: 37.16%
🚨 Hallucination rate (post norm): 36.89%
💾 NLI graded CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/aya/aya_results_hi_translated_en_nli_graded.csv
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/aya/aya_results_ka.csv
🌍 Translating aya ka with NLLB...


Translating:   0%|          | 0/93 [00:00<?, ?it/s]

💾 Translated CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/aya/aya_results_ka_translated_en.csv
⚡ Running NLI on aya ka...
⚡ Running NLI on 5928 examples...


Batches:   0%|          | 0/93 [00:00<?, ?it/s]

Batches:   0%|          | 0/93 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 5928
🚨 Hallucination rate: 39.00%
🚨 Hallucination rate (post norm): 37.89%
💾 NLI graded CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/aya/aya_results_ka_translated_en_nli_graded.csv
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/aya/aya_results_ma.csv
🌍 Translating aya ma with NLLB...


Translating:   0%|          | 0/93 [00:00<?, ?it/s]

💾 Translated CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/aya/aya_results_ma_translated_en.csv
⚡ Running NLI on aya ma...
⚡ Running NLI on 5928 examples...


Batches:   0%|          | 0/93 [00:00<?, ?it/s]

Batches:   0%|          | 0/93 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 5928
🚨 Hallucination rate: 37.28%
🚨 Hallucination rate (post norm): 37.37%
💾 NLI graded CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/aya/aya_results_ma_translated_en_nli_graded.csv
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/aya/aya_results_ta.csv
🌍 Translating aya ta with NLLB...


Translating:   0%|          | 0/93 [00:00<?, ?it/s]

💾 Translated CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/aya/aya_results_ta_translated_en.csv
⚡ Running NLI on aya ta...
⚡ Running NLI on 5928 examples...


Batches:   0%|          | 0/93 [00:00<?, ?it/s]

Batches:   0%|          | 0/93 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 5928
🚨 Hallucination rate: 39.32%
🚨 Hallucination rate (post norm): 37.69%
💾 NLI graded CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/aya/aya_results_ta_translated_en_nli_graded.csv
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/gemma/gemma_results_hi.csv
🌍 Translating gemma hi with NLLB...


Translating:   0%|          | 0/93 [00:00<?, ?it/s]

💾 Translated CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/gemma/gemma_results_hi_translated_en.csv
⚡ Running NLI on gemma hi...
⚡ Running NLI on 5928 examples...


Batches:   0%|          | 0/93 [00:00<?, ?it/s]

Batches:   0%|          | 0/93 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 5928
🚨 Hallucination rate: 16.53%
🚨 Hallucination rate (post norm): 16.78%
💾 NLI graded CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/gemma/gemma_results_hi_translated_en_nli_graded.csv
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/gemma/gemma_results_ka.csv
🌍 Translating gemma ka with NLLB...


Translating:   0%|          | 0/93 [00:00<?, ?it/s]

💾 Translated CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/gemma/gemma_results_ka_translated_en.csv
⚡ Running NLI on gemma ka...
⚡ Running NLI on 5928 examples...


Batches:   0%|          | 0/93 [00:00<?, ?it/s]

Batches:   0%|          | 0/93 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 5928
🚨 Hallucination rate: 16.48%
🚨 Hallucination rate (post norm): 16.53%
💾 NLI graded CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/gemma/gemma_results_ka_translated_en_nli_graded.csv
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/gemma/gemma_results_ma.csv
🌍 Translating gemma ma with NLLB...


Translating:   0%|          | 0/93 [00:00<?, ?it/s]

💾 Translated CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/gemma/gemma_results_ma_translated_en.csv
⚡ Running NLI on gemma ma...
⚡ Running NLI on 5928 examples...


Batches:   0%|          | 0/93 [00:00<?, ?it/s]

Batches:   0%|          | 0/93 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 5928
🚨 Hallucination rate: 14.49%
🚨 Hallucination rate (post norm): 14.83%
💾 NLI graded CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/gemma/gemma_results_ma_translated_en_nli_graded.csv
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/gemma/gemma_results_ta.csv
🌍 Translating gemma ta with NLLB...


Translating:   0%|          | 0/93 [00:00<?, ?it/s]

💾 Translated CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/gemma/gemma_results_ta_translated_en.csv
⚡ Running NLI on gemma ta...
⚡ Running NLI on 5928 examples...


Batches:   0%|          | 0/93 [00:00<?, ?it/s]

Batches:   0%|          | 0/93 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 5928
🚨 Hallucination rate: 20.80%
🚨 Hallucination rate (post norm): 21.00%
💾 NLI graded CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/gemma/gemma_results_ta_translated_en_nli_graded.csv
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/sarvam/sarvam_results_hi.csv
🌍 Translating sarvam hi with NLLB...


Translating:   0%|          | 0/93 [00:00<?, ?it/s]

💾 Translated CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/sarvam/sarvam_results_hi_translated_en.csv
⚡ Running NLI on sarvam hi...
⚡ Running NLI on 5928 examples...


Batches:   0%|          | 0/93 [00:00<?, ?it/s]

Batches:   0%|          | 0/93 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 5928
🚨 Hallucination rate: 34.80%
🚨 Hallucination rate (post norm): 34.85%
💾 NLI graded CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/sarvam/sarvam_results_hi_translated_en_nli_graded.csv
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/sarvam/sarvam_results_ka.csv
🌍 Translating sarvam ka with NLLB...


Translating:   0%|          | 0/93 [00:00<?, ?it/s]

💾 Translated CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/sarvam/sarvam_results_ka_translated_en.csv
⚡ Running NLI on sarvam ka...
⚡ Running NLI on 5928 examples...


Batches:   0%|          | 0/93 [00:00<?, ?it/s]

Batches:   0%|          | 0/93 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 5928
🚨 Hallucination rate: 34.28%
🚨 Hallucination rate (post norm): 33.64%
💾 NLI graded CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/sarvam/sarvam_results_ka_translated_en_nli_graded.csv
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/sarvam/sarvam_results_ma.csv
🌍 Translating sarvam ma with NLLB...


Translating:   0%|          | 0/93 [00:00<?, ?it/s]

💾 Translated CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/sarvam/sarvam_results_ma_translated_en.csv
⚡ Running NLI on sarvam ma...
⚡ Running NLI on 5928 examples...


Batches:   0%|          | 0/93 [00:00<?, ?it/s]

Batches:   0%|          | 0/93 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 5928
🚨 Hallucination rate: 34.90%
🚨 Hallucination rate (post norm): 35.07%
💾 NLI graded CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/sarvam/sarvam_results_ma_translated_en_nli_graded.csv
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/sarvam/sarvam_results_ta.csv
🌍 Translating sarvam ta with NLLB...


Translating:   0%|          | 0/93 [00:00<?, ?it/s]

💾 Translated CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/sarvam/sarvam_results_ta_translated_en.csv
⚡ Running NLI on sarvam ta...
⚡ Running NLI on 5928 examples...


Batches:   0%|          | 0/93 [00:00<?, ?it/s]

Batches:   0%|          | 0/93 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 5928
🚨 Hallucination rate: 36.81%
🚨 Hallucination rate (post norm): 36.02%
💾 NLI graded CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Translated-full/sarvam/sarvam_results_ta_translated_en_nli_graded.csv


english

In [30]:
def nli_for_english_csv(m, lang_code="en", english_df=english_df):
    # Load the model CSV (already in English)
    csv_path = main_path.format(m=m, l=lang_code)
    print(f"🚀 Loading {csv_path}")
    df = pd.read_csv(csv_path)

    # Merge with English gold answers by ID to ensure alignment
    df = df.merge(english_df, on="id", how="left")
    df.rename(columns={"Answer": "en_gold"}, inplace=True)

    # Run NLI directly
    print(f"⚡ Running NLI on {m} {lang_code}...")
    df = evaluate_correctness(df, "model_answer", "en_gold")

    # Save NLI-graded CSV
    out_nli_path = csv_path.replace(".csv", "_nli_graded.csv")
    df.to_csv(out_nli_path, index=False)
    print(f"💾 NLI graded CSV saved: {out_nli_path}")


In [44]:
for m in models:
    english_df['Answer'] = english_df['Answer'].astype('string')
    nli_for_english_csv(m, lang_code="en", english_df=english_df)


🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/aya/aya_results_en.csv
⚡ Running NLI on aya en...
⚡ Running NLI on 5928 examples...


Batches:   0%|          | 0/93 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 5928
🚨 Hallucination rate: 3.24%
💾 NLI graded CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/aya/aya_results_en_nli_graded.csv
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/gemma/gemma_results_en.csv
⚡ Running NLI on gemma en...
⚡ Running NLI on 5928 examples...


Batches:   0%|          | 0/93 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 5928
🚨 Hallucination rate: 4.07%
💾 NLI graded CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/gemma/gemma_results_en_nli_graded.csv
🚀 Loading /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/sarvam/sarvam_results_en.csv
⚡ Running NLI on sarvam en...
⚡ Running NLI on 5928 examples...


Batches:   0%|          | 0/93 [00:00<?, ?it/s]


✅ DONE
📊 Total rows: 5928
🚨 Hallucination rate: 8.86%
💾 NLI graded CSV saved: /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicSquad/Results/Vanilla/sarvam/sarvam_results_en_nli_graded.csv


In [35]:
english_df.dtypes

Unnamed: 0,0
id,object
Answer,string[python]


In [34]:
english_df

Unnamed: 0,id,Answer
0,56ddde6b9a695914005b9628,France
1,56ddde6b9a695914005b9629,10th and 11th centuries
2,56ddde6b9a695914005b962a,"Denmark, Iceland and Norway"
3,56ddde6b9a695914005b962b,Rollo
4,56ddde6b9a695914005b962c,10th century
...,...,...
5923,5737aafd1c456719005744fb,kilogram-force
5924,5737aafd1c456719005744fc,kilopond
5925,5737aafd1c456719005744fd,slug
5926,5737aafd1c456719005744fe,kip


In [39]:
has_nan = english_df['Answer'].isnull().values.any()
print(has_nan)

False


In [41]:
total_rows = len(english_df)
start_idx = int(total_rows * 0.25)
end_idx = int(total_rows * 0.28)

print(f"Inspecting rows from {start_idx} to {end_idx} (Total rows: {total_rows})")

# Slice the dataframe
subset = english_df.iloc[start_idx:end_idx]

# Check for nulls or non-strings
print("\nChecking for null values in 'Answer':")
nulls = subset[subset['Answer'].isnull()]
display(nulls)

print("\nDisplaying the subset:")
display(subset[['id', 'Answer']])

Inspecting rows from 1482 to 1659 (Total rows: 5928)

Checking for null values in 'Answer':


Unnamed: 0,id,Answer



Displaying the subset:


Unnamed: 0,id,Answer
1482,57265e11708984140094c3bd,recover market share
1483,57265e11708984140094c3be,nearly $40 per barrel
1484,5725b7f389a1e219009abd5d,a body of treaties and legislation
1485,5725b7f389a1e219009abd5e,Treaties establishing the European Union
1486,5725b7f389a1e219009abd5f,regulations and directives
...,...,...
1654,5726a5525951b619008f78e1,workforce consultation in businesses
1655,5726a638dd62a815002e8bf6,"France, Italy, Belgium, the Netherlands, Luxem..."
1656,5726a638dd62a815002e8bf7,1951
1657,5726a638dd62a815002e8bf8,cartels




No charts were generated by quickchart


In [42]:
# Check for values that are strictly NOT python strings in the subset
non_strings = subset[subset['Answer'].apply(lambda x: not isinstance(x, str))]

print(f"Rows with non-string values in the subset: {len(non_strings)}")
if not non_strings.empty:
    display(non_strings)
    print("\nTypes found in these rows:")
    print(non_strings['Answer'].apply(type).unique())
else:
    print("All values in the subset's 'Answer' column are strings.")

# Quick check on the entire dataframe to be sure
print("\n--- Global Type Check ---")
unique_types = english_df['Answer'].apply(type).unique()
print(f"Unique types in entire english_df['Answer']: {unique_types}")

Rows with non-string values in the subset: 0
All values in the subset's 'Answer' column are strings.

--- Global Type Check ---
Unique types in entire english_df['Answer']: [<class 'str'>]
