Data Loading

In [None]:
!pip install -q "transformers>=4.40" datasets accelerate scikit-learn pandas


In [None]:
import pandas as pd

CSV_PATH = "xfever_01pct_langbalanced.csv"   # uploaded file name

df = pd.read_csv(CSV_PATH)
print("Size:", len(df))
print(df.head())
print("\nLanguages:\n", df["language"].value_counts())
print("\nSplits:\n", df["split"].value_counts())


Size: 10070
  language  split                                              claim  \
0       en  train  Horrible Bosses is a film that is categorized ...   
1       en  train                  Robbie Collin edited The Crimson.   
2       en    dev  The Godfather Part II featured an Academy Awar...   
3       en   test  In the 1970's, Samsung entered the shipbuildin...   
4       en  train                       The Beatles started in 1980.   

      label                                       evidence_ids  
0  SUPPORTS  Horrible Bosses is a 2011 American black comed...  
1   REFUTES  Collin studied aesthetics and the philosophy o...  
2  SUPPORTS  Pacino won the BAFTA Award for Best Actor and ...  
3   REFUTES  Samsung entered the electronics industry in th...  
4   REFUTES  After their break-up in 1970, they each enjoye...  

Languages:
 language
en    2014
fr    2014
id    2014
ja    2014
zh    2014
Name: count, dtype: int64

Splits:
 split
train    8865
test      630
dev       575
Name

Label Cleaning & Encoding

In [None]:
from typing import Dict

def normalize_label(raw: str) -> str:
    if raw is None:
        return "NEI"
    s = str(raw).strip().upper()
    if s in {"SUPPORTS", "SUPPORTED"}:
        return "SUPPORTS"
    if s in {"REFUTES", "REFUTED"}:
        return "REFUTES"
    if "NOT ENOUGH" in s or s == "NEI":
        return "NEI"
    return "NEI"

df["label_norm"] = df["label"].apply(normalize_label)

def build_label_maps(labels):
    uniq = sorted(set(labels))
    label2id: Dict[str, int] = {lab: i for i, lab in enumerate(uniq)}
    id2label: Dict[int, str] = {i: lab for lab, i in enumerate(uniq)}
    return label2id, id2label

label2id, id2label = build_label_maps(df["label_norm"])
print(label2id)

df["labels"] = df["label_norm"].map(label2id)


{'NEI': 0, 'REFUTES': 1, 'SUPPORTS': 2}


In [None]:
df["split"].value_counts()


Unnamed: 0_level_0,count
split,Unnamed: 1_level_1
train,8865
test,630
dev,575


In [None]:
# 1. No missing claims
df["claim"].isna().sum()


np.int64(0)

In [None]:
# 2. Label distribution overall
df["label_norm"].value_counts()

Unnamed: 0_level_0,count
label_norm,Unnamed: 1_level_1
SUPPORTS,5410
REFUTES,2505
NEI,2155


In [None]:
# 3. Label distribution in test only

df_test = df[df["split"] == "test"]
df_test["label_norm"].value_counts()

Unnamed: 0_level_0,count
label_norm,Unnamed: 1_level_1
SUPPORTS,245
NEI,205
REFUTES,180


Train/Dev/Test Split & HF Dataset Conversion

In [None]:
from datasets import Dataset

df_train = df[df["split"] == "train"]
df_dev   = df[df["split"] == "dev"]
df_test  = df[df["split"] == "test"]

print("Train:", len(df_train), "Dev:", len(df_dev), "Test:", len(df_test))

train_ds = Dataset.from_pandas(df_train[["claim", "labels"]])
dev_ds   = Dataset.from_pandas(df_dev[["claim", "labels"]])
test_ds  = Dataset.from_pandas(df_test[["claim", "labels"]])


Train: 8865 Dev: 575 Test: 630


Tokenization & Tensor Formatting

In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "xlm-roberta-base"   # can change later
MAX_LENGTH = 128

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_batch(batch):
    return tokenizer(
        batch["claim"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
    )

train_ds = train_ds.map(tokenize_batch, batched=True)
dev_ds   = dev_ds.map(tokenize_batch, batched=True)
test_ds  = test_ds.map(tokenize_batch, batched=True)

cols = ["input_ids", "attention_mask", "labels"]
train_ds.set_format(type="torch", columns=cols)
dev_ds.set_format(type="torch", columns=cols)
test_ds.set_format(type="torch", columns=cols)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/8865 [00:00<?, ? examples/s]

Map:   0%|          | 0/575 [00:00<?, ? examples/s]

Map:   0%|          | 0/630 [00:00<?, ? examples/s]

Model 1: XLM-RoBERTa-Base Training

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

def compute_metrics(pred):
    y_pred = np.argmax(pred.predictions, axis=-1)
    y_true = pred.label_ids
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="weighted")
    return {"accuracy": acc, "f1_weighted": f1}

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

EPOCHS = 5      # good with GPU
BATCH_SIZE = 16

args = TrainingArguments(
    output_dir="./xfever_claimonly_xlmr_colab",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_steps=100,
    report_to="none",
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [None]:
trainer.train()


Step,Training Loss
100,0.9934
200,0.9636
300,0.9008
400,0.8083
500,0.777
600,0.7444
700,0.6521
800,0.6599
900,0.6474
1000,0.6227


TrainOutput(global_step=2775, training_loss=0.48968083080944713, metrics={'train_runtime': 292.5479, 'train_samples_per_second': 151.514, 'train_steps_per_second': 9.486, 'total_flos': 2915625560198400.0, 'train_loss': 0.48968083080944713, 'epoch': 5.0})

In [None]:
print("===== Evaluation on TEST =====")
test_metrics = trainer.evaluate(test_ds)
print("Test metrics:", test_metrics)


===== Evaluation on TEST =====


Test metrics: {'eval_loss': 1.6369948387145996, 'eval_accuracy': 0.6063492063492063, 'eval_f1_weighted': 0.5995740611151779, 'eval_runtime': 1.083, 'eval_samples_per_second': 581.713, 'eval_steps_per_second': 18.467, 'epoch': 5.0}


In [None]:
from sklearn.metrics import classification_report
import numpy as np

preds = trainer.predict(test_ds)
y_pred = np.argmax(preds.predictions, axis=-1)
y_true = preds.label_ids

# Build label order and names from label2id
# Example: label2id = {'NEI': 0, 'REFUTES': 1, 'SUPPORTS': 2}
label_names = sorted(label2id.keys(), key=lambda k: label2id[k])   # ['NEI','REFUTES','SUPPORTS']
label_ids   = [label2id[name] for name in label_names]             # [0,1,2]

print("\nClassification report:")
print(classification_report(
    y_true,
    y_pred,
    labels=label_ids,
    target_names=label_names
))



Classification report:
              precision    recall  f1-score   support

         NEI       0.63      0.42      0.51       205
     REFUTES       0.67      0.63      0.65       180
    SUPPORTS       0.56      0.74      0.64       245

    accuracy                           0.61       630
   macro avg       0.62      0.60      0.60       630
weighted avg       0.62      0.61      0.60       630



Model 2: XLM-RoBERTa-Large

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# -------------------------------
# MODEL: xlm-roberta-large
# -------------------------------
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,               # "xlm-roberta-large"
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

# -------------------------------
# METRICS
# -------------------------------
def compute_metrics(pred):
    y_pred = np.argmax(pred.predictions, axis=-1)
    y_true = pred.label_ids
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1_weighted": f1_score(y_true, y_pred, average="weighted"),
    }

# -------------------------------
# TRAINING ARGS (compatible with your older transformers)
# -------------------------------
EPOCHS = 8        # start with 3 for large model; bump to 4–5 if stable
BATCH_SIZE = 8    # drop to 4 or 2 if you get CUDA OOM on T4

args = TrainingArguments(
    output_dir="./xfever_multilingual_xlmr_large",
    learning_rate=1e-5,                         # slightly smaller lr for large model
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_steps=50,
    save_steps=500,
    save_total_limit=1,
    report_to="none",
)

# -------------------------------
# TRAINER
# -------------------------------
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# -------------------------------
# TRAIN
# -------------------------------
trainer.train()


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,1.051
100,0.9745
150,0.9628
200,1.0135
250,1.0186
300,1.0019
350,1.0164
400,0.9801
450,0.9016
500,0.8854


Step,Training Loss
50,1.051
100,0.9745
150,0.9628
200,1.0135
250,1.0186
300,1.0019
350,1.0164
400,0.9801
450,0.9016
500,0.8854


TrainOutput(global_step=1776, training_loss=0.7588095042082641, metrics={'train_runtime': 573.7637, 'train_samples_per_second': 24.721, 'train_steps_per_second': 3.095, 'total_flos': 933000179263488.0, 'train_loss': 0.7588095042082641, 'epoch': 8.0})

In [None]:
from sklearn.metrics import classification_report

print("===== Evaluation on TEST (multilingual, xlm-roberta-large) =====")
test_metrics = trainer.evaluate(test_ds)
print("Test metrics:", test_metrics)

preds = trainer.predict(test_ds)
y_pred = np.argmax(preds.predictions, axis=-1)
y_true = preds.label_ids

label_names = sorted(label2id.keys(), key=lambda k: label2id[k])
label_ids   = [label2id[name] for name in label_names]

print("\nClassification report (multilingual, large):")
print(classification_report(
    y_true,
    y_pred,
    labels=label_ids,
    target_names=label_names,
))


===== Evaluation on TEST (multilingual, xlm-roberta-large) =====


Test metrics: {'eval_loss': 1.0249700546264648, 'eval_accuracy': 0.5873015873015873, 'eval_f1_weighted': 0.5718355718355718, 'eval_runtime': 0.9536, 'eval_samples_per_second': 132.132, 'eval_steps_per_second': 8.389, 'epoch': 8.0}

Classification report (multilingual, large):
              precision    recall  f1-score   support

         NEI       0.64      0.34      0.44        41
     REFUTES       0.61      0.61      0.61        36
    SUPPORTS       0.56      0.78      0.65        49

    accuracy                           0.59       126
   macro avg       0.60      0.58      0.57       126
weighted avg       0.60      0.59      0.57       126



Model 3:mDeBERTa-v3-Base



In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# 1) Model name
MODEL_NAME = "microsoft/mdeberta-v3-base"

# 2) Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_batch(batch):
    return tokenizer(
        batch["claim"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

train_ds = train_ds.map(tokenize_batch, batched=True)
dev_ds   = dev_ds.map(tokenize_batch, batched=True)
test_ds  = test_ds.map(tokenize_batch, batched=True)

cols = ["input_ids", "attention_mask", "labels"]
train_ds.set_format(type="torch", columns=cols)
dev_ds.set_format(type="torch", columns=cols)
test_ds.set_format(type="torch", columns=cols)

# 3) Model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

# 4) Metrics
def compute_metrics(pred):
    y_pred = np.argmax(pred.predictions, axis=-1)
    y_true = pred.label_ids
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1_weighted": f1_score(y_true, y_pred, average="weighted"),
    }

# 5) Training arguments (compatible with your older transformers)
EPOCHS = 5
BATCH_SIZE = 16  # drop to 8 if OOM

args = TrainingArguments(
    output_dir="./xfever_multilingual_mdeberta",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_steps=50,
    save_steps=500,
    save_total_limit=1,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

# 6) Evaluate
test_metrics = trainer.evaluate(test_ds)
print("Test metrics:", test_metrics)


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]



Map:   0%|          | 0/1773 [00:00<?, ? examples/s]

Map:   0%|          | 0/115 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Step,Training Loss
50,0.9907
100,0.9202
150,0.8015
200,0.7129
250,0.6236
300,0.5776
350,0.5296
400,0.4347
450,0.4153
500,0.3656


Test metrics: {'eval_loss': 1.0601704120635986, 'eval_accuracy': 0.6349206349206349, 'eval_f1_weighted': 0.6219040608856237, 'eval_runtime': 1.1269, 'eval_samples_per_second': 111.811, 'eval_steps_per_second': 3.55, 'epoch': 5.0}


Dataset Check (this pure "ML fact checker” with no external knowledge)

In [None]:
import torch
import numpy as np

def predict_claim(claim: str):
    model.eval()

    inputs = tokenizer(
        claim,
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    # GPU support
    if torch.cuda.is_available():
        model.to("cuda")
        inputs = {k: v.to("cuda") for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
    pred_id = int(np.argmax(probs))
    pred_label = model.config.id2label[pred_id]

    return pred_label, probs


def interactive_fact_check():
    print("Type a claim to fact-check (or 'quit' to exit).")
    while True:
        claim = input("\nClaim: ").strip()
        if claim.lower() in {"quit", "exit"}:
            print("Bye!")
            break
        if not claim:
            continue

        label, probs = predict_claim(claim)
        print(f"Model label: {label}")
        print(f"Raw probabilities: {probs}")


In [None]:
interactive_fact_check()

Type a claim to fact-check (or 'quit' to exit).

Claim: Birmingham is a large town.
Model label: SUPPORTS
Raw probabilities: [0.16460861 0.312651   0.5227404 ]

Claim: exit
Bye!


Wikipedia usage


In [None]:
import requests

def wiki_search(query):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "search",
        "srsearch": query,
        "format": "json",
    }

    r = requests.get(url, params=params)

    print("Status code:", r.status_code)
    print("Final URL:", r.url)

    # If not OK, show some of the text to debug and return empty list
    if r.status_code != 200:
        print("Response text (first 500 chars):")
        print(r.text[:500])
        return []

    try:
        data = r.json()
    except Exception as e:
        print("Could not parse JSON. Raw text (first 500 chars):")
        print(r.text[:500])
        raise e

    return data["query"]["search"]


In [None]:
import urllib.parse

def wiki_summary(title):
    # URL-encode the title (spaces, etc.)
    encoded = urllib.parse.quote(title)
    url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{encoded}"
    r = requests.get(url, headers=HEADERS)

    if r.status_code != 200:
        print("Summary error:", r.status_code, r.text[:200])
        return ""

    data = r.json()
    return data.get("extract", "")


In [None]:
import requests

HEADERS = {
    # Put *any* identifying string + (optional) email/URL
    "User-Agent": "SuprajaFactChecker/1.0 (contact: suprajabalerao36@gmail.com)"
}


In [None]:
def wiki_search(query):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "search",
        "srsearch": query,
        "format": "json",
    }

    r = requests.get(url, params=params, headers=HEADERS)

    print("Status code:", r.status_code)
    print("Final URL:", r.url)

    if r.status_code != 200:
        print("Response text (first 500 chars):")
        print(r.text[:500])
        return []

    data = r.json()
    return data["query"]["search"]



In [None]:
results = wiki_search("Taj Mahal")
title = results[0]["title"]
summary = wiki_summary(title)
print("Title:", title)
print("Summary:", summary[:500])


Status code: 200
Final URL: https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=Taj+Mahal&format=json
Title: Taj Mahal
Summary: The Taj Mahal is an ivory-white marble mausoleum on the right bank of the river Yamuna in Agra, Uttar Pradesh, India. It was commissioned in 1631 by the fifth Mughal emperor, Shah Jahan, to house the tomb of his beloved wife, Mumtaz Mahal; it also houses the tomb of Shah Jahan himself. The tomb is the centrepiece of a 17-hectare (42-acre) complex, which includes a mosque and a guest house, and is set in formal gardens bounded on three sides by a crenellated wall.


In [None]:
import re

def parse_location_claim(claim: str):
    """
    Very simple parser for claims like 'X is in Y'.
    Returns (X, Y) or (None, None) if it doesn't match.
    """
    s = claim.strip().lower()
    # pattern: "<something> is in <something>"
    m = re.match(r"(.+?)\s+is in\s+(.+)", s)
    if not m:
        return None, None
    entity = m.group(1).strip()
    location = m.group(2).strip().rstrip(".")
    return entity, location


In [None]:
print(parse_location_claim("The Taj Mahal is in Hyderabad"))
print(parse_location_claim("Eiffel Tower is in Paris"))


('the taj mahal', 'hyderabad')
('eiffel tower', 'paris')


In [None]:
def get_wiki_evidence_for_entity(entity: str, max_chars: int = 600):
    # 1. Search using the entity text
    results = wiki_search(entity)
    if not results:
        return None, ""

    # 2. Take the top result's title
    title = results[0]["title"]

    # 3. Get its summary
    summary = wiki_summary(title)

    # Optionally trim the summary
    if len(summary) > max_chars:
        summary = summary[:max_chars] + "..."

    return title, summary


In [None]:
def extract_location_from_summary(summary: str):
    """
    Tries to extract a location phrase from the Wikipedia summary.
    This is heuristic and not perfect.
    """
    # Look for patterns like 'located in X', 'is in X'
    patterns = [
        r"located in ([A-Z][a-zA-Z\s,-]+)",
        r"is in ([A-Z][a-zA-Z\s,-]+)",
        r"in ([A-Z][a-zA-Z\s,-]+),",   # e.g., 'in Agra, India'
    ]
    for pat in patterns:
        m = re.search(pat, summary)
        if m:
            return m.group(1).strip()
    return None


In [None]:
def fact_check_location_claim_with_wiki(claim: str):
    entity, claimed_location = parse_location_claim(claim)
    if entity is None:
        return {
            "claim": claim,
            "label": "NEI",
            "reason": "Claim not in 'X is in Y' format.",
            "entity": None,
            "claimed_location": None,
            "wiki_title": None,
            "wiki_location": None,
            "evidence": ""
        }

    # Get Wikipedia evidence
    wiki_title, summary = get_wiki_evidence_for_entity(entity)
    if not summary:
        return {
            "claim": claim,
            "label": "NEI",
            "reason": "No Wikipedia summary found.",
            "entity": entity,
            "claimed_location": claimed_location,
            "wiki_title": None,
            "wiki_location": None,
            "evidence": ""
        }

    # Extract actual location from summary
    wiki_location = extract_location_from_summary(summary)
    if wiki_location is None:
        label = "NEI"
        reason = "Wikipedia summary did not clearly mention a location."
    else:
        # Compare lowercase substrings
        cl = claimed_location.lower()
        wl = wiki_location.lower()
        if cl in wl or wl in cl:
            label = "SUPPORTS"
            reason = f"Summary indicates the location as '{wiki_location}'."
        else:
            label = "REFUTES"
            reason = f"Summary indicates the location as '{wiki_location}', not '{claimed_location}'."

    return {
        "claim": claim,
        "label": label,
        "reason": reason,
        "entity": entity,
        "claimed_location": claimed_location,
        "wiki_title": wiki_title,
        "wiki_location": wiki_location,
        "evidence": summary,
    }


In [None]:
tests = [
    "The Taj Mahal is in Hyderabad",
    "The Taj Mahal is in Agra",
    "Eiffel Tower is in Paris",
    "Eiffel Tower is in Berlin",
]

for c in tests:
    res = fact_check_location_claim_with_wiki(c)
    print("\nClaim:", res["claim"])
    print("Label:", res["label"])
    print("Reason:", res["reason"])
    print("Wiki title:", res["wiki_title"])
    print("Wiki location:", res["wiki_location"])


Status code: 200
Final URL: https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=the+taj+mahal&format=json

Claim: The Taj Mahal is in Hyderabad
Label: REFUTES
Reason: Summary indicates the location as 'Agra, Uttar Pradesh', not 'hyderabad'.
Wiki title: Taj Mahal
Wiki location: Agra, Uttar Pradesh
Status code: 200
Final URL: https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=the+taj+mahal&format=json

Claim: The Taj Mahal is in Agra
Label: SUPPORTS
Reason: Summary indicates the location as 'Agra, Uttar Pradesh'.
Wiki title: Taj Mahal
Wiki location: Agra, Uttar Pradesh
Status code: 200
Final URL: https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=eiffel+tower&format=json

Claim: Eiffel Tower is in Paris
Label: SUPPORTS
Reason: Summary indicates the location as 'Paris'.
Wiki title: Eiffel Tower
Wiki location: Paris
Status code: 200
Final URL: https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=eiffel+tower&format=json



In [None]:
def interactive_fact_checker():
    print("🔍 Real-Time Wikipedia Fact Checker")
    print("Type a claim like: The Taj Mahal is in Hyderabad")
    print("Type 'quit' to exit.\n")

    while True:
        claim = input("Enter a claim: ").strip()
        if claim.lower() in ["quit", "exit"]:
            print("Goodbye!")
            break

        if not claim:
            print("Please type a claim.\n")
            continue

        result = fact_check_location_claim_with_wiki(claim)

        print("\n--- Fact Check Result ---")
        print("Claim:", result["claim"])
        print("Label:", result["label"])
        print("Reason:", result["reason"])
        print("Entity:", result["entity"])
        print("Claimed Location:", result["claimed_location"])
        print("Wiki Title:", result["wiki_title"])
        print("Extracted Wiki Location:", result["wiki_location"])
        print("\nEvidence snippet:")
        print(result["evidence"])
        print("\n-------------------------\n")


In [None]:
interactive_fact_checker()


🔍 Real-Time Wikipedia Fact Checker
Type a claim like: The Taj Mahal is in Hyderabad
Type 'quit' to exit.

Enter a claim: The Taj Mahal is in Hyderabad
Status code: 200
Final URL: https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=the+taj+mahal&format=json

--- Fact Check Result ---
Claim: The Taj Mahal is in Hyderabad
Label: REFUTES
Reason: Summary indicates the location as 'Agra, Uttar Pradesh', not 'hyderabad'.
Entity: the taj mahal
Claimed Location: hyderabad
Wiki Title: Taj Mahal
Extracted Wiki Location: Agra, Uttar Pradesh

Evidence snippet:
The Taj Mahal is an ivory-white marble mausoleum on the right bank of the river Yamuna in Agra, Uttar Pradesh, India. It was commissioned in 1631 by the fifth Mughal emperor, Shah Jahan, to house the tomb of his beloved wife, Mumtaz Mahal; it also houses the tomb of Shah Jahan himself. The tomb is the centrepiece of a 17-hectare (42-acre) complex, which includes a mosque and a guest house, and is set in formal gardens bounded

**NLI Model + General Wikipedia Fact Checking**

In [None]:
# ============================================
# 1️⃣ Load NLI model (multilingual XNLI)
# ============================================
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification

NLI_MODEL_NAME = "joeddav/xlm-roberta-large-xnli"

nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nli_model.to(device)
print("NLI model loaded on:", device)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/734 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NLI model loaded on: cuda


In [None]:
# ============================================
# 2️⃣ nli_fact_label()  (evidence + claim → SUPPORTS / REFUTES / NEI)
# ============================================
def nli_fact_label(claim: str, evidence: str):
    """
    Use NLI model to decide if evidence SUPPORTS / REFUTES / NEI the claim.
    Premise = evidence (Wikipedia), Hypothesis = claim (user).
    """
    enc = nli_tokenizer(
        evidence,
        claim,
        return_tensors="pt",
        truncation=True,
        max_length=512,
    ).to(device)

    with torch.no_grad():
        logits = nli_model(**enc).logits[0]

    probs = torch.softmax(logits, dim=-1).cpu().numpy()
    label_id = int(np.argmax(probs))
    raw_label = nli_model.config.id2label[label_id]  # e.g. 'contradiction', 'entailment', 'neutral'

    raw_upper = raw_label.upper()
    if "ENTAIL" in raw_upper:
        label = "SUPPORTS"
    elif "CONTRADICT" in raw_upper:
        label = "REFUTES"
    else:
        label = "NEI"

    return label, raw_label, probs


In [None]:
# ============================================
# 3️⃣ get_wiki_evidences_for_claim()
#     (uses your working wiki_search & wiki_summary)
# ============================================
def get_wiki_evidences_for_claim(claim: str, top_k: int = 3, max_chars: int = 700):
    """
    Search Wikipedia using the claim as query, return up to top_k (title, summary) pairs.
    Assumes wiki_search(query) and wiki_summary(title) are already defined.
    """
    hits = wiki_search(claim)  # you already have this
    evidences = []

    for h in hits[:top_k]:
        title = h["title"]
        summary = wiki_summary(title)  # you already have this
        if not summary:
            continue
        if len(summary) > max_chars:
            summary = summary[:max_chars] + "..."
        evidences.append((title, summary))

    return evidences


In [None]:
# ============================================
# 4️⃣ fact_check_with_wiki_nli()
#     (Wikipedia + NLI fact checker)
# ============================================
def fact_check_with_wiki_nli(claim: str, top_k: int = 3):
    """
    Wikipedia + NLI fact checker:
    1. Search Wikipedia for relevant pages.
    2. Retrieve summaries.
    3. Use multilingual NLI (XLM-R) to check SUPPORTS / REFUTES / NEI.
    """

    evidences = get_wiki_evidences_for_claim(claim, top_k=top_k)

    if not evidences:
        return {
            "claim": claim,
            "label": "NEI",
            "reason": "No relevant Wikipedia pages found.",
            "evidence_title": None,
            "evidence_text": "",
            "nli_raw_label": None,
            "nli_probs": None
        }

    best = None

    for title, text in evidences:
        label, raw_label, probs = nli_fact_label(claim, text)
        score = float(probs.max())  # how confident the model is

        if best is None or score > best["score"]:
            best = {
                "title": title,
                "text": text,
                "label": label,
                "raw_label": raw_label,
                "probs": probs,
                "score": score
            }

    final_label = best["label"]

    if final_label == "SUPPORTS":
        reason = "Wikipedia evidence entails the claim."
    elif final_label == "REFUTES":
        reason = "Wikipedia evidence contradicts the claim."
    else:
        reason = "Wikipedia evidence does not clearly entail or contradict the claim."

    return {
        "claim": claim,
        "label": final_label,
        "reason": reason,
        "evidence_title": best["title"],
        "evidence_text": best["text"],
        "nli_raw_label": best["raw_label"],
        "nli_probs": best["probs"]
    }


In [None]:
# ============================================
# 5️⃣ lookup_dataset_label()
#     (exact match of claim in XFEVER dataframe df)
# ============================================
def lookup_dataset_label(claim: str):
    """
    Look for an exact match of the claim in the XFEVER dataset.
    Returns (label, row) or (None, None).
    Requires df with columns 'claim' and 'label_norm'.
    """
    if "claim" not in df.columns or "label_norm" not in df.columns:
        raise ValueError("df must contain 'claim' and 'label_norm' columns.")

    norm_claim = claim.strip().lower()
    matches = df[df["claim"].str.strip().str.lower() == norm_claim]

    if matches.empty:
        return None, None  # Not found
    row = matches.iloc[0]
    return row["label_norm"], row


In [None]:
# ============================================
# 6️⃣ fact_check_dataset_then_wiki()
#     (dataset first → else Wikipedia + NLI)
# ============================================
def fact_check_dataset_then_wiki(claim: str, top_k: int = 3):
    """
    1. Check if the claim exists in XFEVER dataset.
    2. If yes → return dataset result.
    3. If no → run Wikipedia + NLI.
    """

    # FIRST: CHECK DATASET
    ds_label, ds_row = lookup_dataset_label(claim)

    if ds_label is not None:
        return {
            "source": "dataset",
            "claim": claim,
            "label": ds_label,
            "reason": "Found exact match in XFEVER dataset.",
            "dataset_index": int(ds_row.name),
            "dataset_split": ds_row.get("split", None),
            "dataset_language": ds_row.get("language", None),
            "evidence_title": None,
            "evidence_text": None,
            "nli_raw_label": None,
            "nli_probs": None
        }

    # SECOND: WIKIPEDIA + NLI
    wiki_result = fact_check_with_wiki_nli(claim, top_k=top_k)
    wiki_result["source"] = "wikipedia+nli"
    return wiki_result


In [None]:
# ============================================
# 7️⃣ interactive_fact_checker()
#     (user types claim; pipeline chooses dataset or wiki)
# ============================================
def interactive_fact_checker():
    print("🧠 Hybrid Fact Checker (Dataset → Wikipedia + NLI)")
    print("Type a claim. Type 'quit' to exit.\n")

    while True:
        claim = input("Enter a claim: ").strip()
        if claim.lower() in {"quit", "exit"}:
            print("Goodbye!")
            break
        if not claim:
            continue

        result = fact_check_dataset_then_wiki(claim)

        print("\n--- Fact Check Result ---")
        print("Claim:", result["claim"])
        print("Source used:", result["source"])
        print("Label:", result["label"])
        print("Reason:", result["reason"])

        if result["source"] == "dataset":
            print("Dataset index:", result["dataset_index"])
            print("Split:", result["dataset_split"])
            print("Language:", result["dataset_language"])
        else:
            print("Evidence title:", result["evidence_title"])
            print("\nEvidence snippet:")
            print(result["evidence_text"])
            print("\nRaw NLI label:", result["nli_raw_label"])

        print("-------------------------\n")


In [None]:
# ✅ Run the interactive checker
interactive_fact_checker()


🧠 Hybrid Fact Checker (Dataset → Wikipedia + NLI)
Type a claim. Type 'quit' to exit.

Enter a claim: The Eiffel Tower was built in 1889
Status code: 200
Final URL: https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=The+Eiffel+Tower+was+built+in+1889&format=json

--- Fact Check Result ---
Claim: The Eiffel Tower was built in 1889
Source used: wikipedia+nli
Label: SUPPORTS
Reason: Wikipedia evidence entails the claim.
Evidence title: Eiffel Tower replicas and derivatives

Evidence snippet:
As one of the most iconic and recognizable structures in the world, the Eiffel Tower, completed in 1889, has been the inspiration for the creation of over 50 similar towers around the world. Most are not exact replicas, though there are many that resemble it closely, while others look slightly different. The Eiffel Tower has also inspired other towers that are not close resembling replicas and therefore are not listed here, for example the Blackpool Tower.

Raw NLI label: entailment
-

In [None]:
interactive_fact_checker()

🧠 Hybrid Fact Checker (Dataset → Wikipedia + NLI)
Type a claim. Type 'quit' to exit.

Enter a claim: Sanskrit is 5000 years old language
Status code: 200
Final URL: https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=Sanskrit+is+5000+years+old+language&format=json

--- Fact Check Result ---
Claim: Sanskrit is 5000 years old language
Source used: wikipedia+nli
Label: NEI
Reason: Wikipedia evidence does not clearly entail or contradict the claim.
Evidence title: Languages of India

Evidence snippet:
Languages of India belong to several language families, the major ones being the Indo-Aryan languages spoken by 78.05% of Indians and the Dravidian languages spoken by 19.64% of Indians; both families together are sometimes known as Indic languages. Languages spoken by the remaining 2.31% of the population belong to the Austroasiatic, Sino–Tibetan, Tai–Kadai, Andamanese, and a few other minor language families and isolates. According to the People's Linguistic Survey of India