In [None]:
!pip install -q pandas scikit-learn nltk rank_bm25 jsonlines

In [None]:
import jsonlines
import pandas as pd
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from rank_bm25 import BM25Okapi

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

 STEP 1: Load JSON or JSONL Data Correctly ----

In [None]:

def load_data(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            first_char = f.read(1)
            if first_char == "[":
                f.seek(0)  # Reset file pointer
                data = pd.read_json(f)  # Load JSON array
            else:
                f.seek(0)
                data = []
                with jsonlines.open(file_path) as reader:
                    for obj in reader:
                        data.append(obj)
                data = pd.DataFrame(data)  # Convert to DataFrame
        if data.empty:
            print(f"⚠️ Warning: {file_path} is empty or improperly formatted!")
        return data
    except Exception as e:
        print(f"🚨 Error loading {file_path}: {e}")
        return pd.DataFrame([])  # Return empty DataFrame on failure


In [None]:
train_df = load_data("train.json")
test_df = load_data("dev.json")

In [None]:
import json
import pandas as pd

# Load train.json
with open("train.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

# Convert to DataFrame
train_df = pd.DataFrame(train_data)

# Count occurrences of each label
label_counts = train_df["label"].value_counts()

# Display results
print("📊 Label Distribution in train.json:")
print(label_counts)


📊 Label Distribution in train.json:
label
Refuted                               1742
Supported                              849
Not Enough Evidence                    282
Conflicting Evidence/Cherrypicking     195
Name: count, dtype: int64


In [None]:
import json
import pandas as pd

# Load train.json
with open("dev.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

# Convert to DataFrame
train_df = pd.DataFrame(train_data)

# Count occurrences of each label
label_counts = train_df["label"].value_counts()

# Display results
print("📊 Label Distribution in dev.json:")
print(label_counts)


📊 Label Distribution in dev.json:
label
Refuted                               305
Supported                             122
Conflicting Evidence/Cherrypicking     38
Not Enough Evidence                    35
Name: count, dtype: int64


In [None]:
total_claims = len(train_df)
print(f"📌 Total number of claims in train.json: {total_claims}")


📌 Total number of claims in train.json: 500


In [None]:
total_claims = len(test_df)
print(f"📌 Total number of claims in test.json: {total_claims}")

📌 Total number of claims in test.json: 500


STEP 2: Extract Evidence from Available Fields

In [None]:

def extract_evidence(row):
    if isinstance(row.get("fact_checking_article"), str) and len(row["fact_checking_article"]) > 10:
        return row["fact_checking_article"]
    elif isinstance(row.get("questions"), list) and len(row["questions"]) > 0:
        return " ".join([q.get("question", "") for q in row["questions"]])
    elif isinstance(row.get("justification"), str) and len(row["justification"]) > 10:
        return row["justification"]
    return "No evidence available"


In [None]:
# Apply evidence extraction
train_df["evidence"] = train_df.apply(extract_evidence, axis=1)
test_df["evidence"] = test_df.apply(extract_evidence, axis=1)


In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Tokenize evidence
train_df["tokenized_evidence"] = train_df["evidence"].apply(lambda x: nltk.word_tokenize(str(x).lower()))
test_df["tokenized_evidence"] = test_df["evidence"].apply(lambda x: nltk.word_tokenize(str(x).lower()))


STEP 3: BM25 Evidence Retrieval

In [None]:

bm25 = BM25Okapi(train_df["tokenized_evidence"])

def retrieve_best_evidence(claim):
    tokenized_claim = nltk.word_tokenize(claim.lower())
    scores = bm25.get_scores(tokenized_claim)
    best_evidence_idx = scores.argmax()
    return train_df.iloc[best_evidence_idx]["evidence"] if scores.max() > 0 else "No evidence found"


In [None]:
# Retrieve evidence for test set
test_df["retrieved_evidence"] = test_df["claim"].apply(retrieve_best_evidence)


In [None]:
# Check if all keys from test.json are converted to columns in test_df
import json
# Load test.json to inspect its structure
test_json_path = "/content/dev.json"

try:
    with open(test_json_path, "r", encoding="utf-8") as f:
        test_data = json.load(f)  # Load test.json as a list of dictionaries

    # Convert to DataFrame
    test_df_fixed = pd.DataFrame(test_data)

    # Compare columns from JSON with test_df
    json_keys = set(test_data[0].keys()) if len(test_data) > 0 else set()
    df_columns = set(test_df_fixed.columns)

    missing_keys = json_keys - df_columns

    # Display results
    print(f"🛠 Columns in test_df: {df_columns}")
    print(f"🔍 Keys in test.json: {json_keys}")
    print(f"🚨 Missing keys in test_df: {missing_keys}" if missing_keys else "✅ All keys from test.json are in test_df!")

except Exception as e:
    print(f"🚨 Error loading test.json: {e}")


🛠 Columns in test_df: {'original_claim_url', 'justification', 'claim', 'claim_date', 'label', 'reporting_source', 'claim_types', 'questions', 'fact_checking_strategies', 'cached_original_claim_url', 'required_reannotation', 'fact_checking_article', 'speaker', 'location_ISO_code'}
🔍 Keys in test.json: {'original_claim_url', 'justification', 'claim', 'claim_date', 'label', 'reporting_source', 'claim_types', 'questions', 'fact_checking_strategies', 'cached_original_claim_url', 'required_reannotation', 'fact_checking_article', 'speaker', 'location_ISO_code'}
✅ All keys from test.json are in test_df!


STEP 4: Encode Labels for Model Training

In [None]:

label_map = {"Supported": 0, "Refuted": 1, "Not Enough Evidence": 2, "Conflicting Evidence/Cherrypicking": 3}
train_df["label_encoded"] = train_df["label"].map(label_map)
test_df["label_encoded"] = test_df["label"].map(label_map)

# Remove NaN values from labels
train_df = train_df.dropna(subset=["label_encoded"])
test_df = test_df.dropna(subset=["label_encoded"])

In [None]:
print(test_df.columns)

Index(['claim', 'required_reannotation', 'label', 'justification',
       'claim_date', 'speaker', 'original_claim_url', 'fact_checking_article',
       'reporting_source', 'location_ISO_code', 'claim_types',
       'fact_checking_strategies', 'questions', 'cached_original_claim_url',
       'evidence', 'tokenized_evidence', 'retrieved_evidence',
       'label_encoded'],
      dtype='object')


In [None]:
print(train_df.columns)  # List all column names


Index(['claim', 'required_reannotation', 'label', 'justification',
       'claim_date', 'speaker', 'original_claim_url', 'fact_checking_article',
       'reporting_source', 'location_ISO_code', 'claim_types',
       'fact_checking_strategies', 'questions', 'cached_original_claim_url',
       'evidence', 'tokenized_evidence', 'label_encoded'],
      dtype='object')


In [None]:
print("Columns in train_df:", test_df.columns)
print(test_df[["label", "label_encoded"]].head())

Columns in train_df: Index(['claim', 'required_reannotation', 'label', 'justification',
       'claim_date', 'speaker', 'original_claim_url', 'fact_checking_article',
       'reporting_source', 'location_ISO_code', 'claim_types',
       'fact_checking_strategies', 'questions', 'cached_original_claim_url',
       'evidence', 'tokenized_evidence', 'retrieved_evidence',
       'label_encoded'],
      dtype='object')
     label  label_encoded
0  Refuted              1
1  Refuted              1
2  Refuted              1
3  Refuted              1
4  Refuted              1


In [None]:
print(test_df["label"].unique())  # Check unique text labels
print(test_df["label_encoded"].unique())  # Check unique encoded labels


['Refuted' 'Supported' 'Not Enough Evidence'
 'Conflicting Evidence/Cherrypicking']
[1 0 2 3]


In [None]:
print(test_df["label"].value_counts())  # Show all unique label texts in test set


label
Refuted                               305
Supported                             122
Conflicting Evidence/Cherrypicking     38
Not Enough Evidence                    35
Name: count, dtype: int64


In [None]:
print(f"🚨 Missing labels in test_df: {test_df['label'].isna().sum()}")


🚨 Missing labels in test_df: 0


In [None]:
test_df = test_df.dropna(subset=["label"])


In [None]:
import json
import pandas as pd

# Load test.json again
with open("dev.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

test_df = pd.DataFrame(test_data)  # Ensure full dataset is loaded
print("✅ Reloaded test data!")


✅ Reloaded test data!


In [None]:
label_map = {
    "Supported": 0,
    "Refuted": 1,
    "Not Enough Evidence": 2,
    "Conflicting Evidence/Cherrypicking": 3  # Ensure exact match
}

test_df.loc[:, "label_encoded"] = test_df["label"].map(label_map)
print("✅ Applied label encoding!")


✅ Applied label encoding!


In [None]:
print("📊 Unique Labels in test_df['label']:", test_df["label"].unique())
print("📊 Unique Encoded Labels:", test_df["label_encoded"].unique())


📊 Unique Labels in test_df['label']: ['Refuted' 'Supported' 'Not Enough Evidence'
 'Conflicting Evidence/Cherrypicking']
📊 Unique Encoded Labels: [1 0 2 3]


In [None]:
import json
import pandas as pd

# Load train.json again
with open("dev.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

train_df = pd.DataFrame(train_data)  # Ensure all data is loaded
print(f"✅ Loaded dev.json with {len(test_df)} samples.")


✅ Loaded dev.json with 500 samples.


In [None]:
print("📊 Unique Labels in train_df['label']:", train_df["label"].unique())


📊 Unique Labels in train_df['label']: ['Refuted' 'Supported' 'Not Enough Evidence'
 'Conflicting Evidence/Cherrypicking']


In [None]:
print("📊 Unique Labels in train_df['label']:", test_df["label"].unique())


📊 Unique Labels in train_df['label']: ['Refuted' 'Supported' 'Not Enough Evidence'
 'Conflicting Evidence/Cherrypicking']


In [None]:
# Define correct label mapping
label_map = {
    "Supported": 0,
    "Refuted": 1,
    "Not Enough Evidence": 2,
    "Conflicting Evidence/Cherrypicking": 3  # Ensure correct mapping
}

# Apply encoding safely
train_df.loc[:, "label_encoded"] = train_df["label"].map(label_map)

# Verify encoding
print("✅ Label encoding applied correctly!")
print(train_df["label_encoded"].unique())  # Should now show `[0, 1, 2, 3]`


✅ Label encoding applied correctly!
[1 0 2 3]


In [None]:
print("📊 Columns in train_df:", train_df.columns)
print("📊 Columns in test_df:", test_df.columns)


📊 Columns in train_df: Index(['claim', 'required_reannotation', 'label', 'justification',
       'claim_date', 'speaker', 'original_claim_url', 'fact_checking_article',
       'reporting_source', 'location_ISO_code', 'claim_types',
       'fact_checking_strategies', 'questions', 'cached_original_claim_url',
       'label_encoded'],
      dtype='object')
📊 Columns in test_df: Index(['claim', 'required_reannotation', 'label', 'justification',
       'claim_date', 'speaker', 'original_claim_url', 'fact_checking_article',
       'reporting_source', 'location_ISO_code', 'claim_types',
       'fact_checking_strategies', 'questions', 'cached_original_claim_url',
       'label_encoded'],
      dtype='object')


In [None]:
# Define label mapping
label_map = {
    "Supported": 0,
    "Refuted": 1,
    "Not Enough Evidence": 2,
    "Conflicting Evidence/Cherrypicking": 3  # Ensure exact match
}

# Apply encoding safely
train_df.loc[:, "label_encoded"] = train_df["label"].map(label_map)
test_df.loc[:, "label_encoded"] = test_df["label"].map(label_map)

# Verify encoding
print("✅ Label encoding applied correctly!")
print(train_df["label_encoded"].unique())  # Should show `[0, 1, 2, 3]`
print(test_df["label_encoded"].unique())  # Should also show `[0, 1, 2, 3]`


✅ Label encoding applied correctly!
[1 0 2 3]
[1 0 2 3]


STEP 5: Feature Extraction using TF-IDF

In [None]:

vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df["claim"])
X_test = vectorizer.transform(test_df["claim"])
y_train = train_df["label_encoded"]
y_test = test_df["label_encoded"]


STEP 6: Train Logistic Regression Model

In [None]:

clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


In [None]:

print("🔍 Model Evaluation:")
print(classification_report(y_test, y_pred))

In [None]:
# ---- STEP 7: Evaluate Model Performance ----
print("🔍 Model Evaluation:")
print(classification_report(y_test, y_pred))

🔍 Model Evaluation:
              precision    recall  f1-score   support

           0       0.98      0.48      0.64       122
           1       0.71      1.00      0.83       305
           2       1.00      0.29      0.44        35
           3       0.00      0.00      0.00        38

    accuracy                           0.75       500
   macro avg       0.67      0.44      0.48       500
weighted avg       0.74      0.75      0.69       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ev2r for LR


In [None]:
# ---- STEP 8: Convert Predictions to JSON for Ev2R Scoring ----
reverse_label_map = {v: k for k, v in label_map.items()}

# Ensure 'retrieved_evidence' column exists before proceeding
if 'retrieved_evidence' not in test_df.columns:
    # If the column doesn't exist, create it and fill it (you might want to call your retrieve_best_evidence function here).
    # Example: Fill with a default value
    test_df['retrieved_evidence'] = 'No evidence found'  # Or call retrieve_best_evidence here to populate it
    print("⚠️ 'retrieved_evidence' column was missing. It has been added with a default value.")

# Prepare JSON structure for evaluation
predictions_json = [
    {"claim": row["claim"], "evidence": [{"question": "Claim validation?", "answer": row["retrieved_evidence"], "url": "https://example.com"}], "pred_label": reverse_label_map[pred]}
    for row, pred in zip(test_df.to_dict(orient="records"), y_pred)
]

# Save predictions
with open("dev_veracity_prediction.json", "w") as f:
    json.dump(predictions_json, f, indent=4)

print("✅ Predictions saved as dev_veracity_prediction.json")

⚠️ 'retrieved_evidence' column was missing. It has been added with a default value.
✅ Predictions saved as dev_veracity_prediction.json


In [None]:
import os
print(os.listdir("/content/"))  # List available files


['.config', 'dev_veracity_prediction.json', 'train.json', 'evaluate_veracity.py', 'dev.json', 'sample_data']


In [None]:
import json

# Load dev predictions (predicted labels & evidence)
with open("dev_veracity_prediction.json", "r") as f:
    predictions = json.load(f)

# Print sample predictions
print(json.dumps(predictions[:10], indent=4))  # View first 5 predictions


[
    {
        "claim": "In a letter to Steve Jobs, Sean Connery refused to appear in an apple commercial.",
        "evidence": [
            {
                "question": "Claim validation?",
                "answer": "No evidence found",
                "url": "https://example.com"
            }
        ],
        "pred_label": "Refuted"
    },
    {
        "claim": "Trump Administration claimed songwriter Billie Eilish Is Destroying Our Country In Leaked Documents",
        "evidence": [
            {
                "question": "Claim validation?",
                "answer": "No evidence found",
                "url": "https://example.com"
            }
        ],
        "pred_label": "Refuted"
    },
    {
        "claim": "Due to Imran Khan's criticism of Macron's comments on Islam, French authorities cancelled the visas of 183 Pakistani citizens and deported 118 from the country.",
        "evidence": [
            {
                "question": "Claim validation?",
          

In [None]:
import sys
sys.path.append("/content/evaluate_veracity.py") #code given in the averitec website to compute this score

In [None]:
from evaluate_veracity import AVeriTeCEvaluator

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

ev2r score for LR

In [None]:
scorer = AVeriTeCEvaluator()

# Load ground truth data for comparison
with open("dev.json", "r") as f:
    ground_truth_data = json.load(f)

# Compute scores against the ground truth
q_score = scorer.evaluate_questions_only(predictions, ground_truth_data)
print(f"📊 Question-Only Score: {q_score:.3f}")

qa_score = scorer.evaluate_questions_and_answers(predictions, ground_truth_data)
print(f"📊 Question-Answer Score: {qa_score:.3f}")


📊 Question-Only Score: 0.030
📊 Question-Answer Score: 0.020


In [None]:
import json

# Load existing predictions
with open("dev_veracity_prediction.json", "r") as f:
    predictions = json.load(f)

# Convert predictions to a dictionary for fast lookup
predicted_claims = {pred["claim"]: pred for pred in predictions}

# Load ground truth labels
with open("dev.json", "r") as f:
    references = json.load(f)

# Check for missing claims and add default predictions
missing_claims = [ref for ref in references if ref["claim"] not in predicted_claims]

for missing in missing_claims:
    predictions.append({
        "claim": missing["claim"],
        "pred_label": "Not Enough Evidence",  # Default label
        "evidence": [{"question": "N/A", "answer": "No evidence found", "url": "N/A"}]
    })

# Save fixed predictions
with open("dev_veracity_prediction_fixed.json", "w") as f:
    json.dump(predictions, f, indent=4)

print(f"✅ Added {len(missing_claims)} missing predictions. New file saved: dev_veracity_prediction_fixed.json")


✅ Added 0 missing predictions. New file saved: dev_veracity_prediction_fixed.json


In [None]:
from evaluate_veracity import AVeriTeCEvaluator

# Load the fixed predictions
with open("dev_veracity_prediction_fixed.json") as f:
    predictions = json.load(f)

# Load reference labels again
with open("dev.json") as f:
    references = json.load(f)

# Initialize evaluator
scorer = AVeriTeCEvaluator()

# Compute AVeriTeC Ev2R Score
ev2r_score = scorer.evaluate_averitec_score(predictions, references)

print("📊 AVeriTeC Ev2R Scores:")
for i, level in enumerate(scorer.averitec_reporting_levels):
    print(f" * Score @ {level}: {ev2r_score[i]:.3f}")


📊 AVeriTeC Ev2R Scores:
 * Score @ 0.1: 0.006
 * Score @ 0.2: 0.000
 * Score @ 0.25: 0.000
 * Score @ 0.3: 0.000
 * Score @ 0.4: 0.000
 * Score @ 0.5: 0.000


In [None]:
print(test_df[["claim", "retrieved_evidence"]].head(10))

                                               claim retrieved_evidence
0  In a letter to Steve Jobs, Sean Connery refuse...  No evidence found
1  Trump Administration claimed songwriter Billie...  No evidence found
2  Due to Imran Khan's criticism of Macron's comm...  No evidence found
3  UNESCO declared Nadar community as the most an...  No evidence found
4  Republican Matt Gaetz was part of a company th...  No evidence found
5  The United States of America and its Western a...  No evidence found
6  More than 225,000 people dead, 225,000. The es...  No evidence found
7  Why should you pay more taxes than Donald Trum...  No evidence found
8  You’re watching the cheaters and all those peo...  No evidence found
9  You see the number today? 33.1 GDP. The bigges...  No evidence found


In [None]:
from sklearn.metrics import classification_report

true_labels = [ref["label"] for ref in references]
pred_labels = [pred["pred_label"] for pred in predictions]

print("🔍 Classification Report:")
print(classification_report(true_labels, pred_labels))


🔍 Classification Report:
                                    precision    recall  f1-score   support

Conflicting Evidence/Cherrypicking       0.00      0.00      0.00        38
               Not Enough Evidence       1.00      0.29      0.44        35
                           Refuted       0.71      1.00      0.83       305
                         Supported       0.98      0.48      0.64       122

                          accuracy                           0.75       500
                         macro avg       0.67      0.44      0.48       500
                      weighted avg       0.74      0.75      0.69       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
for pred, ref in zip(predictions[:5], references[:5]):
    print(f"🔍 CLAIM: {pred['claim']}")
    # Access the 'evidence' list and extract the answer from the first item
    retrieved_evidence = pred['evidence'][0]['answer'] if pred['evidence'] else "No evidence found"
    print(f"🔹 Retrieved Evidence: {retrieved_evidence}")
    print(f"✅ Reference Evidence: {ref['fact_checking_article']}\n")

🔍 CLAIM: In a letter to Steve Jobs, Sean Connery refused to appear in an apple commercial.
🔹 Retrieved Evidence: No evidence found
✅ Reference Evidence: https://web.archive.org/web/20201130144023/https://checkyourfact.com/2020/11/03/fact-check-sean-connery-letter-steve-jobs-apple-1998/

🔍 CLAIM: Trump Administration claimed songwriter Billie Eilish Is Destroying Our Country In Leaked Documents
🔹 Retrieved Evidence: No evidence found
✅ Reference Evidence: https://web.archive.org/web/20201103001419/https://leadstories.com/hoax-alert/2020/11/fact-check-trump-administration-did-not-claim-songwriter-billie-eilish-was-destroying-country-in-leaked-documents.html

🔍 CLAIM: Due to Imran Khan's criticism of Macron's comments on Islam, French authorities cancelled the visas of 183 Pakistani citizens and deported 118 from the country.
🔹 Retrieved Evidence: No evidence found
✅ Reference Evidence: https://web.archive.org/web/20210629013122/https://www.indiatoday.in/fact-check/story/fact-check-fake-t

train xgb model

In [None]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split

# Load Train & Test Data
def load_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return pd.DataFrame(data)

train_df = load_data("train.json")
test_df = load_data("dev.json")

# Ensure column names are correct
print("Train Columns:", train_df.columns)
print("Test Columns:", test_df.columns)

# Split train data for validation
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df["label"])


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df["claim"])
X_val = vectorizer.transform(val_df["claim"])
X_test = vectorizer.transform(test_df["claim"])

y_train = train_df["label_encoded"]
y_val = val_df["label_encoded"]


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

xgb_model = XGBClassifier(n_estimators=500, learning_rate=0.1, max_depth=6, use_label_encoder=False, eval_metric="mlogloss")
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_val)
print("📊 XGBoost Classification Report:\n", classification_report(y_val, y_pred_xgb))


Parameters: { "use_label_encoder" } are not used.



📊 XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.28      0.21      0.24       170
           1       0.57      0.76      0.66       349
           2       0.21      0.05      0.09        56
           3       0.21      0.08      0.11        39

    accuracy                           0.50       614
   macro avg       0.32      0.27      0.27       614
weighted avg       0.44      0.50      0.45       614



ev2r scores for XGB

In [None]:
import nltk
from rank_bm25 import BM25Okapi

# ... (your previous code to load data and prepare train_df) ...

# Tokenize evidence for BM25
train_df["tokenized_evidence"] = train_df["questions"].apply(lambda x: nltk.word_tokenize(str(x).lower()))

# Create BM25 index
bm25 = BM25Okapi(train_df["tokenized_evidence"])

def retrieve_best_evidence(claim):
    tokenized_claim = nltk.word_tokenize(claim.lower())
    scores = bm25.get_scores(tokenized_claim)
    best_evidence_idx = scores.argmax()
    return train_df.iloc[best_evidence_idx]["questions"] if scores.max() > 0 else "No evidence found"

# Apply retrieval to test_df
test_df["retrieved_evidence"] = test_df["claim"].apply(retrieve_best_evidence)


In [None]:
reverse_label_map = {v: k for k, v in label_map.items()}

predictions_json = [
    {
        "claim": row["claim"],
        "evidence": [
            {
                "question": "Claim validation?",
                "answer": " ".join([str(item) for item in row["retrieved_evidence"]]) if isinstance(row["retrieved_evidence"], list) else str(row["retrieved_evidence"]), # Convert each item to string before joining
                "url": row["fact_checking_article"] if pd.notna(row["fact_checking_article"]) else "No URL available"
            }
        ],
        "pred_label": reverse_label_map[pred]
    }
    for row, pred in zip(test_df.to_dict(orient="records"), xgb_model.predict(X_test))  # Using XGBoost predictions
]

with open("devv_veracity_prediction.json", "w") as f:
    json.dump(predictions_json, f, indent=4)

print("✅ Predictions saved as devv_veracity_prediction.json") # corrected the output file name

✅ Predictions saved as devv_veracity_prediction.json


In [None]:
from evaluate_veracity import AVeriTeCEvaluator

# Load reference labels (if available)
with open("dev.json") as f:
    references = json.load(f)

scorer = AVeriTeCEvaluator()
averitec_score = scorer.evaluate_averitec_score(predictions_json, references)

# Print scores
for i, level in enumerate(scorer.averitec_reporting_levels):
    print(f"📊 Ev2R Score @ {level}: {averitec_score[i]}")


📊 Ev2R Score @ 0.1: 0.746
📊 Ev2R Score @ 0.2: 0.514
📊 Ev2R Score @ 0.25: 0.402
📊 Ev2R Score @ 0.3: 0.34
📊 Ev2R Score @ 0.4: 0.19
📊 Ev2R Score @ 0.5: 0.134


Train Random Forest Classifier model

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=300, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_val)
print("📊 Random Forest Classification Report:\n", classification_report(y_val, y_pred_rf))


📊 Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.02      0.03       170
           1       0.57      1.00      0.73       349
           2       0.00      0.00      0.00        56
           3       0.00      0.00      0.00        39

    accuracy                           0.57       614
   macro avg       0.39      0.25      0.19       614
weighted avg       0.60      0.57      0.42       614



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
