# 2025 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme

We perform **individual model training** on both evidence retrieval and classification task. All trained model is saved under Pretrained_Model directory.
- The evidence retrieval model (Sentence Transformer) training script & log is saved in evidence_retrieval_model_training.py file.
- All classification models(T5, BERT, deBERTa) training script & log is saved under /Classification_Model_Training directory 

To execute the **overall processing pipeline**, simply run this file to generate the prediction files under /Evaluation directory.
- new_BERT_prediction.json: prediction generated with BERT model
- deBERTa_prediction.json: prediction generated with deBERTa model

Finally, to **evaluate the prediction file**, run below command:

'''
cd Evaluation  
python eval.py --predictions ./BERT_prediction.json --groundtruth ../data/dev-claims.json  
python eval.py --predictions ./deBERTa_prediction.json --groundtruth ../data/dev-claims.json  
'''


# 1.DataSet Processing

Basic Loader for Claims and Evidences

In [None]:
import json
# import random
import numpy as np
from typing import Dict, List, Tuple
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_recall_fscore_support
import faiss

# === Parameters ===
TRAIN_CLAIMS_PATH = "data/train-claims.json"
DEV_CLAIMS_PATH = "data/dev-claims.json"
EVIDENCE_PATH = "data/evidence.json"
FINETUNED_MODEL_PATH = "Pretrained_Model/bge-finetuned"
BASE_MODEL_NAME = "BAAI/bge-base-en-v1.5"
PREDICTIONS_SAVE_PATH = "Evidence_Prediction/MyPredictions"

# === Parameters ===
BATCH_SIZE = 16
TOP_K = 5 # Min 1 evidence and max 5 evidences
SIMILARITY_THRESHOLD = 0.90 # Used for selecting evidences

# === Loaders ===
def load_claims(path: str) -> Dict:
    with open(path) as f:
        return json.load(f)

def load_evidences(path: str) -> Dict:
    with open(path) as f:
        return json.load(f)


Claim-Predicted Evidence Loader for Classification Task

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
import time

# Combined predicted evidence ID, dev-claim and evidence to construct data loader for classification task
class DynamicEvidenceDataset(Dataset):
    def __init__(self, predicted_evidence_path, claim_path, evidence_path, tokenizer, max_len=512):
        self.predicted_evidence_ID = self.load_data(predicted_evidence_path)
        self.claim_data = self.load_data(claim_path)
        self.evidence_data = self.load_data(evidence_path)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_map = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT_ENOUGH_INFO': 2, 'DISPUTED': 3}

    def load_data(self, path):
        with open(path, 'r') as f:
            return json.load(f)

    def __len__(self):
        return len(self.claim_data)

    def __getitem__(self, idx):
        claim_id = list(self.predicted_evidence_ID.keys())[idx]
        pred_evidences_ID = self.predicted_evidence_ID.get(claim_id, {}).get('evidences', [])

        # Fetch claim text
        claim_text = self.claim_data[claim_id]['claim_text']

        # Fetch evidence texts
        evidence_texts = [self.evidence_data.get(e_id, "") for e_id in pred_evidences_ID]
        evidence = " [SEP] ".join(evidence_texts)

        # Construct input text -- explicit marker with "Claim" and "Evidence"
        inputs = self.tokenizer("CLAIM: " + claim_text + " [SEP] EVIDENCE: " + evidence,
                                truncation=True, padding='max_length',
                                max_length=self.max_len, return_tensors='pt')

        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        inputs['labels'] = self.label_map[self.claim_data[claim_id]['claim_label']]
        return inputs



def create_dataloader(predicted_evidence_path, claim_path, evidence_path, tokenizer, batch_size=16, max_len=512):
    dataset = DynamicEvidenceDataset(predicted_evidence_path, claim_path, evidence_path, tokenizer, max_len)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return dataloader

# 2. Model Implementation

Evidence Retrieval

In [None]:
def predictions(cand_map: Dict[str, List[str]]) -> Dict[str, dict]:
    return {cid: {"evidences": eids} for cid, eids in cand_map.items()} # convert to the expected predictions structure

def evaluate(pred: dict, actual: dict):
    # For each claim, get the set of gold and predicted evidence IDs
    gold_sets = [set(actual[c]["evidences"]) for c in actual]
    pred_sets = [set(pred.get(c, {}).get("evidences", [])) for c in actual]

    # Fit the label binarizer on the gold evidence universe
    mlb = MultiLabelBinarizer()
    y_true = mlb.fit_transform(gold_sets)

    valid_labels = set(mlb.classes_)  # only evidence IDs that appear in gold

    # Filter predictions to only include valid evidence IDs (safe for shared use)
    pred_sets_filtered = [
        [eid for eid in preds if eid in valid_labels]
        for preds in pred_sets
    ]
    y_pred = mlb.transform(pred_sets_filtered)

    # Micro-averaged precision/recall/F1 (shared evidences per-claim are handled naturally)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="micro", zero_division=0
    )

    return rec, prec, f1


def faiss_candidates(
    claims: Dict[str, dict], evidences: Dict[str, str],
    model: SentenceTransformer, top_k: int
) -> Tuple[List[str], List[str], Dict[str, List[str]]]:
    claim_ids = list(claims)
    claim_texts = [claims[cid]["claim_text"] for cid in claim_ids]
    evidence_ids, evidence_texts = zip(*[(eid, txt) for eid, txt in evidences.items() if txt]) if evidences else ([], [])

    emb_e = model.encode(evidence_texts, batch_size=64, show_progress_bar=True,
                            convert_to_numpy=True, normalize_embeddings=True, device='cpu')

    emb_c = model.encode(claim_texts, batch_size=64, show_progress_bar=True,
                         convert_to_numpy=True, normalize_embeddings=True, device='cpu')

    d = emb_e.shape[1]
    index = faiss.IndexHNSWFlat(d, 32)
    index.hnsw.efConstruction = 200

    print("Adding embeddings to FAISS index in batches")

    total = len(emb_e)
    progress_checkpoints = {int(total * i / 10) for i in range(1, 11)}  # 10%, 20%, ..., 100%

    for i in range(0, total, 100):
        try:
            batch = emb_e[i:i+100]
            index.add(batch)
            current = i + len(batch)
            if current in progress_checkpoints:
                print(f"{int(current / total * 100)}% complete ({current} / {total})", flush=True)
        except Exception as e:
            print(f"Failed at batch {i}: {e}")
            break

    print(f"Finished adding all vectors")

    print("Performing FAISS search (per claim)")

    I_all = []

    for emb in tqdm(emb_c, desc="Searching claims"):
        sim_scores, I = index.search(np.expand_dims(emb, axis=0), top_k)
        paired = list(zip(I[0], sim_scores[0]))

        # Filter by similarity threshold
        filtered = [idx for idx, score in paired if score >= SIMILARITY_THRESHOLD]

        # If nothing passes threshold, take the best one as we want at-least 1 evidence per claim
        if not filtered:
            filtered = [paired[0][0]] if paired else []

        I_all.append(filtered)

    print(f"Search done")

    # Create dict structure for the results
    cand_map: Dict[str, List[str]] = {
        claim_ids[i]: [
            evidence_ids[j] for j in I_all[i] if j < len(evidence_ids)
        ] for i in range(len(claim_ids))
    }

    return cand_map


Claim Classification

In [None]:
import json
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification
import time

# Run trained classification model on development set
def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    start_time = time.time()

    with torch.no_grad():
        for batch in dataloader:

            input_ids = batch['input_ids'].to('cuda')
            attention_mask = batch['attention_mask'].to('cuda')
            labels = batch['labels'].to('cuda')

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    end_time = time.time()
    inference_time = end_time - start_time

    return all_preds, all_labels, inference_time


# Evaluation pipeline
def run_evaluation(predicted_evidence_path, claim_path, evidence_path, model, tokenizer,output_path, batch_size=16, max_len=512):

    label_map = {0: 'SUPPORTS', 1: 'REFUTES', 2: 'NOT_ENOUGH_INFO', 3: 'DISPUTED'}

    dataloader = create_dataloader(predicted_evidence_path, claim_path, evidence_path, tokenizer, batch_size, max_len)
    preds, labels, inference_time = evaluate_model(model, dataloader)

    # Calculate metrics
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    accuracy = accuracy_score(labels, preds)

    # Prepare predicion output
    output_data = {}
    predicted_evidence_ID = json.load(open(predicted_evidence_path))

    for idx, claim_id in enumerate(predicted_evidence_ID.keys()):
        output_data[claim_id] = {
            "evidences": predicted_evidence_ID[claim_id]["evidences"],
            "claim_label": label_map[int(preds[idx])]
        }

    with open(output_path, 'w') as f:
        json.dump(output_data, f, indent=4)

    print(f"Classification predictions saved to {output_path}")
    print(f"Classification Accuracy: {accuracy:.4f}")
    print(f"Classification Precision: {precision:.4f}")
    print(f"Classification Recall: {recall:.4f}")
    print(f"Classification F1 Score: {f1:.4f}")
    print(f"Total Classification Inference Time: {inference_time:.2f} seconds")



# 3.Testing and Evaluation

Evidence Retrieval

In [None]:
# load model
model = SentenceTransformer(FINETUNED_MODEL_PATH)

dev_claims = load_claims(DEV_CLAIMS_PATH)
evidences = load_evidences(EVIDENCE_PATH)

cand_map = faiss_candidates(dev_claims, evidences, model, TOP_K)
pred = predictions(cand_map) # Convert to appropriate predictions structure

# save predictions file
if PREDICTIONS_SAVE_PATH:
    with open(PREDICTIONS_SAVE_PATH, "w") as f:
        json.dump(pred, f, indent=2)
    print(f"Saved predictions to {PREDICTIONS_SAVE_PATH}")

recall, precision, f1 = evaluate(pred, dev_claims) # evaluate results
print(f"Recall: {recall:.4f}, Precision: {precision:.4f}, F1: {f1:.4f}")

'''
Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18888/18888 [6:02:03<00:00,  4.86it/s]
Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.04it/s]
Adding embeddings to FAISS index in batches
10% complete (120882 / 1208827)
20% complete (241765 / 1208827)
30% complete (362648 / 1208827)
40% complete (483530 / 1208827)
50% complete (604413 / 1208827)
60% complete (725296 / 1208827)
70% complete (846179 / 1208827)
80% complete (967062 / 1208827)
90% complete (1087945 / 1208827)
100% complete (1208827 / 1208827)
Finished adding all vectors
Performing FAISS search (per claim)
Searching claims: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 154/154 [00:56<00:00,  2.75it/s]
Search done
Saved predictions to MyPredictions
Recall: 0.0896, Precision: 0.1800, F1: 0.1176
'''


Claim Classification with BERT model

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

STATE_DICT_PATH = "Pretrained_model/new_bert_model_Autocast_explicitMarker_LR5e05.pt"
OUTPUT_PATH = "Prediction/bert_prediction.json"

batch_size = 16
max_len = 512

# Load trained Bert Model and Tokenizer
def load_model_and_tokenizer(state_dict_path):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
    if state_dict_path:
        state_dict = torch.load(state_dict_path)
        model.load_state_dict(state_dict)
        print("successfully loaded finetuned BERT model")
    model.to('cuda')
    return model, tokenizer

# Load model and tokenizer
model, tokenizer = load_model_and_tokenizer(STATE_DICT_PATH)

# Run classification evaluation with BERT Model
run_evaluation(PREDICTIONS_SAVE_PATH, DEV_CLAIMS_PATH, EVIDENCE_PATH, model, tokenizer,OUTPUT_PATH, batch_size, max_len)

'''
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
successfully loaded finetuned BERT model
Classification Predictions saved to Prediction/bert_prediction.json
Classification Accuracy: 0.4610
Classification Precision: 0.3464
Classification Recall: 0.4610
Classification F1 Score: 0.3806
Total Classification Inference Time: 4.49 seconds
'''

Claim Classification with deBERTa-v3 model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

STATE_DICT_PATH = "Pretrained_model/deBERTa_v3_best_model.pt"
OUTPUT_PATH = "Prediction/deBERTa_prediction.json"

batch_size = 4
max_len = 512

# Load trained deBERTa Model and Tokenizer
def load_model_and_tokenizer(state_dict_path):
    tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
    model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels=4)
    if state_dict_path:
        state_dict = torch.load(state_dict_path)
        model.load_state_dict(state_dict)
        print("successfully loaded finetuned deBERTa model")
    model.to('cuda')
    return model, tokenizer

# Load model and tokenizer
model, tokenizer = load_model_and_tokenizer(STATE_DICT_PATH)

# Run classification evaluation with deBERTa Model
run_evaluation(PREDICTIONS_SAVE_PATH, DEV_CLAIMS_PATH, EVIDENCE_PATH, model, tokenizer,OUTPUT_PATH, batch_size, max_len)

'''
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
successfully loaded finetuned deBERTa model
Classification Predictions saved to Prediction/deBERTa_prediction.json
Classification Accuracy: 0.4481
Classification Precision: 0.3525
Classification Recall: 0.4481
Classification F1 Score: 0.3858
Total Classification Inference Time: 7.49 seconds
'''

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*