In [2]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import time

class DynamicEvidenceDataset(Dataset):
    def __init__(self, eval_path, claim_path, evidence_path, tokenizer, max_len=512):
        self.eval_data = self.load_data(eval_path)
        self.claim_data = self.load_data(claim_path)
        self.evidence_data = self.load_data(evidence_path)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_map = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT_ENOUGH_INFO': 2, 'DISPUTED': 3}

    def load_data(self, path):
        with open(path, 'r') as f:
            return json.load(f)

    def __len__(self):
        return len(self.claim_data)

    def __getitem__(self, idx):
        claim_id = list(self.eval_data.keys())[idx]
        evidences = self.eval_data.get(claim_id, {}).get('evidences', [])

        # Fetch claim text
        claim_text = self.claim_data[claim_id]['claim_text']

        # Fetch evidence texts
        evidence_texts = [self.evidence_data.get(e_id, "") for e_id in evidences]
        evidence = " [SEP] ".join(evidence_texts)

        # Construct input text
        inputs = self.tokenizer("CLAIM: " + claim_text + " [SEP] EVIDENCE: " + evidence,
                                truncation=True, padding='max_length',
                                max_length=self.max_len, return_tensors='pt')

        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        inputs['labels'] = self.label_map[self.claim_data[claim_id]['claim_label']]  
        return inputs


def create_dataloader(eval_path, claim_path, evidence_path, tokenizer, batch_size=16, max_len=512):
    dataset = DynamicEvidenceDataset(eval_path, claim_path, evidence_path, tokenizer, max_len)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return dataloader


def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    start_time = time.time()

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to('cuda')
            attention_mask = batch['attention_mask'].to('cuda')
            labels = batch['labels'].to('cuda')

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    end_time = time.time()
    inference_time = end_time - start_time

    return all_preds, all_labels, inference_time


def run_evaluation(eval_path, claim_path, evidence_path, model, tokenizer,output_path, batch_size=16, max_len=512):
    
    label_map = {0: 'SUPPORTS', 1: 'REFUTES', 2: 'NOT_ENOUGH_INFO', 3: 'DISPUTED'}
    
    dataloader = create_dataloader(eval_path, claim_path, evidence_path, tokenizer, batch_size, max_len)
    preds, labels, inference_time = evaluate_model(model, dataloader)

    # Calculate metrics
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    accuracy = accuracy_score(labels, preds)
    
    # Prepare output dictionary (key: claim_id)
    output_data = {}
    eval_data = json.load(open(eval_path))

    for idx, claim_id in enumerate(eval_data.keys()):
        output_data[claim_id] = {
            "evidences": eval_data[claim_id]["evidences"],
            "claim_label": label_map[int(preds[idx])]
        }

    with open(output_path, 'w') as f:
        json.dump(output_data, f, indent=4)

    print(f"Predictions saved to {output_path}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Total Inference Time: {inference_time:.2f} seconds")



BERT Model Evaluation

In [9]:
from transformers import BertTokenizer, BertForSequenceClassification

state_dict_path = "/kaggle/input/bert_base_pretrained/pytorch/default/1/baseline_bert_model_Autocast_explicitMarker_LR5e05.pt"
eval_path = "/kaggle/input/evidence-prediction/MyPredictions"
claim_path = "/kaggle/input/dev-claims/dev-claims.json"
evidence_path = "/kaggle/input/evidence/evidence.json"
output_path = "/kaggle/working/BERT_prediction.json"
batch_size = 16
max_len = 512

def load_model_and_tokenizer(state_dict_path):
    """ Load the pre-trained model and tokenizer from the directory. """
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
    if state_dict_path:
        state_dict = torch.load(state_dict_path)
        model.load_state_dict(state_dict)
        print("successfully loaded model")
    model.to('cuda')
    return model, tokenizer


# Load model and tokenizer
model, tokenizer = load_model_and_tokenizer(state_dict_path)

# Run evaluation
run_evaluation(eval_path, claim_path, evidence_path, model, tokenizer,output_path, batch_size, max_len)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


successfully loaded model
Predictions saved to /kaggle/working/BERT_prediction.json
Accuracy: 0.4675
Precision: 0.3534
Recall: 0.4675
F1 Score: 0.4024
Total Inference Time: 4.70 seconds


DeBERTa Model Evaluation

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

state_dict_path = "/kaggle/input/deberta-v3-pretrained/pytorch/default/1/deBERTa_v3_best_model.pt"
eval_path = "/kaggle/input/evidence-prediction/MyPredictions"
claim_path = "/kaggle/input/dev-claims/dev-claims.json"
evidence_path = "/kaggle/input/evidence/evidence.json"
output_path = "/kaggle/working/deBERTa_prediction.json"
batch_size = 4
max_len = 512

def load_model_and_tokenizer(state_dict_path):
    """ Load the pre-trained model and tokenizer from the directory. """
    tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
    model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels=4)
    if state_dict_path:
        state_dict = torch.load(state_dict_path)
        model.load_state_dict(state_dict)
        print("successfully loaded model")
    model.to('cuda')
    return model, tokenizer


# Load model and tokenizer
model, tokenizer = load_model_and_tokenizer(state_dict_path)

# Run evaluation
run_evaluation(eval_path, claim_path, evidence_path, model, tokenizer,output_path, batch_size, max_len)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


successfully loaded model
Predictions saved to /kaggle/working/deBERTa_prediction.json
Accuracy: 0.4481
Precision: 0.3525
Recall: 0.4481
F1 Score: 0.3858
Total Inference Time: 7.30 seconds
