# 2025 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [1]:
import os
import json

data_file = "data"
train_file = os.path.join(data_file, 'train-claims.json')
dev_file = os.path.join(data_file, 'dev-claims.json')
evidence_file = os.path.join(data_file, 'evidence.json')
test_file = os.path.join(data_file, 'test-claims-unlabelled.json')

def remove_short_evidences(evidences, min_length=5):
    filtered_evidences = {eid: text for eid, text in evidences.items() if len(text) >= min_length}
    print(f"Removed {len(evidences) - len(filtered_evidences)} evidences shorter than {min_length}")
    return filtered_evidences

with open(train_file, 'r') as f:
    tr_claims = json.load(f)
tr_numbers = list(tr_claims.keys())
tr_texts = [tr_claims[claim_id]['claim_text'] for claim_id in tr_numbers]
claim_number_to_tr_id = {claim_id: i for i, claim_id in enumerate(tr_numbers)}

with open(dev_file, 'r') as f:
    dev_claims = json.load(f)
dev_numbers = list(dev_claims.keys())
dev_texts = [dev_claims[claim_id]['claim_text'] for claim_id in dev_numbers]

with open(evidence_file, 'r') as f:
    evidences = json.load(f)
evidences = remove_short_evidences(evidences)
evi_numbers = list(evidences.keys())
evidences_texts = [evidences[evidence_id] for evidence_id in evi_numbers]
evi_number_to_evi_id = {evi_number: i for i, evi_number in enumerate(evi_numbers)}
evi_id_to_evi_number = {i: evi_number for i, evi_number in enumerate(evi_numbers)}

with open(test_file, 'r') as f:
    test_claims = json.load(f)
ts_numbers = list(test_claims.keys())
ts_texts = [test_claims[claim_id]['claim_text'] for claim_id in ts_numbers]

print("Train claims:", len(tr_claims))
print("Dev claims:", len(dev_claims))   
print("Evidences:", len(evidences))
print("Test claims:", len(test_claims))

Removed 564 evidences shorter than 5
Train claims: 1228
Dev claims: 154
Evidences: 1208263
Test claims: 153


In [2]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
import torch
from transformers import AutoTokenizer, AutoModel
#todo test other models

tokenizer = AutoTokenizer.from_pretrained(model_name, max_length=512)

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [3]:
#######TFIDF functions#######
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import joblib

lemmatizer = WordNetLemmatizer()
def word_tokenize_and_lemmatize(text):
    def lemmatize_word(token):
        token = lemmatizer.lemmatize(token, pos='v')
        token = lemmatizer.lemmatize(token, pos='n') if token != token else token
        return token
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stopwords.words('english')]
    lemmed_tokens = [lemmatize_word(token) for token in tokens]
    return lemmed_tokens

def save_tfidf_model(text_list, path_name):
    vectorizer = TfidfVectorizer(tokenizer=word_tokenize_and_lemmatize, max_features=2000)
    text_ids = list(text_list.keys())
    text_list = [text_list[id] for id in text_ids]

    tfidf_matrix = vectorizer.fit_transform(text_list)
    vector_path=str(path_name + "_vectors.npz")
    model_path=str(path_name + "_vectorizer.pkl")
    sparse.save_npz(vector_path, tfidf_matrix)
    joblib.dump(vectorizer, model_path)
    print(f"Saved TF-IDF vectors to '{vector_path}' and vectorizer to '{model_path}'")

def load_tfidf(path_name):
    vector_path=str(path_name + "_vectors.npz")
    model_path=str(path_name + "_vectorizer.pkl")
    tfidf_matrix = sparse.load_npz(vector_path)
    vectorizer = joblib.load(model_path)
    print(f"Loaded TF-IDF matrix from '{vector_path}' and vectorizer from '{model_path}'")
    return tfidf_matrix, vectorizer

[nltk_data] Downloading package stopwords to C:\Users\Salist
[nltk_data]     desk2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Salist
[nltk_data]     desk2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Salist
[nltk_data]     desk2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
#run save tfidf
# save_tfidf_model(evidences, "evidences")

#load tfidf
# tfidf_vectors, tfidf_vectorizer = load_tfidf("data/evidences")

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json
def get_topk_evidence_numbers(tr_claims_vectors, evidences_vectors, k=100):
    sim_matrix = cosine_similarity(tr_claims_vectors, evidences_vectors)
    topk_evi_num = []
    for i in range(sim_matrix.shape[0]):
        row = sim_matrix[i]
        if k < len(row):
            idx = np.argpartition(row, -k)[-k:]
            idx = idx[np.argsort(row[idx])[::-1]]
        else:
            idx = np.argsort(row)[::-1]
        cur_topk_indices = idx[:k]
        cur_topk_numbers = [evi_id_to_evi_number[j] for j in cur_topk_indices]
        topk_evi_num.append(cur_topk_numbers)
    print(f"Collected top {k} evidence ids for each claim, total claims: {len(topk_evi_num)}")
    return topk_evi_num

def get_all_filterd_evi_ids(*claims_dicts, topk_evi_num):
    evi_nums = set()
    for claims in claims_dicts:
        for claim in claims.values():
            evi_nums.update(claim.get('evidences', []))
    for claim in topk_evi_num:
        for evi_num in claim:
            evi_nums.add(evi_num)
    return evi_nums

# evidences_vectors, evidences_vectorizer = load_tfidf("data/evidences")
# tr_claims_vectors = evidences_vectorizer.transform([tr_claims[id]['claim_text'] for id in tr_numbers])
# topk_evi_num = get_topk_evidence_numbers(tr_claims_vectors, evidences_vectors, k=100)
# filtered_evi_ids = get_all_filterd_evi_ids(tr_claims, dev_claims, topk_evi_num=topk_evi_num)
# print(f"Filtered evidence ids: {len(filtered_evi_ids)}")
# with open(os.path.join("data", "topk_evi_num.json"), "w", encoding="utf-8") as f:
#     json.dump(list(topk_evi_num), f, ensure_ascii=False, indent=2)
# with open(os.path.join("data", "filtered_evi_ids.json"), "w", encoding="utf-8") as f:
#     json.dump(list(filtered_evi_ids), f, ensure_ascii=False, indent=2)

In [6]:
with open("data/filtered_evi_ids.json", 'r') as f:
    filtered_evi_ids = json.load(f)
with open("data/topk_evi_num.json", 'r') as f:
    topk_evi_num = json.load(f)
#Filter evidences by high similarity from TFIDF
filtered_evi_ids = get_all_filterd_evi_ids(tr_claims, dev_claims, topk_evi_num=topk_evi_num)
print(f"Filtered evidence ids: {len(filtered_evi_ids)}")
evi_numbers = list(filtered_evi_ids)
evidences_texts = [evidences[evidence_id] for evidence_id in evi_numbers]
evi_number_to_evi_id = {evi_number: i for i, evi_number in enumerate(evi_numbers)}
evi_id_to_evi_number = {i: evi_number for i, evi_number in enumerate(evi_numbers)}

Filtered evidence ids: 48302


In [None]:
#######sbert functions#######
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm

def generate_batched_embeddings_ts(model, texts, batch_size=32):
    model.eval()
    all_embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_evi_indices = texts[i:i + batch_size]
            inputs = tokenizer(batch_evi_indices, padding=True, truncation=True, return_tensors="pt").to(model.device)
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu()
            all_embeddings.append(batch_embeddings)
            torch.cuda.empty_cache()
    return torch.cat(all_embeddings)

#returns f1 for each claim
def evaluate_retrival(claims, top_evidence_id):
    claims_f1 = []
    for i, claim_id in enumerate(claims.keys()):
        correct = 0
        recall = 0.0
        precision = 0.0
        fscore = 0.0
        
        claim = claims[claim_id]
        true_evidence_num = claim['evidences']
        pred_evidence_num = [evi_numbers[idx] for idx in top_evidence_id[i]]
        for true_evidence in true_evidence_num:
            true_evidence_id = int(true_evidence.split('-')[1])
            if true_evidence_id in pred_evidence_num:
                correct += 1
        if correct > 0:
            recall = correct / len(true_evidence_num)
            precision = correct / len(pred_evidence_num[i])
            fscore = (2 * precision * recall) / (precision + recall)
        claims_f1.append(fscore)
    return claims_f1

#returns avg f1 for input claims
def calc_f1(claim_texts_indices, claims, evidences_text_indices, model, bathch_size=32):
    model.eval()

    claim_embeddings = generate_batched_embeddings_ts(model, claim_texts_indices, batch_size=bathch_size)
    claim_embeddings_norm = F.normalize(claim_embeddings, p=2, dim=1)

    evidences_embeddings = generate_batched_embeddings_ts(model, evidences_text_indices, batch_size=bathch_size)
    evidences_embeddings_norm = F.normalize(evidences_embeddings, p=2, dim=1)

    cos_similarities = torch.matmul(claim_embeddings_norm, evidences_embeddings_norm.T)

    top_k_indices = []
    similarity_threshold = 0.8
    for i in range(cos_similarities.shape[0]):
        sim_row = cos_similarities[i]
        max_sim = torch.max(sim_row)
        indices = torch.where(
            (sim_row > similarity_threshold) & ((max_sim - sim_row) < 0.04*max_sim)
        )[0].cpu().numpy()
        if len(indices) > 5:
            top_indices = torch.topk(sim_row, 5).indices.cpu().numpy()
            indices = top_indices
        elif len(indices) == 0:
            indices = torch.topk(sim_row, 3).indices.cpu().numpy()
        top_k_indices.append(indices)

    claims_f1 = evaluate_retrival(claims, top_k_indices)
    model.train()
    return np.mean(claims_f1)

#get normalized embeddings from model
def get_normalized_embeddings(texts, model, tokenizer):
    model_inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(model.device)
    outputs = model(**model_inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]
    norm_embeddings = F.normalize(embeddings, p=2, dim=1)
    return norm_embeddings

In [8]:
import random
#returns all evidence indices included in the claim tr
def get_real_evidences(batch_claims):
    evi_indices = []
    claims_pos_evidence_indices = []
    for claim in batch_claims:
        pos_evi_indices = []
        for evi_number in claim["evidences"]:
            cur_evi_index = evi_number_to_evi_id[evi_number]
            if cur_evi_index not in evi_indices:
                evi_indices.append(cur_evi_index)
            pos_evi_indices.append(evi_indices.index(cur_evi_index))
        claims_pos_evidence_indices.append(pos_evi_indices)
    return evi_indices, claims_pos_evidence_indices

#return all true pos evi and size of neg evi
def get_pos_evi_indices(batch_claims, neg_size=50):
    claims_pos_evidence_indices = []
    claims_neg_evidence_indices = []
    batch_all_evi_indices = []
    batch_len = len(batch_claims)
    for i in range(batch_len):
        claim = batch_claims[i]
        pos_evi_indices = []
        for evi_number in claim["evidences"]:
            cur_evi_index = evi_number_to_evi_id[evi_number]
            if cur_evi_index not in batch_all_evi_indices:
                batch_all_evi_indices.append(cur_evi_index)
            pos_evi_indices.append(batch_all_evi_indices.index(cur_evi_index))
        claims_pos_evidence_indices.append(pos_evi_indices)

        neg_evi_number = random.sample(topk_evi_num[i], neg_size)
        random_neg_evi_indices = []
        for evi_number in neg_evi_number:
            if evi_number not in claim["evidences"]:
                cur_evi_index = evi_number_to_evi_id[evi_number]
                if cur_evi_index not in batch_all_evi_indices:
                    batch_all_evi_indices.append(cur_evi_index)
                random_neg_evi_indices.append(batch_all_evi_indices.index(cur_evi_index))
        claims_neg_evidence_indices.append(random_neg_evi_indices)
    return claims_pos_evidence_indices, claims_neg_evidence_indices, batch_all_evi_indices

#loss based on cosine similarity of pos and neg evidence
def contrastive_loss(claim_embedding, pos_evi_embeddings, neg_evi_embeddings, temperature=0.1):
    """Compute contrastive loss for a claim"""
    pos_sim = torch.exp(torch.matmul(claim_embedding, pos_evi_embeddings.T) / temperature).sum()
    neg_sim = torch.exp(torch.matmul(claim_embedding, neg_evi_embeddings.T) / temperature).sum()

    loss = -torch.log(pos_sim / (pos_sim + neg_sim))
    return loss.mean()

In [9]:
#########RUN ON COLAB##########
########Training setup#########
import random
from transformers import get_cosine_schedule_with_warmup
max_epochs = 10  ######Change Later######
max_steps = 100  ######Change Later######
batch_size = 20
learning_rate = 1e-5
test_period = 100
log_period = 20
neg_size = 50

model= AutoModel.from_pretrained(model_name)
# model = model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.01)

num_warmup_steps = int(0.1 * max_steps)
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=max_steps
)

def save_training_log(loss_record, f1_record):
    params = {
        "batch_size": batch_size,
        "model_name": model_name,
        "base_lr": learning_rate,
        "negative_evidence_size": neg_size
    }
    log = {
        "params": params,
        "loss_record": loss_record,
        "f1_record": f1_record
    }
    log_name = f"{model_name.split('/')[1]}_dev_log.json"
    print(f"Saving training log to results/{log_name}")
    os.makedirs("results", exist_ok=True)
    with open(os.path.join("results", log_name), "w", encoding="utf-8") as f:
        json.dump(log, f, ensure_ascii=False, indent=2)

            

In [None]:
#########RUN ON COLAB##########
step = 0
model.train()
max_f1 = 0.0
loss_record = []
f1_record = []
for epoch in range(max_epochs):
    print(f"Epoch {epoch+1}/{max_epochs}")
    random.shuffle(tr_numbers)

    for i in range(0, len(tr_numbers), batch_size):
        if step > max_steps:
            break
        step += 1
        if step % 10 == 0:
            print(f"Step {step}")
        batch_claim_ids = tr_numbers[i:i + batch_size] # claim ids
        batch_claims = [tr_claims[claim_id] for claim_id in batch_claim_ids]
        batch_tr_indices = [claim_number_to_tr_id[claim_id] for claim_id in batch_claim_ids] #claims' train indices
        
        claim_text = [tr_texts[i] for i in batch_tr_indices]
        norm_claim_embeddings = get_normalized_embeddings(claim_text, model, tokenizer)

        #neg 1
        # all_real_evi_id, pos_evi_id = get_real_evidences(batch_claims)
        # batch_all_evi_ids = []
        # for evi_number in topk_evi_number:
        #     batch_all_evi_ids
        # batch_evi_texts = [evidences_texts[i] for i in batch_all_evi_ids]

        #neg 2
        pos_evi_id, neg_evi_id, batch_all_evi_ids = get_pos_evi_indices(batch_claims, neg_size) #modify
        batch_evi_texts = [evidences_texts[i] for i in batch_all_evi_ids]
        
        norm_evi_embeddings = get_normalized_embeddings(batch_evi_texts, model, tokenizer)
        loss = []
        for i, claim_embedding in enumerate(norm_claim_embeddings):
            pos_evi_embedding = norm_evi_embeddings[torch.tensor(pos_evi_id[i])]
            #todo adjust negtive evidence, more negative evidence use tfidf & some random
            #todo try to use model prediction as pos neg evidences
            
            #neg evi 1: postive evidence of other claims 
            # neg_evidences = [j for j in range(len(batch_evi_texts)) if j not in pos_evi_id[i]]
            # neg_evi_embedding = norm_evi_embeddings[torch.tensor(neg_evidences)]
            
            #neg evi 2: high tfidf but not true evidence
            neg_evi_embedding = norm_evi_embeddings[torch.tensor(neg_evi_id[i])]

            loss.append(contrastive_loss(claim_embedding, pos_evi_embedding, neg_evi_embedding))
        loss = torch.mean(torch.stack(loss))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        torch.cuda.empty_cache()
        if step % log_period == 0:
            loss_record.append(loss.item())
            print(f"Step {step}, Loss: {loss.item()}")
        if step % test_period == 0 and step > max_steps*0.1:
            print("Evaluating on dev set...")
            dev_f1 = calc_f1(dev_texts, dev_claims, evidences_texts, model)
            avg_f1 = np.mean(dev_f1)
            f1_record.append(avg_f1)
            print(f"Avg F1 on dev set: {avg_f1}, History Best: {max_f1}")
            if avg_f1 > max_f1:
                max_f1 = avg_f1
                print(f"New best F1: {max_f1}, model saved.")
                os.makedirs("results", exist_ok=True)
                torch.save(model.state_dict(), os.path.join("results", "best_model.pth"))
                
save_training_log(loss_record, f1_record)

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [None]:
model.load_state_dict(torch.load("best_model.pth", weights_only=True))
model.eval()

ts_claims_embeddings = generate_batched_embeddings_ts(model, ts_texts, batch_size=batch_size)
ts_evi_embeddings = generate_batched_embeddings_ts(model, evidences_texts, batch_size=batch_size*10)
norm_ts_claim_embeddings = F.normalize(ts_claims_embeddings, p=2, dim=1)
norm_evi_embeddings = F.normalize(ts_evi_embeddings, p=2, dim=1)

similarities = torch.matmul(norm_ts_claim_embeddings, norm_evi_embeddings.T)

top_k_indices = []
similarity_threshold = 0.8
for i in range(similarities.shape[0]):
    sim_row = similarities[i]
    max_sim = torch.max(sim_row)
    indices = torch.where(
        (sim_row > similarity_threshold) & ((max_sim - sim_row) < 0.04*max_sim)
    )[0].cpu().numpy()
    if len(indices) == 0:
        indices = torch.topk(sim_row, 3).indices.cpu().numpy()
    top_k_indices.append(indices)

# Save the results
test_claims_ids = list(test_claims.keys())
results = test_claims
for i, claim_id in enumerate(test_claims_ids):
    # Get the evidence indices for the claim
    pred_evi_ids = top_k_indices[i]
    pred_evi_numbers = [evi_numbers[evidence_id] for evidence_id in pred_evi_ids]
    
    # Store the results
    results[claim_id]['evidences'] = pred_evi_numbers
    results[claim_id]['claim_label'] = "SUPPORTS"


# Save the results to a JSON file
output_file = os.path.join("result", "test_claims_retrieved_bert.json")
with open(output_file, 'w') as f:
    json.dump(results, f, indent=4)