# 2025 COMP90042 Project
*Make sure you change the file name with your group id.*

**MelMoxue Bruce**

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

**This file inlcudes the deep learning method, i.e., gru, transformer to find the relevant evidence.**

**This method needs to train.**

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

# Read files

In [1]:
import json
import os

# Set paths
data_dir = "data"
train_claims_file = os.path.join(data_dir, "train-claims.json")
dev_claims_file = os.path.join(data_dir, "dev-claims.json")
test_claims_file = os.path.join(data_dir, "test-claims-unlabelled.json")
evidence_file = os.path.join(data_dir, "evidence.json")

# Load train claims
with open(train_claims_file, 'r') as f:
    train_claims = json.load(f)
train_ids = list(train_claims.keys())
train_texts = [train_claims[claim_id]['claim_text'] for claim_id in train_ids]
claim_id_to_train_inidce = {claim_id: i for i, claim_id in enumerate(train_ids)}

print(f"Loaded {len(train_claims)} train claims.")

# Load dev claims
with open(dev_claims_file, 'r') as f:
    dev_claims = json.load(f)
dev_ids = list(dev_claims.keys())
dev_texts = [dev_claims[claim_id]['claim_text'] for claim_id in dev_ids]
    
print(f"Loaded {len(dev_claims)} dev claims.")

# Load test claims
with open(test_claims_file, 'r') as f:
    test_claims = json.load(f)
test_texts = [test_claims[claim_id]['claim_text'] for claim_id in test_claims.keys()]

print(f"Loaded {len(test_claims)} test claims.")

# Load evidence texts
with open(evidence_file, 'r') as f:
    evidence = json.load(f)

evidence_ids = list(evidence.keys())
evidence_texts = [evidence[claim_id] for claim_id in evidence_ids]
evidence_id_to_train_index = {claim_id: i for i, claim_id in enumerate(evidence_ids)}
print(f"Loaded {len(evidence)} evidence documents.")

Loaded 1228 train claims.
Loaded 154 dev claims.
Loaded 153 test claims.
Loaded 1208827 evidence documents.


## Preprocess text data

In [2]:
import nltk
# nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from tqdm import tqdm

# function to yield the tokens from the data
def yield_tokens(data):
    for item in data:
        tokens = word_tokenize(item.lower())
        yield tokens

# build the vocabulary from the data
from collections import Counter
def build_vocab_from_iterator(iterator, min_freq=5, special_tokens=("<pad>", "<unk>", "<cls>")):
    """Build a vocabulary from an iterator of token lists."""
    counter = Counter()
    for tokens in iterator:
        counter.update(tokens)
    vocab = {special_token: idx for idx, special_token in enumerate(special_tokens)}
    cur_idx = len(special_tokens)
    for idx, (token, freq) in enumerate(counter.items()):
        if freq >= min_freq:
            vocab[token] = cur_idx
            cur_idx += 1
    
    # index to token mapping
    idx_to_token = {idx: token for token, idx in vocab.items()}
    return vocab, idx_to_token

# Create the vocabulary for train, evidence
vocab, idx_to_token = build_vocab_from_iterator(yield_tokens(train_texts + evidence_texts))

print(f"Vocabulary size: {len(vocab)}")

def process_text(text, vocab):
    """Convert text to indices using the vocabulary."""
    tokens = word_tokenize(text.lower())
    return [vocab["<cls>"]] + [vocab.get(token, vocab["<unk>"]) for token in tokens]


train_texts_indices = [process_text(text, vocab) for text in train_texts]
dev_texts_indices = [process_text(text, vocab) for text in dev_texts]
test_texts_indices = [process_text(text, vocab) for text in test_texts]

evidence_texts_indices = [process_text(text, vocab) for text in tqdm(evidence_texts)]

Vocabulary size: 112508


100%|██████████| 1208827/1208827 [01:16<00:00, 15805.83it/s]


# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## I will define three common NLP deep learning models, including lstm, gru and transformer. And I will show how to train a retrieval model.

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F


# BiLSTM with two layers and dropout
class BiLSTMEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=2, dropout=0.2):
        super(BiLSTMEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab['<pad>'])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, 
                           batch_first=True, bidirectional=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)  # Bi-directional output
        
    def forward(self, x, lengths):
        embedded = self.embedding(x)
        
        # Pack padded sequence for LSTM
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        output, (hidden, _) = self.lstm(packed)
        
        # Use the last hidden state from both directions
        hidden_forward = hidden[-2, :, :]  # Second to last layer is forward of last layer
        hidden_backward = hidden[-1, :, :]  # Last layer is backward of last layer
        hidden_concat = torch.cat((hidden_forward, hidden_backward), dim=1)
        
        # Project to final embedding space
        final_embedding = self.fc(hidden_concat)
        return final_embedding
    

# BiGRU with two layers and dropout
class BiGRUEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=2, dropout=0.2):
        super(BiGRUEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab['<pad>'])
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=num_layers, 
                          batch_first=True, bidirectional=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)  # Bi-directional output
        
    def forward(self, x, lengths):
        embedded = self.embedding(x)
        
        # Pack padded sequence for GRU
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        output, hidden = self.gru(packed)
        
        # Use the last hidden state from both directions
        hidden_forward = hidden[-2, :, :]  # Second to last layer is forward of last layer
        hidden_backward = hidden[-1, :, :]  # Last layer is backward of last layer
        hidden_concat = torch.cat((hidden_forward, hidden_backward), dim=1)
        
        # Project to final embedding space
        final_embedding = self.fc(hidden_concat)
        return final_embedding
    
# Transformer Encoder with cls token in the first position
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, num_layers, hidden_dim, dropout=0.2):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab['<pad>'])
        self.positional_encoding = nn.Parameter(torch.zeros(1, 1000, embedding_dim))  # Max length of 1000
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim, 
            nhead=num_heads, 
            dim_feedforward=hidden_dim,
            dropout=dropout, 
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embedding_dim, hidden_dim)  # Project to final embedding space
        
    def forward(self, x, lengths):
        embedded = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        output = self.transformer_encoder(embedded)
        
        # Use the first token (CLS token) as the representation for the entire sequence
        cls_output = output[:, 0, :]
        final_embedding = self.fc(cls_output)
        return final_embedding


It will define the loss funciton for one claim with multiple postive evidences and negative evidences. 

In [4]:
def contrastive_loss(claim_embedding, pos_evidence_embeddings, neg_evidence_embeddings, temperature=0.1):
    """Compute contrastive loss for a claim"""
    # Compute positive and negative similarities
    pos_sim = torch.exp(torch.matmul(claim_embedding, pos_evidence_embeddings.T) / temperature).sum()
    neg_sim = torch.exp(torch.matmul(claim_embedding, neg_evidence_embeddings.T) / temperature).sum()

    # Compute contrastive loss
    loss = -torch.log(pos_sim / (pos_sim + neg_sim))
    return loss.mean()

In [5]:
"""
test code
It will calcualte the score in dev dataset.
"""
from tqdm import tqdm

def generate_all_embeddings(model, evidence_texts_indices, batch_size=32):
    """Generate evidence embeddings using the model"""
    all_embeddings = []  # each embedding is a list of indices of evidence documents
    with torch.no_grad():
        for i in tqdm(range(0, len(evidence_texts_indices), batch_size)):
            evidence_batch = evidence_texts_indices[i:i+batch_size]
            
            # Process texts and get lengths
            evidence_lengths = [len(text) for text in evidence_batch]
            evidence_batch_indices = torch.nn.utils.rnn.pad_sequence([torch.tensor(text) for text in evidence_batch], 
                                                                     batch_first=True, padding_value=vocab['<pad>'])
            # Get embeddings
            evidence_embeddings = model(evidence_batch_indices, evidence_lengths)
            
            # Store embeddings
            all_embeddings.extend(evidence_embeddings.tolist())
            del evidence_embeddings
            torch.cuda.empty_cache()
    
    return torch.tensor(all_embeddings)


import numpy as np
def get_test_f_scores(test_texts_indices, test_claims, test_ids, evidence_texts_indices, evidence_indice_to_claim_id, model, batch_size=32):
    """Get F-scores for test claims"""
    model.eval()
    
    # Generate test claim embeddings
    test_claim_embeddings = generate_all_embeddings(model, test_texts_indices, batch_size=batch_size)
    norm_test_claim_embeddings = F.normalize(test_claim_embeddings, p=2, dim=1)

    # Generate evidence embeddings
    evidence_embeddings = generate_all_embeddings(model, evidence_texts_indices, batch_size=batch_size*10)
    norm_evidence_embeddings = F.normalize(evidence_embeddings, p=2, dim=1)

    # Compute cosine similarities
    similarities = torch.matmul(norm_test_claim_embeddings, norm_evidence_embeddings.T)
    # Get top-k evidence indices for each claim
    
    top_k = 5
    top_k_indices = torch.topk(similarities, top_k, dim=1).indices
    top_k_indices = top_k_indices.numpy()
    
    # Compute F-scores
    f_scores = []
    for i, claim_id in tqdm(enumerate(test_ids)):
        # Get the evidence indices for the claim
        true_evidence_indices = test_claims[claim_id]["evidences"]
        
        # Get the predicted evidence indices
        predicted_evidence_indices = top_k_indices[i]
        predicted_evidence_indices = [evidence_indice_to_claim_id[evidence_index] for evidence_index in predicted_evidence_indices]
        
        # Compute precision and recall
        true_positives = len(set(true_evidence_indices) & set(predicted_evidence_indices))
        precision = true_positives / len(predicted_evidence_indices) if len(predicted_evidence_indices) > 0 else 0.0
        recall = true_positives / len(true_evidence_indices) if len(true_evidence_indices) > 0 else 0.0
        
        # Compute F-score
        f_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        f_scores.append(f_score)
    model.train()
    return np.mean(f_scores)



In [None]:
"""
train code
"""
max_epochs = 1
batch_size = 32
embedding_dim = 128
hidden_dim = 128
num_layers = 2
num_heads = 4
dropout = 0.2

learning_rate = 0.001

test_interval = 21  # Test every 100 steps
log_interval = 20

model = BiGRUEncoder(len(vocab), embedding_dim, hidden_dim, num_layers=num_layers, dropout=dropout)
# model = BiGRUEncoder(len(vocab), embedding_dim, hidden_dim, num_layers=num_layers, dropout=dropout)
# model = TransformerEncoder(len(vocab), embedding_dim, num_heads, num_layers * 2, hidden_dim, dropout=dropout)

model
# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)

import random

step = 0
model.train()
max_score = 0.0
for epoch in range(max_epochs):
    print(f"Epoch {epoch + 1}/{max_epochs}")
    random.shuffle(train_ids)  # Shuffle the training data
    if step > 120:
        break
        
    for i in range(0, len(train_ids), batch_size):
        if step > 120:
            break
            
        step += 1
        print(f"Step {step}")
        batch_ids = train_ids[i:i + batch_size]
        batch_claims = [train_claims[claim_id] for claim_id in batch_ids]
        batch_indices = [claim_id_to_train_inidce[claim_id] for claim_id in batch_ids]

        # Get claim embeddings
        claim_texts = [train_texts_indices[i] for i in batch_indices]
        claim_lengths = [len(text) for text in claim_texts]
        claim_batch_indices = torch.nn.utils.rnn.pad_sequence([torch.tensor(text) for text in claim_texts], 
                                                              batch_first=True, padding_value=vocab['<pad>'])
        
        claim_embeddings = model(claim_batch_indices, claim_lengths)
        
        norm_claim_embeddings = F.normalize(claim_embeddings, p=2, dim=1)

        # Get all positive evidence
        evidence_indices = []
        pos_evidence_positive_indices = []
        for claim in batch_claims:
            positive_evidence_indices = []
            for evidence_id in claim["evidences"]:
                evidence_global_index = evidence_id_to_train_index[evidence_id]
                if evidence_global_index not in evidence_indices:
                    evidence_indices.append(evidence_global_index)
                positive_evidence_indices.append(evidence_indices.index(evidence_global_index))
            pos_evidence_positive_indices.append(positive_evidence_indices)
        
        cur_evidence_indices = [evidence_texts_indices[evidence_indice] for evidence_indice in evidence_indices]

        # Get evidence embeddings
        evidence_lengths = [len(text) for text in cur_evidence_indices]
        evidence_batch_indices = torch.nn.utils.rnn.pad_sequence([torch.tensor(text) for text in cur_evidence_indices], 
                                                                 batch_first=True, padding_value=vocab['<pad>'])
        
        evidence_embeddings = model(evidence_batch_indices, evidence_lengths)
        norm_evidence_embeddings = F.normalize(evidence_embeddings, p=2, dim=1)

        loss = []
        for i, claim_embedding in enumerate(norm_claim_embeddings):
            pos_evidence_embeddings = norm_evidence_embeddings[torch.tensor(pos_evidence_positive_indices[i])]
            neg_evidence_embeddings = norm_evidence_embeddings[torch.tensor([j for j in range(len(evidence_indices)) if j not in pos_evidence_positive_indices[i]])]
            
            # Compute contrastive loss
            loss.append(contrastive_loss(claim_embedding, pos_evidence_embeddings, neg_evidence_embeddings))
        loss = torch.mean(torch.stack(loss))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()
        if step % log_interval == 0:
            print(f"Loss: {loss.item()}")
        if step % test_interval == 0:
            # Evaluate the model on the dev set
            f_score = get_test_f_scores(dev_texts_indices, dev_claims, dev_ids, evidence_texts_indices, evidence_ids, model, batch_size=batch_size)
            print(f"F-score on dev set: {f_score}")
            # Save the model
            torch.save(model.state_dict(), f"model_epoch_{epoch + 1}_step_{step}.pth")
            if f_score > max_score:
                max_score = f_score
                print(f"New best F-score: {max_score}")
                torch.save(model.state_dict(), "best_model.pth")


Epoch 1/1
Step 1
Step 2
Step 3
Step 4
Step 5
Step 6
Step 7
Step 8
Step 9
Step 10
Step 11
Step 12
Step 13
Step 14
Step 15
Step 16
Step 17
Step 18
Step 19
Step 20
Loss: 3.5200395584106445
Step 21


100%|██████████| 5/5 [00:00<00:00, 41.26it/s]
100%|██████████| 3778/3778 [04:09<00:00, 15.17it/s]
154it [00:00, 144275.81it/s]


F-score on dev set: 0.005009276437847867
New best F-score: 0.005009276437847867
Step 22
Step 23
Step 24
Step 25
Step 26
Step 27
Step 28
Step 29
Step 30
Step 31
Step 32
Step 33
Step 34
Step 35
Step 36
Step 37
Step 38
Step 39


: 

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [7]:
# Load the best model "best_model.pth"
model.load_state_dict(torch.load("best_model.pth", weights_only=True))
model.eval()

test_claims_embeddings = generate_all_embeddings(model, test_texts_indices, batch_size=batch_size)
test_evidence_embeddings = generate_all_embeddings(model, evidence_texts_indices, batch_size=batch_size*10)

norm_test_claim_embeddings = F.normalize(test_claims_embeddings, p=2, dim=1)
norm_evidence_embeddings = F.normalize(test_evidence_embeddings, p=2, dim=1)

# Compute cosine similarities
similarities = torch.matmul(norm_test_claim_embeddings, norm_evidence_embeddings.T)

# Get top-k evidence indices for each claim
top_k = 5
top_k_indices = torch.topk(similarities, top_k, dim=1).indices
top_k_indices = top_k_indices.numpy()

# Save the results
test_claims_ids = list(test_claims.keys())
results = test_claims
for i, claim_id in enumerate(test_claims_ids):
    # Get the evidence indices for the claim
    predicted_evidence_indices = top_k_indices[i]
    predicted_evidence_indices = [evidence_ids[evidence_index] for evidence_index in predicted_evidence_indices]
    
    # Store the results
    results[claim_id]['evidences'] = predicted_evidence_indices


# Save the results to a JSON file
output_file = os.path.join(data_dir, "test_claims_retrieved_lstm.json")
with open(output_file, 'w') as f:
    json.dump(results, f, indent=4)


100%|██████████| 5/5 [00:00<00:00, 394.21it/s]
100%|██████████| 3778/3778 [00:43<00:00, 87.83it/s] 


## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*