# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

**We use pytorch, nltk, scikit-learn in this project.**

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

# 0.Importpreparation in advance

## import

In [None]:
import torch
import json
import nltk
import string
import re
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import os
import subprocess

from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from statistics import mean
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm


## test cuda 

In [None]:
print("CUDA available:", torch.cuda.is_available())
print("Current CUDA device:", torch.cuda.current_device())
print("Device count:", torch.cuda.device_count())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

## read files

In [None]:

torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
random.seed(42)

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

with open('data/train-claims.json', 'r') as input_file:
    train_claims = json.load(input_file)

# Read in development data (claim)
with open('data/dev-claims.json', 'r') as input_file:
    dev_claims = json.load(input_file)

# Read in test data (claim)
with open('data/test-claims-unlabelled.json', 'r') as input_file:
    test_claims = json.load(input_file)

# Read in evidence data
with open('data/evidence.json', 'r') as input_file:
    evidences = json.load(input_file)

#EDA
claim_count = 0
evi_count = 0
claim_length = []
evidence_count = []
evidence_length = []
labels = []

for key,value in train_claims.items():
    claim_count+=1
    claim_length.append(len(value["claim_text"]))
    evidence_count.append(len(value["evidences"]))
    evidence_length += [len(evidences[x]) for x in value["evidences"]]
    labels.append(value["claim_label"])

for key,value in evidences.items():
    evi_count+=1

print("claim count: ",claim_count)
print("evidence count: ",evi_count)
print("max claim length: ",max(claim_length))
print("min claim length: ",min(claim_length))
print("mean claim length: ",mean(claim_length))
print("max evidence count: ",max(evidence_count))
print("min evidence count: ",min(evidence_count))
print("mean evidence count: ",mean(evidence_count))
print("max evidence length: ",max(evidence_length))
print("min evidence length: ",min(evidence_length))
print("mean evidence length: ",mean(evidence_length))
print(Counter(labels))

inside = 0
outside = 0

train_evi_id = []
for claim_id,claim_value in train_claims.items():
    train_evi_id=train_evi_id+claim_value['evidences']

for claim_id,claim_value in dev_claims.items():
    test_evi_id=claim_value['evidences']
    for e in test_evi_id:
        if e in train_evi_id:
            inside += 1
        else:
            outside += 1
print("Dev evi inside train evi", inside)
print("Dev evi outside train evi", outside)

full_evidence_id = list(evidences.keys())
full_evidence_text  = list(evidences.values())
train_claim_id = list(train_claims.keys())
train_claim_text  = [ v["claim_text"] for v in train_claims.values()]
print("Train claim count: ",len(train_claim_id))

# TASK 1:   PRIDICT EVI FOR CLAIM

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## PreProcess for evidence and claims

### preprocessing function

In [6]:
def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

def preprocessing_text(text):

    # Convert to lowercase
    text = text.lower()

    # Lemmatize the text
    text = lemmatize_text(text)

    # Remove leading/trailing whitespaces
    text = text.strip()

    return text

def preprocess_claim_data(claim_data, existed_evidences_id=None):
    claim_data_text = []
    claim_data_id = []
    claim_data_label = []
    claim_evidences = []
    for key in claim_data.keys():
        claim_data[key]["claim_text"] = preprocessing_text(claim_data[key]["claim_text"])
        claim_data_text.append(claim_data[key]["claim_text"])
        claim_data_id.append(key)
        if "claim_label" in claim_data[key]:
            claim_data_label.append(claim_data[key]["claim_label"])
        else:
            claim_data_label.append(None)
        if existed_evidences_id and "evidences" in claim_data[key]:
            valid_evidences = [existed_evidences_id[i] for i in claim_data[key]["evidences"] if i in existed_evidences_id]
            claim_evidences.append(valid_evidences)
        else:
            claim_evidences.append([])
    return claim_data_text, claim_data_id, claim_data_label, claim_evidences


def preprocess_evi_data(evi_data):
    cleaned_evidence_text = []
    cleaned_evidence_id = []
    for key, value in evi_data.items():
        cleaned_text = preprocessing_text(value)
        cleaned_evidence_text.append(cleaned_text)
        cleaned_evidence_id.append(key)
    return cleaned_evidence_text, cleaned_evidence_id

In [8]:
cleaned_evidence_text, cleaned_evidence_id = preprocess_evi_data(evidences)

evidences_id_dict = {evidence_id: idx for idx, evidence_id in enumerate(cleaned_evidence_id)}

train_claim_text, train_claim_id, train_claim_label, train_claim_evidences = preprocess_claim_data(train_claims, evidences_id_dict)

dev_claim_text, dev_claim_id, dev_claim_label, dev_claim_evidences = preprocess_claim_data(dev_claims, evidences_id_dict)

test_claim_text, test_claim_id, _, _ = preprocess_claim_data(test_claims)

### tfidf retrieval

In [9]:
vectorizer = TfidfVectorizer()
vectorizer.fit(cleaned_evidence_text)

def fit_transform_tfidf(train_texts, dev_texts, test_texts, evidence_texts):
    train_tfidf = vectorizer.transform(train_texts)
    dev_tfidf = vectorizer.transform(dev_texts)
    test_tfidf = vectorizer.transform(test_texts)
    evidence_tfidf = vectorizer.transform(evidence_texts)

    return train_tfidf, dev_tfidf, test_tfidf, evidence_tfidf

train_tfidf, dev_tfidf, test_tfidf, evidence_tfidf = fit_transform_tfidf(train_claim_text, dev_claim_text, test_claim_text, cleaned_evidence_text)

## calculate similarity

Similarity calculation and sorting

In [None]:
# use cos similarity to calculate the similarity between the claim and evidence
train_cos_sims = cosine_similarity(train_tfidf, evidence_tfidf)
dev_cos_sims = cosine_similarity(dev_tfidf, evidence_tfidf)
test_cos_sims = cosine_similarity(test_tfidf, evidence_tfidf)

# sort the evidence and choose first 10000 evidence for each claim based on the similarity
def sort_evidence(cos_sims):
    top_ids = np.argsort(-cos_sims, axis=1)[:, :10000]
    return top_ids.tolist()

In [13]:
dev_sort_evidences = sort_evidence(dev_cos_sims)
test_sort_evidences = sort_evidence(test_cos_sims)
train_sort_evidences = sort_evidence(train_cos_sims)

### construct word frequency list and filter

In [None]:
min_count = 5
wordcount = defaultdict(int)
idxword = ["<cls>",  "<sep>", "<pad>", "<unk>"]

for texts in train_claim_text + cleaned_evidence_text:
    for word in texts.split():
        wordcount[word] += 1

for word, count in wordcount.items():
    if count > min_count:
        idxword.append(word)

wordidx = {value: i for i, value in enumerate(idxword)}

def text_to_idx(text_data, wordidx_):
    idx_data = []
    unk_idx = wordidx_["<unk>"]
    for texts in text_data:
        temp_idx = [wordidx_.get(word, unk_idx) for word in texts.split()]
        idx_data.append(temp_idx)
    return idx_data

def construct_input_text(text_idx, padding_len, wordidx_):
    idx_data = []
    cls_idx = wordidx_["<cls>"]
    sep_idx = wordidx_["<sep>"]
    pad_idx = wordidx_["<pad>"]

    for texts in text_idx:
        if len(texts) < padding_len:
            padded_text = [cls_idx] + texts + [sep_idx] + [pad_idx] * (padding_len - len(texts))
        else:
            padded_text = [cls_idx] + texts[:padding_len] + [sep_idx]
        idx_data.append(padded_text)

    return idx_data

In [15]:
train_text_idx = text_to_idx(train_claim_text, wordidx)
dev_text_idx = text_to_idx(dev_claim_text, wordidx)
test_text_idx = text_to_idx(test_claim_text, wordidx)
evidences_text_idx = text_to_idx(cleaned_evidence_text, wordidx)

train_max_freq = max([len(i) for i in train_text_idx])
dev_max_freq = max([len(i) for i in dev_text_idx])
test_max_freq = max([len(i) for i in test_text_idx])
evidences_max_freq = max([len(i) for i in evidences_text_idx])

print(train_max_freq, dev_max_freq, test_max_freq, evidences_max_freq)

text_pad_len = 50
evidences_pad_len = 80

train_input = construct_input_text(train_text_idx, text_pad_len, wordidx)
dev_input = construct_input_text(dev_text_idx, text_pad_len, wordidx)
test_input = construct_input_text(test_text_idx, text_pad_len, wordidx)
evidences_input = construct_input_text(evidences_text_idx, evidences_pad_len, wordidx)

vocab_size = len(idxword)
print(vocab_size)
print(len(train_input[0]), len(evidences_input[0]))

## Construct TrainDataset

In [20]:
class TrainDataset(Dataset):
    def __init__(self, text_data, evidence_data, sorted_evidences, evidence_label, negative_num):
        self.text_data = text_data
        self.evidence_data = evidence_data
        self.sorted_evidences = sorted_evidences
        self.evidence_label = evidence_label
        self.negative_num = negative_num
        self.evidence_len = len(evidence_data[0])
        self.text_len = len(text_data[0])

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        positive_evidences = self.evidence_label[idx]
        negative_evidences = random.sample(self.sorted_evidences[idx][30: self.negative_num * 10], self.negative_num)
        return [self.text_data[idx], negative_evidences, positive_evidences]

    def collate_fn(self, batch):
        queries, queries_pos, evidences, temp_labels = [], [], [], []

        for i, j, k in batch:
            queries = queries + [i]
            queries_pos = queries_pos + [list(range(self.text_len))]
            temp_labels.append(k)
            evidences.extend(k + j)

        evidences = list(set(evidences))

        evidences2idx = {evid: i for i, evid in enumerate(evidences)}
        labels = [[evidences2idx[evid] for evid in pos_evidences] for pos_evidences in temp_labels]

        evidences = [self.evidence_data[i] for i in evidences]
        evidences_pos = [list(range(self.evidence_len)) for _ in range(len(evidences))]


        batch_encoding = {
            "queries": torch.LongTensor(queries),
            "evidences": torch.LongTensor(evidences),
            "queries_pos": torch.LongTensor(queries_pos),
            "evidences_pos": torch.LongTensor(evidences_pos),
            "labels": labels
        }

        return batch_encoding

In [21]:
train_set = TrainDataset(train_input, evidences_input, train_sort_evidences, train_claim_evidences, negative_num=10)
dataloader = DataLoader(train_set, batch_size=5, shuffle=True, num_workers=0, collate_fn=train_set.collate_fn)

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

### Construct Encoder

In [22]:
class Encoder(nn.Module):
    def __init__(self, vocab_emb, embed_dim, hidden_size, num_layers, max_position=180, dropout=0.2):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_emb, embed_dim)
        self.pos_embedding = nn.Embedding(max_position, embed_dim)
        self.encoder = nn.LSTM(embed_dim, hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, text_data, position_text):
        text_x = self.embedding(text_data) + self.pos_embedding(position_text)
        text_x = self.dropout(text_x)
        x_encoded, _ = self.encoder(text_x)
        x_encoded = self.dropout(x_encoded)
        return x_encoded

In [23]:
lstm_encoder = Encoder(vocab_emb=vocab_size, embed_dim=512, hidden_size=512, num_layers=6, max_position=180)
lstm_encoder.cuda()

Encoder(
  (embedding): Embedding(90097, 512)
  (pos_embedding): Embedding(180, 512)
  (encoder): LSTM(512, 512, num_layers=6, batch_first=True, dropout=0.2, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

### Set model hyperparameters and save pathTraining

In [24]:
torch.manual_seed(41)
torch.cuda.manual_seed_all(41)
random.seed(41)

weight_decay = 1e-4
encoder_optimizer = optim.Adam(lstm_encoder.parameters(), weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    encoder_optimizer,
    mode='min',
    factor=0.1,
    patience=10,
    threshold=0.0001,
    threshold_mode='rel',
    cooldown=0,
    min_lr=1e-8,
    eps=1e-08
)

max_lr = 1e-3
for param_group in encoder_optimizer.param_groups:
    param_group['lr'] = max_lr

accumulate_step = 3
grad_norm = 0.5
warmup_steps = 500
report_freq = 10
eval_interval = 50
save_dir = "save_model"

### validate function to evaluate each epoch

In [25]:
def cal_fscore(evidence_correct, label, pred_evidences):
    if evidence_correct > 0:
        evidence_recall = float(evidence_correct) / len(label)
        evidence_precision = float(evidence_correct) / len(pred_evidences)
        evidence_fscore = (2 * evidence_precision * evidence_recall) / (evidence_precision + evidence_recall)
        print(evidence_fscore)
    else:
        evidence_fscore = 0
    return evidence_fscore

def get_embeddings(text_indices, encoder_model, batch_size=800):
    embeddings = []
    text_len = len(text_indices[0])

    for start_idx in range(0, len(text_indices), batch_size):
        end_idx = min(start_idx + batch_size, len(text_indices))
        cur_query = torch.LongTensor(text_indices[start_idx:end_idx]).view(-1, text_len).cuda()
        cur_query_pos = torch.LongTensor([list(range(text_len)) for _ in range(end_idx - start_idx)]).cuda()

        query_embedding = encoder_model(cur_query, cur_query_pos)
        query_embedding = query_embedding[:, -1, :].detach().cpu()
        embeddings.append(query_embedding)

    return torch.cat(embeddings, dim=0).t()

In [26]:
retrieval_num = 5
dev_candis_num = 10

def validate(dev_text_idx, evidence_text_idx, dev_sort_evidences, dev_claim_evidences, encoder_model):
    # get evidence embeddings

    encoder_model.eval()
    start_idx = 0
    batch_size = 800
    evidence_embeddings = get_embeddings(evidence_text_idx, encoder_model, batch_size)

    text_len = len(dev_text_idx[0])

    f_scores = []

    while start_idx < len(dev_text_idx):
        end_idx = min(start_idx + batch_size, len(dev_text_idx))

        cur_query = torch.LongTensor(dev_text_idx[start_idx:end_idx]).view(-1, text_len).cuda()
        cur_query_pos = torch.LongTensor([list(range(text_len)) for _ in range(end_idx - start_idx)]).cuda()

        query_embedding = encoder_model(cur_query, cur_query_pos)
        query_embedding = query_embedding[:, -1, :].detach()
        query_embedding = F.normalize(query_embedding, p=2, dim=1).cpu()

        scores = torch.mm(query_embedding, evidence_embeddings)

        for i in range(scores.size(0)):
            new_score = torch.index_select(scores[i], 0, torch.LongTensor(dev_sort_evidences[start_idx+i][:dev_candis_num]))
            topk_ids = torch.argsort(new_score).tolist()
            select_ids = topk_ids[:retrieval_num]

            evidence_correct = 0
            pred_evidences = [dev_sort_evidences[start_idx+i][j] for j in select_ids]
            label = dev_claim_evidences[start_idx+i]
            for evidence_id in label:
                if evidence_id in pred_evidences:
                    evidence_correct += 1
            evidence_fscore = cal_fscore(evidence_correct, label, pred_evidences)
            f_scores = f_scores + [evidence_fscore]

        start_idx = end_idx

    fscore = np.mean(f_scores)
    encoder_model.train()
    print("\n\nEvidence Retrieval F-score: %.3f\n\n" % fscore)
    return fscore

### Start training

In [29]:
encoder_optimizer.zero_grad()
step_cnt, all_step_cnt, avg_loss, maximum_f_score = 0, 0, 0, 0
n_epochs = 5
for epoch in range(n_epochs):
    epoch_step = 0

    for (i, batch) in enumerate(tqdm(dataloader)):

        query_embeddings = lstm_encoder(batch["queries"].cuda(), batch["queries_pos"].cuda())
        evidence_embeddings = lstm_encoder(batch["evidences"].cuda(), batch["evidences_pos"].cuda())

        query_embeddings = query_embeddings[:, -1, :]
        evidence_embeddings = evidence_embeddings[:, -1, :]

        assert query_embeddings.size(1) == evidence_embeddings.size(1), "Embedding dimensions not match"

        query_embeddings = torch.nn.functional.normalize(query_embeddings, p=2, dim=1)
        evidence_embeddings = torch.nn.functional.normalize(evidence_embeddings, p=2, dim=1)

        cos_sims = torch.mm(query_embeddings, evidence_embeddings.t())
        scores = cos_sims / 0.1
        loss = []
        start_idx = 0
        criterion = torch.nn.CrossEntropyLoss()
        for idx, labels in enumerate(batch["labels"]):
            labels = torch.LongTensor(labels).cuda()
            cur_loss = criterion(scores[idx].unsqueeze(0).repeat(len(labels), 1), labels)
        loss = loss + [cur_loss]
        loss = torch.stack(loss).mean()

        # cos_sims = torch.mm(query_embeddings, evidence_embeddings.t())
        # scores = - torch.nn.functional.log_softmax(cos_sims / 0.1 + 1e-10, dim=1)
        # loss = []
        # start_idx = 0
        # for idx, label in enumerate(batch["labels"]):
        #     label = torch.LongTensor(label).cuda()
        #     cur_loss = torch.mean(torch.index_select(scores[idx], 0, label))
        #     loss.append(cur_loss)
        # loss = torch.stack(loss).mean()

        loss = loss / accumulate_step
        loss.backward()

        avg_loss += loss.item()

        step_cnt += 1
        if step_cnt == accumulate_step:
            # updating
            if grad_norm > 0:
                nn.utils.clip_grad_norm_(lstm_encoder.parameters(), grad_norm)

            step_cnt = 0
            epoch_step += 1
            all_step_cnt += 1

            # adjust learning rate
            if all_step_cnt <= warmup_steps:
                lr = all_step_cnt * (max_lr - 2e-8) / warmup_steps + 2e-8
            else:
                lr = max_lr - (all_step_cnt - warmup_steps) * 1e-5

            encoder_optimizer.step()
            encoder_optimizer.zero_grad()

        if all_step_cnt % report_freq == 0 and step_cnt == 0:
            if all_step_cnt <= warmup_steps:
                lr = all_step_cnt * (max_lr - 2e-8) / warmup_steps + 2e-8
            else:
                lr = max_lr - (all_step_cnt - warmup_steps) * 1e-5

            # report stats
            print("\n")
            print("epoch: %d, epoch_step: %d, avg loss: %.6f" % (epoch + 1, epoch_step, avg_loss / report_freq))
            print(f"learning rate: {lr:.6f}")
            print("\n")
            avg_loss = 0
        del loss, cos_sims, query_embeddings, evidence_embeddings

        if all_step_cnt % eval_interval == 0 and all_step_cnt != 0 and step_cnt == 0:
            # evaluate the model as a scorer
            print("\nEvaluate:\n")

            f_score = validate(dev_input, evidences_input, dev_sort_evidences, dev_claim_evidences, lstm_encoder)

            if f_score > maximum_f_score:
                maximum_f_score = f_score
                os.makedirs(save_dir, exist_ok=True)
                torch.save(lstm_encoder.state_dict(), os.path.join(os.path.abspath(save_dir), "best_model.bin"))
                print()
                print()
                print("best val loss - epoch: %d, epoch_step: %d" % (epoch, epoch_step))
                print("maxi_f_score", f_score, "\n\n")

[34m[1mwandb[0m: Currently logged in as: [33mcyl101397[0m ([33mnlp_project_5pmteam7[0m). Use [1m`wandb login --relogin`[0m to force relogin


 13%|█▎        | 32/246 [00:03<00:17, 11.92it/s]



epoch: 1, epoch_step: 10, avg loss: 4.199338
learning rate: 0.000020




 25%|██▌       | 62/246 [00:05<00:14, 12.35it/s]



epoch: 1, epoch_step: 20, avg loss: 4.205086
learning rate: 0.000040




 37%|███▋      | 92/246 [00:07<00:12, 12.36it/s]



epoch: 1, epoch_step: 30, avg loss: 4.227416
learning rate: 0.000060




 50%|████▉     | 122/246 [00:10<00:10, 11.81it/s]



epoch: 1, epoch_step: 40, avg loss: 4.170825
learning rate: 0.000080




 60%|██████    | 148/246 [00:12<00:08, 12.20it/s]



epoch: 1, epoch_step: 50, avg loss: 4.187892
learning rate: 0.000100



Evaluate:



 60%|██████    | 148/246 [00:29<00:08, 12.20it/s]

0.28571428571428575
0.28571428571428575
0.20000000000000004
0.20000000000000004
0.25
0.22222222222222224
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.20000000000000004
0.22222222222222224
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.20000000000000004
0.28571428571428575
0.20000000000000004
0.20000000000000004
0.5
0.33333333333333337
0.5
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.20000000000000004
0.25
0.20000000000000004
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.20000000000000004
0.20000000000000004
0.22222222222222224
0.33333333333333337
0.33333333333333337
0.28571428571428575
0.25
0.20000000000000004
0.20000000000000004
0.25
0.20000000000000004
0.20000000000000004
0.20000000000000004


Evidence Retrieval F-score: 0.072




 62%|██████▏   | 152/246 [03:38<33:55, 21.65s/it]



best val loss - epoch: 0, epoch_step: 50
maxi_f_score 0.07155225726654299 




 74%|███████▍  | 182/246 [03:40<00:11,  5.49it/s]



epoch: 1, epoch_step: 60, avg loss: 4.410746
learning rate: 0.000120




 86%|████████▌ | 212/246 [03:43<00:02, 12.38it/s]



epoch: 1, epoch_step: 70, avg loss: 4.249171
learning rate: 0.000140




 98%|█████████▊| 242/246 [03:45<00:00, 12.65it/s]



epoch: 1, epoch_step: 80, avg loss: 4.197874
learning rate: 0.000160




100%|██████████| 246/246 [03:45<00:00,  1.09it/s]
 11%|█         | 26/246 [00:02<00:17, 12.50it/s]



epoch: 2, epoch_step: 8, avg loss: 4.144449
learning rate: 0.000180




 21%|██        | 52/246 [00:04<00:15, 12.19it/s]



epoch: 2, epoch_step: 18, avg loss: 4.195648
learning rate: 0.000200



Evaluate:



 21%|██        | 52/246 [00:24<00:15, 12.19it/s]

0.28571428571428575
0.28571428571428575
0.20000000000000004
0.20000000000000004
0.25
0.22222222222222224
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.4000000000000001
0.33333333333333337
0.28571428571428575
0.22222222222222224
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.20000000000000004
0.28571428571428575
0.20000000000000004
0.5
0.33333333333333337
0.25
0.25
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.6
0.25
0.20000000000000004
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.20000000000000004
0.20000000000000004
0.22222222222222224
0.25
0.33333333333333337
0.28571428571428575
0.33333333333333337
0.28571428571428575
0.25
0.4000000000000001
0.20000000000000004
0.25
0.8000000000000002
0.20000000000000004


Evidence Retrieval F-score: 0.086




 23%|██▎       | 56/246 [03:31<1:09:11, 21.85s/it]



best val loss - epoch: 1, epoch_step: 18
maxi_f_score 0.08641001855287572 




 35%|███▍      | 86/246 [03:34<00:29,  5.44it/s]  



epoch: 2, epoch_step: 28, avg loss: 4.197912
learning rate: 0.000220




 46%|████▋     | 114/246 [03:36<00:11, 11.29it/s]



epoch: 2, epoch_step: 38, avg loss: 4.203417
learning rate: 0.000240




 59%|█████▉    | 146/246 [03:39<00:08, 11.98it/s]



epoch: 2, epoch_step: 48, avg loss: 4.183603
learning rate: 0.000260




 72%|███████▏  | 176/246 [03:41<00:05, 12.29it/s]



epoch: 2, epoch_step: 58, avg loss: 4.200558
learning rate: 0.000280




 82%|████████▏ | 202/246 [03:44<00:03, 12.14it/s]



epoch: 2, epoch_step: 68, avg loss: 4.212100
learning rate: 0.000300



Evaluate:



 82%|████████▏ | 202/246 [03:54<00:03, 12.14it/s]

0.28571428571428575
0.28571428571428575
0.20000000000000004
0.20000000000000004
0.25
0.22222222222222224
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.4000000000000001
0.28571428571428575
0.22222222222222224
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.20000000000000004
0.28571428571428575
0.20000000000000004
0.20000000000000004
0.5
0.33333333333333337
0.25
0.5
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.6
0.5
0.20000000000000004
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.20000000000000004
0.20000000000000004
0.25
0.33333333333333337
0.28571428571428575
0.33333333333333337
0.28571428571428575
0.25
0.4000000000000001
0.20000000000000004
0.20000000000000004
0.8000000000000002
0.20000000000000004


Evidence Retrieval F-score: 0.088




 84%|████████▎ | 206/246 [07:09<14:26, 21.66s/it]



best val loss - epoch: 1, epoch_step: 68
maxi_f_score 0.08832199546485263 




 96%|█████████▌| 236/246 [07:12<00:01,  5.28it/s]



epoch: 2, epoch_step: 78, avg loss: 4.192981
learning rate: 0.000320




100%|██████████| 246/246 [07:13<00:00,  1.76s/it]
  8%|▊         | 20/246 [00:01<00:19, 11.76it/s]



epoch: 3, epoch_step: 6, avg loss: 4.165759
learning rate: 0.000340




 20%|██        | 50/246 [00:04<00:16, 12.05it/s]



epoch: 3, epoch_step: 16, avg loss: 4.208925
learning rate: 0.000360




 33%|███▎      | 80/246 [00:06<00:13, 12.25it/s]



epoch: 3, epoch_step: 26, avg loss: 4.232152
learning rate: 0.000380




 43%|████▎     | 106/246 [00:08<00:11, 11.78it/s]



epoch: 3, epoch_step: 36, avg loss: 4.189977
learning rate: 0.000400



Evaluate:



 43%|████▎     | 106/246 [00:20<00:11, 11.78it/s]

0.28571428571428575
0.28571428571428575
0.20000000000000004
0.20000000000000004
0.5
0.22222222222222224
0.20000000000000004
0.28571428571428575
0.4000000000000001
0.33333333333333337
0.28571428571428575
0.22222222222222224
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.20000000000000004
0.28571428571428575
0.20000000000000004
0.20000000000000004
0.5
0.33333333333333337
0.25
0.5
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.4000000000000001
0.5
0.20000000000000004
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.20000000000000004
0.4000000000000001
0.22222222222222224
0.33333333333333337
0.28571428571428575
0.33333333333333337
0.28571428571428575
0.25
0.20000000000000004
0.20000000000000004
0.25
0.20000000000000004
0.8000000000000002
0.20000000000000004


Evidence Retrieval F-score: 0.091




 45%|████▍     | 110/246 [03:35<49:22, 21.78s/it]  



best val loss - epoch: 2, epoch_step: 36
maxi_f_score 0.09095547309833026 




 57%|█████▋    | 140/246 [03:38<00:19,  5.42it/s]



epoch: 3, epoch_step: 46, avg loss: 4.203455
learning rate: 0.000420




 69%|██████▉   | 170/246 [03:40<00:06, 12.25it/s]



epoch: 3, epoch_step: 56, avg loss: 4.195634
learning rate: 0.000440




 81%|████████▏ | 200/246 [03:43<00:03, 12.49it/s]



epoch: 3, epoch_step: 66, avg loss: 4.228290
learning rate: 0.000460




 93%|█████████▎| 230/246 [03:45<00:01, 12.40it/s]



epoch: 3, epoch_step: 76, avg loss: 4.188524
learning rate: 0.000480




100%|██████████| 246/246 [03:46<00:00,  1.08it/s]
  4%|▍         | 10/246 [00:00<00:19, 12.33it/s]



epoch: 4, epoch_step: 4, avg loss: 4.188038
learning rate: 0.000500



Evaluate:



  4%|▍         | 10/246 [00:13<00:19, 12.33it/s]

0.28571428571428575
0.28571428571428575
0.25
0.22222222222222224
0.20000000000000004
0.28571428571428575
0.4000000000000001
0.33333333333333337
0.28571428571428575
0.22222222222222224
0.20000000000000004
0.20000000000000004
0.25
0.20000000000000004
0.28571428571428575
0.20000000000000004
0.20000000000000004
0.5
0.33333333333333337
0.25
0.5
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.20000000000000004
0.5
0.20000000000000004
0.20000000000000004
0.20000000000000004
0.25
0.33333333333333337
0.4000000000000001
0.4000000000000001
0.22222222222222224
0.33333333333333337
0.28571428571428575
0.33333333333333337
0.28571428571428575
0.25
0.25
0.4000000000000001
0.20000000000000004
0.20000000000000004
0.6
0.20000000000000004


Evidence Retrieval F-score: 0.084




 18%|█▊        | 44/246 [03:30<00:37,  5.44it/s]  



epoch: 4, epoch_step: 14, avg loss: 4.193564
learning rate: 0.000520




 30%|███       | 74/246 [03:33<00:13, 12.42it/s]



epoch: 4, epoch_step: 24, avg loss: 4.225201
learning rate: 0.000540




 42%|████▏     | 104/246 [03:35<00:11, 12.51it/s]



epoch: 4, epoch_step: 34, avg loss: 4.206849
learning rate: 0.000560




 54%|█████▍    | 134/246 [03:37<00:09, 12.33it/s]



epoch: 4, epoch_step: 44, avg loss: 4.216021
learning rate: 0.000580




 65%|██████▌   | 160/246 [03:40<00:06, 12.36it/s]



epoch: 4, epoch_step: 54, avg loss: 4.235664
learning rate: 0.000600



Evaluate:



 65%|██████▌   | 160/246 [03:53<00:06, 12.36it/s]

0.28571428571428575
0.28571428571428575
0.20000000000000004
0.20000000000000004
0.25
0.22222222222222224
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.4000000000000001
0.33333333333333337
0.28571428571428575
0.22222222222222224
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.25
0.20000000000000004
0.28571428571428575
0.20000000000000004
0.20000000000000004
0.5
0.33333333333333337
0.25
0.25
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.20000000000000004
0.5
0.20000000000000004
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.4000000000000001
0.20000000000000004
0.22222222222222224
0.33333333333333337
0.28571428571428575
0.33333333333333337
0.28571428571428575
0.25
0.25
0.4000000000000001
0.20000000000000004
0.20000000000000004
0.8000000000000002
0.20000000000000004


Evidence Retrieval F-score: 0.091




 79%|███████▉  | 194/246 [07:02<00:09,  5.58it/s]



epoch: 4, epoch_step: 64, avg loss: 4.187320
learning rate: 0.000620




 91%|█████████ | 224/246 [07:05<00:01, 12.54it/s]



epoch: 4, epoch_step: 74, avg loss: 4.235623
learning rate: 0.000640




100%|██████████| 246/246 [07:06<00:00,  1.74s/it]
  3%|▎         | 8/246 [00:00<00:18, 12.71it/s]



epoch: 5, epoch_step: 2, avg loss: 4.202347
learning rate: 0.000660




 15%|█▌        | 38/246 [00:02<00:16, 12.88it/s]



epoch: 5, epoch_step: 12, avg loss: 4.193675
learning rate: 0.000680




 26%|██▌       | 64/246 [00:05<00:14, 12.39it/s]



epoch: 5, epoch_step: 22, avg loss: 4.184556
learning rate: 0.000700



Evaluate:



 26%|██▌       | 64/246 [00:17<00:14, 12.39it/s]

0.5714285714285715
0.28571428571428575
0.5
0.22222222222222224
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.4000000000000001
0.33333333333333337
0.28571428571428575
0.22222222222222224
0.20000000000000004
0.25
0.20000000000000004
0.28571428571428575
0.20000000000000004
0.20000000000000004
0.25
0.33333333333333337
0.25
0.5
0.28571428571428575
0.33333333333333337
0.20000000000000004
0.25
0.20000000000000004
0.20000000000000004
0.20000000000000004
0.20000000000000004
0.25
0.28571428571428575
0.33333333333333337
0.20000000000000004
0.22222222222222224
0.20000000000000004
0.4000000000000001
0.22222222222222224
0.25
0.28571428571428575
0.33333333333333337
0.28571428571428575
0.25
0.25
0.20000000000000004
0.20000000000000004
0.25
0.6


Evidence Retrieval F-score: 0.085




 40%|███▉      | 98/246 [03:28<00:26,  5.52it/s]  



epoch: 5, epoch_step: 32, avg loss: 4.216505
learning rate: 0.000720




 52%|█████▏    | 128/246 [03:30<00:09, 12.44it/s]



epoch: 5, epoch_step: 42, avg loss: 4.183382
learning rate: 0.000740




 64%|██████▍   | 158/246 [03:33<00:07, 12.33it/s]



epoch: 5, epoch_step: 52, avg loss: 4.215314
learning rate: 0.000760




 76%|███████▋  | 188/246 [03:35<00:04, 12.50it/s]



epoch: 5, epoch_step: 62, avg loss: 4.230684
learning rate: 0.000780




 87%|████████▋ | 214/246 [03:37<00:02, 12.14it/s]



epoch: 5, epoch_step: 72, avg loss: 4.191739
learning rate: 0.000800



Evaluate:



 87%|████████▋ | 214/246 [03:57<00:02, 12.14it/s]

0.28571428571428575
0.28571428571428575
0.20000000000000004
0.20000000000000004
0.25
0.22222222222222224
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.4000000000000001
0.33333333333333337
0.28571428571428575
0.22222222222222224
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.20000000000000004
0.28571428571428575
0.20000000000000004
0.20000000000000004
0.5
0.33333333333333337
0.25
0.5
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.6
0.5
0.20000000000000004
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.4000000000000001
0.20000000000000004
0.22222222222222224
0.33333333333333337
0.28571428571428575
0.33333333333333337
0.28571428571428575
0.25
0.4000000000000001
0.20000000000000004
0.20000000000000004
1.0
0.20000000000000004


Evidence Retrieval F-score: 0.093




 89%|████████▊ | 218/246 [06:58<09:51, 21.13s/it]



best val loss - epoch: 4, epoch_step: 72
maxi_f_score 0.09290352504638219 




100%|██████████| 246/246 [07:00<00:00,  1.71s/it]



epoch: 5, epoch_step: 82, avg loss: 4.176377
learning rate: 0.000820







In [30]:
torch.cuda.empty_cache()

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

### get the best one to evaluate

In [31]:
lstm_encoder.load_state_dict(torch.load(os.path.join(save_dir, "best_model.bin")))
lstm_encoder.cuda().eval()

Encoder(
  (embedding): Embedding(90097, 512)
  (pos_embedding): Embedding(180, 512)
  (encoder): LSTM(512, 512, num_layers=6, batch_first=True, dropout=0.2, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [32]:
evidence_embeddings = []
start_idx = 0
batch_size = 800
evidence_len = len(evidences_input[0])

while start_idx < len(evidences_input):
    end_idx = min(start_idx + batch_size, len(evidences_input))

    cur_evidence = torch.LongTensor(evidences_input[start_idx:end_idx]).view(-1, evidence_len).cuda()
    cur_evidence_pos = torch.LongTensor([list(range(evidence_len)) for _ in range(end_idx - start_idx)]).cuda()
    start_idx = end_idx

    cur_embedding = lstm_encoder(cur_evidence, cur_evidence_pos)
    cur_embedding = cur_embedding[:, -1, :].detach()
    cur_embedding_cpu = F.normalize(cur_embedding, p=2, dim=1).cpu()  # for cosine similarity
    evidence_embeddings = evidence_embeddings + [cur_embedding_cpu]

    del cur_embedding, cur_evidence, cur_evidence_pos

evidence_embeddings = torch.cat(evidence_embeddings, dim=0).t()


In [33]:
torch.cuda.empty_cache()

### evaluation function

In [34]:
def evaluate(dev_text_idx, evidence_embeddings, dev_sort_evidences, dev_claim_evidences, model):
    # get evidence embeddings
    model.eval()

    text_len = len(dev_text_idx[0])
    f = []

    start_idx = 0

    while start_idx < len(dev_text_idx):
        end_idx = min(start_idx + 200, len(dev_text_idx))

        cur_query = torch.LongTensor(dev_text_idx[start_idx:end_idx]).view(-1, text_len).cuda()
        cur_query_pos = torch.LongTensor([list(range(text_len)) for _ in range(end_idx - start_idx)]).cuda()

        query_embedding = model(cur_query, cur_query_pos)
        query_embedding = query_embedding[:, -1, :].detach()
        query_embedding = F.normalize(query_embedding, p=2, dim=1).cpu()

        scores = torch.mm(query_embedding, evidence_embeddings)

        for i in range(scores.size(0)):
            new_score = torch.index_select(scores[i], 0, torch.LongTensor(dev_sort_evidences[start_idx+i][:dev_candis_num]))
            topk_ids = torch.argsort(new_score).tolist()
            select_ids = topk_ids[:retrieval_num]

            evidence_correct = 0
            pred_evidences = [dev_sort_evidences[start_idx+i][j] for j in select_ids]

            label = dev_claim_evidences[start_idx+i]

            for evidence_id in label:
                if evidence_id in pred_evidences:
                    evidence_correct += 1
            evidence_fscore = cal_fscore(evidence_correct, label, pred_evidences)
            f.append(evidence_fscore)

        start_idx = end_idx

    fscore = np.mean(f)
    print("\n\nEvidence Retrieval F-score: %.3f\n\n" % fscore)
    return fscore

In [35]:
fscore = evaluate(dev_input, evidence_embeddings, dev_sort_evidences, dev_claim_evidences, lstm_encoder)
print(fscore)

0.28571428571428575
0.28571428571428575
0.20000000000000004
0.20000000000000004
0.25
0.22222222222222224
0.20000000000000004
0.28571428571428575
0.4000000000000001
0.33333333333333337
0.28571428571428575
0.22222222222222224
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.20000000000000004
0.28571428571428575
0.20000000000000004
0.20000000000000004
0.5
0.33333333333333337
0.25
0.5
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.6
0.5
0.20000000000000004
0.20000000000000004
0.20000000000000004
0.28571428571428575
0.33333333333333337
0.20000000000000004
0.6
0.22222222222222224
0.33333333333333337
0.28571428571428575
0.33333333333333337
0.28571428571428575
0.25
0.4000000000000001
0.20000000000000004
0.20000000000000004
1.0
0.20000000000000004


Evidence Retrieval F-score: 0.093


0.09290352504638219


# 4. Make prediction 

### prediction function

In [36]:
def do_prediction(dev_text_idx, evidences_embeddings, dev_sort_evidences, cleaned_evidence_id, encoder_model):
    # get evidence embeddings
    text_len = len(dev_text_idx[0])
    encoder_model.eval()

    start_idx = 0
    prediction = []
    while start_idx < len(dev_text_idx):
        end_idx = min(start_idx + 200, len(dev_text_idx))

        cur_query = torch.LongTensor(dev_text_idx[start_idx:end_idx]).view(-1, text_len).cuda()
        cur_query_pos = torch.LongTensor([list(range(text_len)) for _ in range(end_idx - start_idx)]).cuda()

        query_embedding = encoder_model(cur_query, cur_query_pos)
        query_embedding = query_embedding[:, -1, :].detach()
        query_embedding = F.normalize(query_embedding, p=2, dim=1).cpu()

        scores = torch.mm(query_embedding, evidences_embeddings)

        for i in range(scores.size(0)):
            new_score = torch.index_select(scores[i], 0, torch.LongTensor(dev_sort_evidences[start_idx+i][:dev_candis_num]))
            topk_ids = torch.argsort(new_score).tolist()

            pred_evidences = [cleaned_evidence_id[dev_sort_evidences[start_idx+i][j]] for j in topk_ids[:retrieval_num]]
            prediction.append(pred_evidences)

        start_idx = end_idx
    return prediction

In [37]:
pred_dev_claims = dict()
pred_test_claims = dict()
dev_evidence_ids = do_prediction(dev_input, evidence_embeddings, dev_sort_evidences, cleaned_evidence_id, lstm_encoder)
test_evidence_ids = do_prediction(test_input, evidence_embeddings, test_sort_evidences, cleaned_evidence_id, lstm_encoder)

with open("data/dev-claims.json", "r") as f:
    dev_claims = json.load(f)
with open("data/test-claims-unlabelled.json", "r") as f:
    test_claims = json.load(f)

for idx, evidence_ids in enumerate(dev_evidence_ids):
    pred_dev_claims[dev_claim_id[idx]] = dev_claims[dev_claim_id[idx]]
    pred_dev_claims[dev_claim_id[idx]]['evidences'] = evidence_ids


for idx, evidence_ids in enumerate(test_evidence_ids):
    pred_test_claims[test_claim_id[idx]] = test_claims[test_claim_id[idx]]
    pred_test_claims[test_claim_id[idx]]['evidences'] = evidence_ids

with open("data/dev_predict.json", "w") as f:
    json.dump(pred_dev_claims, f)
with open("data/test-claims-unlabelled.json", "w") as f:
    json.dump(pred_test_claims, f)

# Task2: CLS LABEL FOR CLAIM

# 1.Preprocessing

In [None]:
dev_cls_data = []
test_cls_data = []
all_max_len = 580

for idx, dev_text in enumerate(dev_text_idx):
    temp_data = {"label": dev_claim_label[idx]}
    temp_text = [wordidx["<cls>"]] + dev_text_idx[idx][:60]
    for i in dev_evidence_ids[idx]:
        temp_text = temp_text + [wordidx["<sep>"]] + evidences_text_idx[evidences_id_dict[i]][:100]
    temp_text.append(wordidx["<sep>"])
    if len(temp_text) < all_max_len:
        temp_text = temp_text + [wordidx["<pad>"]] * (all_max_len - len(temp_text))
    temp_data['text'] = temp_text
    dev_cls_data.append(temp_data)

for idx, test_text in enumerate(test_text_idx):
    temp_data = {}
    temp_text = [wordidx["<cls>"]] + test_text_idx[idx][:60]
    for i in test_evidence_ids[idx]:
        temp_text = temp_text + [wordidx["<sep>"]] + evidences_text_idx[evidences_id_dict[i]][:100]
    temp_text.append(wordidx["<sep>"])
    if len(temp_text) < all_max_len:
        temp_text = temp_text + [wordidx["<pad>"]] * (all_max_len - len(temp_text))
    temp_data['text'] = temp_text
    test_cls_data.append(temp_data)
with open("dev_cls_data.json", "w") as f:
    json.dump(dev_cls_data, f)
with open("test_cls_data.json", "w") as f:
    json.dump(test_cls_data, f)

In [39]:
with open("dev_cls_data.json", "r") as f:
    dev_cls_data = json.load(f)
with open("test_cls_data.json", "r") as f:
    test_cls_data = json.load(f)
with open("pred_train_negative_evidences.json", "r") as f:
    train_negative_evidences = json.load(f)

labels = ["SUPPORTS", "NOT_ENOUGH_INFO", "REFUTES", "DISPUTED"]
labels_id = {"SUPPORTS": 0, "NOT_ENOUGH_INFO": 1, "REFUTES": 2, "DISPUTED": 3}

### Construct TrainDataset

In [40]:
text_max_len = 60
evidence_max_len = 100
all_max_len = 580
retrieval_num = 5

class TrainDataset(Dataset):
    def __init__(self, text_data, evidence_data, positive_evidences, negative_evidences, cls_label, cls_idx, sep_idx, pad_idx, evidence_num=5):
        self.text_data = text_data
        self.evidence_data = evidence_data
        self.negative_evidences = negative_evidences
        self.evidence_num = evidence_num
        self.positive_evidences = positive_evidences
        self.cls_idx = cls_idx
        self.sep_idx = sep_idx
        self.pad_idx = pad_idx

        self.cls_label = [labels_id[i] for i in cls_label]

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        positive_evidences = self.positive_evidences[idx]
        negative_evidences = self.negative_evidences[idx]
        cls_label = self.cls_label[idx]
        return [self.text_data[idx][:text_max_len], positive_evidences, negative_evidences, self.cls_label[idx]]

    def collate_fn(self, batch):
        queries, queries_pos, labels = [], [], []

        for i, j, h, k in batch:
            l = len(j)
            temp_text = [self.cls_idx]
            temp_text = temp_text + i
            for p in j:
                temp_text = temp_text + [self.sep_idx]
                temp_text = temp_text + self.evidence_data[p][:evidence_max_len]
            if self.evidence_num > l:
                n = random.sample(h, self.evidence_num - l)
                for p in n:
                    temp_text = temp_text + [self.sep_idx]
                    temp_text = temp_text + self.evidence_data[p][:evidence_max_len]
            temp_text = temp_text + [self.sep_idx]
            if len(temp_text) < all_max_len:
                temp_text = temp_text + [self.pad_idx] * (all_max_len - len(temp_text))

            queries = queries + [temp_text]
            queries_pos = queries_pos + [list(range(all_max_len))]
            labels = labels + [k]

        batch_encoding = {"queries": torch.LongTensor(queries),
                          "queries_pos": torch.LongTensor(queries_pos),
                          "labels": torch.LongTensor(labels)}

        return batch_encoding

In [43]:
labels = [i["label"] for i in dev_cls_data]
dev_outputs = [labels_id[i] for i in labels]
dev_inputs = [i['text'] for i in dev_cls_data]
test_inputs = [i['text'] for i in test_cls_data]

train_set = TrainDataset(train_text_idx, evidences_text_idx, train_claim_evidences, train_negative_evidences, train_claim_label, wordidx["<cls>"], wordidx["<sep>"], wordidx["<pad>"], evidence_num=retrieval_num)
dataloader = DataLoader(train_set, batch_size=10, shuffle=True, num_workers=0, collate_fn=train_set.collate_fn)

print(Counter(train_claim_label))

Counter({'SUPPORTS': 519, 'NOT_ENOUGH_INFO': 386, 'REFUTES': 199, 'DISPUTED': 124})


# 2. Model Implementation

### construct cls encoder

In [44]:
class CLS(nn.Module):
    def __init__(self, vocab_emb, embed_dim, hidden_size, output_size, num_layers, max_position=all_max_len):
        super(CLS, self).__init__()

        self.embedding = nn.Embedding(vocab_emb, embed_dim)
        self.pos_embedding = nn.Embedding(max_position, embed_dim)

        self.encoder = nn.LSTM(embed_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.hidden_layer = nn.Linear(hidden_size * 2, hidden_size)
        self.cls = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.2)
        self.hidden_size = hidden_size

    def forward(self, text_data, position_text):
        text_x = self.embedding(text_data) + self.pos_embedding(position_text) * 0.01
        x_encoded,_ = self.encoder(text_x)
        x_cls = x_encoded[:, 0, :]
        x_hidden = F.tanh(self.hidden_layer(x_cls))
        x_hidden = self.dropout(x_hidden)
        cls_res = self.cls(x_hidden)
        return cls_res


In [45]:
cls_model = CLS(vocab_emb=len(idxword), embed_dim=256, hidden_size=256, output_size=4, num_layers=7, max_position=700).cuda()

### Set model hyperparameters

In [46]:
weight_decay = 0.02
encoder_optimizer = optim.AdamW(cls_model.parameters(), weight_decay=weight_decay)
max_lr = 1e-2
for param_group in encoder_optimizer.param_groups:
    param_group['lr'] = max_lr
warmup_steps = 300

### validate function to evaluate each epoch

In [47]:
def validate(dev_input, dev_output, cls_model_):
    # get evidence embeddings
    start_idx = 0
    cls_model.eval()
    l = len(dev_output)
    acc = []
    correct_count = 0
    while start_idx < l:
        end_idx = min(start_idx + 50, l)

        cur_input = torch.LongTensor(dev_input[start_idx:end_idx]).view(-1, len(dev_input[0])).cuda()
        cur_pos = torch.LongTensor([list(range(len(dev_input[0]))) for _ in range(end_idx - start_idx)]).cuda()

        cur_res = cls_model_(cur_input, cur_pos)
        cur_res = torch.argmax(cur_res, 1)
        cur_res = cur_res.tolist()

        for i, j in zip(cur_res, dev_output[start_idx: end_idx]):
            if i == j:
                correct_count += 1

        del cur_input, cur_pos
        start_idx = end_idx
    acc = correct_count / l
    print("\n\nClassification Accuracy: %.3f\n\n" % acc)
    cls_model.train()
    return acc

### Start training

In [49]:
encoder_optimizer.zero_grad()
step_cnt, all_step_cnt, avg_loss, maximum_f_score = 0, 0, 0, 0
ce_fn = nn.CrossEntropyLoss(torch.FloatTensor([0.2, 0.3, 0.5, 1.]).cuda())

n_epochs = 5
for epoch in range(n_epochs):
    epoch_step = 0

    for (i, batch) in enumerate(tqdm(dataloader)):

        step_cnt += 1
        cur_res = cls_model(batch["queries"].cuda(), batch["queries_pos"].cuda())
        loss = ce_fn(cur_res, batch["labels"].cuda()) / 2
        loss.backward()

        avg_loss += loss.item()
        if step_cnt == 2:
            # updating
            nn.utils.clip_grad_norm_(cls_model.parameters(), 4)

            step_cnt = 0
            epoch_step += 1
            all_step_cnt += 1

            # adjust learning rate
            if all_step_cnt <= warmup_steps:
                lr = all_step_cnt * (max_lr - 2e-8) / warmup_steps + 2e-8
            else:
                lr = max_lr - (all_step_cnt - warmup_steps) * 1e-6

            encoder_optimizer.step()
            encoder_optimizer.zero_grad()

        if all_step_cnt % 10 == 0 and step_cnt == 0:
            if all_step_cnt <= warmup_steps:
                lr = all_step_cnt * (max_lr - 2e-8) / warmup_steps + 2e-8
            else:
                lr = max_lr - (all_step_cnt - warmup_steps) * 1e-6

            # report stats
            print("\n\nEpoch: %d, Epoch_step: %d, Avg loss: %.6f" % (epoch + 1, epoch_step, avg_loss / 10))
            print(f"Learning rate: {lr:.6f}\n\n")

            avg_loss = 0
        del loss, cur_res

        if all_step_cnt % 50 == 0 and all_step_cnt != 0 and step_cnt == 0:
            # evaluate the model as a scorer
            print("\nEvaluate:\n")

            accuracy = validate(dev_inputs, dev_outputs, cls_model)

            if accuracy > maximum_f_score:
                maximum_f_score = accuracy
                torch.save(cls_model.state_dict(), os.path.join("save_model", "best_cls_model.bin"))
                print("\n\nBest val loss - epoch: %d, Epoch_step: %d" % (epoch, epoch_step))
                print("Max_accuracy", accuracy, "\n\n")

0,1
F_score,▁▆▆▇▅▇▅█
Learning_rate,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██
Loss,▂▃▃▂▂█▄▂▁▂▂▃▂▂▃▂▂▃▃▂▃▂▃▂▂▂▃▃▃▃▂▃▃▂▂▃▂▃▃▂

0,1
F_score,0.0929
Learning_rate,0.00082
Loss,4.17638


 16%|█▋        | 20/123 [00:04<00:20,  5.11it/s]



Epoch: 1, Epoch_step: 10, Avg loss: 2.304117
Learning rate: 0.000333




 33%|███▎      | 40/123 [00:08<00:16,  5.11it/s]



Epoch: 1, Epoch_step: 20, Avg loss: 1.507000
Learning rate: 0.000667




 49%|████▉     | 60/123 [00:11<00:12,  5.22it/s]



Epoch: 1, Epoch_step: 30, Avg loss: 1.571600
Learning rate: 0.001000




 65%|██████▌   | 80/123 [00:15<00:08,  5.11it/s]



Epoch: 1, Epoch_step: 40, Avg loss: 1.815439
Learning rate: 0.001333




 80%|████████  | 99/123 [00:19<00:04,  5.14it/s]



Epoch: 1, Epoch_step: 50, Avg loss: 1.595932
Learning rate: 0.001667



Evaluate:



Classification Accuracy: 0.442




Best val loss - epoch: 0, Epoch_step: 50

 81%|████████▏ | 100/123 [00:20<00:09,  2.49it/s]


Max_accuracy 0.44155844155844154 




 98%|█████████▊| 120/123 [00:24<00:00,  5.04it/s]



Epoch: 1, Epoch_step: 60, Avg loss: 1.494501
Learning rate: 0.002000




100%|██████████| 123/123 [00:24<00:00,  4.93it/s]
 14%|█▍        | 17/123 [00:03<00:20,  5.05it/s]



Epoch: 2, Epoch_step: 9, Avg loss: 1.544255
Learning rate: 0.002333




 30%|███       | 37/123 [00:07<00:16,  5.10it/s]



Epoch: 2, Epoch_step: 19, Avg loss: 1.640329
Learning rate: 0.002667




 46%|████▋     | 57/123 [00:11<00:13,  5.01it/s]



Epoch: 2, Epoch_step: 29, Avg loss: 1.484893
Learning rate: 0.003000




 62%|██████▏   | 76/123 [00:15<00:09,  5.01it/s]



Epoch: 2, Epoch_step: 39, Avg loss: 1.468899
Learning rate: 0.003333



Evaluate:



 63%|██████▎   | 77/123 [00:15<00:14,  3.15it/s]



Classification Accuracy: 0.266




 79%|███████▉  | 97/123 [00:19<00:05,  5.05it/s]



Epoch: 2, Epoch_step: 49, Avg loss: 1.429720
Learning rate: 0.003667




 95%|█████████▌| 117/123 [00:23<00:01,  5.05it/s]



Epoch: 2, Epoch_step: 59, Avg loss: 1.400015
Learning rate: 0.004000




100%|██████████| 123/123 [00:24<00:00,  4.96it/s]
 11%|█▏        | 14/123 [00:02<00:21,  5.02it/s]



Epoch: 3, Epoch_step: 7, Avg loss: 1.365819
Learning rate: 0.004333




 28%|██▊       | 34/123 [00:06<00:17,  5.01it/s]



Epoch: 3, Epoch_step: 17, Avg loss: 1.404617
Learning rate: 0.004667




 43%|████▎     | 53/123 [00:10<00:13,  5.04it/s]



Epoch: 3, Epoch_step: 27, Avg loss: 1.392720
Learning rate: 0.005000



Evaluate:



 44%|████▍     | 54/123 [00:11<00:21,  3.26it/s]



Classification Accuracy: 0.117




 60%|██████    | 74/123 [00:15<00:09,  5.08it/s]



Epoch: 3, Epoch_step: 37, Avg loss: 1.388385
Learning rate: 0.005333




 76%|███████▋  | 94/123 [00:19<00:05,  4.99it/s]



Epoch: 3, Epoch_step: 47, Avg loss: 1.398821
Learning rate: 0.005667




 93%|█████████▎| 114/123 [00:22<00:01,  5.08it/s]



Epoch: 3, Epoch_step: 57, Avg loss: 1.390976
Learning rate: 0.006000




100%|██████████| 123/123 [00:24<00:00,  4.97it/s]
  9%|▉         | 11/123 [00:02<00:22,  5.03it/s]



Epoch: 4, Epoch_step: 6, Avg loss: 1.395507
Learning rate: 0.006333




 24%|██▍       | 30/123 [00:05<00:18,  5.07it/s]



Epoch: 4, Epoch_step: 16, Avg loss: 1.383334
Learning rate: 0.006667



Evaluate:



 25%|██▌       | 31/123 [00:06<00:28,  3.28it/s]



Classification Accuracy: 0.266




 41%|████▏     | 51/123 [00:10<00:14,  5.11it/s]



Epoch: 4, Epoch_step: 26, Avg loss: 1.381854
Learning rate: 0.007000




 58%|█████▊    | 71/123 [00:14<00:10,  5.10it/s]



Epoch: 4, Epoch_step: 36, Avg loss: 1.386146
Learning rate: 0.007333




 74%|███████▍  | 91/123 [00:18<00:06,  5.11it/s]



Epoch: 4, Epoch_step: 46, Avg loss: 1.384133
Learning rate: 0.007667




 90%|█████████ | 111/123 [00:22<00:02,  5.10it/s]



Epoch: 4, Epoch_step: 56, Avg loss: 1.386935
Learning rate: 0.008000




100%|██████████| 123/123 [00:24<00:00,  5.03it/s]
  6%|▌         | 7/123 [00:01<00:22,  5.13it/s]



Epoch: 5, Epoch_step: 4, Avg loss: 1.396879
Learning rate: 0.008333



Evaluate:



  7%|▋         | 8/123 [00:01<00:35,  3.24it/s]



Classification Accuracy: 0.266




 23%|██▎       | 28/123 [00:05<00:18,  5.12it/s]



Epoch: 5, Epoch_step: 14, Avg loss: 1.388349
Learning rate: 0.008667




 39%|███▉      | 48/123 [00:09<00:14,  5.12it/s]



Epoch: 5, Epoch_step: 24, Avg loss: 1.388666
Learning rate: 0.009000




 55%|█████▌    | 68/123 [00:13<00:10,  5.13it/s]



Epoch: 5, Epoch_step: 34, Avg loss: 1.383945
Learning rate: 0.009333




 72%|███████▏  | 88/123 [00:17<00:06,  5.15it/s]



Epoch: 5, Epoch_step: 44, Avg loss: 1.388509
Learning rate: 0.009667




 87%|████████▋ | 107/123 [00:21<00:03,  5.11it/s]



Epoch: 5, Epoch_step: 54, Avg loss: 1.397145
Learning rate: 0.010000



Evaluate:



 88%|████████▊ | 108/123 [00:21<00:04,  3.30it/s]



Classification Accuracy: 0.266




100%|██████████| 123/123 [00:24<00:00,  4.98it/s]


# 3. Make Prediction

In [50]:
def do_cls_predict(dev_input, cls_model_):
    # get evidence embeddings
    start_idx = 0
    cls_model.eval()

    cls_res = []
    while start_idx < len(dev_input):
        end_idx = min(start_idx + 50, len(dev_input))

        temp_input = torch.LongTensor(dev_input[start_idx:end_idx]).view(-1, len(dev_input[0])).cuda()
        temp_pos = torch.LongTensor([list(range(len(dev_input[0]))) for _ in range(end_idx - start_idx)]).cuda()

        temp_res = cls_model_(temp_input, temp_pos)
        temp_res = torch.argmax(temp_res, 1).tolist()


        cls_res = cls_res + temp_res

        start_idx = end_idx
        del temp_input, temp_pos

    return cls_res

In [51]:
torch.cuda.empty_cache()

In [53]:
cls_model.load_state_dict(torch.load(os.path.join("save_model", "best_cls_model.bin")))

dev_classes = do_cls_predict(dev_inputs, cls_model)
test_classes = do_cls_predict(test_inputs, cls_model)

In [54]:
with open("data/dev_predict.json", "r") as f:
    pred_dev_claims = json.load(f)
with open("data/test-claims-unlabelled.json", "r") as f:
    pred_test_claims = json.load(f)

for i, j in zip(dev_claim_id, dev_classes):
    claim_label = labels[j]
    evidences = pred_dev_claims[i]['evidences']
    pred_dev_claims[i] = {'claim_text': pred_dev_claims[i]['claim_text'], 'claim_label': claim_label, 'evidences': evidences}

for i, j in zip(test_claim_id, test_classes):
    claim_label = labels[j]
    evidences = pred_test_claims[i]['evidences']
    pred_test_claims[i] = {'claim_text': pred_test_claims[i]['claim_text'], 'claim_label': claim_label, 'evidences': evidences}

with open("data/dev_predict.json", "w") as f:
    json.dump(pred_dev_claims, f)
with open("data/test-claims-unlabelled.json", "w") as f:
    json.dump(pred_test_claims, f)

# 4. Evaluate whole project

In [56]:
output = subprocess.check_output("python eval.py --predictions data/dev_predict.json --groundtruth data/dev-claims.json", shell=True)
output_str = output.decode('utf-8')

# Split the output into lines
output_lines = output_str.strip().split('\n')

# Format the output
formatted_lines = []
for line in output_lines:
    metric, value = line.split('=')
    metric = metric.strip()
    value = value.strip()
    formatted_line = f"{metric}: {value}"
    formatted_lines.append(formatted_line)

# Join the formatted lines into a single string
formatted_output = '\n'.join(formatted_lines)
print(formatted_output)

Evidence Retrieval F-score (F): 0.0929035250463822
Claim Classification Accuracy (A): 0.44155844155844154
Harmonic Mean of F and A: 0.15350890539643472


## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*