# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

**We use pytorch, nltk, scikit-learn in this project.**

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## PreProcess for evidence and claims

### preprocessing function

In [3]:
import torch
print("CUDA available:", torch.cuda.is_available())

CUDA available: True


In [None]:
print("Current CUDA device:", torch.cuda.current_device())
print("Device count:", torch.cuda.device_count())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

Current CUDA device: 0
Device count: 1
Device name: NVIDIA GeForce RTX 4070 Ti SUPER


### read files

In [9]:
import json
import nltk
import string
import re
import random
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from statistics import mean
import torch

torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
random.seed(42)

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

with open('data/train-claims.json', 'r') as input_file:
    train_claims = json.load(input_file)

# Read in development data (claim)
with open('data/dev-claims.json', 'r') as input_file:
    dev_claims = json.load(input_file)

# Read in test data (claim)
with open('data/test-claims-unlabelled.json', 'r') as input_file:
    test_claims = json.load(input_file)

# Read in evidence data
with open('data/evidence.json', 'r') as input_file:
    evidences = json.load(input_file)

#EDA
claim_count = 0
evi_count = 0
claim_length = []
evidence_count = []
evidence_length = []
labels = []

for key,value in train_claims.items():
    claim_count+=1
    claim_length.append(len(value["claim_text"]))
    evidence_count.append(len(value["evidences"]))
    evidence_length += [len(evidences[x]) for x in value["evidences"]]
    labels.append(value["claim_label"])

for key,value in evidences.items():
    evi_count+=1

print("claim count: ",claim_count)
print("evidence count: ",evi_count)
print("max claim length: ",max(claim_length))
print("min claim length: ",min(claim_length))
print("mean claim length: ",mean(claim_length))
print("max evidence count: ",max(evidence_count))
print("min evidence count: ",min(evidence_count))
print("mean evidence count: ",mean(evidence_count))
print("max evidence length: ",max(evidence_length))
print("min evidence length: ",min(evidence_length))
print("mean evidence length: ",mean(evidence_length))
print(Counter(labels))

inside = 0
outside = 0

train_evi_id = []
for claim_id,claim_value in train_claims.items():
    train_evi_id=train_evi_id+claim_value['evidences']

for claim_id,claim_value in dev_claims.items():
    test_evi_id=claim_value['evidences']
    for e in test_evi_id:
        if e in train_evi_id:
            inside += 1
        else:
            outside += 1
print("Dev evi inside train evi", inside)
print("Dev evi outside train evi", outside)

full_evidence_id = list(evidences.keys())
full_evidence_text  = list(evidences.values())
train_claim_id = list(train_claims.keys())
train_claim_text  = [ v["claim_text"] for v in train_claims.values()]
print("Train claim count: ",len(train_claim_id))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


claim count:  1228
evidence count:  1208827
max claim length:  332
min claim length:  26
mean claim length:  122.95521172638436
max evidence count:  5
min evidence count:  1
mean evidence count:  3.3566775244299674
max evidence length:  1979
min evidence length:  13
mean evidence length:  173.5
Counter({'SUPPORTS': 519, 'NOT_ENOUGH_INFO': 386, 'REFUTES': 199, 'DISPUTED': 124})
Dev evi inside train evi 163
Dev evi outside train evi 328
Train claim count:  1228


In [5]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [6]:
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text


def preprocessing_text(text):

     # Convert to lowercase
    text = text.lower()

     # Lemmatize the text
    text = lemmatize_text(text)

    # Remove leading/trailing whitespaces
    text = text.strip()
    
    return text

In [7]:
def preprocess_claim_data(claim_data, existed_evidences_id=None):
    claim_data_text = []
    claim_data_id = []
    claim_data_label = []
    claim_evidences = []
    for key in claim_data.keys():
        claim_data[key]["claim_text"] = preprocessing_text(claim_data[key]["claim_text"])
        claim_data_text.append(claim_data[key]["claim_text"])
        claim_data_id.append(key)
        if "claim_label" in claim_data[key]:
            claim_data_label.append(claim_data[key]["claim_label"])
        else:
            claim_data_label.append(None)
        if existed_evidences_id and "evidences" in claim_data[key]:
            valid_evidences = [existed_evidences_id[i] for i in claim_data[key]["evidences"] if i in existed_evidences_id]
            claim_evidences.append(valid_evidences)
        else:
            claim_evidences.append([])
    return claim_data_text, claim_data_id, claim_data_label, claim_evidences


def preprocess_evi_data(evi_data):
    cleaned_evidence_text = []
    cleaned_evidence_id = []
    for key, value in evi_data.items():
        cleaned_text = preprocessing_text(value)
        cleaned_evidence_text.append(cleaned_text)
        cleaned_evidence_id.append(key)
    return cleaned_evidence_text, cleaned_evidence_id

In [10]:
cleaned_evidence_text, cleaned_evidence_id = preprocess_evi_data(evidences)

evidences_id_dict = {evidence_id: idx for idx, evidence_id in enumerate(cleaned_evidence_id)}

train_claim_text, train_claim_id, train_claim_label, train_claim_evidences = preprocess_claim_data(train_claims, evidences_id_dict)

dev_claim_text, dev_claim_id, dev_claim_label, dev_claim_evidences = preprocess_claim_data(dev_claims, evidences_id_dict)

test_claim_text, test_claim_id, _, _ = preprocess_claim_data(test_claims)

In [11]:
print("Number of claims after removing non-English:", len(train_claim_evidences))
print("Number of claims after preprocessing:", len(train_claim_text))

Number of claims after removing non-English: 1228
Number of claims after preprocessing: 1228


In [12]:
print(train_claim_text[0:10])

['not only is there no scientific evidence that co2 is a pollutant , higher co2 concentration actually help ecosystem support more plant and animal life .', 'el niño drove record high in global temperature suggesting rise may not be down to man-made emission .', 'in 1946 , pdo switched to a cool phase .', 'weather channel co-founder john coleman provided evidence that convincingly refutes the concept of anthropogenic global warming .', '`` january 2008 capped a 12 month period of global temperature drop on all of the major well respected indicator .', 'the last time the planet wa even four degree warmer , peter brannen point out in the end of the world , his new history of the planet ’ s major extinction event , the ocean were hundred of foot higher .', 'tree-ring proxy reconstruction are reliable before 1960 , tracking closely with the instrumental record and other independent proxy .', 'under the most ambitious scenario , they found a strong likelihood that antarctica would remain fa

### tfidf retrieval

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(cleaned_evidence_text)

def fit_transform_tfidf(train_texts, dev_texts, test_texts, evidence_texts):
    train_tfidf = vectorizer.transform(train_texts)
    dev_tfidf = vectorizer.transform(dev_texts)
    test_tfidf = vectorizer.transform(test_texts)
    evidence_tfidf = vectorizer.transform(evidence_texts)

    return train_tfidf, dev_tfidf, test_tfidf, evidence_tfidf

train_tfidf, dev_tfidf, test_tfidf, evidence_tfidf = fit_transform_tfidf(train_claim_text, dev_claim_text, test_claim_text, cleaned_evidence_text)

In [14]:
train_cos_sims = cosine_similarity(train_tfidf, evidence_tfidf)
dev_cos_sims = cosine_similarity(dev_tfidf, evidence_tfidf)
test_cos_sims = cosine_similarity(test_tfidf, evidence_tfidf)
print(train_cos_sims.shape)

(1228, 54272)


In [15]:
def test_retrieval_topk(k, cur_scores, cur_labels):
    ACC = []
    top_ids = torch.topk(torch.FloatTensor(cur_scores), k, -1).indices.tolist()
    for i in range(len(cur_labels)):
        no_recall_count = 0
        recall_count = 0
        for cur_ in cur_labels[i]:
            if cur_ in top_ids[i]:
                recall_count += 1
            else:
                no_recall_count += 1
        if recall_count + no_recall_count == 0:
            no_recall_count = 1e-9  # to avoid division by zero
        ACC.append(recall_count / (recall_count + no_recall_count))
    return sum(ACC) / len(ACC)

topK = 30
print(test_retrieval_topk(topK, train_cos_sims, train_claim_evidences))
print(test_retrieval_topk(topK, dev_cos_sims, dev_claim_evidences))

0.2799809989142237
0.32932900432900436


In [16]:
def sort_evidence_candidates(cos_sims):
    top_ids = np.argsort(-cos_sims, axis=1)[:, :10000]
    return top_ids.tolist()

In [17]:
dev_sort_evidences = sort_evidence_candidates(dev_cos_sims)
test_sort_evidences = sort_evidence_candidates(test_cos_sims)
train_sort_evidences = sort_evidence_candidates(train_cos_sims)

### construct vocab and indexing

In [18]:
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import random

min_count = 5
wordcount = defaultdict(int)
idxword = ["<cls>",  "<sep>", "<pad>", "<unk>"]

for texts in train_claim_text + cleaned_evidence_text:
    for word in texts.split():
        wordcount[word] += 1

for word, count in wordcount.items():
    if count > min_count:
        idxword.append(word)

wordidx = {value: i for i, value in enumerate(idxword)}

In [19]:
def convert2idx(text_data, wordidx_):
    idx_data = []
    unk_idx = wordidx_["<unk>"]
    for texts in text_data:
        temp_idx = [wordidx_.get(word, unk_idx) for word in texts.split()]
        idx_data.append(temp_idx)
    return idx_data

In [20]:
train_text_idx = convert2idx(train_claim_text, wordidx)
dev_text_idx = convert2idx(dev_claim_text, wordidx)
test_text_idx = convert2idx(test_claim_text, wordidx)
evidences_text_idx = convert2idx(cleaned_evidence_text, wordidx)

In [21]:
print(max([len(i) for i in train_text_idx]), max([len(i) for i in dev_text_idx]), max([len(i) for i in test_text_idx]), max([len(i) for i in evidences_text_idx]))

76 73 60 298


In [22]:
def construct_input_text(text_idx, padding_len, wordidx_):
    idx_data = []
    cls_idx = wordidx_["<cls>"]
    sep_idx = wordidx_["<sep>"]
    pad_idx = wordidx_["<pad>"]

    for texts in text_idx:
        if len(texts) < padding_len:
            padded_text = [cls_idx] + texts + [sep_idx] + [pad_idx] * (padding_len - len(texts))
        else:
            padded_text = [cls_idx] + texts[:padding_len] + [sep_idx]
        idx_data.append(padded_text)

    return idx_data

In [23]:
text_pad_len = 50
evidences_pad_len = 80
train_input = construct_input_text(train_text_idx, text_pad_len, wordidx)
dev_input = construct_input_text(dev_text_idx, text_pad_len, wordidx)
test_input = construct_input_text(test_text_idx, text_pad_len, wordidx)
evidences_input = construct_input_text(evidences_text_idx, evidences_pad_len, wordidx)

In [24]:
print(len(train_input[0]), len(evidences_input[0]))

52 82


In [25]:
vocab_size = len(idxword)
print(vocab_size)

11792


In [39]:
class TrainDataset(Dataset):
    def __init__(self, text_data, evidence_data, sorted_evidences, evidence_label, negative_num):
        self.text_data = text_data
        self.evidence_data = evidence_data
        self.sorted_evidences = sorted_evidences
        self.evidence_label = evidence_label
        self.negative_num = negative_num
        self.evidence_len = len(evidence_data[0])
        self.text_len = len(text_data[0])

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        positive_evidences = self.evidence_label[idx]
        negative_evidences = random.sample(self.sorted_evidences[idx][30: self.negative_num * 10], self.negative_num)
        return [self.text_data[idx], negative_evidences, positive_evidences]

    def collate_fn(self, batch):
        queries, queries_pos, evidences, temp_labels = [], [], [], []

        for i, j, k in batch:
            queries = queries + [i]
            queries_pos = queries_pos + [list(range(self.text_len))]
            temp_labels.append(k)
            evidences.extend(k + j)

        evidences = list(set(evidences))

        evidences2idx = {evid: i for i, evid in enumerate(evidences)}
        labels = [[evidences2idx[evid] for evid in pos_evidences] for pos_evidences in temp_labels]

        evidences = [self.evidence_data[i] for i in evidences]
        evidences_pos = [list(range(self.evidence_len)) for _ in range(len(evidences))]


        batch_encoding = {
            "queries": torch.LongTensor(queries),
            "evidences": torch.LongTensor(evidences),
            "queries_pos": torch.LongTensor(queries_pos),
            "evidences_pos": torch.LongTensor(evidences_pos),
            "labels": labels
        }

        return batch_encoding

In [40]:
train_set = TrainDataset(train_input, evidences_input, train_sort_evidences, train_claim_evidences, negative_num=10)
dataloader = DataLoader(train_set, batch_size=5, shuffle=True, num_workers=0, collate_fn=train_set.collate_fn)

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [50]:
class Encoder(nn.Module):
    def __init__(self, vocab_emb, embed_dim, hidden_size, num_layers, max_position=180, dropout=0.2):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_emb, embed_dim)
        self.pos_embedding = nn.Embedding(max_position, embed_dim)
        self.encoder = nn.LSTM(embed_dim, hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, text_data, position_text):
        text_x = self.embedding(text_data) + self.pos_embedding(position_text)
        text_x = self.dropout(text_x)
        x_encoded, _ = self.encoder(text_x)
        x_encoded = self.dropout(x_encoded)
        return x_encoded

In [51]:
lstm_encoder = Encoder(vocab_emb=vocab_size, embed_dim=512, hidden_size=512, num_layers=6, max_position=180)
lstm_encoder.cuda()

Encoder(
  (embedding): Embedding(11792, 512)
  (pos_embedding): Embedding(180, 512)
  (encoder): LSTM(512, 512, num_layers=7, batch_first=True, dropout=0.2, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

### Training

In [52]:
torch.manual_seed(41)
torch.cuda.manual_seed_all(41)
random.seed(41)

weight_decay = 1e-4
encoder_optimizer = optim.Adam(lstm_encoder.parameters(), weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, mode='min', factor=0.1, patience=10, verbose=True, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=1e-8, eps=1e-08)

max_lr = 1e-5
for param_group in encoder_optimizer.param_groups:
    param_group['lr'] = max_lr

accumulate_step = 3
grad_norm = 0.5
warmup_steps = 500
report_freq = 10
eval_interval = 50
save_dir = "model_ckpts"

In [53]:
def cal_fscore(evidence_correct, label, pred_evidences):
    if evidence_correct > 0:
        evidence_recall = float(evidence_correct) / len(label)
        evidence_precision = float(evidence_correct) / len(pred_evidences)
        evidence_fscore = (2 * evidence_precision * evidence_recall) / (evidence_precision + evidence_recall)
        print(evidence_fscore)
    else:
        evidence_fscore = 0
    return evidence_fscore

In [54]:
retrieval_num = 5
dev_candis_num = 10

def validate(dev_text_idx, evidence_text_idx, dev_sort_evidences, dev_claim_evidences, encoder_model):
    # get evidence embeddings

    encoder_model.eval()
    start_idx = 0
    batch_size = 900
    evidence_embeddings = get_embeddings(evidence_text_idx, encoder_model, batch_size)

    text_len = len(dev_text_idx[0])

    f_scores = []

    while start_idx < len(dev_text_idx):
        end_idx = min(start_idx + batch_size, len(dev_text_idx))

        cur_query = torch.LongTensor(dev_text_idx[start_idx:end_idx]).view(-1, text_len).cuda()
        cur_query_pos = torch.LongTensor([list(range(text_len)) for _ in range(end_idx - start_idx)]).cuda()

        query_embedding = encoder_model(cur_query, cur_query_pos)
        query_embedding = query_embedding[:, -1, :].detach()
        query_embedding = F.normalize(query_embedding, p=2, dim=1).cpu()

        scores = torch.mm(query_embedding, evidence_embeddings)

        for i in range(scores.size(0)):
            new_score = torch.index_select(scores[i], 0, torch.LongTensor(dev_sort_evidences[start_idx+i][:dev_candis_num]))
            topk_ids = torch.argsort(new_score).tolist()
            select_ids = topk_ids[:retrieval_num]

            evidence_correct = 0
            pred_evidences = [dev_sort_evidences[start_idx+i][j] for j in select_ids]
            label = dev_claim_evidences[start_idx+i]
            for evidence_id in label:
                if evidence_id in pred_evidences:
                    evidence_correct += 1
            evidence_fscore = cal_fscore(evidence_correct, label, pred_evidences)
            f_scores = f_scores + [evidence_fscore]

        start_idx = end_idx

    fscore = np.mean(f_scores)
    encoder_model.train()
    print("\n\nEvidence Retrieval F-score: %.3f\n\n" % fscore)
    return fscore

def get_embeddings(text_indices, encoder_model, batch_size=900):
    embeddings = []
    text_len = len(text_indices[0])

    for start_idx in range(0, len(text_indices), batch_size):
        end_idx = min(start_idx + batch_size, len(text_indices))
        cur_query = torch.LongTensor(text_indices[start_idx:end_idx]).view(-1, text_len).cuda()
        cur_query_pos = torch.LongTensor([list(range(text_len)) for _ in range(end_idx - start_idx)]).cuda()

        query_embedding = encoder_model(cur_query, cur_query_pos)
        query_embedding = query_embedding[:, -1, :].detach().cpu()
        embeddings.append(query_embedding)

    return torch.cat(embeddings, dim=0).t()


In [None]:
%env WANDB_NOTEBOOK_NAME Mon5PMGroup7_COMP90042_Project_2024_Test.ipynb

In [None]:
import subprocess

def run_command(command):
    result = subprocess.run(command, shell=True, text=True, capture_output=True)
    if result.returncode != 0:
        print(f"Command failed: {command}\n{result.stderr}")
    else:
        print(f"Command succeeded: {command}\n{result.stdout}")
    return result

required_packages = ["wandb"]
run_command(f"pip install {' '.join(required_packages)}")

In [55]:
# start training
import wandb
import os

wandb.init(project="nlp", name="dpr")

from tqdm import tqdm

encoder_optimizer.zero_grad()
step_cnt, all_step_cnt, avg_loss, maximum_f_score = 0, 0, 0, 0
n_epochs = 5
for epoch in range(n_epochs):
    epoch_step = 0

    for (i, batch) in enumerate(tqdm(dataloader)):

        query_embeddings = lstm_encoder(batch["queries"].cuda(), batch["queries_pos"].cuda())
        evidence_embeddings = lstm_encoder(batch["evidences"].cuda(), batch["evidences_pos"].cuda())

        query_embeddings = query_embeddings[:, -1, :]
        evidence_embeddings = evidence_embeddings[:, -1, :]

        assert query_embeddings.size(1) == evidence_embeddings.size(1), "Embedding dimensions do not match!"

        query_embeddings = torch.nn.functional.normalize(query_embeddings, p=2, dim=1)
        evidence_embeddings = torch.nn.functional.normalize(evidence_embeddings, p=2, dim=1)

        cos_sims = torch.mm(query_embeddings, evidence_embeddings.t())
        scores = cos_sims / 0.1
        loss = []
        start_idx = 0
        criterion = torch.nn.CrossEntropyLoss()
        for idx, labels in enumerate(batch["labels"]):
            labels = torch.LongTensor(labels).cuda()
            cur_loss = criterion(scores[idx].unsqueeze(0).repeat(len(labels), 1), labels)
        loss = loss + [cur_loss]
        loss = torch.stack(loss).mean()

        # cos_sims = torch.mm(query_embeddings, evidence_embeddings.t())
        # scores = - torch.nn.functional.log_softmax(cos_sims / 0.1 + 1e-10, dim=1)
        # loss = []
        # start_idx = 0
        # for idx, label in enumerate(batch["labels"]):
        #     label = torch.LongTensor(label).cuda()
        #     cur_loss = torch.mean(torch.index_select(scores[idx], 0, label))
        #     loss.append(cur_loss)
        # loss = torch.stack(loss).mean()
        
        loss = loss / accumulate_step
        loss.backward()

        avg_loss += loss.item()

        step_cnt += 1
        if step_cnt == accumulate_step:
            # updating
            if grad_norm > 0:
                nn.utils.clip_grad_norm_(lstm_encoder.parameters(), grad_norm)

            step_cnt = 0
            epoch_step += 1
            all_step_cnt += 1

            # adjust learning rate
            if all_step_cnt <= warmup_steps:
                lr = all_step_cnt * (max_lr - 2e-8) / warmup_steps + 2e-8
            else:
                lr = max_lr - (all_step_cnt - warmup_steps) * 1e-5

            encoder_optimizer.step()
            encoder_optimizer.zero_grad()

        if all_step_cnt % report_freq == 0 and step_cnt == 0:
            if all_step_cnt <= warmup_steps:
                lr = all_step_cnt * (max_lr - 2e-8) / warmup_steps + 2e-8
            else:
                lr = max_lr - (all_step_cnt - warmup_steps) * 1e-5

            wandb.log({"learning_rate": lr}, step=all_step_cnt)
            wandb.log({"loss": avg_loss / report_freq}, step=all_step_cnt)
            
            # report stats
            print("\n")
            print("epoch: %d, epoch_step: %d, avg loss: %.6f" % (epoch + 1, epoch_step, avg_loss / report_freq))
            print(f"learning rate: {lr:.6f}")
            print("\n")
            avg_loss = 0
        del loss, cos_sims, query_embeddings, evidence_embeddings

        if all_step_cnt % eval_interval == 0 and all_step_cnt != 0 and step_cnt == 0:
            # evaluate the model as a scorer
            print("\nEvaluate:\n")

            f_score = validate(dev_input, evidences_input, dev_sort_evidences, dev_claim_evidences, lstm_encoder)
            wandb.log({"f_score": f_score}, step=all_step_cnt)
            
            if f_score > maximum_f_score:
                maximum_f_score = f_score
                os.makedirs(save_dir, exist_ok=True)
                torch.save(lstm_encoder.state_dict(), os.path.join(os.path.abspath(save_dir), "best_ckpt.bin"))
                print()
                print()
                print("best val loss - epoch: %d, epoch_step: %d" % (epoch, epoch_step))
                print("maximum_f_score", f_score, "\n\n")

  3%|▎         | 7/246 [00:03<02:13,  1.79it/s]


KeyboardInterrupt: 

In [61]:
torch.cuda.empty_cache()

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [63]:
lstm_encoder.load_state_dict(torch.load(os.path.join(save_dir, "best_ckpt.bin")))
lstm_encoder.cuda().eval()

Encoder(
  (embedding): Embedding(11792, 512)
  (pos_embedding): Embedding(180, 512)
  (encoder): LSTM(512, 512, num_layers=7, batch_first=True, dropout=0.2, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [64]:
evidence_embeddings = []
start_idx = 0
batch_size = 900
evidence_len = len(evidences_input[0])

while start_idx < len(evidences_input):
    end_idx = min(start_idx + batch_size, len(evidences_input))

    cur_evidence = torch.LongTensor(evidences_input[start_idx:end_idx]).view(-1, evidence_len).cuda()
    cur_evidence_pos = torch.LongTensor([list(range(evidence_len)) for _ in range(end_idx - start_idx)]).cuda()
    start_idx = end_idx

    cur_embedding = lstm_encoder(cur_evidence, cur_evidence_pos)
    cur_embedding = cur_embedding[:, -1, :].detach()
    cur_embedding_cpu = F.normalize(cur_embedding, p=2, dim=1).cpu()  # for cosine similarity
    evidence_embeddings = evidence_embeddings + [cur_embedding_cpu]

    del cur_embedding, cur_evidence, cur_evidence_pos

evidence_embeddings = torch.cat(evidence_embeddings, dim=0).t()


In [65]:
torch.cuda.empty_cache()

In [66]:
def validate_(dev_text_idx, evidence_embeddings, dev_sort_evidences, dev_claim_evidences, encoder_model):
    # get evidence embeddings
    encoder_model.eval()

    text_len = len(dev_text_idx[0])
    f = []

    start_idx = 0

    while start_idx < len(dev_text_idx):
        end_idx = min(start_idx + 200, len(dev_text_idx))

        cur_query = torch.LongTensor(dev_text_idx[start_idx:end_idx]).view(-1, text_len).cuda()
        cur_query_pos = torch.LongTensor([list(range(text_len)) for _ in range(end_idx - start_idx)]).cuda()

        query_embedding = encoder_model(cur_query, cur_query_pos)
        query_embedding = query_embedding[:, -1, :].detach()
        query_embedding = F.normalize(query_embedding, p=2, dim=1).cpu()

        scores = torch.mm(query_embedding, evidence_embeddings)

        for i in range(scores.size(0)):
            new_score = torch.index_select(scores[i], 0, torch.LongTensor(dev_sort_evidences[start_idx+i][:dev_candis_num]))
            topk_ids = torch.argsort(new_score).tolist()
            select_ids = topk_ids[:retrieval_num]

            evidence_correct = 0
            pred_evidences = [dev_sort_evidences[start_idx+i][j] for j in select_ids]

            label = dev_claim_evidences[start_idx+i]

            for evidence_id in label:
                if evidence_id in pred_evidences:
                    evidence_correct += 1
            evidence_fscore = cal_fscore(evidence_correct, label, pred_evidences)
            f.append(evidence_fscore)

        start_idx = end_idx
        
    fscore = np.mean(f)
    print("\n\nEvidence Retrieval F-score: %.3f\n\n" % fscore)
    return fscore

In [67]:
fscore = validate_(dev_input, evidence_embeddings, dev_sort_evidences, dev_claim_evidences, lstm_encoder)
print(fscore)

0.33333333333333337
0.25
0.20000000000000004
0.4000000000000001
0.20000000000000004
0.28571428571428575
0.25
0.33333333333333337
0.28571428571428575
0.22222222222222224
0.33333333333333337
0.25
0.33333333333333337
0.28571428571428575
0.28571428571428575
0.28571428571428575
0.4000000000000001
0.5714285714285715
0.28571428571428575
0.33333333333333337
0.33333333333333337
0.33333333333333337
0.6
0.25
0.25
0.33333333333333337
0.20000000000000004
0.33333333333333337
0.4000000000000001
0.25
0.20000000000000004
0.22222222222222224
0.25
0.33333333333333337
0.28571428571428575
0.33333333333333337
0.33333333333333337
0.25
0.4000000000000001
0.28571428571428575
0.4000000000000001
0.20000000000000004
0.33333333333333337
0.28571428571428575


Evidence Retrieval F-score: 0.088


0.08779633065347352


In [68]:
def evidence_predicts(dev_text_idx, evidences_embeddings, dev_sort_evidences, cleaned_evidence_id, encoder_model):
    # get evidence embeddings
    text_len = len(dev_text_idx[0])
    encoder_model.eval()

    start_idx = 0
    preds = []
    while start_idx < len(dev_text_idx):
        end_idx = min(start_idx + 200, len(dev_text_idx))

        cur_query = torch.LongTensor(dev_text_idx[start_idx:end_idx]).view(-1, text_len).cuda()
        cur_query_pos = torch.LongTensor([list(range(text_len)) for _ in range(end_idx - start_idx)]).cuda()

        query_embedding = encoder_model(cur_query, cur_query_pos)
        query_embedding = query_embedding[:, -1, :].detach()
        query_embedding = F.normalize(query_embedding, p=2, dim=1).cpu()

        scores = torch.mm(query_embedding, evidences_embeddings)

        for i in range(scores.size(0)):
            new_score = torch.index_select(scores[i], 0, torch.LongTensor(dev_sort_evidences[start_idx+i][:dev_candis_num]))
            topk_ids = torch.argsort(new_score).tolist()

            pred_evidences = [cleaned_evidence_id[dev_sort_evidences[start_idx+i][j]] for j in topk_ids[:retrieval_num]]
            preds.append(pred_evidences)

        start_idx = end_idx
    return preds

In [69]:
pred_dev_claims = dict()
pred_test_claims = dict()
dev_evidences_ids = evidence_predicts(dev_input, evidence_embeddings, dev_sort_evidences, cleaned_evidence_id, lstm_encoder)
test_evidences_ids = evidence_predicts(test_input, evidence_embeddings, test_sort_evidences, cleaned_evidence_id, lstm_encoder)
with open("data/dev-claims.json", "r") as f:
    dev_claims = json.load(f)
with open("data/test-claims-unlabelled.json", "r") as f:
    test_claims = json.load(f)

for idx, evidence_ids in enumerate(dev_evidences_ids):
    pred_dev_claims[dev_claim_id[idx]] = dev_claims[dev_claim_id[idx]]
    pred_dev_claims[dev_claim_id[idx]]['evidences'] = evidence_ids


for idx, evidence_ids in enumerate(test_evidences_ids):
    pred_test_claims[test_claim_id[idx]] = test_claims[test_claim_id[idx]]
    pred_test_claims[test_claim_id[idx]]['evidences'] = evidence_ids


In [70]:
with open("data/dev_predict.json", "w") as f:
    json.dump(pred_dev_claims, f)
with open("data/test-claims-unlabelled.json", "w") as f:
    json.dump(pred_test_claims, f)

In [71]:
train_evidences_ids = evidence_predicts(train_input, evidence_embeddings, train_sort_evidences, cleaned_evidence_id, lstm_encoder)

pred_train_negative_evidences = []
for idx, evidence_ids in enumerate(train_evidences_ids):
    temp_ = []
    for i in evidence_ids:
        if evidences_id_dict[i] not in train_claim_evidences[idx]:
            temp_ = temp_ + [evidences_id_dict[i]]
    pred_train_negative_evidences.append(temp_)

In [72]:
## save prediction data
with open("pred_train_negative_evidences.json", "w") as f:
    json.dump(pred_train_negative_evidences, f)

In [73]:
## save cls data

dev_cls_data = []
test_cls_data = []
all_max_len = 580

for idx, dev_text in enumerate(dev_text_idx):
    cur_data = {"label": dev_claim_label[idx]}
    temp_text = [wordidx["<cls>"]] + dev_text_idx[idx][:60]
    for i in dev_evidences_ids[idx]:
        temp_text = temp_text + [wordidx["<sep>"]] + evidences_text_idx[evidences_id_dict[i]][:100]
    temp_text.append(wordidx["<sep>"])
    if len(temp_text) < all_max_len:
        temp_text = temp_text + [wordidx["<pad>"]] * (all_max_len - len(temp_text))
    cur_data['text'] = temp_text
    dev_cls_data.append(cur_data)

for idx, dev_text in enumerate(test_text_idx):
    cur_data = {}
    temp_text = [wordidx["<cls>"]] + test_text_idx[idx][:60]
    for i in test_evidences_ids[idx]:
        temp_text = temp_text + [wordidx["<sep>"]] + evidences_text_idx[evidences_id_dict[i]][:100]
    temp_text.append(wordidx["<sep>"])
    if len(temp_text) < all_max_len:
        temp_text = temp_text + [wordidx["<pad>"]] * (all_max_len - len(temp_text))
    cur_data['text'] = temp_text
    test_cls_data.append(cur_data)
with open("dev_cls_data.json", "w") as f:
    json.dump(dev_cls_data, f)
with open("test_cls_data.json", "w") as f:
    json.dump(test_cls_data, f)

Task2

Preprocessing

In [74]:
import json

with open("dev_cls_data.json", "r") as f:
    dev_cls_data = json.load(f)
with open("test_cls_data.json", "r") as f:
    test_cls_data = json.load(f)
with open("pred_train_negative_evidences.json", "r") as f:
    train_negative_evidences = json.load(f)

id2labels = ["SUPPORTS", "NOT_ENOUGH_INFO", "REFUTES", "DISPUTED"]
labels2id = {"SUPPORTS": 0, "NOT_ENOUGH_INFO": 1, "REFUTES": 2, "DISPUTED": 3}

In [85]:
text_max_len = 60
evidence_max_len = 100
all_max_len = 580
retrieval_num = 5

class TrainDataset(Dataset):
    def __init__(self, text_data, evidence_data, positive_evidences, negative_evidences, cls_label, cls_idx, sep_idx, pad_idx, evidence_num=5):
        self.text_data = text_data
        self.evidence_data = evidence_data
        self.negative_evidences = negative_evidences
        self.evidence_num = evidence_num
        self.positive_evidences = positive_evidences
        self.cls_idx = cls_idx
        self.sep_idx = sep_idx
        self.pad_idx = pad_idx

        self.cls_label = [labels2id[i] for i in cls_label]

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        positive_evidences = self.positive_evidences[idx]
        negative_evidences = self.negative_evidences[idx]
        cls_label = self.cls_label[idx]
        return [self.text_data[idx][:text_max_len], positive_evidences, negative_evidences, self.cls_label[idx]]

    def collate_fn(self, batch):
        queries, queries_pos, labels = [], [], []

        for i, j, h, k in batch:
            l = len(j)
            temp_text = [self.cls_idx]
            temp_text = temp_text + i
            for p in j:
                temp_text = temp_text + [self.sep_idx]
                temp_text = temp_text + self.evidence_data[p][:evidence_max_len]
            if self.evidence_num > l:
                n = random.sample(h, self.evidence_num - l)
                for p in n:
                    temp_text = temp_text + [self.sep_idx]
                    temp_text = temp_text + self.evidence_data[p][:evidence_max_len]
            temp_text = temp_text + [self.sep_idx]
            if len(temp_text) < all_max_len:
                temp_text = temp_text + [self.pad_idx] * (all_max_len - len(temp_text))

            queries = queries + [temp_text]
            queries_pos = queries_pos + [list(range(all_max_len))]
            labels = labels + [k]

        batch_encoding = {"queries": torch.LongTensor(queries),
                          "queries_pos": torch.LongTensor(queries_pos),
                          "labels": torch.LongTensor(labels)}

        return batch_encoding

In [86]:
labels = [i["label"] for i in dev_cls_data]
dev_outputs = [labels2id[i] for i in labels]
dev_inputs = [i['text'] for i in dev_cls_data]
test_inputs = [i['text'] for i in test_cls_data]

In [87]:
train_set = TrainDataset(train_text_idx, evidences_text_idx, train_claim_evidences, train_negative_evidences, train_claim_label, wordidx["<cls>"], wordidx["<sep>"], wordidx["<pad>"], evidence_num=retrieval_num)
dataloader = DataLoader(train_set, batch_size=10, shuffle=True, num_workers=0, collate_fn=train_set.collate_fn)

In [88]:
print(Counter(train_claim_label))

Counter({'SUPPORTS': 519, 'NOT_ENOUGH_INFO': 386, 'REFUTES': 199, 'DISPUTED': 124})


In [89]:
class CLS(nn.Module):
    def __init__(self, vocab_emb, embed_dim, hidden_size, output_size, num_layers, max_position=all_max_len):
        super(CLS, self).__init__()

        self.embedding = nn.Embedding(vocab_emb, embed_dim)
        self.pos_embedding = nn.Embedding(max_position, embed_dim)

        self.encoder = nn.LSTM(embed_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.hidden_layer = nn.Linear(hidden_size * 2, hidden_size)
        self.cls = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.1)
        self.hidden_size = hidden_size

    def forward(self, text_data, position_text):
        text_x = self.embedding(text_data) + self.pos_embedding(position_text) * 0.01
        x_encoded,_ = self.encoder(text_x)
        x_cls = x_encoded[:, 0, :]
        x_hidden = F.tanh(self.hidden_layer(x_cls))
        x_hidden = self.dropout(x_hidden)
        cls_res = self.cls(x_hidden)
        return cls_res


In [90]:
cls_model = CLS(vocab_emb=len(idxword), embed_dim=256, hidden_size=256, output_size=4, num_layers=7, max_position=700).cuda()

In [91]:
encoder_optimizer = optim.AdamW(cls_model.parameters())
max_lr = 1e-2
for param_group in encoder_optimizer.param_groups:
    param_group['lr'] = max_lr
warmup_steps = 300

In [92]:
def validate(dev_input, dev_output, cls_model_):
    # get evidence embeddings
    start_idx = 0
    cls_model.eval()
    l = len(dev_output)
    acc = []
    correct_count = 0
    while start_idx < l:
        end_idx = min(start_idx + 50, l)

        cur_input = torch.LongTensor(dev_input[start_idx:end_idx]).view(-1, len(dev_input[0])).cuda()
        cur_pos = torch.LongTensor([list(range(len(dev_input[0]))) for _ in range(end_idx - start_idx)]).cuda()

        cur_res = cls_model_(cur_input, cur_pos)
        cur_res = torch.argmax(cur_res, 1)
        cur_res = cur_res.tolist()

        for i, j in zip(cur_res, dev_output[start_idx: end_idx]):
            if i == j:
                correct_count += 1

        del cur_input, cur_pos
        start_idx = end_idx
    acc = correct_count / l
    print("\n\nClassification Accuracy: %.3f\n\n" % acc)
    cls_model.train()
    return acc

In [93]:
%env WANDB_NOTEBOOK_NAME Mon5PMGroup7_COMP90042_Project_2024_Test.ipynb

env: WANDB_NOTEBOOK_NAME=Mon5PMGroup7_COMP90042.ipynb


In [97]:
import wandb
wandb.init(project="nlp", name="cls")

encoder_optimizer.zero_grad()
step_cnt, all_step_cnt, avg_loss, maximum_f_score = 0, 0, 0, 0
ce_fn = nn.CrossEntropyLoss(torch.FloatTensor([0.2, 0.3, 0.5, 1.]).cuda())

n_epochs = 5
for epoch in range(n_epochs):
    epoch_step = 0

    for (i, batch) in enumerate(tqdm(dataloader)):

        step_cnt += 1
        cur_res = cls_model(batch["queries"].cuda(), batch["queries_pos"].cuda())
        loss = ce_fn(cur_res, batch["labels"].cuda()) / 2
        loss.backward()

        avg_loss += loss.item()
        if step_cnt == 2:
            # updating
            nn.utils.clip_grad_norm_(cls_model.parameters(), 4)

            step_cnt = 0
            epoch_step += 1
            all_step_cnt += 1

            # adjust learning rate
            if all_step_cnt <= warmup_steps:
                lr = all_step_cnt * (max_lr - 2e-8) / warmup_steps + 2e-8
            else:
                lr = max_lr - (all_step_cnt - warmup_steps) * 1e-6

            encoder_optimizer.step()
            encoder_optimizer.zero_grad()

        if all_step_cnt % 10 == 0 and step_cnt == 0:
            if all_step_cnt <= warmup_steps:
                lr = all_step_cnt * (max_lr - 2e-8) / warmup_steps + 2e-8
            else:
                lr = max_lr - (all_step_cnt - warmup_steps) * 1e-6

            wandb.log({"Learning_rate": lr}, step=all_step_cnt)
            wandb.log({"Loss": avg_loss / report_freq}, step=all_step_cnt)
            
            # report stats
            print("\n\nEpoch: %d, Epoch_step: %d, Avg loss: %.6f" % (epoch + 1, epoch_step, avg_loss / 10))
            print(f"Learning rate: {lr:.6f}\n\n")

            avg_loss = 0
        del loss, cur_res

        if all_step_cnt % 50 == 0 and all_step_cnt != 0 and step_cnt == 0:
            # evaluate the model as a scorer
            print("\nEvaluate:\n")

            f_score = validate(dev_inputs, dev_outputs, cls_model)
            wandb.log({"Acc": f_score}, step=all_step_cnt)
            
            if f_score > maximum_f_score:
                maximum_f_score = f_score
                torch.save(cls_model.state_dict(), os.path.join("model_ckpts", "best_cls_ckpt.bin"))
                print("\n\nBest val loss - epoch: %d, Epoch_step: %d" % (epoch, epoch_step))
                print("Maximum_f_score", f_score, "\n\n")

 16%|█▋        | 20/123 [00:16<01:24,  1.21it/s]



epoch: 1, epoch_step: 10, avg loss: 1.410911
learning rate: 0.000033




 33%|███▎      | 40/123 [00:32<01:06,  1.25it/s]



epoch: 1, epoch_step: 20, avg loss: 1.400776
learning rate: 0.000067




 49%|████▉     | 60/123 [00:47<00:48,  1.29it/s]



epoch: 1, epoch_step: 30, avg loss: 1.377988
learning rate: 0.000100




 65%|██████▌   | 80/123 [01:03<00:32,  1.32it/s]



epoch: 1, epoch_step: 40, avg loss: 1.403825
learning rate: 0.000133




 80%|████████  | 99/123 [01:18<00:18,  1.30it/s]



epoch: 1, epoch_step: 50, avg loss: 1.378702
learning rate: 0.000167



Evaluate:



Classification Accuracy: 0.312




 81%|████████▏ | 100/123 [01:22<00:39,  1.73s/it]



best val loss - epoch: 0, epoch_step: 50
maximum_f_score 0.3116883116883117 




 98%|█████████▊| 120/123 [01:38<00:02,  1.15it/s]



epoch: 1, epoch_step: 60, avg loss: 1.397753
learning rate: 0.000200




100%|██████████| 123/123 [01:40<00:00,  1.22it/s]
  3%|▎         | 4/123 [00:04<02:25,  1.22s/it]


KeyboardInterrupt: 

In [98]:
def predict(dev_input, cls_model_):
    # get evidence embeddings
    start_idx = 0
    cls_model.eval()

    cls_res = []
    correct_count = 0
    while start_idx < len(dev_input):
        end_idx = min(start_idx + 50, len(dev_input))

        cur_input = torch.LongTensor(dev_input[start_idx:end_idx]).view(-1, len(dev_input[0])).cuda()
        cur_pos = torch.LongTensor([list(range(len(dev_input[0]))) for _ in range(end_idx - start_idx)]).cuda()

        cur_res = cls_model_(cur_input, cur_pos)
        cur_res = torch.argmax(cur_res, 1).tolist()


        cls_res = cls_res + cur_res

        start_idx = end_idx
        del cur_input, cur_pos

    return cls_res

In [99]:
torch.cuda.empty_cache()

In [100]:
cls_model.load_state_dict(torch.load(os.path.join("model_ckpts", "best_cls_ckpt.bin")))

dev_classes = predict(dev_inputs, cls_model)
test_classes = predict(test_inputs, cls_model)

In [101]:
with open("data/dev_predict.json", "r") as f:
    pred_dev_claims = json.load(f)
with open("data/test-claims-unlabelled.json", "r") as f:
    pred_test_claims = json.load(f)

for i, j in zip(dev_claim_id, dev_classes):
    claim_label = id2labels[j]
    evidences = pred_dev_claims[i]['evidences']
    pred_dev_claims[i] = {'claim_text': pred_dev_claims[i]['claim_text'], 'claim_label': claim_label, 'evidences': evidences}

for i, j in zip(test_claim_id, test_classes):
    claim_label = id2labels[j]
    evidences = pred_test_claims[i]['evidences']
    pred_test_claims[i] = {'claim_text': pred_test_claims[i]['claim_text'], 'claim_label': claim_label, 'evidences': evidences}

with open("data/dev_predict.json", "w") as f:
    json.dump(pred_dev_claims, f)
with open("data/test-claims-unlabelled.json", "w") as f:
    json.dump(pred_test_claims, f)

In [102]:
print(Counter(dev_classes))

Counter({1: 123, 0: 30, 3: 1})


In [103]:
print(Counter(test_classes))

Counter({1: 122, 0: 29, 3: 2})


In [105]:
import subprocess

output = subprocess.check_output("python eval.py --predictions data/dev_predict.json --groundtruth data/dev-claims.json", shell=True)
output_str = output.decode('utf-8')

# Split the output into lines
output_lines = output_str.strip().split('\n')

# Format the output
formatted_lines = []
for line in output_lines:
    metric, value = line.split('=')
    metric = metric.strip()
    value = value.strip()
    formatted_line = f"{metric}: {value}"
    formatted_lines.append(formatted_line)

# Join the formatted lines into a single string
formatted_output = '\n'.join(formatted_lines)
print(formatted_output)

Evidence Retrieval F-score (F): 0.08067924139352713
Claim Classification Accuracy (A): 0.3116883116883117
Harmonic Mean of F and A: 0.1281796944764043


## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*