# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

**We use keras, tensorflow, nltk, scikit-learn in this project.**

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## PreProcess for evidence and claims

### read files

In [1]:
import json

dev_cls_data = json.load(open("dev_cls_data.json", "r"))
test_cls_data = json.load(open("test_cls_data.json", "r"))

dev_ids = json.load(open("temp_data/dev_ids.json", "r"))
test_ids = json.load(open("temp_data/test_ids.json", "r"))

train_text_idx = json.load(open("temp_data/train_text_idx.json", "r"))
evidences_text_idx = json.load(open("temp_data/evidences_text_idx.json", "r"))

text_max_len = 60
evidence_max_len = 100
all_max_len = 580
retrieval_num = 5

id2labels = ["SUPPORTS", "NOT_ENOUGH_INFO", "REFUTES", "DISPUTED"]
labels2id = {"SUPPORTS": 0, "NOT_ENOUGH_INFO": 1, "REFUTES": 2, "DISPUTED": 3}

train_labels = json.load(open("temp_data/train_labels.json", "r"))
train_evidences = json.load(open("temp_data/train_evidences.json", "r"))

idx2word = json.load(open("temp_data/idx2word.json", "r"))
word2idx = json.load(open("temp_data/word2idx.json", "r"))

train_negative_evidences = json.load(open("pred_train_negative_evidences.json", "r"))
evidences_id_dict = json.load(open("temp_data/evidences_id_dict.json", "r"))


In [2]:
from torch.utils.data import Dataset
import random

class TrainDataset(Dataset):
    def __init__(self, text_data, evidence_data, positive_evidences, negative_evidences, cls_label, cls_idx, sep_idx, pad_idx, evidence_num=5):
        self.text_data = text_data
        self.evidence_data = evidence_data
        
        self.negative_evidences = negative_evidences

        self.cls_label = [labels2id[i] for i in cls_label]
        self.evidence_num = evidence_num
        self.positive_evidences = positive_evidences
        
        self.cls_idx = cls_idx
        self.sep_idx = sep_idx
        self.pad_idx = pad_idx
        
    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        return [self.text_data[idx][:text_max_len], self.positive_evidences[idx], self.negative_evidences[idx], self.cls_label[idx]]

    def collate_fn(self, batch):
        queries = []
        queries_pos = []
        labels = []
        
        for i, j, h, k in batch:
            temp_text = [self.cls_idx]
            temp_text.extend(i)
            for p in j:
                temp_text.append(self.sep_idx)
                temp_text.extend(self.evidence_data[p][:evidence_max_len])
            if self.evidence_num > len(j):
                n = random.sample(h, self.evidence_num - len(j))
                for p in n:
                    temp_text.append(self.sep_idx)
                    temp_text.extend(self.evidence_data[p][:evidence_max_len])
            temp_text.append(self.sep_idx)
            if len(temp_text) < all_max_len:
                temp_text.extend([self.pad_idx] * (all_max_len - len(temp_text)))
                
            queries.append(temp_text)
            queries_pos.append(list(range(all_max_len)))
            labels.append(k)    

        batch_encoding = {}
        batch_encoding["queries"] = torch.LongTensor(queries)        
        batch_encoding["queries_pos"] = torch.LongTensor(queries_pos)
        batch_encoding["labels"] = torch.LongTensor(labels)
        
        return batch_encoding

In [3]:
dev_inputs = [i['text'] for i in dev_cls_data]
test_inputs = [i['text'] for i in test_cls_data]
dev_outputs = [labels2id[i["label"]] for i in dev_cls_data]

In [4]:
train_set = TrainDataset(train_text_idx, evidences_text_idx, train_evidences, train_negative_evidences, train_labels, word2idx["<cls>"], word2idx["<sep>"], word2idx["<pad>"], evidence_num=retrieval_num)
from torch.utils.data import DataLoader

dataloader = DataLoader(train_set, batch_size=10, shuffle=True, num_workers=4, collate_fn=train_set.collate_fn)

In [5]:
from collections import Counter
print(Counter(train_labels))

Counter({'SUPPORTS': 519, 'NOT_ENOUGH_INFO': 386, 'REFUTES': 199, 'DISPUTED': 124})


# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [6]:
# from workshop
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class CLS(nn.Module):
    def __init__(self, vocab_emb, embed_dim, hidden_size, output_size, nhead, num_layers, max_position=all_max_len):
        super(CLS, self).__init__()

        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_emb, embed_dim)
        self.pos_embedding = nn.Embedding(max_position, embed_dim)
        
        # encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size, nhead=nhead, batch_first=True)
        # self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers, norm=nn.LayerNorm(hidden_size))
        self.encoder = nn.LSTM(embed_dim, hidden_size, num_layers=2)
        self.hidden_layer = nn.Linear(hidden_size, hidden_size // 2)
        self.cls = nn.Linear(hidden_size // 2, output_size)
        self.dropout = nn.Dropout(0.1)

    def forward(self, text_data, position_text):
        mask_ = text_data == 0
        # print(text_data.size(), position_text.size())
        text_x = self.embedding(text_data) + self.pos_embedding(position_text) * 0.01
        #x_encoded = self.encoder(text_x, src_key_padding_mask=mask_)
        x_encoded,_ = self.encoder(text_x)
        x_cls = x_encoded[:, 0, :]
        x_hidden = F.tanh(self.hidden_layer(x_cls))
        self.dropout(x_hidden)
        cls_res = self.cls(x_hidden)
        return cls_res


In [7]:
cls_model = CLS(vocab_emb=len(idx2word), embed_dim=512, hidden_size=512, output_size=4, nhead=8, num_layers=6, max_position=700)
cls_model.cuda()

CLS(
  (embedding): Embedding(86627, 512)
  (pos_embedding): Embedding(700, 512)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (hidden_layer): Linear(in_features=512, out_features=256, bias=True)
  (cls): Linear(in_features=256, out_features=4, bias=True)
  (dropout): Dr

### Training

In [8]:
# from workshop but need to change because I add some speciall setting

torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
random.seed(42)

encoder_optimizer = optim.Adam(cls_model.parameters())
max_lr = 1e-3
for param_group in encoder_optimizer.param_groups:
    param_group['lr'] = max_lr
accumulate_step = 2
grad_norm = 4
warmup_steps = 300
report_freq = 10
eval_interval = 50
save_dir = "model_ckpts"

In [9]:
def validate(dev_input, dev_output, cls_model_):
    # get evidence embeddings
    start_idx = 0
    batch_size = 50
    pos_len = len(dev_input[0])
    cls_model.eval()

    acc = []
    correct_count = 0
    while start_idx < len(dev_output):
        end_idx = min(start_idx + batch_size, len(dev_output))
        
        cur_input = torch.LongTensor(dev_input[start_idx:end_idx]).view(-1, pos_len).cuda()
        cur_pos = torch.LongTensor([list(range(pos_len)) for _ in range(end_idx - start_idx)]).cuda()

        cur_res = cls_model_(cur_input, cur_pos)
        cur_res = torch.argmax(cur_res, 1).tolist()
        
        del cur_input, cur_pos
        
        for i, j in zip(cur_res, dev_output[start_idx: end_idx]):
            if i == j:
                correct_count += 1
        
        start_idx = end_idx
    acc = correct_count / len(dev_output)
    print("\n")
    print("Classification Accuracy: %.3f" % acc)
    print("\n")
    
    cls_model.train()
    return acc

In [10]:
%env WANDB_NOTEBOOK_NAME MelMoxue_NLP_CLS.ipynb

env: WANDB_NOTEBOOK_NAME=MelMoxue_NLP_CLS.ipynb


In [11]:
# start training
import wandb
import os
wandb.init(project="nlp", name="cls")

from tqdm import tqdm
import numpy as np

encoder_optimizer.zero_grad()
step_cnt = 0
all_step_cnt = 0
avg_loss = 0
maximum_f_score = 0
ce_fn = nn.CrossEntropyLoss(torch.FloatTensor([0.2, 0.3, 0.5, 1.]).cuda())

for epoch in range(5): 
    epoch_step = 0

    for (i, batch) in enumerate(tqdm(dataloader)):
        
        step_cnt += 1
        
        # forward pass
            
        cur_res = cls_model(batch["queries"].cuda(), batch["queries_pos"].cuda())

        loss = ce_fn(cur_res, batch["labels"].cuda())
        loss = loss / accumulate_step
        loss.backward()

        avg_loss += loss.item()
        if step_cnt == accumulate_step:
            # updating
            if grad_norm > 0:
                nn.utils.clip_grad_norm_(cls_model.parameters(), grad_norm)

            step_cnt = 0
            epoch_step += 1
            all_step_cnt += 1
            
            # adjust learning rate
            if all_step_cnt <= warmup_steps:
                lr = all_step_cnt * (max_lr - 2e-8) / warmup_steps + 2e-8
            else:
                lr = max_lr - (all_step_cnt - warmup_steps) * 1e-6
                
            encoder_optimizer.step()
            encoder_optimizer.zero_grad()
        
        if all_step_cnt % report_freq == 0 and step_cnt == 0:
            if all_step_cnt <= warmup_steps:
                lr = all_step_cnt * (max_lr - 2e-8) / warmup_steps + 2e-8
            else:
                lr = max_lr - (all_step_cnt - warmup_steps) * 1e-6

            wandb.log({"learning_rate": lr}, step=all_step_cnt)
            wandb.log({"loss": avg_loss / report_freq}, step=all_step_cnt)
            
            # report stats
            print("\n")
            print("epoch: %d, epoch_step: %d, avg loss: %.6f" % (epoch + 1, epoch_step, avg_loss / report_freq))
            print(f"learning rate: {lr:.6f}")
            print("\n")

            avg_loss = 0
        del loss, cur_res

        if all_step_cnt % eval_interval == 0 and all_step_cnt != 0 and step_cnt == 0:
            # evaluate the model as a scorer
            print("\nEvaluate:\n")
            
            f_score = validate(dev_inputs, dev_outputs, cls_model)
            wandb.log({"acc": f_score}, step=all_step_cnt)

            if f_score > maximum_f_score:
                maximum_f_score = f_score
                torch.save(cls_model.state_dict(), os.path.join(save_dir, "best_cls_ckpt.bin"))
                # torch.save(last_evidence_embeddings, os.path.join(save_dir, "evidence_embeddings"))
                print("\n")
                print("best val loss - epoch: %d, epoch_step: %d" % (epoch, epoch_step))
                print("maximum_f_score", f_score)
                print("\n")

[34m[1mwandb[0m: Currently logged in as: [33mbruce[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112690433381228, max=1.0…

 18%|█▊        | 22/123 [00:01<00:04, 22.13it/s]



epoch: 1, epoch_step: 10, avg loss: 1.916283
learning rate: 0.000033


 35%|███▍      | 43/123 [00:02<00:03, 23.82it/s]



epoch: 1, epoch_step: 20, avg loss: 1.431737
learning rate: 0.000067


 52%|█████▏    | 64/123 [00:03<00:02, 23.89it/s]



epoch: 1, epoch_step: 30, avg loss: 1.427802
learning rate: 0.000100


 67%|██████▋   | 82/123 [00:03<00:01, 23.94it/s]



epoch: 1, epoch_step: 40, avg loss: 1.413639
learning rate: 0.000133


 79%|███████▉  | 97/123 [00:04<00:01, 23.99it/s]



epoch: 1, epoch_step: 50, avg loss: 1.413316
learning rate: 0.000167



Evaluate:



Classification Accuracy: 0.117


 84%|████████▎ | 103/123 [00:05<00:01, 10.78it/s]



best val loss - epoch: 0, epoch_step: 50
maximum_f_score 0.11688311688311688


100%|██████████| 123/123 [00:06<00:00, 19.32it/s]




epoch: 1, epoch_step: 60, avg loss: 1.445513
learning rate: 0.000200


 15%|█▌        | 19/123 [00:01<00:04, 22.51it/s]



epoch: 2, epoch_step: 9, avg loss: 1.421385
learning rate: 0.000233


 33%|███▎      | 40/123 [00:01<00:03, 23.87it/s]



epoch: 2, epoch_step: 19, avg loss: 1.422743
learning rate: 0.000267


 50%|████▉     | 61/123 [00:02<00:02, 23.86it/s]



epoch: 2, epoch_step: 29, avg loss: 1.418498
learning rate: 0.000300


 62%|██████▏   | 76/123 [00:03<00:01, 23.98it/s]



epoch: 2, epoch_step: 39, avg loss: 1.416530
learning rate: 0.000333



Evaluate:



Classification Accuracy: 0.266


 64%|██████▍   | 79/123 [00:04<00:04,  9.71it/s]



best val loss - epoch: 1, epoch_step: 39
maximum_f_score 0.2662337662337662


 81%|████████▏ | 100/123 [00:04<00:01, 21.41it/s]



epoch: 2, epoch_step: 49, avg loss: 1.414500
learning rate: 0.000367


 98%|█████████▊| 121/123 [00:05<00:00, 23.59it/s]



epoch: 2, epoch_step: 59, avg loss: 1.385892
learning rate: 0.000400


100%|██████████| 123/123 [00:06<00:00, 20.47it/s]
 13%|█▎        | 16/123 [00:00<00:04, 21.53it/s]



epoch: 3, epoch_step: 7, avg loss: 1.371151
learning rate: 0.000433


 30%|███       | 37/123 [00:01<00:03, 23.76it/s]



epoch: 3, epoch_step: 17, avg loss: 1.388361
learning rate: 0.000467


 42%|████▏     | 52/123 [00:02<00:02, 23.82it/s]



epoch: 3, epoch_step: 27, avg loss: 1.480666
learning rate: 0.000500



Evaluate:



Classification Accuracy: 0.266


 62%|██████▏   | 76/123 [00:03<00:02, 22.90it/s]



epoch: 3, epoch_step: 37, avg loss: 1.394980
learning rate: 0.000533


 79%|███████▉  | 97/123 [00:04<00:01, 23.80it/s]



epoch: 3, epoch_step: 47, avg loss: 1.391847
learning rate: 0.000567


 96%|█████████▌| 118/123 [00:05<00:00, 23.81it/s]



epoch: 3, epoch_step: 57, avg loss: 1.401002
learning rate: 0.000600


100%|██████████| 123/123 [00:05<00:00, 21.81it/s]
 11%|█         | 13/123 [00:00<00:05, 20.74it/s]



epoch: 4, epoch_step: 6, avg loss: 1.431069
learning rate: 0.000633


 23%|██▎       | 28/123 [00:01<00:04, 23.37it/s]



epoch: 4, epoch_step: 16, avg loss: 1.388191
learning rate: 0.000667



Evaluate:


 28%|██▊       | 34/123 [00:01<00:05, 17.66it/s]



Classification Accuracy: 0.266


 45%|████▍     | 55/123 [00:02<00:02, 23.11it/s]



epoch: 4, epoch_step: 26, avg loss: 1.413971
learning rate: 0.000700


 59%|█████▉    | 73/123 [00:03<00:02, 23.65it/s]



epoch: 4, epoch_step: 36, avg loss: 1.395924
learning rate: 0.000733


 76%|███████▋  | 94/123 [00:04<00:01, 23.82it/s]



epoch: 4, epoch_step: 46, avg loss: 1.422175
learning rate: 0.000767


 93%|█████████▎| 115/123 [00:05<00:00, 23.76it/s]



epoch: 4, epoch_step: 56, avg loss: 1.428711
learning rate: 0.000800


100%|██████████| 123/123 [00:05<00:00, 21.87it/s]
  6%|▌         | 7/123 [00:00<00:07, 15.75it/s]



epoch: 5, epoch_step: 4, avg loss: 1.431863
learning rate: 0.000833



Evaluate:



Classification Accuracy: 0.117


 24%|██▍       | 30/123 [00:01<00:04, 22.60it/s]



epoch: 5, epoch_step: 14, avg loss: 1.440198
learning rate: 0.000867


 41%|████▏     | 51/123 [00:02<00:03, 23.93it/s]



epoch: 5, epoch_step: 24, avg loss: 1.398574
learning rate: 0.000900


 59%|█████▊    | 72/123 [00:03<00:02, 23.93it/s]



epoch: 5, epoch_step: 34, avg loss: 1.424734
learning rate: 0.000933


 73%|███████▎  | 90/123 [00:04<00:01, 23.91it/s]



epoch: 5, epoch_step: 44, avg loss: 1.405719
learning rate: 0.000967


 85%|████████▌ | 105/123 [00:04<00:00, 24.00it/s]



epoch: 5, epoch_step: 54, avg loss: 1.447049
learning rate: 0.001000



Evaluate:


 90%|█████████ | 111/123 [00:05<00:00, 18.00it/s]



Classification Accuracy: 0.117


100%|██████████| 123/123 [00:05<00:00, 21.10it/s]


# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [12]:
def predict(dev_input, cls_model_):
    # get evidence embeddings
    start_idx = 0
    batch_size = 50
    pos_len = len(dev_input[0])
    cls_model.eval()

    cls_res = []
    correct_count = 0
    while start_idx < len(dev_input):
        end_idx = min(start_idx + batch_size, len(dev_input))
        
        cur_input = torch.LongTensor(dev_input[start_idx:end_idx]).view(-1, pos_len).cuda()
        cur_pos = torch.LongTensor([list(range(pos_len)) for _ in range(end_idx - start_idx)]).cuda()

        cur_res = cls_model_(cur_input, cur_pos)
        cur_res = torch.argmax(cur_res, 1).tolist()
        
        del cur_input, cur_pos
        
        cls_res.extend(cur_res)
        
        start_idx = end_idx

    return cls_res

In [13]:
torch.cuda.empty_cache()

In [14]:
import os
cls_model.load_state_dict(torch.load(os.path.join(save_dir, "best_cls_ckpt.bin")))

dev_classes = predict(dev_inputs, cls_model)
test_classes = predict(test_inputs, cls_model)

In [15]:
pred_dev_claims = json.load(open("pred_dev_claims_retrieval.json", "r"))
pred_test_claims = json.load(open("pred_test_claims_retrieval.json", "r"))

for i, j in zip(dev_ids, dev_classes):
    pred_dev_claims[i]['claim_label'] = id2labels[j]

for i, j in zip(test_ids, test_classes):
    pred_test_claims[i]['claim_label'] = id2labels[j]
    

In [16]:
## save cls data
json.dump(pred_dev_claims, open("pred_dev_claims.json", "w"))
json.dump(pred_test_claims, open("pred_test_claims.json", "w"))

In [17]:
from collections import Counter
print(Counter(dev_classes))

Counter({1: 154})


In [18]:
print(Counter(test_classes))

Counter({1: 153})


## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*

In [14]:
# python eval.py --predictions pred_dev_claims.json --groundtruth data/dev-claims.json