In [16]:
import os
import json
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from transformers import BertTokenizer, BertModel
from transformers import logging

In [17]:
with open('/kaggle/input/2022-inlp-final/train.json') as f:
    train_data = json.load(f)
with open('/kaggle/input/2022-inlp-final/valid.json') as f:
    valid_data = json.load(f)
with open('/kaggle/input/2022-inlp-final/test.json') as f:
    test_data = json.load(f)
with open('/kaggle/input/2022-inlp-final/train_evidence_v3.json') as f:
    train_data_evidence = json.load(f)
with open('/kaggle/input/2022-inlp-final/valid_evidence_v3.json') as f:
    valid_data_evidence = json.load(f)
with open('/kaggle/input/2022-inlp-final/test_evidence_v3.json') as f:
    test_data_evidence = json.load(f)

    
print(f"Training Dataset Size: {len(train_data)}")
print(f"Validation Dataset Size: {len(valid_data)}")
print(f"Testing Dataset Size: {len(test_data)}")

Training Dataset Size: 16894
Validation Dataset Size: 2360
Testing Dataset Size: 2360


In [18]:
logging.set_verbosity_error()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

class_num = 3
article_num = 3
batch_size = 11
sequence_length = 512

num_epoch = 3
show_freq = 20
DEVICE = 'cuda'

def url_processing(url):
    url = url.replace('/', ' ')
    url = url.replace('.', ' ')
    url = url.replace('-', ' ')
    url = url.replace('?', ' ')
    url = url.replace('&', ' ')
    url = url.replace('=', ' ')
    url = url.replace('_', ' ')
    url = url.replace('%', ' ')
    url = url.replace('+', ' ')
    url = url.replace('#', ' ')
    return url

def get_dataloader(all_data, all_evidence, shuffle=True, mode="train"):
    
    all_input_ids = list()
    all_input_mask = list()
    all_segment_ids = list()
    all_label_ids = list()
    for data in tqdm(all_data):
        claim = data['metadata']['claim']
        file_id = str(data['metadata']['id'])

        if mode != "test":
            label = data['label']['rating']
        
        claimant = "" if data['metadata']['claimant'] is None else data['metadata']['claimant']
        
        premise_article_url = '/'.join(list(data['metadata']['premise_articles']))
        premise_article_url = url_processing(premise_article_url)
        
        evidence = "" if len(all_evidence[file_id]) == 0 else " . ".join(all_evidence[file_id])
        
        sentences = [claimant, premise_article_url, evidence]
        
        encoding = tokenizer(
            text = [claim] * article_num, 
            text_pair = sentences + [""] * (article_num - len(sentences)),
            return_tensors = "pt",
            padding = 'max_length',
            truncation = 'only_second',
            max_length = sequence_length
        )

        all_input_ids.append(encoding.input_ids.unsqueeze(0))
        all_input_mask.append(encoding.attention_mask.unsqueeze(0))
        all_segment_ids.append(encoding.token_type_ids.unsqueeze(0))
        
        if mode != "test":
            all_label_ids.append(torch.tensor(label).unsqueeze(0))

    all_input_ids = torch.cat(all_input_ids)
    all_input_mask = torch.cat(all_input_mask)
    all_segment_ids = torch.cat(all_segment_ids)
    
    if mode != "test":
        all_label_ids = torch.cat(all_label_ids)
    
    if mode != "test":
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    else:
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
        
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    
    return loader

In [19]:
train_loader = get_dataloader(train_data, train_data_evidence, mode="train")
valid_loader = get_dataloader(valid_data, valid_data_evidence, mode="valid")

100%|██████████| 16894/16894 [02:46<00:00, 101.65it/s]
100%|██████████| 2360/2360 [00:24<00:00, 97.73it/s] 


In [20]:
class BertForFactChecking(nn.Module):
    def __init__(self):
        super(BertForFactChecking, self).__init__()
        
        self.bert = bert_model
        self.dropout = nn.Dropout()
        self.dense1 = nn.Linear(768, 300)
        self.dense2 = nn.Linear(300, 300)
        self.classifier = nn.Linear(300, class_num)

    def forward(self, input_ids, attention_mask, token_type_ids):
        
        input_ids = input_ids.view(-1, input_ids.size(-1))
        attention_mask = attention_mask.view(-1, attention_mask.size(-1))
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
        
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        pooled_output = self.dense1(pooled_output)
        pooled_output = self.dropout(pooled_output)
        pooled_output = self.dense2(pooled_output)
        logits = self.classifier(pooled_output)
        reshaped_logits = logits.view(-1, article_num, class_num)
        
        return reshaped_logits.sum(1)

In [25]:
model = BertForFactChecking()

for name, param in model.named_parameters():
    if name.startswith("bert.embeddings."):
        param.requires_grad = False
    if name.startswith("bert.encoder.layer.0."):
        param.requires_grad = False
    if name.startswith("bert.encoder.layer.1."):
        param.requires_grad = False
    if name.startswith("bert.encoder.layer.2."):
        param.requires_grad = False
    if name.startswith("bert.encoder.layer.3."):
        param.requires_grad = False
    if name.startswith("bert.encoder.layer.4."):
        param.requires_grad = False
    if name.startswith("bert.encoder.layer.5."):
        param.requires_grad = False

for name, param in model.named_parameters():
    print(name, param.requires_grad)
    
model = model.to(DEVICE)

bert.embeddings.word_embeddings.weight False
bert.embeddings.position_embeddings.weight False
bert.embeddings.token_type_embeddings.weight False
bert.embeddings.LayerNorm.weight False
bert.embeddings.LayerNorm.bias False
bert.encoder.layer.0.attention.self.query.weight False
bert.encoder.layer.0.attention.self.query.bias False
bert.encoder.layer.0.attention.self.key.weight False
bert.encoder.layer.0.attention.self.key.bias False
bert.encoder.layer.0.attention.self.value.weight False
bert.encoder.layer.0.attention.self.value.bias False
bert.encoder.layer.0.attention.output.dense.weight False
bert.encoder.layer.0.attention.output.dense.bias False
bert.encoder.layer.0.attention.output.LayerNorm.weight False
bert.encoder.layer.0.attention.output.LayerNorm.bias False
bert.encoder.layer.0.intermediate.dense.weight False
bert.encoder.layer.0.intermediate.dense.bias False
bert.encoder.layer.0.output.dense.weight False
bert.encoder.layer.0.output.dense.bias False
bert.encoder.layer.0.output.Lay

In [26]:
from sklearn.utils.class_weight import compute_class_weight

ylabels = []
for task in train_data:
    ylabels.append(task['label']['rating'])
ylabels = np.array(ylabels)
print(ylabels)

class_weights = compute_class_weight('balanced',classes=np.unique(ylabels),y=ylabels)
class_weights = torch.tensor(class_weights,dtype=torch.float)
class_weights = class_weights.to(DEVICE)
print(class_weights)

[0 1 1 ... 2 2 2]
tensor([0.8045, 0.8045, 1.9459], device='cuda:0')


In [27]:
loss = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

best_valid_f1_score = 0.0
for epoch in range(num_epoch):
    epoch_start_time = time.time()
    train_loss, valid_loss = 0.0, 0.0
    train_count, valid_count = 0.0, 0.0
    train_true, valid_true = list(), list()
    train_pred, valid_pred = list(), list()
    
    model.train()
    for i, (input_ids, input_mask, segment_ids, label_ids) in enumerate(train_loader):
        input_ids = input_ids.to(DEVICE)
        input_mask = input_mask.to(DEVICE)
        segment_ids = segment_ids.to(DEVICE)
        label_ids = label_ids.to(DEVICE)
        
        outputs = model(
            input_ids=input_ids, 
            token_type_ids=segment_ids, 
            attention_mask=input_mask,
        )
        batch_loss = loss(outputs, label_ids)
        
        batch_loss.backward()
        optimizer.step()      
        model.zero_grad()
                
        train_loss += batch_loss.item()
        train_count += input_ids.shape[0]
        train_true += label_ids.tolist()
        train_pred += np.argmax(outputs.cpu().detach().numpy(), axis=1).tolist()
        
        if (i+1) % show_freq == 0 or (i+1) == len(train_loader):
            train_acc = accuracy_score(train_true, train_pred)
            train_f1_score = f1_score(train_true, train_pred, average='macro')
            print('[{:02d}/{:02d} - {:04d}/{:04d}] '.format(epoch+1, num_epoch, i+1, len(train_loader))
                + '{:2.2f} sec '.format(time.time() - epoch_start_time)
                + 'Train Acc: {:3.2f}% Loss: {:3.4f} '.format(train_acc*100, train_loss/train_count)
                + 'F1 Score: {:3.4f}'.format(train_f1_score)
            )
            
    model.eval()    
    for i, (input_ids, input_mask, segment_ids, label_ids) in enumerate(valid_loader):
        input_ids = input_ids.to(DEVICE)
        input_mask = input_mask.to(DEVICE)
        segment_ids = segment_ids.to(DEVICE)
        label_ids = label_ids.to(DEVICE)
        
        with torch.no_grad():
            outputs = model(
                input_ids=input_ids, 
                token_type_ids=segment_ids, 
                attention_mask=input_mask,
            )
            
        batch_loss = loss(outputs, label_ids)
        
        valid_loss += batch_loss.item()
        valid_count += input_ids.shape[0]
        valid_true += label_ids.tolist()
        valid_pred += np.argmax(outputs.cpu().detach().numpy(), axis=1).tolist()
        
        if (i+1) % show_freq == 0 or (i+1) == len(valid_loader):
            valid_acc = accuracy_score(valid_true, valid_pred)
            valid_f1_score = f1_score(valid_true, valid_pred, average='macro')
            print('[{:02d}/{:02d} - {:04d}/{:04d}] '.format(epoch+1, num_epoch, i+1, len(valid_loader))
                + '{:2.2f} sec '.format(time.time() - epoch_start_time)
                + 'Valid Acc: {:3.2f}% Loss: {:3.4f} '.format(valid_acc*100, valid_loss/valid_count)
                + 'F1 Score: {:3.4f}'.format(valid_f1_score)
            )
            
    if best_valid_f1_score < valid_f1_score:
        best_valid_f1_score = valid_f1_score
        torch.save(model.state_dict(), 'bert_weight.pth')
        
print(f"Best Validation F1 Score: {best_valid_f1_score}")
torch.cuda.empty_cache()

[01/03 - 0020/1536] 22.01 sec Train Acc: 42.73% Loss: 0.0975 F1 Score: 0.3586
[01/03 - 0040/1536] 43.98 sec Train Acc: 52.73% Loss: 0.0899 F1 Score: 0.4556
[01/03 - 0060/1536] 66.00 sec Train Acc: 54.70% Loss: 0.0894 F1 Score: 0.4815
[01/03 - 0080/1536] 87.98 sec Train Acc: 55.11% Loss: 0.0877 F1 Score: 0.5015
[01/03 - 0100/1536] 109.98 sec Train Acc: 52.82% Loss: 0.0872 F1 Score: 0.5020
[01/03 - 0120/1536] 131.96 sec Train Acc: 51.74% Loss: 0.0872 F1 Score: 0.4990
[01/03 - 0140/1536] 153.93 sec Train Acc: 51.56% Loss: 0.0873 F1 Score: 0.4944
[01/03 - 0160/1536] 175.93 sec Train Acc: 51.70% Loss: 0.0866 F1 Score: 0.4981
[01/03 - 0180/1536] 197.93 sec Train Acc: 51.57% Loss: 0.0862 F1 Score: 0.4968
[01/03 - 0200/1536] 219.91 sec Train Acc: 51.09% Loss: 0.0868 F1 Score: 0.4925
[01/03 - 0220/1536] 241.91 sec Train Acc: 51.57% Loss: 0.0866 F1 Score: 0.4960
[01/03 - 0240/1536] 263.90 sec Train Acc: 52.20% Loss: 0.0860 F1 Score: 0.5030
[01/03 - 0260/1536] 285.88 sec Train Acc: 52.06% Loss: 0

In [29]:
test_loader = get_dataloader(test_data, test_data_evidence, shuffle=False, mode="test")

100%|██████████| 2360/2360 [00:23<00:00, 100.31it/s]


In [30]:
model = BertForFactChecking()
model = model.to(DEVICE)
model.load_state_dict(torch.load('bert_weight.pth'))
         
test_pred = list()
for (input_ids, input_mask, segment_ids) in tqdm(test_loader):
    input_ids = input_ids.to(DEVICE)
    input_mask = input_mask.to(DEVICE)
    segment_ids = segment_ids.to(DEVICE)

    with torch.no_grad():
        outputs = model(
            input_ids=input_ids, 
            token_type_ids=segment_ids, 
            attention_mask=input_mask,
        )

    test_pred += np.argmax(outputs.cpu().detach().numpy(), axis=1).tolist()

100%|██████████| 215/215 [01:55<00:00,  1.87it/s]


In [31]:
df = pd.DataFrame({
    'id': [data['metadata']['id'] for data in test_data],
    'rating': test_pred
})

df.to_csv('submission.csv', index=False)