In [1]:
import numpy as np
import pandas as pd

from utils import preprocess
import model
from model.model import OlidCnnNet

from sklearn.model_selection import train_test_split
from sklearn import metrics

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
DATA_DIR = "./data/"
TRAIN_DATA_FILE = DATA_DIR + "olid-training-v1.0.tsv"

In [4]:
ori_train_data = pd.read_csv(TRAIN_DATA_FILE, sep='\t')

In [5]:
%%time

tweet_doc = preprocess.spacy_pipeline(ori_train_data['tweet'].to_list())

CPU times: total: 1min 22s
Wall time: 1min 23s


In [135]:
tweet_normalized = preprocess.spacy_normalize(tweet_doc, stop_removal=False, lemmatized=False)
tweet_normalized = preprocess.remove_user_mask(tweet_normalized)

In [136]:
vocabulary, documents_padded = preprocess.transform_word_to_vector(tweet_normalized, num_vocab=15000, num_tokens=30)

# sub a

In [137]:
label2id = {'NOT': 0, 'OFF': 1}

In [138]:
sub_a_label = ori_train_data['subtask_a'].map(label2id).to_list()

In [139]:
sentences_train, sentences_test, labels_train, labels_test = train_test_split(documents_padded, sub_a_label, test_size=0.2, random_state=5246)

In [149]:
train_dataset = TensorDataset(sentences_train, torch.tensor(labels_train, dtype=torch.float32))
test_dataset = TensorDataset(sentences_test, torch.tensor(labels_test, dtype=torch.float32))

In [150]:
learning_rate = 1e-3
batch_size = 256
epochs = 50

In [151]:
seq_len = documents_padded.size(1)
num_vocab = documents_padded.max().item() + 1
model = OlidCnnNet(seq_len=seq_len, num_vocab=num_vocab, embedding_size=300, conv_out=8)

In [152]:
model

OlidCnnNet(
  (embedding): Embedding(15004, 300, padding_idx=0)
  (conv_1): Conv2d(1, 8, kernel_size=(2, 300), stride=(1, 1))
  (conv_2): Conv2d(1, 8, kernel_size=(3, 300), stride=(1, 1))
  (conv_3): Conv2d(1, 8, kernel_size=(4, 300), stride=(1, 1))
  (pool_1): MaxPool2d(kernel_size=(1, 2), stride=(1, 2), padding=0, dilation=1, ceil_mode=False)
  (pool_2): MaxPool2d(kernel_size=(1, 3), stride=(1, 3), padding=0, dilation=1, ceil_mode=False)
  (pool_3): MaxPool2d(kernel_size=(1, 4), stride=(1, 4), padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=328, out_features=1, bias=True)
)

In [153]:
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
#scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
criterion = nn.BCELoss()

In [154]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [155]:
def train_loop(dataloader, model, loss_fn, optimizer):
    model.train()
    num_batches = len(dataloader)
    train_loss = 0
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
    
    train_loss /= num_batches
    
    # train accuracy
    model.eval()
    correct = 0
    size = len(dataloader.dataset)
    with torch.no_grad():
        for X, y in dataloader:
            pred = model.inference(X)
            correct += (pred == y).type(torch.float).sum().item()
    correct /= size
    print(f"Train Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {train_loss:>8f}")
    return train_loss, correct


def test_loop(dataloader, model, loss_fn):
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (torch.where(pred > 0.5, 1., 0.) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test  Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return test_loss, correct

In [156]:
train_loss_list = []
train_acc_list = []
test_loss_list = []
test_acc_list = []
for t in range(epochs):
    print(f"Epoch {t+1}\n----------------------------------")
    train_loss, train_acc = train_loop(train_dataloader, model, criterion, optimizer)
    test_loss, test_acc = test_loop(test_dataloader, model, criterion)
    train_loss_list.append(train_loss)
    train_acc_list.append(train_acc)
    test_loss_list.append(test_loss)
    test_acc_list.append(test_acc)
    #scheduler.step()
print("Training Done!")

Epoch 1
----------------------------------
Train Error: 
 Accuracy: 68.0%, Avg loss: 0.650657
Test  Error: 
 Accuracy: 65.8%, Avg loss: 0.635321 

Epoch 2
----------------------------------
Train Error: 
 Accuracy: 70.7%, Avg loss: 0.609450
Test  Error: 
 Accuracy: 65.9%, Avg loss: 0.628140 

Epoch 3
----------------------------------
Train Error: 
 Accuracy: 74.6%, Avg loss: 0.582941
Test  Error: 
 Accuracy: 66.7%, Avg loss: 0.619809 

Epoch 4
----------------------------------
Train Error: 
 Accuracy: 76.3%, Avg loss: 0.548442
Test  Error: 
 Accuracy: 68.3%, Avg loss: 0.615458 

Epoch 5
----------------------------------
Train Error: 
 Accuracy: 82.3%, Avg loss: 0.509792
Test  Error: 
 Accuracy: 68.1%, Avg loss: 0.605996 

Epoch 6
----------------------------------
Train Error: 
 Accuracy: 84.9%, Avg loss: 0.465686
Test  Error: 
 Accuracy: 69.6%, Avg loss: 0.600160 

Epoch 7
----------------------------------
Train Error: 
 Accuracy: 87.8%, Avg loss: 0.427755
Test  Error: 
 Accuracy:

In [157]:
pred = model.inference(sentences_test)
print(metrics.classification_report(pred.detach(), labels_test, target_names=['NOT', 'OFF']))

              precision    recall  f1-score   support

         NOT       0.84      0.77      0.80      1894
         OFF       0.52      0.62      0.57       754

    accuracy                           0.73      2648
   macro avg       0.68      0.70      0.69      2648
weighted avg       0.75      0.73      0.74      2648

