In [1]:
import numpy as np
import pandas as pd

from utils import preprocess
import model
from model.model import OlidCnnNet

from sklearn.model_selection import train_test_split
from sklearn import metrics

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
DATA_DIR = "./data/"
TRAIN_DATA_FILE = DATA_DIR + "olid-training-v1.0.tsv"

In [4]:
ori_train_data = pd.read_csv(TRAIN_DATA_FILE, sep='\t')

In [5]:
%%time

tweet_doc = preprocess.spacy_pipeline(ori_train_data['tweet'].to_list())

CPU times: total: 1min 15s
Wall time: 1min 15s


In [6]:
tweet_normalized = preprocess.spacy_normalize(tweet_doc, stop_removal=False, lemmatized=False)
tweet_normalized = preprocess.remove_user_mask(tweet_normalized)

In [7]:
vocabulary, documents_padded = preprocess.transform_word_to_vector(tweet_normalized, num_vocab=10000, num_tokens=30)

# sub a

In [8]:
label2id = {'NOT': 0, 'OFF': 1}

In [9]:
sub_a_label = ori_train_data['subtask_a'].map(label2id).to_list()

In [10]:
sentences_train, sentences_test, labels_train, labels_test = train_test_split(documents_padded, sub_a_label, test_size=0.2, random_state=5246)

In [11]:
train_dataset = TensorDataset(sentences_train, torch.tensor(labels_train, dtype=torch.float32))
test_dataset = TensorDataset(sentences_test, torch.tensor(labels_test, dtype=torch.float32))

In [12]:
learning_rate = 1e-3
batch_size = 64
epochs = 50

In [13]:
seq_len = documents_padded.size(1)
num_vocab = documents_padded.max().item() + 1
model = OlidCnnNet(seq_len=seq_len, num_vocab=num_vocab, embedding_size=16, conv_out=8)

In [14]:
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
#scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
criterion = nn.BCELoss()

In [15]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [16]:
def train_loop(dataloader, model, loss_fn, optimizer):
    model.train()
    num_batches = len(dataloader)
    train_loss = 0
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
    
    train_loss /= num_batches
    
    # train accuracy
    model.eval()
    correct = 0
    size = len(dataloader.dataset)
    with torch.no_grad():
        for X, y in dataloader:
            pred = model.inference(X)
            correct += (pred == y).type(torch.float).sum().item()
    correct /= size
    print(f"Train Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {train_loss:>8f}")
    return train_loss, correct


def test_loop(dataloader, model, loss_fn):
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (torch.where(pred > 0.5, 1., 0.) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test  Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return test_loss, correct

In [17]:
train_loss_list = []
train_acc_list = []
test_loss_list = []
test_acc_list = []
for t in range(epochs):
    print(f"Epoch {t+1}\n----------------------------------")
    train_loss, train_acc = train_loop(train_dataloader, model, criterion, optimizer)
    test_loss, test_acc = test_loop(test_dataloader, model, criterion)
    train_loss_list.append(train_loss)
    train_acc_list.append(train_acc)
    test_loss_list.append(test_loss)
    test_acc_list.append(test_acc)
    #scheduler.step()
print("Training Done!")

Epoch 1
----------------------------------
Train Error: 
 Accuracy: 66.9%, Avg loss: 0.651658
Test  Error: 
 Accuracy: 66.1%, Avg loss: 0.645647 

Epoch 2
----------------------------------
Train Error: 
 Accuracy: 66.9%, Avg loss: 0.635288
Test  Error: 
 Accuracy: 66.0%, Avg loss: 0.644342 

Epoch 3
----------------------------------
Train Error: 
 Accuracy: 67.0%, Avg loss: 0.626707
Test  Error: 
 Accuracy: 66.0%, Avg loss: 0.642795 

Epoch 4
----------------------------------
Train Error: 
 Accuracy: 67.4%, Avg loss: 0.623462
Test  Error: 
 Accuracy: 65.9%, Avg loss: 0.638567 

Epoch 5
----------------------------------
Train Error: 
 Accuracy: 68.0%, Avg loss: 0.617418
Test  Error: 
 Accuracy: 65.6%, Avg loss: 0.637552 

Epoch 6
----------------------------------
Train Error: 
 Accuracy: 68.3%, Avg loss: 0.611374
Test  Error: 
 Accuracy: 65.5%, Avg loss: 0.636533 

Epoch 7
----------------------------------
Train Error: 
 Accuracy: 68.9%, Avg loss: 0.602513
Test  Error: 
 Accuracy:

In [18]:
pred = model.inference(sentences_test)
print(metrics.classification_report(pred.detach(), labels_test, target_names=['NOT', 'OFF']))

              precision    recall  f1-score   support

         NOT       0.82      0.77      0.79      1856
         OFF       0.53      0.60      0.56       792

    accuracy                           0.72      2648
   macro avg       0.67      0.69      0.68      2648
weighted avg       0.73      0.72      0.73      2648

