In [1]:
import re
import numpy as np
import pandas as pd

from utils import preprocess, plotutils

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn import metrics
from sklearn.pipeline import Pipeline
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F

import random
from tqdm import tqdm
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize
from spacy.lang.en import stop_words
from string import punctuation
import emoji
from utils import preprocess

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.cuda.manual_seed_all(seed)

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
DATA_DIR = "./data/"
TRAIN_DATA_FILE = DATA_DIR + "olid-training-v1.0.tsv"
ori_train_data = pd.read_csv(TRAIN_DATA_FILE, sep='\t')
ori_train_data.head()

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,


In [24]:
def compute_label(a, b, c):
    if a == 'NOT':
        return a
    elif b == 'UNT':
        return b
    else:
        return c

In [25]:
ori_train_data['label'] = ori_train_data.apply(lambda x: compute_label(x.subtask_a, x.subtask_b, x.subtask_c), axis=1)
count = ori_train_data.groupby('label').size().to_dict()
stop_words = stop_words.STOP_WORDS
stop_words = stop_words.union(set(emoji.get_emoji_unicode_dict('en').values()))
stop_words.add("url")
stop_words = stop_words.union(set(punctuation))

In [26]:
def tweet_tokenization(doc):
    tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, match_phone_numbers=False)
    tweet_tokenize = [tweet_tokenizer.tokenize(tweet) for tweet in doc]
    return tweet_tokenize

def stop_word_removal(doc, stop_words):
    new_tweets = []
    for t in doc:
        new_words = []
        for word in t:
            if word not in stop_words:
                new_words.append(word)
        new_tweets.append(new_words)

    return new_tweets

def hashtag_removal(doc):
    new_docs = []
    for t in doc:
        new_doc=[]
        for word in t:
            if word[0] == "#":
                new_doc.append(word[1:])
            else:
                new_doc.append(word)
        new_docs.append(new_doc)
    
    return new_docs

tweets = ori_train_data["tweet"]
tweet_tokenization = tweet_tokenization(tweets)
tweet_remove_stop_word = stop_word_removal(tweet_tokenization, stop_words)
tweets_remove = hashtag_removal(tweet_remove_stop_word)
tweets_remove = [" ".join(t) for t in tweets_remove]

## Subtask A

In [31]:
label2id = {'NOT': 0, 'OFF': 1}
sub_a_label = ori_train_data['subtask_a'].map(label2id).to_list()
sentences_train, sentences_test, labels_train, labels_test = train_test_split(tweets_remove, sub_a_label, test_size=0.2, random_state=5246)

y_train = np.asarray(labels_train)
y_test = np.asarray(labels_test)

print("Size of training set: {}".format(len(sentences_train)))
print("Size of test set: {}".format(len(sentences_test)))

Size of training set: 10592
Size of test set: 2648


### Pad 0

In [8]:
class TextDataset(Dataset):
    def __init__(self, sentences, labels, vectorizer):
        self.sentences = sentences
        self.labels = labels
        self.vectorizer = vectorizer
        self.sequences = self.vectorizer.transform(sentences).toarray()

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx]), torch.tensor(self.labels[idx])


class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()

        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(1, batch_size, self.hidden_dim).requires_grad_()
        c0 = torch.zeros(1, batch_size, self.hidden_dim).requires_grad_()

        x = x.view(batch_size, 1, -1)

        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc(hn[-1])
        return out

vectorizer = CountVectorizer(stop_words='english', lowercase=True)
X_train = vectorizer.fit_transform(sentences_train)

input_dim = X_train.shape[1]
hidden_dim = 100
output_dim = 2
lr = 0.01
num_epochs = 10


model = LSTMClassifier(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

train_dataset = TextDataset(sentences_train, y_train, vectorizer)
test_dataset = TextDataset(sentences_test, y_test, vectorizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

for epoch in range(num_epochs):
    hn, cn = None, None
    for i, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        if hn is not None and cn is not None:
            hn, cn = hn.detach(), cn.detach()
        outputs = model(inputs.float())
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        hn, cn = outputs[1][0].detach(), outputs[1][1].detach()

        if (i + 1) % 100 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")

hn, cn = None, None

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    pred = []
    for i, (inputs, labels) in enumerate(test_loader):

        outputs = model(inputs.float())
        _, predicted = torch.max(outputs.data, 1)
        pred.append(predicted)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Test Accuracy: {100 * correct / total:.2f}%")

y_pred = np.asarray(torch.cat(pred))
print(metrics.classification_report(y_test, y_pred, target_names=['NOT', 'OFF']))

Epoch [1/10], Step [100/331], Loss: 0.7050
Epoch [1/10], Step [200/331], Loss: 0.5693
Epoch [1/10], Step [300/331], Loss: 0.4136
Epoch [2/10], Step [100/331], Loss: 0.2369
Epoch [2/10], Step [200/331], Loss: 0.1267
Epoch [2/10], Step [300/331], Loss: 0.2413
Epoch [3/10], Step [100/331], Loss: 0.1408
Epoch [3/10], Step [200/331], Loss: 0.0646
Epoch [3/10], Step [300/331], Loss: 0.0280
Epoch [4/10], Step [100/331], Loss: 0.0337
Epoch [4/10], Step [200/331], Loss: 0.0106
Epoch [4/10], Step [300/331], Loss: 0.0150
Epoch [5/10], Step [100/331], Loss: 0.0999
Epoch [5/10], Step [200/331], Loss: 0.0177
Epoch [5/10], Step [300/331], Loss: 0.0711
Epoch [6/10], Step [100/331], Loss: 0.0022
Epoch [6/10], Step [200/331], Loss: 0.0011
Epoch [6/10], Step [300/331], Loss: 0.0768
Epoch [7/10], Step [100/331], Loss: 0.0086
Epoch [7/10], Step [200/331], Loss: 0.0053
Epoch [7/10], Step [300/331], Loss: 0.0156
Epoch [8/10], Step [100/331], Loss: 0.0016
Epoch [8/10], Step [200/331], Loss: 0.0011
Epoch [8/10

### Pack Sequence

In [37]:
def collate_fn(batch):
    batch.sort(key=lambda x: x[2], reverse=True)
    sequences, labels, lengths = zip(*batch)
    sequences = nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=0.0)
    return sequences, torch.tensor(labels), torch.tensor(lengths)

class TextDataset(Dataset):
    def __init__(self, sentences, labels, vectorizer, input_dim):
        self.sentences = sentences
        self.labels = labels
        self.vectorizer = vectorizer
        self.sequences = [torch.tensor(vectorizer.transform([sentence]).toarray()).reshape(-1, input_dim) for sentence in sentences]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sequences[idx], torch.tensor(self.labels[idx]), len(self.sequences[idx])


class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()

        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, lengths):
        batch_size = x.size(0)
        h0 = torch.zeros(1, batch_size, self.hidden_dim).requires_grad_()
        c0 = torch.zeros(1, batch_size, self.hidden_dim).requires_grad_()

        x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)

        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        out = self.fc(hn[-1])
        return out

    
vectorizer = CountVectorizer(stop_words='english', lowercase=True)
X_train = vectorizer.fit_transform(sentences_train)

input_dim = X_train.shape[1]
hidden_dim = 100
output_dim = 2
lr = 0.001
num_epochs = 3


model = LSTMClassifier(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)


train_dataset = TextDataset(sentences_train, y_train, vectorizer, input_dim)
test_dataset = TextDataset(sentences_test, y_test, vectorizer, input_dim)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

for epoch in range(num_epochs):
    for i, (inputs, labels, lengths) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs.float(), lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    pred = []
    for i, (inputs, labels, lengths) in enumerate(test_loader):
        outputs = model(inputs.float(), lengths)
        _, predicted = torch.max(outputs.data, 1)
        pred.append(predicted)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Test Accuracy: {100 * correct / total:.2f}%")

y_pred = np.asarray(torch.cat(pred))
print(metrics.classification_report(y_test, y_pred, target_names=['NOT', 'OFF']))

Epoch [1/3], Step [100/331], Loss: 0.5573
Epoch [1/3], Step [200/331], Loss: 0.5935
Epoch [1/3], Step [300/331], Loss: 0.4664
Epoch [2/3], Step [100/331], Loss: 0.3889
Epoch [2/3], Step [200/331], Loss: 0.4775
Epoch [2/3], Step [300/331], Loss: 0.2932
Epoch [3/3], Step [100/331], Loss: 0.2257
Epoch [3/3], Step [200/331], Loss: 0.2169
Epoch [3/3], Step [300/331], Loss: 0.1594
Test Accuracy: 73.75%
              precision    recall  f1-score   support

         NOT       0.78      0.85      0.81      1750
         OFF       0.64      0.52      0.57       898

    accuracy                           0.74      2648
   macro avg       0.71      0.69      0.69      2648
weighted avg       0.73      0.74      0.73      2648



## Glove Embedding

In [3]:
DATA_DIR = "./data/"
TRAIN_DATA_FILE = DATA_DIR + "olid-training-v1.0.tsv"
TEST_A_DATA_FILE = DATA_DIR + "testset-levela.tsv"
LABEL_A_DATA_FILE = DATA_DIR + "labels-levela.csv"
TEST_B_DATA_FILE = DATA_DIR + "testset-levelb.tsv"
LABEL_B_DATA_FILE = DATA_DIR + "labels-levelb.csv"
TEST_C_DATA_FILE = DATA_DIR + "testset-levelc.tsv"
LABEL_C_DATA_FILE = DATA_DIR + "labels-levelc.csv"

ori_train_data = pd.read_csv(TRAIN_DATA_FILE, sep='\t')

task_a_test_data = pd.read_csv(TEST_A_DATA_FILE, sep='\t')
task_a_test_label = pd.read_csv(LABEL_A_DATA_FILE, usecols=[1], names=['labela'])

task_b_test_data = pd.read_csv(TEST_B_DATA_FILE, sep='\t')
task_b_test_label = pd.read_csv(LABEL_B_DATA_FILE, usecols=[1], names=['labelb'])

task_c_test_data = pd.read_csv(TEST_C_DATA_FILE, sep='\t')
task_c_test_label = pd.read_csv(LABEL_C_DATA_FILE, usecols=[1], names=['labelc'])

### Task A

In [4]:
tweet_list = ori_train_data['tweet'].to_list()
processed_tweet_list = [preprocess.glove_twitter_preprocess(t) for t in tweet_list]
tokenized_tweet_list = preprocess.nltk_tokenize(processed_tweet_list)
vocabulary, documents_vector = preprocess.transform_word_to_vector(tokenized_tweet_list, num_vocab=20000, padded=False)

pretrained_embedding = preprocess.get_embedding_from_torch_text(vocabulary, "twitter.27B", 25)
pretrained_embedding.shape

torch.Size([19034, 25])

In [5]:
label2id = {'NOT': 0, 'OFF': 1}
sub_a_label = ori_train_data['subtask_a'].map(label2id).to_list()
sentences_train, sentences_test, labels_train, labels_test = train_test_split(documents_vector, sub_a_label, test_size=0.2, random_state=5246)
y_train = np.asarray(labels_train)
y_test = np.asarray(labels_test)

count_not, count_off= np.sum(y_train == 0), np.sum(y_train == 1)
weight_not = len(y_train) / (2 * count_not)
weight_off = len(y_train) / (2 * count_off)
class_weights = torch.FloatTensor([weight_not, weight_off])

In [6]:
def collate_fn(batch):
    batch.sort(key=lambda x: x[2], reverse=True)
    sequences, labels, lengths = zip(*batch)
    sequences = nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=0.0)
    return sequences, torch.tensor(labels), torch.tensor(lengths)

class TextDataset(Dataset):
    def __init__(self, sentences, labels):
        self.sentences = sentences
        self.labels = labels

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], torch.tensor(self.labels[idx]), len(self.sentences[idx])

# class LSTMClassifier(nn.Module):
#     def __init__(self, embedding_dim, hidden_dim, output_dim, pretrained_embedding):
#         super(LSTMClassifier, self).__init__()

#         self.hidden_dim = hidden_dim
#         self.embedding = nn.Embedding.from_pretrained(pretrained_embedding)
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
#         self.fc = nn.Linear(hidden_dim, output_dim)

#     def forward(self, x, lengths):
#         batch_size = x.size(0)
#         x = self.embedding(x)
#         h0 = torch.zeros(1, batch_size, self.hidden_dim).requires_grad_()
#         c0 = torch.zeros(1, batch_size, self.hidden_dim).requires_grad_()

#         x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)

#         out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
#         out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
#         out = self.fc(hn[-1])
#         return out

class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, pretrained_embedding):
        super(LSTMClassifier, self).__init__()

        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding.from_pretrained(pretrained_embedding)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2 * hidden_dim, output_dim)

    def forward(self, x, lengths):
        batch_size = x.size(0)
        x = self.embedding(x)
        h0 = torch.zeros(2, batch_size, self.hidden_dim).requires_grad_()
        c0 = torch.zeros(2, batch_size, self.hidden_dim).requires_grad_()

        x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)

        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        out = self.fc(torch.cat((hn[-2], hn[-1]), dim=1))
        return out

In [7]:
embedding_dim = pretrained_embedding.size(1)
hidden_dim = 64
output_dim = 2
lr = 0.001
num_epochs = 10

model = LSTMClassifier(embedding_dim, hidden_dim, output_dim, pretrained_embedding)
criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=lr)
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-3)

train_dataset = TextDataset(sentences_train, y_train)
test_dataset = TextDataset(sentences_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

for epoch in range(num_epochs):
    for i, (inputs, labels, lengths) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs.long(), lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    pred, label = [],[]
    for i, (inputs, labels, lengths) in enumerate(test_loader):
        outputs = model(inputs.long(), lengths)
        _, predicted = torch.max(outputs.data, 1)
        pred.append(predicted)
        label.append(labels)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Test Accuracy: {100 * correct / total:.2f}%")

y_pred = np.asarray(torch.cat(pred))
y_label = np.asarray(torch.cat(label))
print(metrics.classification_report(y_label, y_pred, target_names=['NOT', 'OFF']))

Epoch [1/10], Step [100/331], Loss: 0.5837
Epoch [1/10], Step [200/331], Loss: 0.6062
Epoch [1/10], Step [300/331], Loss: 0.6370
Epoch [2/10], Step [100/331], Loss: 0.5525
Epoch [2/10], Step [200/331], Loss: 0.5903
Epoch [2/10], Step [300/331], Loss: 0.4903
Epoch [3/10], Step [100/331], Loss: 0.5850
Epoch [3/10], Step [200/331], Loss: 0.4008
Epoch [3/10], Step [300/331], Loss: 0.4418
Epoch [4/10], Step [100/331], Loss: 0.5595
Epoch [4/10], Step [200/331], Loss: 0.4050
Epoch [4/10], Step [300/331], Loss: 0.3465
Epoch [5/10], Step [100/331], Loss: 0.4745
Epoch [5/10], Step [200/331], Loss: 0.4319
Epoch [5/10], Step [300/331], Loss: 0.4768
Epoch [6/10], Step [100/331], Loss: 0.4479
Epoch [6/10], Step [200/331], Loss: 0.7108
Epoch [6/10], Step [300/331], Loss: 0.5251
Epoch [7/10], Step [100/331], Loss: 0.3425
Epoch [7/10], Step [200/331], Loss: 0.3834
Epoch [7/10], Step [300/331], Loss: 0.4343
Epoch [8/10], Step [100/331], Loss: 0.3453
Epoch [8/10], Step [200/331], Loss: 0.3376
Epoch [8/10

#### Test task a

In [9]:
tweets_test_a = task_a_test_data["tweet"].tolist()
processed_tweet_list_a = [preprocess.glove_twitter_preprocess(t) for t in tweets_test_a]
tokenized_tweet_list_a = preprocess.nltk_tokenize(processed_tweet_list_a)
# _ , documents_vector_a = preprocess.transform_word_to_vector(tokenized_tweet_list_a, num_vocab=20000, padded=False)
documents_vector_a = preprocess.get_vector_from_vocabulary(vocabulary, tokenized_tweet_list_a)

sub_a_label_test = task_a_test_label['labela'].map(label2id).to_list()
test_a_dataset = TextDataset(documents_vector_a, sub_a_label_test)
test_a_loader = DataLoader(test_a_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    pred, label = [],[]
    for i, (inputs, labels, lengths) in enumerate(test_a_loader):
        outputs = model(inputs.long(), lengths)
        _, predicted = torch.max(outputs.data, 1)
        pred.append(predicted)
        label.append(labels)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Test Accuracy: {100 * correct / total:.2f}%")

y_pred = np.asarray(torch.cat(pred))
y_label = np.asarray(torch.cat(label))
print(metrics.f1_score(y_label, y_pred, average='micro'))
print(metrics.classification_report(y_label, y_pred, target_names=['NOT', 'OFF']))

Test Accuracy: 77.91%
0.7790697674418605
              precision    recall  f1-score   support

         NOT       0.81      0.91      0.86       620
         OFF       0.65      0.45      0.53       240

    accuracy                           0.78       860
   macro avg       0.73      0.68      0.69       860
weighted avg       0.77      0.78      0.76       860



### Task B

In [20]:
sub_b_idx = list(ori_train_data[ori_train_data['subtask_b'].notna()].index)
tweet_list = ori_train_data.loc[sub_b_idx]['tweet'].to_list()
processed_tweet_list = [preprocess.glove_twitter_preprocess(t) for t in tweet_list]
tokenized_tweet_list = preprocess.nltk_tokenize(processed_tweet_list)
vocabulary, documents_vector = preprocess.transform_word_to_vector(tokenized_tweet_list, num_vocab=20000, padded=False)
pretrained_embedding = preprocess.get_embedding_from_torch_text(vocabulary, "twitter.27B", 25)
pretrained_embedding.shape

torch.Size([10335, 25])

In [21]:
label2id = {'UNT': 0, 'TIN': 1}
sub_b_label = ori_train_data.loc[sub_b_idx]['subtask_b'].map(label2id).to_list()
sentences_train, sentences_test, labels_train, labels_test = train_test_split(documents_vector, sub_b_label, test_size=0.2, random_state=5246)
y_train = np.asarray(labels_train)
y_test = np.asarray(labels_test)

count_unt, count_tin= np.sum(y_train == 0), np.sum(y_train == 1)
weight_unt = len(y_train) / (2 * count_unt)
weight_tin = len(y_train) / (2 * count_tin)
class_weights = torch.FloatTensor([weight_unt, weight_tin])

In [22]:
embedding_dim = pretrained_embedding.size(1)
hidden_dim = 64
output_dim = 2
lr = 0.001
num_epochs = 10

model = LSTMClassifier(embedding_dim, hidden_dim, output_dim, pretrained_embedding)
# criterion = nn.CrossEntropyLoss()
criterion = nn.CrossEntropyLoss(weight=class_weights)
# optimizer = optim.Adam(model.parameters(), lr=lr)
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-3)

train_dataset = TextDataset(sentences_train, y_train)
test_dataset = TextDataset(sentences_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

for epoch in range(num_epochs):
    for i, (inputs, labels, lengths) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs.long(), lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    pred, label = [],[]
    for i, (inputs, labels, lengths) in enumerate(test_loader):
        outputs = model(inputs.long(), lengths)
        _, predicted = torch.max(outputs.data, 1)
        pred.append(predicted)
        label.append(labels)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Test Accuracy: {100 * correct / total:.2f}%")

y_pred = np.asarray(torch.cat(pred))
y_label = np.asarray(torch.cat(label))
print(metrics.classification_report(y_label, y_pred, target_names=['UNT', 'TIN']))


Epoch [1/10], Step [100/110], Loss: 0.6551
Epoch [2/10], Step [100/110], Loss: 0.4863
Epoch [3/10], Step [100/110], Loss: 0.7053
Epoch [4/10], Step [100/110], Loss: 0.5995
Epoch [5/10], Step [100/110], Loss: 0.6077
Epoch [6/10], Step [100/110], Loss: 0.5809
Epoch [7/10], Step [100/110], Loss: 0.5817
Epoch [8/10], Step [100/110], Loss: 0.6481
Epoch [9/10], Step [100/110], Loss: 0.4105
Epoch [10/10], Step [100/110], Loss: 0.3881
Test Accuracy: 75.80%
              precision    recall  f1-score   support

         UNT       0.27      0.49      0.35       117
         TIN       0.91      0.80      0.85       763

    accuracy                           0.76       880
   macro avg       0.59      0.64      0.60       880
weighted avg       0.83      0.76      0.78       880



#### Test task b

In [23]:
tweets_test_b = task_b_test_data["tweet"].tolist()
processed_tweet_list_b = [preprocess.glove_twitter_preprocess(t) for t in tweets_test_b]
tokenized_tweet_list_b = preprocess.nltk_tokenize(processed_tweet_list_b)
documents_vector_b = preprocess.get_vector_from_vocabulary(vocabulary, tokenized_tweet_list_b)

sub_b_label_test = task_b_test_label['labelb'].map(label2id).to_list()
test_b_dataset = TextDataset(documents_vector_b, sub_b_label_test)
test_b_loader = DataLoader(test_b_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    pred, label = [],[]
    for i, (inputs, labels, lengths) in enumerate(test_b_loader):
        outputs = model(inputs.long(), lengths)
        _, predicted = torch.max(outputs.data, 1)
        pred.append(predicted)
        label.append(labels)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Test Accuracy: {100 * correct / total:.2f}%")

y_pred = np.asarray(torch.cat(pred))
y_label = np.asarray(torch.cat(label))
print(metrics.f1_score(y_label, y_pred, average='micro'))
print(metrics.classification_report(y_label, y_pred, target_names=['UNT', 'TIN']))

Test Accuracy: 41.25%
0.4125
              precision    recall  f1-score   support

         UNT       0.14      0.85      0.25        27
         TIN       0.95      0.36      0.52       213

    accuracy                           0.41       240
   macro avg       0.55      0.60      0.38       240
weighted avg       0.86      0.41      0.49       240



### Task C

In [16]:
sub_c_idx = list(ori_train_data[ori_train_data['subtask_c'].notna()].index)
tweet_list = ori_train_data.loc[sub_c_idx]['tweet'].to_list()
processed_tweet_list = [preprocess.glove_twitter_preprocess(t) for t in tweet_list]
tokenized_tweet_list = preprocess.nltk_tokenize(processed_tweet_list)
vocabulary, documents_vector = preprocess.transform_word_to_vector(tokenized_tweet_list, num_vocab=20000, padded=False)
pretrained_embedding = preprocess.get_embedding_from_torch_text(vocabulary, "twitter.27B", 25)
pretrained_embedding.shape

torch.Size([9679, 25])

In [17]:
label2id = {'IND': 0, 'GRP': 1, 'OTH': 2}
sub_c_label = ori_train_data.loc[sub_c_idx]['subtask_c'].map(label2id).to_list()
sentences_train, sentences_test, labels_train, labels_test = train_test_split(documents_vector, sub_c_label, test_size=0.2, random_state=5246)
y_train = np.asarray(labels_train)
y_test = np.asarray(labels_test)

count_ind, count_grp, count_oth = np.sum(y_train == 0), np.sum(y_train == 1), np.sum(y_train == 1)
weight_ind = len(y_train) / (3 * count_ind)
weight_grp = len(y_train) / (3 * count_grp)
weight_oth = len(y_train) / (3 * count_oth)
class_weights = torch.FloatTensor([weight_ind, weight_grp, weight_oth])

In [18]:
embedding_dim = pretrained_embedding.size(1)
hidden_dim = 64
output_dim = 3
lr = 0.001
num_epochs = 10

model = LSTMClassifier(embedding_dim, hidden_dim, output_dim, pretrained_embedding)
# criterion = nn.CrossEntropyLoss()
criterion = nn.CrossEntropyLoss(weight=class_weights)
# optimizer = optim.Adam(model.parameters(), lr=lr)
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-3)

train_dataset = TextDataset(sentences_train, y_train)
test_dataset = TextDataset(sentences_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

for epoch in range(num_epochs):
    for i, (inputs, labels, lengths) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs.long(), lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i + 1) % 50 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    pred, label = [],[]
    for i, (inputs, labels, lengths) in enumerate(test_loader):
        outputs = model(inputs.long(), lengths)
        _, predicted = torch.max(outputs.data, 1)
        pred.append(predicted)
        label.append(labels)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Test Accuracy: {100 * correct / total:.2f}%")

y_pred = np.asarray(torch.cat(pred))
y_label = np.asarray(torch.cat(label))
print(metrics.classification_report(y_label, y_pred, target_names=['IND', 'GRP', 'OTH']))


Epoch [1/10], Step [50/97], Loss: 0.8945
Epoch [2/10], Step [50/97], Loss: 0.8534
Epoch [3/10], Step [50/97], Loss: 0.8560
Epoch [4/10], Step [50/97], Loss: 1.0413
Epoch [5/10], Step [50/97], Loss: 0.7240
Epoch [6/10], Step [50/97], Loss: 0.9235
Epoch [7/10], Step [50/97], Loss: 0.8876
Epoch [8/10], Step [50/97], Loss: 0.9720
Epoch [9/10], Step [50/97], Loss: 0.9349
Epoch [10/10], Step [50/97], Loss: 0.8611
Test Accuracy: 72.81%
              precision    recall  f1-score   support

         IND       0.84      0.82      0.83       500
         GRP       0.57      0.71      0.63       203
         OTH       0.30      0.12      0.17        73

    accuracy                           0.73       776
   macro avg       0.57      0.55      0.55       776
weighted avg       0.72      0.73      0.72       776



In [19]:
tweets_test_c = task_c_test_data["tweet"].tolist()
processed_tweet_list_c = [preprocess.glove_twitter_preprocess(t) for t in tweets_test_c]
tokenized_tweet_list_c = preprocess.nltk_tokenize(processed_tweet_list_c)
documents_vector_c = preprocess.get_vector_from_vocabulary(vocabulary, tokenized_tweet_list_c)

sub_c_label_test = task_c_test_label['labelc'].map(label2id).to_list()
test_c_dataset = TextDataset(documents_vector_c, sub_c_label_test)
test_c_loader = DataLoader(test_c_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    pred, label = [],[]
    for i, (inputs, labels, lengths) in enumerate(test_c_loader):
        outputs = model(inputs.long(), lengths)
        _, predicted = torch.max(outputs.data, 1)
        pred.append(predicted)
        label.append(labels)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Test Accuracy: {100 * correct / total:.2f}%")

y_pred = np.asarray(torch.cat(pred))
y_label = np.asarray(torch.cat(label))
print(metrics.f1_score(y_label, y_pred, average='micro'))
print(metrics.classification_report(y_label, y_pred, target_names=['IND', 'GRP', 'OTH']))

Test Accuracy: 60.09%
0.6009389671361502
              precision    recall  f1-score   support

         IND       0.73      0.62      0.67       100
         GRP       0.55      0.78      0.65        78
         OTH       0.29      0.14      0.19        35

    accuracy                           0.60       213
   macro avg       0.52      0.51      0.50       213
weighted avg       0.59      0.60      0.58       213



In [122]:
zeros_mask = torch.all(pretrained_embedding == 0, dim=1)
zeros_indices = torch.nonzero(zeros_mask)
print('Size of OOV: {}'.format(len(zeros_indices.squeeze())))
[t for i, t in enumerate(vocabulary.get_itos()) if i in zeros_indices.squeeze()]

Size of OOV: 2791


['<PAD>',
 '<UNK>',
 '<SOS>',
 '<EOS>',
 "don't",
 '😂',
 "it's",
 '🇺',
 '🇸',
 '️',
 'kavanaugh',
 "i'm",
 "that's",
 "can't",
 "you're",
 '🤣',
 "doesn't",
 "he's",
 "didn't",
 '😭',
 "isn't",
 "they're",
 'brexit',
 '🤔',
 "she's",
 '😍',
 '👍',
 "won't",
 '🙄',
 "aren't",
 '😘',
 "i've",
 "there's",
 "i'll",
 '😡',
 "wouldn't",
 "let's",
 "what's",
 '🙏',
 '🔥',
 '💕',
 "wasn't",
 '😊',
 '😁',
 '👏',
 '😉',
 'qanon',
 '😆',
 '😀',
 "we're",
 '💜',
 "i'd",
 '💀',
 '😎',
 '💯',
 "trump's",
 "who's",
 '😠',
 "haven't",
 'blasey',
 '💥',
 '💙',
 "you'll",
 "you've",
 '🌹',
 "couldn't",
 '💖',
 "he'll",
 'mbga',
 '😢',
 "shouldn't",
 '😩',
 '🤢',
 '👊',
 "women's",
 '👌',
 'alt-right',
 '🛑',
 '😒',
 '💩',
 '. . .',
 '👀',
 "hasn't",
 '‼',
 '🙏🏻',
 '💞',
 '👇',
 '😜',
 '. .',
 '😅',
 '🇷',
 '🇬',
 "ain't",
 '🤗',
 'mcga',
 '<<number>',
 '🤡',
 '🗽',
 '👏🏻',
 '🇧',
 '😳',
 "y'all",
 "people's",
 '😱',
 "weren't",
 '💛',
 '👍🏼',
 '🙏🏼',
 "we've",
 '🐇',
 "kavanaugh's",
 '😋',
 'declassify',
 "he'd",
 '🙂',
 "we'll",
 '🤮',
 '👇🏼',
 '🤷\u200d♂',
 '