In [1]:
import re
import numpy as np
import pandas as pd

from utils import preprocess, plotutils

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn import metrics
from sklearn.pipeline import Pipeline
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F

from tqdm import tqdm
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize
from spacy.lang.en import stop_words
from string import punctuation
import emoji

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
DATA_DIR = "./data/"
TRAIN_DATA_FILE = DATA_DIR + "olid-training-v1.0.tsv"
ori_train_data = pd.read_csv(TRAIN_DATA_FILE, sep='\t')
ori_train_data.head()

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,


In [4]:
def compute_label(a, b, c):
    if a == 'NOT':
        return a
    elif b == 'UNT':
        return b
    else:
        return c

In [5]:
ori_train_data['label'] = ori_train_data.apply(lambda x: compute_label(x.subtask_a, x.subtask_b, x.subtask_c), axis=1)
count = ori_train_data.groupby('label').size().to_dict()
stop_words = stop_words.STOP_WORDS
stop_words = stop_words.union(set(emoji.get_emoji_unicode_dict('en').values()))
stop_words.add("url")
stop_words = stop_words.union(set(punctuation))

In [6]:
def tweet_tokenization(doc):
    tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, match_phone_numbers=False)
    tweet_tokenize = [tweet_tokenizer.tokenize(tweet) for tweet in doc]
    return tweet_tokenize

def stop_word_removal(doc, stop_words):
    new_tweets = []
    for t in doc:
        new_words = []
        for word in t:
            if word not in stop_words:
                new_words.append(word)
        new_tweets.append(new_words)

    return new_tweets

def hashtag_removal(doc):
    new_docs = []
    for t in doc:
        new_doc=[]
        for word in t:
            if word[0] == "#":
                new_doc.append(word[1:])
            else:
                new_doc.append(word)
        new_docs.append(new_doc)
    
    return new_docs

tweets = ori_train_data["tweet"]
tweet_tokenization = tweet_tokenization(tweets)
tweet_remove_stop_word = stop_word_removal(tweet_tokenization, stop_words)
tweets_remove = hashtag_removal(tweet_remove_stop_word)
tweets_remove = [" ".join(t) for t in tweets_remove]

## Subtask A

In [7]:
label2id = {'NOT': 0, 'OFF': 1}
sub_a_label = ori_train_data['subtask_a'].map(label2id).to_list()
sentences_train, sentences_test, labels_train, labels_test = train_test_split(tweets_remove, sub_a_label, test_size=0.2, random_state=5246)

y_train = np.asarray(labels_train)
y_test = np.asarray(labels_test)

print("Size of training set: {}".format(len(sentences_train)))
print("Size of test set: {}".format(len(sentences_test)))

Size of training set: 10592
Size of test set: 2648


### Pad 0

In [8]:
class TextDataset(Dataset):
    def __init__(self, sentences, labels, vectorizer):
        self.sentences = sentences
        self.labels = labels
        self.vectorizer = vectorizer
        self.sequences = self.vectorizer.transform(sentences).toarray()

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx]), torch.tensor(self.labels[idx])


class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()

        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(1, batch_size, self.hidden_dim).requires_grad_()
        c0 = torch.zeros(1, batch_size, self.hidden_dim).requires_grad_()

        x = x.view(batch_size, 1, -1)

        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc(hn[-1])
        return out

vectorizer = CountVectorizer(stop_words='english', lowercase=True)
X_train = vectorizer.fit_transform(sentences_train)

input_dim = X_train.shape[1]
hidden_dim = 100
output_dim = 2
lr = 0.01
num_epochs = 10


model = LSTMClassifier(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

train_dataset = TextDataset(sentences_train, y_train, vectorizer)
test_dataset = TextDataset(sentences_test, y_test, vectorizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

for epoch in range(num_epochs):
    hn, cn = None, None
    for i, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        if hn is not None and cn is not None:
            hn, cn = hn.detach(), cn.detach()
        outputs = model(inputs.float())
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        hn, cn = outputs[1][0].detach(), outputs[1][1].detach()

        if (i + 1) % 100 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")

hn, cn = None, None

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    pred = []
    for i, (inputs, labels) in enumerate(test_loader):

        outputs = model(inputs.float())
        _, predicted = torch.max(outputs.data, 1)
        pred.append(predicted)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Test Accuracy: {100 * correct / total:.2f}%")

y_pred = np.asarray(torch.cat(pred))
print(metrics.classification_report(y_test, y_pred, target_names=['NOT', 'OFF']))

Epoch [1/10], Step [100/331], Loss: 0.7050
Epoch [1/10], Step [200/331], Loss: 0.5693
Epoch [1/10], Step [300/331], Loss: 0.4136
Epoch [2/10], Step [100/331], Loss: 0.2369
Epoch [2/10], Step [200/331], Loss: 0.1267
Epoch [2/10], Step [300/331], Loss: 0.2413
Epoch [3/10], Step [100/331], Loss: 0.1408
Epoch [3/10], Step [200/331], Loss: 0.0646
Epoch [3/10], Step [300/331], Loss: 0.0280
Epoch [4/10], Step [100/331], Loss: 0.0337
Epoch [4/10], Step [200/331], Loss: 0.0106
Epoch [4/10], Step [300/331], Loss: 0.0150
Epoch [5/10], Step [100/331], Loss: 0.0999
Epoch [5/10], Step [200/331], Loss: 0.0177
Epoch [5/10], Step [300/331], Loss: 0.0711
Epoch [6/10], Step [100/331], Loss: 0.0022
Epoch [6/10], Step [200/331], Loss: 0.0011
Epoch [6/10], Step [300/331], Loss: 0.0768
Epoch [7/10], Step [100/331], Loss: 0.0086
Epoch [7/10], Step [200/331], Loss: 0.0053
Epoch [7/10], Step [300/331], Loss: 0.0156
Epoch [8/10], Step [100/331], Loss: 0.0016
Epoch [8/10], Step [200/331], Loss: 0.0011
Epoch [8/10

### Pack Sequence

In [9]:
def collate_fn(batch):
    batch.sort(key=lambda x: x[2], reverse=True)
    sequences, labels, lengths = zip(*batch)
    sequences = nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=0.0)
    return sequences, torch.tensor(labels), torch.tensor(lengths)

class TextDataset(Dataset):
    def __init__(self, sentences, labels, vectorizer, input_dim):
        self.sentences = sentences
        self.labels = labels
        self.vectorizer = vectorizer
        self.sequences = [torch.tensor(vectorizer.transform([sentence]).toarray()).reshape(-1, input_dim) for sentence in sentences]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sequences[idx], torch.tensor(self.labels[idx]), len(self.sequences[idx])


class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()

        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, lengths):
        batch_size = x.size(0)
        h0 = torch.zeros(1, batch_size, self.hidden_dim).requires_grad_()
        c0 = torch.zeros(1, batch_size, self.hidden_dim).requires_grad_()

        x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)

        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        out = self.fc(hn[-1])
        return out

    
vectorizer = CountVectorizer(stop_words='english', lowercase=True)
X_train = vectorizer.fit_transform(sentences_train)

input_dim = X_train.shape[1]
hidden_dim = 100
output_dim = 2
lr = 0.01
num_epochs = 10


model = LSTMClassifier(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)


train_dataset = TextDataset(sentences_train, y_train, vectorizer, input_dim)
test_dataset = TextDataset(sentences_test, y_test, vectorizer, input_dim)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

for epoch in range(num_epochs):
    for i, (inputs, labels, lengths) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs.float(), lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    pred = []
    for i, (inputs, labels, lengths) in enumerate(test_loader):
        outputs = model(inputs.float(), lengths)
        _, predicted = torch.max(outputs.data, 1)
        pred.append(predicted)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Test Accuracy: {100 * correct / total:.2f}%")

y_pred = np.asarray(torch.cat(pred))
print(metrics.classification_report(y_test, y_pred, target_names=['NOT', 'OFF']))

Epoch [1/10], Step [100/331], Loss: 0.5411
Epoch [1/10], Step [200/331], Loss: 0.5427
Epoch [1/10], Step [300/331], Loss: 0.5888
Epoch [2/10], Step [100/331], Loss: 0.3349
Epoch [2/10], Step [200/331], Loss: 0.2287
Epoch [2/10], Step [300/331], Loss: 0.3291
Epoch [3/10], Step [100/331], Loss: 0.0798
Epoch [3/10], Step [200/331], Loss: 0.0833
Epoch [3/10], Step [300/331], Loss: 0.1384
Epoch [4/10], Step [100/331], Loss: 0.0739
Epoch [4/10], Step [200/331], Loss: 0.2047
Epoch [4/10], Step [300/331], Loss: 0.0385
Epoch [5/10], Step [100/331], Loss: 0.0079
Epoch [5/10], Step [200/331], Loss: 0.0235
Epoch [5/10], Step [300/331], Loss: 0.0422
Epoch [6/10], Step [100/331], Loss: 0.0016
Epoch [6/10], Step [200/331], Loss: 0.0065
Epoch [6/10], Step [300/331], Loss: 0.0069
Epoch [7/10], Step [100/331], Loss: 0.0019
Epoch [7/10], Step [200/331], Loss: 0.3279
Epoch [7/10], Step [300/331], Loss: 0.0037
Epoch [8/10], Step [100/331], Loss: 0.0030
Epoch [8/10], Step [200/331], Loss: 0.0015
Epoch [8/10