In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import re
from string import punctuation
from collections import Counter
import random
import operator
from tqdm import tqdm
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler



# !pip install transformers
from transformers import BertModel, BertTokenizer

In [None]:
def load_data(path, sample_size=5, cols=['text', 'HOF'], label=None):
    """Helper function that loads data from a given path into a pandas
       DataFrame, using only the specified cols. Also prints basic info
       about the dataset size and displays a sample of the rows.
    """

    df = pd.read_csv(path, sep='\t', usecols=cols)

    print(f"\nThere are {df.shape[0]} tweets in the {label} dataset.")
    print("\nHere's a sample:\n")
    display(df.sample(sample_size))

    return df

In [None]:
# Colab
train = load_data('train.tsv', label='train')
test = load_data('test.tsv', label='test')

## Data preprocessing

In [None]:
# Map labels to binary integers
label2id = {'Non-Hateful': 0, 'Hateful': 1}
train['HOF'] = train['HOF'].apply(lambda x: label2id[x])
test['HOF'] = test['HOF'].apply(lambda x: label2id[x])

In [None]:
print('Before downsampling: ')
print(f"Hateful: {len(train[train['HOF']==1])}")
print(f"Non-Hateful: {len(train[train['HOF']==0])}")

train_hateful = train[train['HOF']==1]
train_nonhateful = train[train['HOF']==0].sample(len(train_hateful))
train_downsampled = pd.concat([train_hateful, train_nonhateful], axis=0).sample(frac=1)

print('\nAfter downsampling: ')
print(f"Hateful: {len(train_downsampled[train_downsampled['HOF']==1])}")
print(f"Non-Hateful: {len(train_downsampled[train_downsampled['HOF']==0])}")

## Data splitting

In [None]:
 #Split train data set into train and development sets
train, dev = train_test_split(train_downsampled, test_size=0.5, stratify=train_downsampled['HOF'])

In [None]:
# Clean the tweets' text
def clean_text(tweet):
    """A function that performs basic cleaning of a tweet's text.
    """

    # Replace mentions and URLs with special token
    tweet = re.sub(r"@[A-Za-z0-9_-]+",'USR',tweet)
    tweet = re.sub(r"http\S+",'URL',tweet)

    # Remove \n and \t characters
    tweet = tweet.replace('\n', ' ')
    tweet = tweet.replace('[NEWLINE]', ' ')
    tweet = tweet.replace('\t', ' ')

    # Strip whitespace
    tweet = tweet.strip()

    # Convert to lowercase
    tweet = tweet.lower()

    # return [w.strip(punctuation) for w in tweet.split() if w.strip(punctuation)!='']
    return tweet

# train['cleaned_text'] = train['text'].apply(lambda x: clean_text(x))
# test['cleaned_text'] = test['text'].apply(lambda x: clean_text(x))

# Data encoding and tokenizing

In [None]:
# Define Dataset class which cleans, tokenizes and encodes data
class BERTDataset(Dataset):

    def __init__(self, data):

        # Initialize BERT tokenizer
        # Note that I need to specify cache_dir because I'm using a venv
        self.tok = BertTokenizer.from_pretrained('bert-base-uncased', cache_dir=Path.cwd()/'venv/lib/python3.8/site-packages')

        # Clean tweets
        self.cleaned_tweets = data['text'].apply(lambda x: clean_text(x))

        # Truncate and encode tweets, up to max_length of 60
        # While this is lower than BERT's max (512), it was chosen for computational speed
        self.tweets = list(self.cleaned_tweets.apply(self.tok.encode, max_length=60, truncation=True))

        # Store labels
        self.labels = list(data['HOF'])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        tweet = self.tweets[idx]
        label = self.labels[idx]
        return tweet, label

# Inspect an example
# BD = BERTDataset(train.iloc[:5])
# next(iter(BD))

In [None]:
# Define collate function to be passed to DataLoader
def bert_collate(batch):

    # Store batch size
    batch_size = len(batch)

    # Separate tweets and labels
    tweets = [t for t, _ in batch]
    labels = torch.tensor([l for _, l in batch]).long()

    # Store length of longest tweet in batch
    max_len = max(len(t) for t in tweets)

    # Create padded tweet and attention mask tensors
    tweets_pad = torch.zeros((batch_size, max_len)).long()
    masks_pad = torch.zeros((batch_size, max_len)).long()
    for i, t in enumerate(tweets):
        tweets_pad[i, :len(t)] = torch.tensor(t)
        masks_pad[i, :len(t)] = 1

    return tweets_pad, masks_pad, labels

In [None]:
%%time

# Create data sets
train_dataset = BERTDataset(train)
dev_dataset = BERTDataset(dev)
test_dataset = BERTDataset(test)

In [None]:
# Create data loaders using torch.utils.data.DataLoader class
# Using shuffle=True instead of specifying RandomSampler
train_loader = DataLoader(train_dataset, batch_size=100, collate_fn=bert_collate, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=100, collate_fn=bert_collate)
test_loader = DataLoader(test_dataset, batch_size=100, collate_fn=bert_collate)

In [None]:
# Inspect
for (idx, batch) in enumerate(train_loader):

    print(f'\n\n--------------------- Batch {idx} ---------------------\n')

    # Print the text
    print(f"There are {len(batch[0])} encoded tweets in this batch.")
    print('Tweets (encoded): ', batch[0])

    # Print the label
    print(f"There are {len(batch[2])} encoded labels in this batch. Here they are: ")
    print('Labels: ', batch[2])

In [None]:
# Define BERT classifier
class BERTClassifier(nn.Module):

    def __init__(self):

        # Specify network layers
        # Note that I need to specify cache as I'm using a venv
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased', cache_dir=Path.cwd()/'venv/lib/python3.8/site-packages')
        self.linear = nn.Linear(768, 4)

        # Define dropout
        self.dropout = nn.Dropout(0.2)

        # Freeze BERT layers
        for n, p in self.bert.named_parameters():
            p.requires_grad = False

    def forward(self, tweets, masks):

        # Define flow of tensors through the network
        output_bert = self.bert(tweets, attention_mask=masks)[0].mean(axis=1)
        return self.linear(self.dropout(output_bert))

In [None]:
# Initialise model
model = BERTClassifier()

In [None]:
# Move model to device
model = model.to(device)

In [None]:
# Define optimiser, objective function and epochs
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
epochs = 5

In [None]:
%%time

# Train model
for epoch_i in range(1, epochs):

    # ========================================
    #               Training
    # ========================================

    # Put model into training mode. This is necessary so that the `Dropout`
    # layers are activated.
    model.train()

    # For each batch of the training data...
    for i, batch in enumerate(tqdm(train_loader)):

        # Step 1. Since PyTorch accumulates gradients, clear any previously
        # calculated gradients before performing a backward pass.
        # PyTorch doesn't do this automatically because it can be useful while
        # training RNNs.
        optimizer.zero_grad()

        # Step 2. Extract data and move to device.
        tweets, masks, labels = [t.to(device) for t in batch]

        # Step 3. Forward pass - note that calling `model()` will in turn call
        # the model's `forward()` function.
        output = model(tweets, masks)

        # Step 4. Compute loss.
        loss = criterion(output, labels)

        # Step 5. Perform backward pass to calculate gradients wrt each w and b term.
        loss.backward()

        # Step 6. Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Step 7. Update parameters and take a step using the computed gradient.
        optimizer.step()

    # ========================================
    #               Validation
    # ========================================

    # After the completion of each training epoch, measure our performance on
    # our validation set.
    # Put model into evaluation mode, thereby deactivating Dropout layer.
    model.eval()

    y_true = list()
    y_pred = list()

    with torch.no_grad(): # We no longer need it to store computation graph.
        for batch in dev_loader:
            tweets, masks, labels = [t.to(device) for t in batch]
            output = model(tweets, masks)
            max_output = output.argmax(dim=1)
            y_true.extend(labels.tolist())
            y_pred.extend(max_output.tolist())

    print(f"Accuracy after {epoch_i} epoch(s): {accuracy_score(y_true, y_pred)}")

In [None]:
%time

# ========================================
#               Evaluation
# ========================================

# Evaluate model on test data
model.eval()

y_true = list()
y_pred = list()

with torch.no_grad():
    for batch in test_loader:
        tweets, masks, labels = [t.to(device) for t in batch]
        output = model(tweets, masks)
        max_output = output.argmax(dim=1)
        y_true.extend(labels.tolist())
        y_pred.extend(max_output.tolist())

print('Test accuracy: {:.2f}'.format(accuracy_score(y_true, y_pred)))
print('\nClassification report: \n', classification_report(y_true, y_pred))
print('\nConfusion matrix: \n')
display(pd.DataFrame({"Predicted: Unhateful": confusion_matrix(y_true, y_pred)[:, 0],
              "Predicted: Hateful": confusion_matrix(y_true, y_pred)[:, 1]},
             index=['Actual: Unhateful', 'Actual: Hateful']))