Downloading all necessary libraries and files while declaring all necessary imports. 

In [12]:
# Installing hugging face transformer library
print("Installing BERT embeddings library...\n")
!pip install transformers -q
# !pip install pytorch-lightning
print("\nLibraries Downloaded!", "\n")

# Downloading the needed dataset
# No need to run again
print("Downloading dataset...\n")
!wget 'https://onedrive.live.com/download?cid=D9CC4BA72F6E9232&resid=D9CC4BA72F6E9232%215067&authkey=ADPocfGcOvh1jMk' -O hate_speech_train.csv -q
!ls
print("\nDataset downloaded!")

Installing BERT embeddings library...


Libraries Downloaded! 

Downloading dataset...

hate_speech_train.csv  sample_data

Dataset downloaded!


Importing all necessary dependencies.

In [13]:
#!usr/bin/env python3
#-*- coding: utf-8 -*-

#Dependencies
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt 
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import pandas as pd
import time
import torch
import gc
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from transformers import *
from wordcloud import WordCloud
from sklearn.metrics import roc_curve, auc
import copy

We will use CUDA for this project due to the amount of data that needs to be trained.

In [14]:
#Setting device on GPU if available, else CPU, and getting GPU name
# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda:0')
print('Which device am I using:', device)
if device.type == 'cuda':
    print("GPU Name:", torch.cuda.get_device_name(0))

Which device am I using: cuda:0
GPU Name: Tesla V100-SXM2-16GB


Loading the hate speech dataset.

In [15]:
#Loading the dataset and show a basic description
df = pd.read_csv("hate_speech_train.csv")
print("Toxic Hate Speech Dataset Loaded!")

Toxic Hate Speech Dataset Loaded!


Here, we instantiate a random seed and shuffle the dataset to keep training consistent.

In [16]:
#Sets a random seed and shuffles data to ensure consistency in training
np.random.seed(69)
df = df.sample(frac = 1)
df = df.reset_index(drop = True)

Let's perform some basic data analysis about the dataset, shown below:

In [17]:
#Gives a basic analysis about the dataset
df.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
#Example of clean comments in the dataset that aren't considered toxic
df[df.toxic == 0].head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,b78ce62f8fbb279e,"""I've added the """""""" template to the article M...",0,0,0,0,0,0
2,6a4a28a5f925a4a8,""". \nWhy would Palpatine propagate his deforma...",0,0,0,0,0,0
3,e9d06400c8aaa52c,Delete this page \n\nThis page should be deleted.,0,0,0,0,0,0
4,d1ad0a6d192eaac6,"""\nI'm not going against the Oxford Dictionary...",0,0,0,0,0,0
5,950b9d303b8200f0,"""A presentation at a major NASA conference is ...",0,0,0,0,0,0


In [19]:
#Example of comments with various language that are described as at least toxic
df[df.toxic == 1].head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
1,bcc800b2f3b053dd,"You must really hate freedom, you evil-doing, ...",1,0,0,0,1,0
6,8f11723833cb5fa2,Cheri you are an evil cunt and that Chuck Smit...,1,0,1,0,1,0
13,08acc5d9d599b41c,Mark. Your semen is not viable. Were you to pr...,1,0,0,0,0,0
22,f02c0ca1eac74c1a,""" Only when it suits you will you say hypocr...",1,0,0,0,0,0
57,4536840db0048b47,Message \n\nGet a life- fuck off you mull!\n\n...,1,0,1,0,1,0


Instantiating the target labels in the dataset, batch size, and valid sequence length for a given token. A graph function is created for easier plotting.

In [20]:
#List of the labels in the dataset
target_columns = ["toxic", 
                  "severe_toxic", 
                  "obscene", 
                  "threat", 
                  "insult", 
                  "identity_hate"]

#Establish batch size and sequence length for tokenization
batch_size = 25
valid_len = 256

#plotting graph function, if mode = 1 we are plotting 1 line, if mode = 2 two lines, if 3 we are plotting bar graphs
#plotting graph function, if mode = 1 we are plotting 1 line, if mode = 2 two lines, if 3 we are plotting bar graphs
def plot_graph(mode = 1, **params):
  fig = go.Figure()
  title = params['title'] 
  x_axis = params['x_axis'] 
  y_axis = params['y_axis'] 
  if mode == 1:
    data = params['data']
    data_label = params['data_label'] 
    fig.add_trace(go.Scatter(x = np.arange(1, len(data)+1), y = data, mode = 'lines', name = data_label))
    fig.update_layout(
            title = title,
            xaxis_title = x_axis,
            yaxis_title = y_axis,
            yaxis_tickformat = 'digits',
            width = 600,
            height = 500,
            margin = dict(
              l = 50,
              r = 50,
              b = 100,
              t = 100,
              pad = 4
            ),
        )
    
    fig.show()

  elif mode == 2:
    one_data = params['one_data']
    two_data = params['two_data']
    one_label = params['one_label'] 
    two_label = params['two_label'] 
    fig.add_trace(go.Scatter(x = np.arange(1, len(one_data)+1), y = one_data, mode = 'lines', name = one_label))
    fig.add_trace(go.Scatter(x = np.arange(1, len(two_data)+1), y = two_data, mode = 'lines', name = two_label))
    fig.update_layout(
            title = title,
            xaxis_title = x_axis,
            yaxis_title = y_axis,
            yaxis_tickformat = 'digits',
            # width = 1000,
            # height = 1000,
            
        )
    fig.show()
  else:
    x_axis_data = params['x_axis_data']
    y_axis_data = params['y_axis_data']
    fig = go.Figure([go.Bar(x=x_axis_data, y=y_axis_data)])
    fig.update_layout(
        title = title,
        xaxis_title = x_axis,
        yaxis_title = y_axis,
        yaxis_tickformat = 'digits',
        width = 550,
        height = 500,
        margin = dict(
          l = 50,
          r = 50,
          b = 100,
          t = 100,
          pad = 4
         ),
    )
    fig.show()


Visualization of some toxic comments in the form of a wordcloud.

In [23]:
#Create a wordcloud based of language from a classified toxic comment
threat_context = df[df.toxic == 1]
neg_text = pd.Series(threat_context.comment_text).str.cat(sep=' ')
wordcloud = WordCloud(width = 1000, height = 600, max_font_size = 100).generate(neg_text)

# print("Word Cloud for dataset:")

# Plot the wordcloud itself
# plt.figure(figsize=(10, 5))
# plt.imshow(wordcloud.recolor(colormap="viridis"), interpolation='bilinear')
# plt.axis("off")
# plt.title("Common Words Associated with Toxic Hate Speech", size = 14)
# plt.show()

Visualization of average comment length frequencies based off of a target label.

In [24]:
#Get the average length of a comment
comment_len = df.comment_text.str.count(' ')
print("Average Comment Length: ", sum(comment_len[:])/len(comment_len[:]))

train_labels = [df[end:end + batch_size][target_columns].reset_index(drop = True) for end in range(0, len(df.index), batch_size)]

#Display word frequencies by a given label through bar chart
# graph needs to be adjusted accordingly since i adjusted trian labels to match train_sets
label_count = 0
for i in range(len(train_labels)):
  label_count += train_labels[i].sum()

plot_graph(mode = 3, title = 'Toxic Label Frequency Across The Entire Dataset', x_axis = 'label', x_axis_data = target_columns, y_axis = 'count', y_axis_data = label_count)


plot_graph(mode = 3, title = 'Toxic vs Non Toxic Frequency Across The Entire Dataset', x_axis = 'General Label', x_axis_data = ['toxic', 'non-toxic'], y_axis = 'count', y_axis_data = [sum(label_count), df.shape[0]-sum(label_count)])


Average Comment Length:  66.86696204197504


Creating the actual training set, validation set, and test set by slicing/indexing the csv

In [None]:
#Get lists of batches for the training, validation, and test set
train_sets = [df[end:end + batch_size].reset_index(drop = True) for end in range(0, 7500, batch_size)]
validation_set = [df[end:end + batch_size].reset_index(drop = True) for end in range(7500, 9000, batch_size)]
test_set = [df[end:end + batch_size].reset_index(drop = True) for end in range(9000, 10500, batch_size)]

Loading BERT.

In [None]:
#Instantiating the BERT model from the hugging transformers library
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
BERT = BertModel.from_pretrained(pretrained_weights)
BERT.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

Creating a function to tokenize comments, which would then be passed into BERT.

In [None]:
def tokenize_and_pad_text(df, max_sequence):
    token_encodings = []
    for text in df.comment_text.values:
      token = tokenizer.encode(text, add_special_tokens = True, max_length = 512, truncation = True)[:max_sequence]
      token_encodings.append(token)

    padded_text = torch.tensor(np.array([token + [0] * (max_sequence - len(token)) for token in token_encodings]), device=device)
    return padded_text

Tokenize the text, and then pass them into BERT to be encoded.

In [None]:
# print(torch.cuda.current_device())
train_indices = [tokenize_and_pad_text(batch, valid_len) for batch in train_sets] #slightly more than average comment length 
val_indices = [tokenize_and_pad_text(batch, valid_len) for batch in validation_set]
test_indices = [tokenize_and_pad_text(batch, valid_len) for batch in test_set]

with torch.no_grad():
    x_train = [BERT(train_inds)[0] for train_inds in train_indices]  
    x_validation = [BERT(val_ind)[0] for val_ind in val_indices]
    x_test = [BERT(test_ind)[0] for test_ind in test_indices]
    
y_train = [torch.tensor(train_set[target_columns].values, dtype=torch.float).to(device) for train_set in train_sets]
y_validation = [torch.tensor(v_set[target_columns].values, dtype=torch.float).to(device) for v_set in validation_set]
y_test = [torch.tensor(t_set[target_columns].values, dtype=torch.float).to(device) for t_set in test_set]


Here are some example outputs after BERT encoding.

In [None]:
#Displaying examples of the output tensors after BERT tokenization encoding.
print(f"X_Train (Token) Tensor Example: {x_train[0][0]}\n")
print(f"Y_Train (Labels) Tensor Example: {y_train[0][0]}\n")
#Confirm the tensor lengths
print(f"X_Train (Token) Length: {len(x_train[0][0])}, Y_Train (Labels) Length: {len(y_train[0][0])}")

X_Train (Token) Tensor Example: tensor([[-0.3831,  0.0452, -0.6090,  ..., -0.3250,  1.0629, -0.3999],
        [ 0.5812,  0.3280,  0.1577,  ...,  0.5463,  1.2730,  0.0710],
        [ 0.9785,  0.0586,  0.3607,  ..., -0.1999,  0.4910,  0.3193],
        ...,
        [ 0.2456,  0.1445,  0.4343,  ..., -0.8573, -0.1843, -1.1921],
        [ 0.2036,  0.1511,  0.4518,  ..., -0.7979, -0.2322, -1.2652],
        [ 0.2825,  0.0185,  0.4872,  ..., -0.7804, -0.2343, -1.4969]],
       device='cuda:0')

Y_Train (Labels) Tensor Example: tensor([0., 0., 0., 0., 0., 0.], device='cuda:0')

X_Train (Token) Length: 256, Y_Train (Labels) Length: 6


Reimplementation of Yoon Kim's CNN for Sentence Text Classification

In [None]:
#Yoon Kim CNN Stuff
class kim_cnn(nn.Module):
  def __init__(self, embedding_num, embedding_dim, dropout = 0.1, kernel_count = 3, kernel_dims = [2, 3, 4], labels = target_columns):
    super().__init__()
    self.dropout = dropout
    self.kernel_count = kernel_count
    self.kernel_dims = kernel_dims
    self.labels = labels
    self.label_count = len(labels)
    self.emb_num = embedding_num
    self.emb_dim = embedding_dim
    
    self.embedding = nn.Embedding(self.emb_num, self.emb_dim)
    self.convs = nn.ModuleList([nn.Conv2d(1, self.kernel_count, (k, self.emb_dim)) for k in self.kernel_dims])
    self.dropout = nn.Dropout(self.dropout)
    self.classifier = nn.Linear(len(self.kernel_dims) * self.kernel_count, len(self.labels))
    self.act = nn.Sigmoid()
    # self.act = nn.ReLU()
    # self.act = nn.Softmax(dim = 1)

  def forward(self, X):
    # (N, W, D) ---> (N, C, W, D)
    X = Variable(X, requires_grad = True)
    X = X.unsqueeze(1)
    # [(N, C, W), ...] * len(kernel_dims)
    X = [F.relu(conv(X)).squeeze(3) for conv in self.convs]
    # concat([(N, C), ...] * len(kernel_dims))
    X = [F.max_pool1d(n, n.size(2)).squeeze(2) for n in X] 
    X = torch.cat(X, 1)
    # (N, len(kernel_dims) * kernel_count)
    X = self.dropout(X)
    #(N, C)
    logits = self.act(self.classifier(X))
    return logits


Instantiating the model and setting the parameters before training.

In [None]:
#Instantiating the model / setting model parameters
kim_model = kim_cnn(
  embedding_num = valid_len, #like the sequence_length value we set last time?
  embedding_dim = x_train[0].shape[2], #BERT embeedings
  dropout= 0.1, 
  kernel_count = 3, 
  kernel_dims = [2, 3, 4], 
  labels = target_columns
)

kim_model.to(device)

kim_cnn(
  (embedding): Embedding(256, 768)
  (convs): ModuleList(
    (0): Conv2d(1, 3, kernel_size=(2, 768), stride=(1, 1))
    (1): Conv2d(1, 3, kernel_size=(3, 768), stride=(1, 1))
    (2): Conv2d(1, 3, kernel_size=(4, 768), stride=(1, 1))
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=9, out_features=6, bias=True)
  (act): Sigmoid()
)

Initializing hyperparameters through fine-tuning, as well as a validation loss function.

In [None]:
#Various hyperparameters and defining optimizer / loss function
lr_rate = 9.625e-6
num_epochs = 30
optimizer = torch.optim.Adam(kim_model.parameters(), lr = lr_rate)
# optimizer = torch.optim.SGD(kim_model.parameters(), lr = lr_rate)
criterion = nn.BCELoss()
# criterion = nn.BCEWithLogitsLoss()

def val(model):
  val_loss = 0
  batch_no = 0
  #Evaluate the model
  # kim_model.eval()
  with torch.no_grad():
    for data_batch, label_batch in zip(x_validation, y_validation):
      batch_no += 1
      for data, labels in zip(data_batch, label_batch):
        data, labels = Variable(data, requires_grad = False), Variable(labels, requires_grad = False) 
        pred = model(data[None, ...].float())
        # print("pred: ", pred)
        # print("label: ", labels, '\n')
        loss = criterion(pred, labels[None, ...].float())
        val_loss += loss.item()
    val_loss /= batch_no
    # print("Epoch: %d, Validation loss: %.6f" % (num_epochs, val_loss))
    return val_loss

Model training begins, with training and validation loss shown side by side.

In [None]:
#Model training
print(f"BEGINNING TRAINING: lr = {lr_rate}, Batch Count = {len(x_train)}, Batch Size = {batch_size}, Epoch Count = {num_epochs}")
print("---------------------------------------------------------------------------------------")
train_losses = []
validation_losses = []
for i in range(0, num_epochs):
    batch_no = 0
    train_loss = 0
    kim_model.train(True)
    for data_batch, label_batch in zip(x_train, y_train):
      #---> I think we need to loop through each tensor in the data batch and the label batch?
      batch_no += 1
      for data, labels in zip(data_batch, label_batch):
        data, labels = Variable(data, requires_grad = False), Variable(labels, requires_grad = False) 
        pred = kim_model(data[None, ...].float())
        optimizer.zero_grad()
        loss = criterion(pred, labels[None, ...].float())
        loss.backward()      
        optimizer.step()     
        train_loss += loss.item()
    train_loss /= batch_no
    train_losses.append(train_loss)
    val_loss = val(kim_model)
    validation_losses.append(val_loss)
    print("Epoch: %d, Training loss: %.6f, Validation loss: %.6f" % (i + 1, train_loss, val_loss))


BEGINNING TRAINING: lr = 9.625e-06, Batch Count = 300, Batch Size = 25, Epoch Count = 30
---------------------------------------------------------------------------------------
Epoch: 1, Training loss: 8.273405, Validation loss: 5.451798
Epoch: 2, Training loss: 4.562329, Validation loss: 4.808934
Epoch: 3, Training loss: 4.096467, Validation loss: 4.188987
Epoch: 4, Training loss: 3.801979, Validation loss: 4.030648
Epoch: 5, Training loss: 3.548766, Validation loss: 3.991595
Epoch: 6, Training loss: 3.413799, Validation loss: 3.591760
Epoch: 7, Training loss: 3.191557, Validation loss: 3.531632
Epoch: 8, Training loss: 3.097927, Validation loss: 3.479942
Epoch: 9, Training loss: 3.015697, Validation loss: 3.322553
Epoch: 10, Training loss: 2.916432, Validation loss: 3.236511
Epoch: 11, Training loss: 2.844546, Validation loss: 3.243617
Epoch: 12, Training loss: 2.729842, Validation loss: 3.057810
Epoch: 13, Training loss: 2.627811, Validation loss: 2.898030
Epoch: 14, Training loss: 

Graph that shows the training and validation losses over each epoch.

In [None]:
# if you want to plot the lines by themself
# plot_graph(title = 'Training Loss Across Epochs', x_axis='Epoch', y_axis = 'Loss', data = train_losses, data_label = 'Training Loss')
# plot_graph(title = 'Validation Loss Across Epochs', x_axis='Epoch', y_axis = 'Loss', data = validation_losses, data_label = 'Training Loss')

# passing deep copy because for some reason it clears the array when we pass it in
val_losses = copy.deepcopy(validation_losses)
plot_graph(mode = 2, title = 'Training Loss and Validation Loss Across 30 Epochs', x_axis='Epoch', y_axis = 'Loss',
           one_data = train_losses, one_label = 'Training Loss', two_data = val_losses, two_label = 'Validation Loss')

Evaluating the test set and displaying the loss after the final epoch.

In [None]:
print("TEST SET EVALUATION")
print("-------------------------")
test_loss = 0
batch_no = 0
# test_losses = []
#Evaluate the model
# kim_model.eval()
with torch.no_grad():
  for data_batch, label_batch in zip(x_test, y_test):
    batch_no += 1
    for data, labels in zip(data_batch, label_batch):
      data, labels = Variable(data, requires_grad = False), Variable(labels, requires_grad = False) 
      pred = kim_model(data[None, ...].float())
      # print("pred: ", pred)
      # print("label: ", labels, '\n')
      loss = criterion(pred, labels[None, ...].float())
      test_loss += loss.item()
  test_loss /= batch_no
  # test_losses.append(val_loss)
  print("After Epoch: %d, Test loss: %.6f" % (num_epochs, test_loss))

TEST SET EVALUATION
-------------------------
After Epoch: 30, Test loss: 1.862452


Evaluating the model accuracy based off of the test set.

In [None]:
#Testing the accuracy of the model
kim_model.eval()
roc_auc_preds = None
roc_auc_labels = None
labels_arr = []
preds = []
with torch.no_grad():
  correct = 0
  total = 0
  for data_batch, label_batch in zip(x_test, y_test):
    batch_no += 1
    for data, labels in zip(data_batch, label_batch):
      data, labels = Variable(data, requires_grad = False), Variable(labels, requires_grad = False) 
      output = kim_model(data[None, ...].float())
      _, predicted = torch.max(output.data, 1)

      #adding to array that is used for roc auc later
      preds.extend(output.cpu().numpy().tolist())
      roc_auc_preds = np.array(preds)

      total += labels.size(0)
      correct += (predicted == labels).sum().item()    
   
  print('Test Accuracy of the model: {} %'.format(100 * correct / total))

Test Accuracy of the model: 96.7 %


Performing a ROC AUC Score to see average accuracy per target label.

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_labels = df[9000:10500][target_columns].values

print("ROC-AUC Scores")

auc_scores = roc_auc_score(roc_auc_labels, roc_auc_preds, average=None)
df_scores = pd.DataFrame({"label": target_columns, "auc": auc_scores})
df_scores.sort_values('auc')[::-1]

ROC-AUC Scores


Unnamed: 0,label,auc
4,insult,0.973251
2,obscene,0.970923
5,identity_hate,0.960268
0,toxic,0.957884
3,threat,0.953939
1,severe_toxic,0.939353
