In [None]:
import sys
import numpy as np
import random as rn
import pandas as pd
import os

import codecs
import itertools
import seaborn as sns

import torch
from torch import nn

from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer,BertModel, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
from keras.preprocessing.sequence import pad_sequences

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output

import io
import matplotlib.pyplot as plt
import csv

from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
import os

from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

In [None]:
input_model = "models/hate_bert"
output_model = "./models/bert_test1"

In order for torch to use the GPU, we need to identify and specify the GPU as the device. Later, in our training loop, we will load data onto the device.



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
rn.seed(501)
np.random.seed(501)
torch.manual_seed(501)
torch.cuda.manual_seed(501)

# max lengrh of a sentence, fed into the network
MAX_LEN = 128

**Load Dataset**

We'll use The semeval dataset for single sentence classification. It's a set of sentences labeled as offensive correct or not. The data is as follows:

Column 1: the id of the sentence

Column 2: the tweet text	

Column 3: Label : Sub-task A - Offensive language identification

Column 4: Label : Sub-task B - Automatic categorization of offense types

Column 5: Label : Sub-task C - Offense target identification.


In [None]:
###Load training data: offenseval 2019

import pandas as pd
DATASET_PATH = "/home/slavkoz/Datasets/Offensive language datasets/15_OLID/"
df_offenseval = pd.read_csv(DATASET_PATH + 'olid-training-v1.0.tsv', delimiter ='\t')
X_offenseval = df_offenseval['tweet'].values
y_offenseval = df_offenseval['subtask_a'].values
      

In [None]:
### load test data: offenseval 2019 (for testing)

df_offenseval_test = pd.read_csv(DATASET_PATH + "OLID-testset-levela.tsv", delimiter = '\t')
X_offenseval_test = df_offenseval_test['tweet'].values

In [None]:
ddft_test_label = pd.read_csv(DATASET_PATH + 'OLID-labels-levela.csv', delimiter =',', header =None)
y_offenseval_test = ddft_test_label[ddft_test_label.columns[1]].values

In [None]:
# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in X_offenseval]

In [None]:
sentences_test = ["[CLS] " + sentence + " [SEP]" for sentence in X_offenseval_test]

**Inputs**

Next, import the BERT tokenizer, used to convert our text into tokens that correspond to BERT's vocabulary.

In [None]:
### Tokenizer from HateBERT is used here  ### input_model: directory where tokenizer is present

tokenizer = BertTokenizer.from_pretrained(input_model, do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[1])

In [None]:
tokenized_texts_test = [tokenizer.tokenize(sent) for sent in sentences_test]
print ("Tokenize the first sentence:")
print (tokenized_texts_test[1])

BERT requires specifically formatted inputs. For each tokenized input sentence, we need to create:

*   input ids: a sequence of integers identifying each input token to its index number in the BERT tokenizer vocabulary

*   segment mask: (optional) a sequence of 1s and 0s used to identify whether the input is one sentence or two sentences long. For one sentence inputs, this is simply a sequence of 0s. For two sentence inputs, there is a 0 for each token of the first sentence, followed by a 1 for each token of the second sentence

*   attention mask: (optional) a sequence of 1s and 0s, with 1s for all input tokens and 0s for all padding tokens (we'll detail this in the next paragraph)

*   labels: a single value of 1 or 0. In our task 1 means "offensive" and 0 means "not offensive"

Although we can have variable length input sentences, BERT does requires our input arrays to be the same size. We address this by first choosing a maximum sentence length, and then padding and truncating our inputs until every input sequence is of the same length.

To "pad" our inputs in this context means that if a sentence is shorter than the maximum sentence length, we simply add 0s to the end of the sequence until it is the maximum sentence length.

If a sentence is longer than the maximum sentence length, then we simply truncate the end of the sequence, discarding anything that does not fit into our maximum sentence length.

We pad and truncate our sequences so that they all become of length MAX_LEN ("post" indicates that we want to pad and truncate at the end of the sequence, as opposed to the beginning) pad_sequences is a utility function that we're borrowing from Keras. It simply handles the truncating and padding of Python lists.

In [None]:
# Use the tokenizer to convert the tokens to their index numbers in the vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [None]:
input_ids_test = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts_test]

In [None]:
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
input_ids_test = pad_sequences(input_ids_test, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [None]:
attention_masks_test = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_test:
  seq_mask = [float(i>0) for i in seq]
  attention_masks_test.append(seq_mask)

In [None]:
def encode_label_bin(y, predicted_label):
    choose = lambda l : 1 if l == predicted_label else 0
    return [choose(l) for l in y]

In [None]:
# Indexing Labels
y = encode_label_bin(y_offenseval, 'OFF')

In [None]:
y_test = encode_label_bin(y_offenseval_test, 'OFF')

In [None]:
# Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, y, random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,random_state=2018, test_size=0.1)

In [None]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
test_inputs = torch.LongTensor(input_ids_test)
test_labels = torch.LongTensor(y_test)
test_masks = torch.LongTensor(attention_masks_test)

In [None]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 16

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

**Train Model**

Now that our input data is properly formatted, it's time to fine tune the BERT model.

For this task, we first want to modify the pre-trained BERT model to give outputs for classification, and then we want to continue training the model on our dataset until that the entire model, end-to-end, is well-suited for our task. Thankfully, the huggingface pytorch implementation includes a set of interfaces designed for a variety of NLP tasks. Though these interfaces are all built on top of a trained BERT model, each has different top layers and output types designed to accomodate their specific NLP task.

We'll load BertForSequenceClassification. This is the normal BERT model with an added single linear layer on top for classification that we will use as a sentence classifier. As we feed input data, the entire pre-trained BERT model and the additional untrained classification layer is trained on our specific task.

**Structure of Fine-Tuning Model** 

we've showed beforehand, the first token of every sequence is the special classification token ([CLS]). Unlike the hidden state vector corresponding to a normal word token, the hidden state corresponding to this special token is designated by the authors of BERT as an aggregate representation of the whole sentence used for classification tasks. As such, when we feed in an input sentence to our model during training, the output is the length 768 hidden state vector corresponding to this token. The additional layer that we've added on top consists of untrained linear neurons of size [hidden_state, number_of_labels], so [768,2], meaning that the output of BERT plus our classification layer is a vector of two numbers representing the "score" for "offensive/non-offensive" that are then fed into cross-entropy loss.


In [None]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

In [None]:
#Memory release
y, x, pooled = None, None, None
torch.cuda.empty_cache()
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

In [None]:
 torch.cuda.empty_cache()
 str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

In [None]:
### The HateBERT model is loaded here  ### input_model: directory where tokenizer is present

model = BertForSequenceClassification.from_pretrained(input_model,num_labels=2)
model.cuda()

Now that we have our model loaded we need to grab the training hyperparameters from within the stored model.

For the purposes of fine-tuning, the authors recommend the following hyperparameter ranges:

*   Batch size: 16, 32
*   Learning rate (Adam): 5e-5, 3e-5, 2e-5
*   Number of epochs: 2, 3, 4

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = BertAdam(optimizer_grouped_parameters,lr=2e-5,warmup=.1)

Below is our training loop. There's a lot going on, but fundamentally for each pass in our loop we have a trianing phase and a validation phase. At each pass we need to:

**Training loop:**

Tell the model to compute gradients by setting the model in train mode
Unpack our data inputs and labels
Load data onto the GPU for acceleration
Clear out the gradients calculated in the previous pass. In pytorch the gradients accumulate by default (useful for things like RNNs) unless you explicitly clear them out
Forward pass (feed input data through the network)
Backward pass (backpropagation)
Tell the network to update parameters with optimizer.step()
Track variables for monitoring progress

**Evalution loop:**

Tell the model not to compute gradients by setting th emodel in evaluation mode
Unpack our data inputs and labels
Load data onto the GPU for acceleration
Forward pass (feed input data through the network)
Compute loss on our validation data and track variables for monitoring progress

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
# Time-consuming code here

# Store our loss and accuracy for plotting
train_loss_set = []

Y=[]
Z=[]
# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    pred_flat = np.argmax(logits, axis=1).flatten()
    labels_flat = label_ids.flatten()
    Y=Y+list(pred_flat)
    Z=Z+list(labels_flat)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

In [None]:
from sklearn.metrics import f1_score

print("F1 micro: %.2f%%" % (f1_score(Y, Z, average='micro')*100))
print("F1 macro: %.2f%%" % (f1_score(Y, Z, average='macro')*100))

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

print("Accuracy: %.2f%%" % (accuracy_score(Y, Z)*100))
print("F1: %.2f%%" % (f1_score(Y, Z, average='macro')*100))
print("Precission: %.2f%%" % (precision_score(Y, Z)*100))
print("Recall: %.2f%%" % (recall_score(Y, Z)*100))
print(classification_report(Y, Z))
print(confusion_matrix(Y, Z))

In [None]:
### Testing the model on offenseval test set
model.eval()
hatebert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = model(token_ids, masks)
        loss_func = nn.CrossEntropyLoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        #print (numpy_logits)
        #print (np.argmax(numpy_logits, 1))
        hatebert_predicted += list(np.argmax(numpy_logits, 1))
        all_logits += list(numpy_logits[:, 0])

In [None]:
print("F1 macro: %.2f%%" % (f1_score(hatebert_predicted, y_test, average='macro')*100))

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

print("Accuracy: %.2f%%" % (accuracy_score(hatebert_predicted, y_test)*100))
print("F1: %.2f%%" % (f1_score(hatebert_predicted, y_test, average='macro')*100))
print("Precission: %.2f%%" % (precision_score(hatebert_predicted, y_test)*100))
print("Recall: %.2f%%" % (recall_score(hatebert_predicted, y_test)*100))
print(classification_report(hatebert_predicted, y_test))
print(confusion_matrix(hatebert_predicted, y_test))

In [None]:


# Step 1: Save a model, configuration and vocabulary that you have fine-tuned

# create folder if not there
Path(output_model).mkdir(parents=True, exist_ok=True)

# If we have a distributed model, save only the encapsulated model
# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
model_to_save = model.module if hasattr(model, 'module') else model

# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(output_model, WEIGHTS_NAME)
output_config_file = os.path.join(output_model, CONFIG_NAME)

torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(output_model)



In [None]:
# Step 2: Re-load the saved model and vocabulary

tokenizer = BertTokenizer.from_pretrained(output_model, do_lower_case=True)
model = BertForSequenceClassification.from_pretrained(output_model,num_labels=2)
model.cuda()