# This program builds and trains a Transformer model for detecting hate speech in Twitter posts. 

This model was created using datasets we were able to find online at Kaggle. Unfortunately that set is too small for training an entire transformer from scratch. Instead, this model uses transfer learning with a pretrained BERT model followed by trainable transformer and dense layers.

CITATIONS

BERT Developed by:
- Jacob Devlin, Ming-Wei Chang, Kenton Lee, Kristina Toutanova: "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding", 2018. [https://arxiv.org/abs/1810.04805]

BERT model available from Tensorflow Hub at *https://tfhub.dev/google/collections/bert/1*

Datasets:
- Kaggle dataset uploaded by "Rahul Agarwal" at *https://www.kaggle.com/vkrahul/twitter-hate-speech?select=train_E6oV3lV.csv*

- Our ML Team's custom hand-labelled dataset using the Twitter API


Imports

In [None]:
import os
import numpy as np
import pandas as pd
import re

# Tensorflow
import tensorflow as tf
!pip install -q tensorflow-text
!pip install --upgrade tensorflow_hub
import tensorflow_text as tf_text
import tensorflow_hub as hub

# visualizations
import matplotlib.pyplot as plt
import matplotlib as mpl



In [None]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Colab_Notebooks/models/ReRight')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Parameters

In [None]:
CHECKPOINT_PATH = 'HateSpeechDetector/checkpoint'
os.getcwd()

'/content/drive/MyDrive/Colab_Notebooks/models/ReRight'

## Dataset

In [None]:
df_tweets = pd.read_csv('datasets/hatespeech.csv')
df_tweets.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,It should be legal for a husband to rape his wife,1
1,1,dalits are lowlife scum,1
2,2,dalits are lowlives,1
3,3,It was a better world when women dare not ques...,1
4,4,I don't work this hard so that those immigrant...,1


Clean up text

In [None]:
def standardize_df(dataframe):
    df = dataframe.copy()

    # lowercase and split on spaces
    df['text'] =  df['text'].map(lambda x: x.lower())

    # remove irregular chars
    def splitter(x):
        return re.sub(pattern='([^a-zA-Z0-9])', repl=' ', string=x)

    df['text'] =  df['text'].map(splitter)

    return df

In [None]:
df_tweets = standardize_df(df_tweets)

In [None]:
"""
# tools for splitting by sentence length
def split_on_wordnum(sentence, wordnum=5):
    # split
    sentence = sentence.split()
    first_half = sentence[:wordnum]
    second_half = sentence[wordnum:]
    
    # rejoin
    first_half = ' '.join(first_half)
    second_half = ' '.join(second_half)
    return first_half, second_half

def split_df_sentences(df, wordnum=5):
    split_array = df['text'].map(lambda x: split_on_wordnum(x, wordnum)).to_list()
    sentence_halves_df = pd.DataFrame.from_records(split_array, columns=['first_half', 'second_half'])
    df2 = pd.concat([df,  sentence_halves_df], axis='columns' )
    return df2
"""

def min_word_count(df):

    df1 = df[df['text'].map(lambda x: len(x.split())) < 50]
    df2 = df[df['text'].map(lambda x: len(x.split())) >= 50]
    df3 = df[df['text'].map(lambda x: len(x.split())) >= 100]

    return df1, df2, df3

In [None]:
df1, df2, df3 = min_word_count(df_tweets)

Test / train split

In [None]:
split = int(.1*len(df_tweets))
df_tweets = df_tweets.sample(frac=1)

df_test = df_tweets[:split]
df_valid = df_tweets[split: 2*split]
df_train = df_tweets[2*split:]

Class weights

In [None]:
# compute class weightings 
# (used balance data during training)
classes = df_train['label'].unique()
num_samples = tf.cast(len(df_train), tf.float32)

CLASS_WEIGHTS = {}
print('Class Balances:' )
for i in classes:
    prop = len(df_train[df_train['label'] == i]) / num_samples
    CLASS_WEIGHTS[i] = 1. / (2 * prop)
    print(f'Class {i}: {100*prop:.1f}%, weighting = {CLASS_WEIGHTS[i]:.3f}')

Class Balances:
Class 1: 33.4%, weighting = 1.497
Class 0: 66.6%, weighting = 0.751


TF Dataset conversion

In [None]:
def convert_to_dataset(df):

    dataset_x = tf.data.Dataset.from_tensor_slices(df[['text']])
    dataset_y = tf.data.Dataset.from_tensor_slices(df[['label']])
    dataset = tf.data.Dataset.zip((dataset_x, dataset_y))
    return dataset

dataset_test = convert_to_dataset(df_test)
dataset_valid = convert_to_dataset(df_valid)
dataset_train = convert_to_dataset(df_train)

# evaluation sets
df1, df2, df3 = min_word_count(df_tweets)
ds_under50_all = convert_to_dataset(df1)
ds_over50_all = convert_to_dataset(df2)
ds_over100_all = convert_to_dataset(df3)

df1, df2, df3 = min_word_count(df_tweets[:2*split])
ds_under50_validation = convert_to_dataset(df1)
ds_over50_validation = convert_to_dataset(df2)
ds_over100_validation = convert_to_dataset(df3)

In [None]:
len(ds_over50_validation)


579

Define Model

In [None]:
def hatespeech_detector(num_heads=1, key_dim=128):
    #layers
    preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
    encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",
                              trainable=False)
    mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)

    # inputs
    text = tf.keras.layers.Input(shape=(), dtype=tf.string)
    inputs= [text]

    # locked transfer model
    text = preprocessor(text)
    text = encoder(text)
    text = text["sequence_output"]
    
    # fine tuning head
    text = mha(key=text, query=text, value=text)
    text = tf.keras.layers.Reshape([-1])(text)
    pred = tf.keras.layers.Dense(1, activation='sigmoid')(text)
    
    outputs = [pred]
    return tf.keras.Model(inputs, outputs, name='hatespeech_detector')

Build

In [None]:
hatespeech_detector_model = hatespeech_detector(num_heads=1, key_dim=128)

In [None]:
hatespeech_detector_model.summary()
print()
hatespeech_detector_model(tf.constant(['test sentence']))

Model: "hatespeech_detector"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        {'input_word_ids': ( 0           input_1[0][0]                    
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      {'encoder_outputs':  109482241   keras_layer[0][0]                
                                                                 keras_layer[0][1]                
                                                                 keras_layer[0][2]                
________________________________________________________________________________

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.5028259]], dtype=float32)>

Load weights

In [None]:
hatespeech_detector_model.load_weights(CHECKPOINT_PATH)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fdeefda4bd0>

Compile

In [None]:
hatespeech_detector_model.compile(optimizer=tf.keras.optimizers.Adam(0.0001),
                                  loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0),  # can experiment with label smoothing values
                                  metrics=['binary_accuracy', 
                                           tf.keras.metrics.AUC(num_thresholds=200, curve='ROC', name='ROC'),
                                           tf.keras.metrics.AUC(num_thresholds=200, curve='PR', name='Precision-Recall')],
                                  steps_per_execution=4)
                                  

Callbacks

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=8)

# checkpoint to save progress during training
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=CHECKPOINT_PATH)

Train

In [None]:
EPOCHS = 1
STEPS_PER_EPOCH = 10

hist = hatespeech_detector_model.fit(x=dataset_train.batch(512, drop_remainder=True).shuffle(int(10e6)), 
                                     validation_data=dataset_valid.batch(512, drop_remainder=True).shuffle(int(10e6)), 
                                     epochs=EPOCHS,
                                     steps_per_epoch=STEPS_PER_EPOCH,
                                     class_weight=CLASS_WEIGHTS,
                                     callbacks=[early_stopping, model_checkpoint],
                                     validation_steps=5
                                     )

"""
hatespeech_detector_model.saved_history = hist
"""

Metrics

In [None]:
""" 
This code is from the TF tutorial at 
https://www.tensorflow.org/tutorials/structured_data/imbalanced_data, 
with only minor modifications 
"""

mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

def plot_metrics(history):
  
  metrics = list(history.history.keys())[:4]  # update this with our chosen metrics
  for n, metric in enumerate(metrics):
    name = metric.replace("_"," ").capitalize()
    plt.subplot(2,2,n+1)
    plt.plot(history.epoch, history.history[metric], color=colors[0], label='Train')
    plt.plot(history.epoch, history.history['val_'+metric],
             color=colors[0], linestyle="--", label='Val')
    plt.xlabel('Epoch')
    plt.ylabel(name)

    plt.legend()

In [None]:
plot_metrics(history=hist)

Evaluatation experiments

In [None]:
hatespeech_detector_model.evaluate(ds_under50_validation.batch(128))

In [None]:
hatespeech_detector_model.evaluate(ds_over50_validation.batch(128))

In [None]:
hatespeech_detector_model.evaluate(ds_over100_validation.batch(128))