The goal of this kernel is to train a simple transformer from only pytorch to classify hate speech in comments
to understand how the transformer works, I would recommend this video it walks through the Attention paper and explains it well

https://www.youtube.com/watch?v=U0s0f995w14&t=2522s

I will be using  torchtext it's really good for preparing the data, it has a good documentation also 
those are links about an example using torchtext and a tutorial

https://www.analyticsvidhya.com/blog/2020/01/first-text-classification-in-pytorch/

http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/

as this will be a classification task, we will just need the encoder part of the transformer, so I used the transformer encoder layer from pytorch

In [1]:
import random
import os

SEED = 32
random.seed(SEED)

import numpy as np 
import pandas as pd
import spacy

from sklearn.metrics import accuracy_score
from sklearn.metrics import  f1_score

from torch import nn
import torch
from torchtext import data
from torch.nn  import functional as F
import torch.optim as  optim 
import dill as dill

if torch.cuda.is_available():  
  dev = "cuda:0" 
  print("gpu up")
else:  
  dev = "cpu"  
device = torch.device(dev)



gpu up


In [2]:
# add dataset
# https://www.kaggle.com/datasets/mrmorj/hate-speech-and-offensive-language-dataset

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/hate-speech-and-offensive-language-dataset/labeled_data.csv


In [4]:
#breaking data into train and test
from sklearn.model_selection import train_test_split

labeled_dataframe = pd.read_csv('/kaggle/input/hate-speech-and-offensive-language-dataset/labeled_data.csv')
labeled_dataframe.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [5]:
#split 
from pathlib import Path
traindata, test = train_test_split(labeled_dataframe, test_size=0.2)

In [6]:
"""

those are the libraries I use for processing text

"""

import nltk
nltk.download("punkt")

import re
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()

tokenizer = Tokenizer(nlp.vocab)

from nltk import word_tokenize,sent_tokenize
from nltk.stem  import PorterStemmer


from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

nltk.download('stopwords')
stops = stopwords.words("english")


def removepunc(my_str): # function to remove punctuation
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct

def hasNumbers(inputString):
    return bool(re.search(r'\d', inputString))
snowstem = SnowballStemmer("english")
portstem = PorterStemmer()


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
#traindata = toxic_train_df #pd.read_csv("/kaggle/input/hate-speech-detection/toxic_train.csv")
#test = toxic_test_df #pd.read_csv("/kaggle/input/hate-speech-detection/toxic_test.csv")
traindata.drop("Unnamed: 0",axis=1,inplace=True)
test.drop("Unnamed: 0",axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [8]:
"""
this function is the tokenizer we are using, it does basic processing also  like ,
Lowercase the text
removing punctuation, stop words and numbers,
it also removes extra spaces and unwanted characters (I use regex for that)


before using the tokenizer I was testing it on the train dataframe manually  
"""

def myTokenizer(x):
 return  [snowstem.stem(word.text)for word in 
          tokenizer(removepunc(re.sub(r"\s+\s+"," ",re.sub(r"[^A-Za-z0-9()!?@\'\`\"\r+\r+\n+\n+\b+]"," ",x.lower()))).strip()) 
          if (word.text not in stops and not hasNumbers(word.text)) ]

# myTokenizer = data.get_tokenizer("basic_english")


In [9]:
"""
here I'm using the torchtext fields and dataset classes they can ease the work to get
the dataset ready for the pytorch model

the class DataFrameDataset is the easiest way I found to turn a dataframe into a torchtext dataset

this cell will take sometime to finish
"""

TEXT = data.Field(tokenize=myTokenizer,batch_first=True,fix_length=140)
LABEL = data.LabelField(dtype=torch.float ,batch_first=True)


class DataFrameDataset(data.Dataset):

    def __init__(self, df, text_field, label_field, is_test=False, **kwargs):
        fields = [('comment_text', text_field), ('toxic', label_field)]
        examples = []
        for i, row in df.iterrows():
            if row['class'] == 2:
                label = 0
            else:
                label = 1
                
            text = row['tweet']
            examples.append(data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)
  

torchdataset = DataFrameDataset(traindata, TEXT,LABEL)
torchtest = DataFrameDataset(test, TEXT,LABEL)

In [10]:
train_data, valid_data = torchdataset.split(split_ratio=0.8, random_state = random.seed(SEED))

In [11]:
"""
this cell build the vocab which means it get all the used words and if also ignores any word 
that only appeared less than 3 times
"""
TEXT.build_vocab(train_data,min_freq=3)  
LABEL.build_vocab(train_data)


In [12]:
#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

print(len(TEXT.vocab))


Size of TEXT vocabulary: 4900
Size of LABEL vocabulary: 2
[('bitch', 7336), ('rt', 4843), ('hoe', 2793), ('co', 1901), ('http', 1834), ('like', 1817), ('pussi', 1427), ('fuck', 1411), ('im', 1334), ('nigga', 1274)]
4900


In [13]:
#set batch size
BATCH_SIZE = 128

"""
we are using batches for validation and test set because of memory usage we can't pass the whole set at once 
"""


train_iterator,valid_iterator,test_iterator= data.BucketIterator.splits(
    (train_data,valid_data,torchtest), 
    batch_size = BATCH_SIZE,
    device = device,
    sort =False,
shuffle=False)


In [14]:

"""
one major point here is that I encoded the embeddings in a different way 
I made an embedding layer for the position then I concatenated position embeddings with the word embeddings 
just thought it could be a usefull way to encode the positions 

had to reshape the output of the transformer layer to get the prediction
"""
class TextTransformer(nn.Module):
  def __init__(self):
    super(TextTransformer,self).__init__()
    self.wordEmbeddings = nn.Embedding(len(TEXT.vocab),140)
    self.positionEmbeddings = nn.Embedding(140,20)
    self.transformerLayer = nn.TransformerEncoderLayer(160,8) 
    self.linear1 = nn.Linear(160,  64)
    self.linear2 = nn.Linear(64,  1)
    self.linear3 = nn.Linear(140,  16)
    self.linear4 = nn.Linear(16,  1)
  def forward(self,x):
    positions = (torch.arange(0,140).reshape(1,140) + torch.zeros(x.shape[0],140)).to(device) 
    # broadcasting the tensor of positions 
    sentence = torch.cat((self.wordEmbeddings(x.long()),self.positionEmbeddings(positions.long())),axis=2)
    attended = self.transformerLayer(sentence)
    linear1 = F.relu(self.linear1(attended))
    linear2 = F.relu(self.linear2(linear1))
    linear2 = linear2.view(-1,140) # reshaping the layer as the transformer outputs a 2d tensor (or 3d considering the batch size)
    linear3 = F.relu(self.linear3(linear2))
    out = torch.sigmoid(self.linear4(linear3))
    return out

myTransformer = TextTransformer()
myTransformer.to(device)

    


TextTransformer(
  (wordEmbeddings): Embedding(4900, 140)
  (positionEmbeddings): Embedding(140, 20)
  (transformerLayer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): Linear(in_features=160, out_features=160, bias=True)
    )
    (linear1): Linear(in_features=160, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=160, bias=True)
    (norm1): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (linear1): Linear(in_features=160, out_features=64, bias=True)
  (linear2): Linear(in_features=64, out_features=1, bias=True)
  (linear3): Linear(in_features=140, out_features=16, bias=True)
  (linear4): Linear(in_features=16, out_features=1, bias=True)
)

In [15]:
def calculateMetrics(ypred,ytrue):
  acc  = accuracy_score(ytrue,ypred)
  f1  = f1_score(ytrue,ypred)
  f1_average  = f1_score(ytrue,ypred,average="macro")
  return " f1 score: "+str(round(f1,3))+" f1 average: "+str(round(f1_average,3))+" accuracy: "+str(round(acc,3))
  

In [16]:
"""
using adagrad because it assign bigger updates to less frequently updated weights 
(like words that are not used many times)

"""

optimizer = optim.Adagrad(myTransformer.parameters(),lr = 0.001)

for i in range(100):
  trainpreds = torch.tensor([])
  traintrues = torch.tensor([])
  for  batch in train_iterator:
    X = batch.comment_text
    y = batch.toxic
    myTransformer.zero_grad()
    pred = myTransformer(X).squeeze()
    trainpreds = torch.cat((trainpreds,pred.cpu().detach()))
    traintrues = torch.cat((traintrues,y.cpu().detach()))
    err = F.binary_cross_entropy(pred,y)
    err.backward()
    optimizer.step()
  err = F.binary_cross_entropy(trainpreds,traintrues)
  print("train BCE loss: ",err.item(),calculateMetrics(torch.round(trainpreds).numpy(),traintrues.numpy()))
 

  valpreds = torch.tensor([])
  valtrues = torch.tensor([])
  for batch in valid_iterator:
    X = batch.comment_text
    y = batch.toxic
    valtrues = torch.cat((valtrues,y.cpu().detach()))
    pred = myTransformer(X).squeeze().cpu().detach()
    # print(valtrues.shape)
    valpreds = torch.cat((valpreds,pred))
  err = F.binary_cross_entropy(valpreds,valtrues)
  print("validation BCE loss: ",err.item(),calculateMetrics(torch.round(valpreds).numpy(),valtrues.numpy()))
  

train BCE loss:  0.6324504017829895  f1 score: 0.0 f1 average: 0.453 accuracy: 0.83
validation BCE loss:  0.6270715594291687  f1 score: 0.0 f1 average: 0.455 accuracy: 0.836
train BCE loss:  0.6260674595832825  f1 score: 0.0 f1 average: 0.453 accuracy: 0.83
validation BCE loss:  0.6223059892654419  f1 score: 0.0 f1 average: 0.455 accuracy: 0.836
train BCE loss:  0.6219716668128967  f1 score: 0.0 f1 average: 0.453 accuracy: 0.83
validation BCE loss:  0.618550181388855  f1 score: 0.0 f1 average: 0.455 accuracy: 0.836
train BCE loss:  0.6185860633850098  f1 score: 0.0 f1 average: 0.453 accuracy: 0.83
validation BCE loss:  0.6153443455696106  f1 score: 0.0 f1 average: 0.455 accuracy: 0.836
train BCE loss:  0.6156609654426575  f1 score: 0.0 f1 average: 0.453 accuracy: 0.83
validation BCE loss:  0.6125164031982422  f1 score: 0.0 f1 average: 0.455 accuracy: 0.836
train BCE loss:  0.6130160093307495  f1 score: 0.0 f1 average: 0.453 accuracy: 0.83
validation BCE loss:  0.6099120378494263  f1 sc

def saveVocab(vocab, file):
    path = Path(file).parent.absolute().mkdir(parents=True, exist_ok=True)
    print("Saving vocab file " + vocab.__str__() + " --> " + file)
    torch.save(vocab, file)so the final scores on validation are  

validation BCE loss:  0.137 f1 score: 0.706 f1 average: 0.84 accuracy: 0.952

In [17]:
"""
now getting the results on the test set
"""

testpreds = torch.tensor([])
testtrues = torch.tensor([])
for batch in test_iterator:
    X = batch.comment_text
    y = batch.toxic
    testtrues = torch.cat((testtrues,y.cpu().detach()))
    pred = myTransformer(X).squeeze().cpu().detach()
    testpreds = torch.cat((testpreds,pred))
err = F.binary_cross_entropy(testpreds,testtrues)
print("test BCE loss: ",err.item(),calculateMetrics(torch.round(testpreds).numpy(),testtrues.numpy()))
  

test BCE loss:  0.3518851399421692  f1 score: 0.579 f1 average: 0.755 accuracy: 0.881


In [18]:
test["predicted"] = torch.round(testpreds).numpy()


"""
this shows that the model understands the language well 

"""

test[test.predicted==1].iloc[32:37]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,predicted
16729,3,0,3,0,1,RT @O3_Millz: Black pussy remind me of roast b...,1.0
12862,3,0,1,2,2,Mississippi state fans are the best to just mo...,1.0
10474,3,1,0,2,2,I hate that wee boy wae the squinty eyes in th...,1.0
11467,3,1,2,0,1,Idgaf if it's 3pm in the afternoon bitch make ...,1.0
23295,3,0,2,1,1,You drink light beer? Hahahahahahahahahahaha w...,1.0


In [19]:
x = TEXT.process(["!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out..."])
x = x.to(device)
pred = myTransformer(x).squeeze().cpu().detach()
print(torch.round(pred))

x = TEXT.process(["!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;"])
x = x.to(device)
pred = myTransformer(x).squeeze().cpu().detach()
print(torch.round(pred))

tensor(1.)
tensor(1.)


In [20]:
def saveVocab(vocab, file):
    path = Path(file).parent.absolute().mkdir(parents=True, exist_ok=True)
    print("Saving vocab file " + vocab.__str__() + " --> " + file)
    torch.save(vocab, file, pickle_module=dill)
    
# def saveVocab(field, file):
#     path = Path(file).parent.absolute().mkdir(parents=True, exist_ok=True)
#     with open(file, 'w+', encoding='utf-8') as f:     
#         for token, index in field.vocab.stoi.items():
#             print (token, len(token),index)
#             f.write(f'{index}\t{token}\n')

In [21]:
def loadVocab(file):
    if os.path.isfile(file):
        print("Reading vocab file: " + file)
        return torch.load(file, pickle_module=dill)
    else:
        print("Error reading file: " + filee + ". file do not exist.")

# def loadVocab(file):
#     if os.path.isfile(file):
#         vocab_dict = dict()
#         field = data.Field(tokenize=myTokenizer,batch_first=True,fix_length=140)
        
#         with open(file, 'r', encoding='utf-8') as f:
#             for line in f:
#                 print(line)
#                 index, token = line.split('\t')
#                 vocab_dict[token] = int(index)
#         field.vocab = vocab_dict
#         return field
#     else:
#         print("Error reading file: " + file + ". file do not exist.")

In [22]:
def saveModel(model, file):
    path = Path(file).parent.absolute().mkdir(parents=True, exist_ok=True)
    print("Saving vocab file " + model.__str__() + " --> " + file)
    torch.save(model.state_dict(), file, pickle_module=dill)

In [23]:
def loadModel(file):
    if os.path.isfile(file):
        print("Reading model file: " + file)
        model = TextTransformer()
        model.load_state_dict(torch.load(file, pickle_module=dill))
        return model
    else:
        print("Error reading file: " + file + ". file do not exist.")

In [24]:
saveVocab(TEXT, "/kaggle/working/vocab/TEXT_obj.pth")

Saving vocab file <torchtext.data.field.Field object at 0x7fb0334472d0> --> /kaggle/working/vocab/TEXT_obj.pth


In [25]:
NEW_TEXT = loadVocab("/kaggle/working/vocab/TEXT_obj.pth")

Reading vocab file: /kaggle/working/vocab/TEXT_obj.pth


In [26]:
saveModel(myTransformer, "/kaggle/working/model/textTransformer_states.pth")

Saving vocab file TextTransformer(
  (wordEmbeddings): Embedding(4900, 140)
  (positionEmbeddings): Embedding(140, 20)
  (transformerLayer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): Linear(in_features=160, out_features=160, bias=True)
    )
    (linear1): Linear(in_features=160, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=160, bias=True)
    (norm1): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (linear1): Linear(in_features=160, out_features=64, bias=True)
  (linear2): Linear(in_features=64, out_features=1, bias=True)
  (linear3): Linear(in_features=140, out_features=16, bias=True)
  (linear4): Linear(in_features=16, out_features=1, bias=True)
) --> /kaggle/working/model/textTransformer_states.

In [27]:
newTransformer = loadModel("/kaggle/working/model/textTransformer_states.pth")

Reading model file: /kaggle/working/model/textTransformer_states.pth


In [28]:
def inference(model, vocab, inString, device):
    model.eval() #switching to evaluation mode.
    model.to(device)
    x = TEXT.process([inString])
    x = x.to(device)
    pred = myTransformer(x).squeeze().cpu().detach()
    return torch.round(pred).numpy()
    

In [29]:
print(inference(newTransformer, NEW_TEXT, "!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...", device))
print(inference(newTransformer, NEW_TEXT, "!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;", device))

0.0
1.0
