In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder

fullset = pd.read_csv('hatespeech-bruehl-momenzada.csv')


In [6]:
fullset['class'].value_counts()

offensive    19712
hate         17033
nothing       4163
Name: class, dtype: int64

## Cleaning

Unfortunately, this dataset has no emojis.

We do not remove stopwords (such as i, and, myself, etc. which are included in the ntlk stopwords list), since this can distort the context of a text

In [7]:
def clean_series(corpus:pd.Series) -> pd.Series:
    #get rid of further symbols 
    corpus = corpus.replace({'[»„‘’“”…]': ' '}, regex=True)
    
    #get rid of digits
    corpus = corpus.replace({'\w*\d\w*': 'Nummer'}, regex=True)

    # get rid of urls
    corpus = corpus.replace({r"https?://\S+|www\.\S+": ' '}, regex=True)

    #get not identified unicode
    corpus = corpus.replace('[\u0080-\uffff]w{1-3}', " ", regex=True)
    
    #delete /t and /n
    corpus = corpus.replace('/t', " ", regex=True)
    corpus = corpus.replace('/n', " ", regex=True)

    # replaces all stringw which are unicodes (\u2009 \a0x) and also removes bashes
    corpus = corpus.replace({r"[^\x00-\x7F\w{1,3}]+": ' '}, regex=True)
    
    # remove @usernames
    corpus = corpus.replace({r"(#[\d\w\.]+)": ' '}, regex=True)

    return corpus

fullset['text'] = clean_series(fullset['text'])

## Define Labels as numbers

In [8]:
#define labels
lb_make = LabelEncoder()
fullset["label_id"] = lb_make.fit_transform(fullset["class"])

In [9]:
# The maximum number of most frequent words to be used.
MAX_NB_WORDS = 59000
# Max number of words in each row
MAX_SEQUENCE_LENGTH = 150
#feature vector length
HIDDEN_DIM = 200

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True)
tokenizer.fit_on_texts(fullset['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 59667 unique tokens.


In [10]:
import pickle as pkl
with open('tokenizer.pkl','wb') as f:
    pkl.dump(tokenizer, f)
    

In [6]:
# define X and Y
X = tokenizer.texts_to_sequences(fullset['text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Y = pd.get_dummies(fullset['class']).values
print('Shape of label tensor:', Y.shape)

# a really messy way to save labels
onehot_labels = pd.get_dummies(fullset['class'])
labels = onehot_labels.drop_duplicates(subset=onehot_labels.columns).reset_index(drop=True)

Shape of data tensor: (40908, 150)
Shape of label tensor: (40908, 3)


In [22]:
with open('offensive_hate.pkl','wb') as f:
    pkl.dump(X, f)

## Define train and test

In [7]:
#split data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.25, random_state = 42)

## Define model and saving path

In [8]:
#save model
from __future__ import absolute_import, division, print_function
import os

checkpoint_path = "model_pretrain/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, 
                                                  save_weights_only=True,
                                                 verbose=1)

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding,SpatialDropout1D, LSTM,Conv1D,MaxPooling1D
from tensorflow.keras import layers

def hate_offensive_model():
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, HIDDEN_DIM, input_length=X.shape[1]))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model 

## Train model

In [11]:
model = hate_offensive_model()

epochs = 5
batch_size = 128
model.fit(X_train, Y_train,epochs=epochs, batch_size=batch_size,validation_split=0.1, callbacks=[cp_callback])

Epoch 1/5

Epoch 00001: saving model to model_pretrain\cp.ckpt
Epoch 2/5

Epoch 00002: saving model to model_pretrain\cp.ckpt
Epoch 3/5

Epoch 00003: saving model to model_pretrain\cp.ckpt
Epoch 4/5

Epoch 00004: saving model to model_pretrain\cp.ckpt
Epoch 5/5

Epoch 00005: saving model to model_pretrain\cp.ckpt


<keras.callbacks.History at 0x237cd9665e0>