In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# loading the datasets
youtube_, davidson = pd.read_csv('final.csv'), pd.read_csv('modified_davidson.csv')
print(youtube.head(), davidson.head())
# in case of davidson and youtube, [a, b, c] is abusive, hate, clean

NameError: name 'youtube' is not defined

In [16]:
# creating the text dataset for training
youtube = youtube_[['text', 'label']]
print(youtube.groupby('label').count(), davidson.groupby('label').count())

           text
label          
[0, 0, 1]  5298
[0, 1, 0]   244
[1, 0, 0]   456
[1, 1, 0]   824             text
label           
[0, 0, 1]   4162
[0, 1, 0]   1430
[1, 0, 0]  19189


As seen, there are 4 categories, viz. Abusive, Hate, abusive + hate, and clean.

| Category | Samples |
| :---:    | :---: |
| Abusive |  19189 + 456 | 
| Hate | 1430 + 244  |
| Abusive + Hate | 0 + 824 |
| Clean | 4162 + 5298 |

In [46]:
SEED_PERCENT = 0.2
test_sample = youtube.groupby('label').apply(lambda x: x.sample(frac=SEED_PERCENT))
test_sample.reset_index(drop=True, inplace=True)
dataset = pd.merge(davidson, test_sample, on=['text', 'label'], how='outer')
dataset.to_csv('comb_data.csv', index=False)
dataset.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
"[0, 0, 1]",5222
"[0, 1, 0]",1479
"[1, 0, 0]",19280
"[1, 1, 0]",165


## Training the model

Now that the dataset is prepared, we can proceed to training and compiling the model.

In [32]:
dataset = pd.read_csv('comb_data_cleaned.csv')
dataset.head()

Unnamed: 0,text,label
0,woman shouldnt complain clean hous amp man alw...,"[0, 0, 1]"
1,boy dat coldtyga dwn bad cuffin dat hoe st place,"[1, 0, 0]"
2,dawgyou ever fuck bitch start cri confus shit,"[1, 0, 0]"
3,look like tranni,"[1, 0, 0]"
4,shit hear might true might faker bitch told ya,"[1, 0, 0]"


In [44]:
import numpy as np
embedding_ = np.load('embeddings_latest.npy')
embedding_.shape

(19210, 256)

In [45]:
from keras.models import Sequential, load_model, save_model
from keras.layers import Embedding, Dropout, Bidirectional, LSTM, Dense
INPUT_SIZE = 512
def get_model(embedding, vocab_size=INPUT_SIZE):
    model = Sequential()
    model.add(Embedding(*embedding.shape, weights=[embedding], input_length=vocab_size, name='Embedding_Layer'))
    model.add(Dropout(0.2, name="Dropout"))
    model.add(Bidirectional(LSTM(vocab_size, dropout=0.1, recurrent_dropout=0.25, name="Bi-LSTM_Layer_1")))

    model.add(Dense(64, activation='relu', name="Dense_64"))
    model.add(Dense(32, activation='relu', name="Dense_32"))
    model.add(Dense(3, activation='sigmoid',name='Dense_3'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy', 'accuracy'])
    print(model.summary())
    return model


model = get_model(embedding_, INPUT_SIZE)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding_Layer (Embedding)  (None, 512, 256)          4917760   
_________________________________________________________________
Dropout (Dropout)            (None, 512, 256)          0         
_________________________________________________________________
bidirectional_9 (Bidirection (None, 1024)              3149824   
_________________________________________________________________
Dense_64 (Dense)             (None, 64)                65600     
_________________________________________________________________
Dense_32 (Dense)             (None, 32)                2080      
_________________________________________________________________
Dense_3 (Dense)              (None, 3)                 99        
Total params: 8,135,363
Trainable params: 8,135,363
Non-trainable params: 0
_________________________________________________________________


In [47]:
# model training
import pickle
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from ast import literal_eval
dataset = pd.read_csv('training.csv')
_X, _y = dataset['text'].astype('str'), dataset['label']
tokenizer = pickle.load(open('tokenizer.pkl', 'rb'))
X = tokenizer.texts_to_matrix(_X)
y = np.array(list(map(lambda label: literal_eval(label), _y)))
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=69, test_size=0.0)
X_train.shape, X_test.shape

((4979, 512), (0, 512))

In [None]:
# train the model
from keras_tqdm import TQDMNotebookCallback
import datetime
history = model.fit(X_train, y_train, batch_size=16, epochs=4, validation_split=0.2, verbose=0, callbacks=[TQDMNotebookCallback()])
model_name = f"model_{datetime.datetime.now()}.h5"
model.save(model_name)

HBox(children=(IntProgress(value=0, description='Training', max=4, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Epoch 0', max=3983, style=ProgressStyle(description_width='in…

In [None]:
fig = plt.figure(figsize=(32, 16))
fig.add_subplot(1, 2, 1)
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.legend(['train', 'validation'], loc='upper left')
plt.title("Training Accuracy")
plt.xlabel("Epoch"), plt.ylabel("Accuracy")
fig.add_subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss Decay')
plt.ylabel('Loss'), plt.xlabel('Epoch')
plt.legend(['train', 'val'], loc='upper right')

## Model Evaluation and testing

Now that we've trained the model, it's time to test it against some real world data.

In [None]:
from keras.utils import plot_model
test_df = pd.read_csv('final.csv')[['text', 'label']]
model = load_model(model_name)  # change this to the corresponding model file
# plot_model(model, to_file='model.png'
print(test_df.head())
print(model.summary())

In [None]:
_X_test, y_test = test_df.text, np.array([literal_eval(_) for _ in test_df.label.values.tolist()])
X_test = tokenizer.texts_to_matrix(_X_test)
X_test.shape

In [None]:
y_hat = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
from keras.utils import to_categorical
y_hat_ = to_categorical([np.argmax(_) for _ in y_hat], num_classes=3).astype('int')
print(classification_report(y_test, y_hat_))