In [53]:
# Importing some important liberaries for the model deployment

import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D, Conv1D, MaxPooling1D, Embedding
from keras.models import Model
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import csv
import os
import sys

In [100]:
# Download the dataset from kaggle and the word vector "Glove" from the official site for the data preprocessing and Embedding

# Here I am mentioning some basic configuration :

MAX_SEQUENCE_LENGTH = 100   # max length of the comments allowed since it is social media comments
MAX_VOCAB_SIZE = 20000      # Max unique words list since its proven that an average native english speaker knows around 20000 words
EMBEDDING_DIM = 100         # size of the each word vector
VALIDATION_SPLIT = 0.2      # 20% data is for validation
BATCH_SIZE = 20           # Data will be snet in batches of 128
EPOCHS = 20                 # and model will go through 10 epochs.
                            # NOTE!! This is a hyperparameter and can be given any number and we can use early stop as well on specific metrics


In [55]:
# Load the pre-trained word vector
# Here we are basically loading a text file and creating a dictionary with key value pair
print("Loading the word vectors........")
word2vec = {}
with open (os.path.join('/content/glove.6B.%sd.txt' % EMBEDDING_DIM)) as f:
  for line in f:
    values=line.split()
    word=values[0]
    vec = np.asarray(values[1:],dtype= 'float32')
    word2vec[word] = vec

print(' Found %s word vectors' % len(word2vec))

Loading the word vectors........
 Found 400000 word vectors


In [105]:
# prepare the text sample as their values
data = pd.read_csv("/content/you_tube_data.csv")

In [74]:
type(data)

In [78]:
sentences = data['Text'].fillna("DUMMY_VALUE").values # Filling in the Comment column with "DUMMY_VALUE" if there is NAN
possible_lables = ['IsToxic','IsAbusive','IsThreat','IsProvocative','IsObscene','IsHatespeech','IsRacist','IsNationalist','IsSexist','IsHomophobic','IsReligiousHate','IsRadicalism']
targets = data[possible_lables].values  #

In [83]:
targets=np.asarray(targets).astype(np.float32)
type(targets)

numpy.ndarray

In [84]:
# printing the comment with the maximum words
for s in sentences:
  if len(s) == max(len(s) for s in sentences):
    print(f"length is {len(s)} and the sentence is below: \n " , s)


length is 4421 and the sentence is below: 
  THE UGLY TRUTH:

If you talk like a THUG, act like a THUG, walk like a THUG, and live like a THUG... Then you will die like a THUG!

"Live by the sword die by the sword." - "Don't hate the player, hate the game!"

This was not an innocent little child just walking down the street who got shot in the back. If we cannot blame the parents then we must look at the individual who in this case was an adult. 

What kind of music did he listen to?
What kind of videos did he view?
Who were his role models?
How did his environment affect him?
Who were the enablers?
Who did he associate with?
But most importantly, when will he be accountable for his own actions? Unfortunately, on this day, he chose the wrong pathway to his fate. 

You cannot rob a store and call it shoplifting. Heck, you might as well call it borrowing without permission. 

This was not his first crime and there were probably many unrecorded (not caught) negative actions by Mike Brown 

In [106]:
# Here the Tokenizer will handle 2 things, It will makes the tokens out of documents and converts those tokens into integers

tokenizer=Tokenizer(num_words=MAX_VOCAB_SIZE) # initializing our tokenizer
tokenizer.fit_on_texts(sentences)
sequences=tokenizer.texts_to_sequences(sentences)
sequences

[[26,
  82,
  16,
  45,
  32,
  120,
  4,
  787,
  101,
  3,
  20,
  108,
  11,
  163,
  42,
  47,
  60,
  13,
  690,
  42,
  223,
  788,
  1,
  253,
  16,
  9,
  7,
  239,
  21,
  2,
  2152,
  353,
  145,
  11,
  789,
  3,
  120,
  434,
  145,
  39,
  137,
  164,
  306,
  68,
  2153,
  5,
  476,
  1445,
  3,
  269,
  935,
  354,
  3,
  1446,
  21,
  1,
  84,
  9,
  11,
  69,
  6,
  2154,
  2155,
  3,
  2156,
  43,
  3,
  399,
  2,
  23,
  537,
  3,
  56,
  19,
  270,
  537,
  19,
  32,
  2157,
  61,
  3,
  61,
  21,
  19,
  14,
  210,
  50,
  2,
  22,
  4,
  1127,
  936,
  21,
  1447,
  2,
  1,
  790,
  17,
  1448,
  937,
  1,
  332,
  93,
  3,
  938,
  307,
  108,
  435,
  50,
  2,
  23,
  4,
  1128,
  21,
  65,
  1129,
  19,
  3,
  100,
  7,
  2158,
  7,
  296,
  96,
  9,
  229,
  108,
  11,
  42,
  400,
  3,
  2,
  2159,
  1,
  538,
  369,
  27,
  52,
  137,
  192,
  21,
  41,
  1,
  121,
  477,
  56,
  70,
  28,
  436,
  691,
  54,
  2160,
  21,
  10,
  308,
  202,
  45,
  32,
  1

In [86]:
# To know which integers corresponds to which word we are using word_index method

word2indx=tokenizer.word_index

In [87]:
# Pad the sequence so that we get N x T Matrix since keras works with constant size arrays

pad_data=pad_sequences(sequences,maxlen=MAX_SEQUENCE_LENGTH)
print(f"shape of the pad_data tensor : {pad_data.shape}") # where N is the number of sequences and T is maxseq length post padding


shape of the pad_data tensor : (1000, 100)


In [88]:
# Prepare Embedding Matrix
print("Filling pre-trained embedding........")
new_words=min(MAX_VOCAB_SIZE,len(word2indx)+1) # Computing the actual number of word in our neural network vocab
embedding_matrix=np.zeros((new_words, EMBEDDING_DIM)) # Creating a zero matrix with the size of our new vectors and predefined matrix dimension
for word,i in word2indx.items():
  if i < MAX_VOCAB_SIZE:
    embedded_vec=word2vec.get(word)
    if embedded_vec is not None:
      embedding_matrix[i]=embedded_vec


embedding_matrix

Filling pre-trained embedding........


array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487001,  0.72812003, ..., -0.1459    ,
         0.82779998,  0.27061999],
       [-0.18970001,  0.050024  ,  0.19084001, ..., -0.39804   ,
         0.47646999, -0.15983   ],
       ...,
       [-0.29462999, -0.089243  , -0.19471   , ..., -0.28557   ,
        -0.74519002,  0.047697  ],
       [-0.1075    ,  0.53479999,  1.12619996, ..., -0.51876003,
        -0.22879   ,  0.48199001],
       [-0.29655001, -0.46729001,  0.97241002, ..., -0.24033   ,
        -0.035678  ,  0.27926001]])

In [89]:
# Load pre-trained word embedding into Embedding layer :

embedding_layer=Embedding(
    new_words,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=MAX_SEQUENCE_LENGTH,
    trainable= False
)

In [107]:
# Train a 1D convolution with global maxpooling
input=Input(shape=(MAX_SEQUENCE_LENGTH))
x=embedding_layer(input)
x = Conv1D(20,3,activation='relu')(x)
x=MaxPooling1D(3)(x)
x = Conv1D(20,3,activation='relu')(x)
x=MaxPooling1D(3)(x)
x = Conv1D(20,3,activation='relu')(x)
x=GlobalMaxPooling1D()(x)
x=Dense(20,activation='relu')(x)
output=Dense(len(possible_lables),activation='sigmoid')(x)

model=Model(input,output)
model.compile(
    loss='binary_crossentropy',
    optimizer='rmsprop',
    metrics=['accuracy']
)


In [108]:
print("Training the Model.....")
r = model.fit(
    pad_data,
    targets,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=VALIDATION_SPLIT
)

Training the Model.....
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [97]:
p=model.predict(pad_data)
aucs = []
for j in range(6):
  auc=roc_auc_score(targets[:,j],p[:,j])
  aucs.append(auc)

print(np.mean(aucs))

0.7109212860856083
