In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import math

import string

import re
from collections import Counter
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error

import json
import collections
import operator
import random
import seaborn as sns
from sklearn.model_selection import train_test_split

import tensorflow as tf

import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [2]:
train_file = pd.read_csv('clean.csv')

In [3]:
train_file.columns, train_file.shape

(Index(['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
        'identity_hate', 'label'],
       dtype='object'),
 (159513, 8))

In [4]:
max_label_length = 6
new_label = []
for each in train_file['label']:
    temp = str(each)
    if len(temp) != max_label_length:
        zeros = max_label_length - len(str(each))
        res = temp.rjust(zeros + len(temp), '0')
        new_label.append(res)
    else:
        new_label.append(temp)    

In [5]:
train_file['label'] = new_label

In [6]:
train_file = train_file.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis = 1)

In [7]:
all_toxic = train_file[(train_file.label != '000000')].reset_index(drop = True)
non_toxic = train_file[(train_file.label == '000000')].reset_index(drop = True)

In [8]:
all_toxic.shape, non_toxic.shape

((16225, 2), (143288, 2))

In [9]:
del train_file

In [10]:
non_toxic_labels = [0] * non_toxic.shape[0]
toxic_labels = [1] * all_toxic.shape[0]

In [11]:
non_toxic['label'] = non_toxic_labels
all_toxic['label'] = toxic_labels

In [12]:
non_toxic.head(4), all_toxic.head(4)

(                                        comment_text  label
 0  explanation edits made username hardcore metal...      0
 1  aww matches background colour seemingly stuck ...      0
 2  hey man really trying edit war guy constantly ...      0
 3  make real suggestions improvement wondered sec...      0,
                                         comment_text  label
 0                        cocksucker piss around work      1
 1  hey talk exclusive group wp talibans good dest...      1
 2            bye look come think comming back tosser      1
 3  gay antisemmitian archangel white tiger meow g...      1)

In [13]:
whole_frame = pd.concat([non_toxic, all_toxic], ignore_index=True)

In [14]:
train, non_train = train_test_split(whole_frame, test_size=0.1, random_state=42, stratify = whole_frame['label'], shuffle = True)
val, test = train_test_split(non_train, test_size=0.1, random_state=42, stratify = non_train['label'], shuffle = True)
train.shape, val.shape, test.shape

((143561, 2), (14356, 2), (1596, 2))

In [15]:
del non_toxic
del all_toxic

x_train = train['comment_text'].reset_index(drop = True)
y_train = train['label'].reset_index(drop = True)

x_val = val['comment_text'].reset_index(drop = True)
y_val = val['label'].reset_index(drop = True)

x_test = test['comment_text'].reset_index(drop = True)
y_test = test['label'].reset_index(drop = True)


del train
del val
del test

#### keras tokenizer helps to form vocab and word index from the given data at the character level, for OOV keep UNK (unknown character)

In [16]:
tk = tf.keras.preprocessing.text.Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(x_train)

In [17]:
tk.word_index

{'UNK': 1,
 ' ': 2,
 'e': 3,
 'i': 4,
 'a': 5,
 't': 6,
 'n': 7,
 's': 8,
 'r': 9,
 'o': 10,
 'l': 11,
 'c': 12,
 'd': 13,
 'p': 14,
 'u': 15,
 'g': 16,
 'm': 17,
 'h': 18,
 'y': 19,
 'k': 20,
 'w': 21,
 'b': 22,
 'f': 23,
 'v': 24,
 'x': 25,
 'j': 26,
 'q': 27,
 'z': 28}

In [18]:
vocab_size = len(tk.word_index)
vocab_size

28

#### Transform text according to vocab i.e., replace every character with index in vocab

In [19]:
x_train.shape, y_train.shape, x_val.shape, y_val.shape, x_test.shape, y_test.shape

((143561,), (143561,), (14356,), (14356,), (1596,), (1596,))

In [20]:
x_train = tk.texts_to_sequences(x_train)
x_val = tk.texts_to_sequences(x_val)
x_test = tk.texts_to_sequences(x_test)

In [21]:
x_train[0]

[14,
 11,
 3,
 5,
 8,
 3,
 2,
 9,
 3,
 17,
 10,
 24,
 3,
 2,
 21,
 5,
 9,
 7,
 4,
 7,
 16,
 8,
 2,
 15,
 8,
 3,
 9,
 2,
 6,
 5,
 11,
 20,
 2,
 14,
 5,
 16,
 3,
 8,
 2,
 12,
 10,
 7,
 8,
 4,
 13,
 3,
 9,
 3,
 13,
 2,
 24,
 5,
 7,
 13,
 5,
 11,
 4,
 8,
 17,
 2,
 22,
 10,
 19,
 12,
 9,
 5,
 8,
 18,
 23,
 5,
 7]

In [22]:
len(x_train), y_train.shape, len(x_val), y_val.shape, len(x_test), y_test.shape

(143561, (143561,), 14356, (14356,), 1596, (1596,))

#### pad sequences

In [23]:
# Padding
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=5000, padding='post')
x_val = tf.keras.preprocessing.sequence.pad_sequences(x_val, maxlen=5000, padding='post')
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=5000, padding='post')

# Convert to numpy array
x_train = np.array(x_train, dtype='float32')
x_val = np.array(x_val, dtype='float32')
x_test = np.array(x_test, dtype='float32')

In [24]:
x_train.shape, y_train.shape, x_val.shape, y_val.shape, x_test.shape, y_test.shape

((143561, 5000), (143561,), (14356, 5000), (14356,), (1596, 5000), (1596,))

#### embedding matrix
#### vocab_size is 28, so 28 will be columns (features)
#### Here using one hot encoding, we can prepare embedding weights for keras embedding layer.

In [25]:
embedding_weights = []
#embedding_weights.append(np.zeros(vocab_size))

for char, i in tk.word_index.items():
    onehot = np.zeros(vocab_size)
    onehot[i-1] = 1
    embedding_weights.append(onehot)
embedding_weights = np.array(embedding_weights)

In [26]:
embedding_weights.shape

(28, 28)

In [27]:
input_size = 5000
vocab_size = len(tk.word_index)
embedding_size = 28

In [28]:
embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                           embedding_size,
                                           input_length = input_size,
                                           weights = [embedding_weights])


#### If we do not want to look for making embedding weights, simply leave it on the keras embedding layer 
#### specify the dimension of embedding and layer will itself produce (vocab_size, embedding_size ) matrix 

In [None]:
#use upper block or this block
embed_size = 100
embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                           output_dim = embed_size,
                                           input_length = input_size,
                                           )

In [39]:
inputs = tf.keras.Input(shape=(input_size,), name='input', dtype='int64')  
x = embedding_layer(inputs)

x = tf.keras.layers.Conv1D(32, 7, activation = 'relu')(x)
x = tf.keras.layers.MaxPooling1D(3)(x)
x = tf.keras.layers.BatchNormalization()(x)

x = tf.keras.layers.Conv1D(64, 5, activation = 'relu')(x)
x = tf.keras.layers.MaxPooling1D(3)(x)
x = tf.keras.layers.BatchNormalization()(x)

x = tf.keras.layers.Conv1D(128, 5, activation = 'relu')(x)
x = tf.keras.layers.MaxPooling1D(3)(x)
x = tf.keras.layers.BatchNormalization()(x)

x = tf.keras.layers.Conv1D(256, 3, activation = 'relu')(x)
x = tf.keras.layers.MaxPooling1D(3)(x)
x = tf.keras.layers.BatchNormalization()(x)

x = tf.keras.layers.Conv1D(256, 3, activation = 'relu')(x)
x = tf.keras.layers.MaxPooling1D(3)(x)
x = tf.keras.layers.BatchNormalization()(x)

x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(1024)(x)
x = tf.keras.layers.Dense(256)(x)
out = tf.keras.layers.Dense(1)(x)


model = tf.keras.Model(inputs = inputs, outputs = out)
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 5000)]            0         
                                                                 
 embedding (Embedding)       (None, 5000, 28)          784       
                                                                 
 conv1d_23 (Conv1D)          (None, 4994, 32)          6304      
                                                                 
 max_pooling1d_23 (MaxPoolin  (None, 1664, 32)         0         
 g1D)                                                            
                                                                 
 batch_normalization_23 (Bat  (None, 1664, 32)         128       
 chNormalization)                                                
                                                                 
 conv1d_24 (Conv1D)          (None, 1660, 64)          1030

In [44]:
model.fit(x_train, y_train,
          validation_data=(x_val, y_val),
          batch_size=128,
          epochs=5)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x33531d460>

In [45]:
y_predicted = model.predict(x_test)



In [46]:
y_predicted = y_predicted.flatten()
y_predicted = np.where(y_predicted > 0.8, 1, 0)
y_predicted


print(accuracy_score(y_test, y_predicted))
print(precision_score(y_test, y_predicted, average='weighted', zero_division = 1))
print(recall_score(y_test, y_predicted, average='weighted', zero_division = 1))
print(f1_score(y_test, y_predicted, average='micro', zero_division = 1))
print(confusion_matrix(y_test, y_predicted))

0.8947368421052632
0.820089818374652
0.8947368421052632
0.8947368421052632
[[1427    7]
 [ 161    1]]
