# Import the Dataset

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import Sequential
from keras.layers import Dense, Embedding, GlobalMaxPool1D, LSTM
from keras.losses import BinaryCrossentropy
from keras.metrics import AUC
from keras.optimizers import Adam
from keras.models import model_from_json
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import re
import gc
import pickle

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
tf.debugging.set_log_device_placement(True)

Num GPUs Available:  1


In [None]:
! pip install kaggle



In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"sid200026","key":"c1398985906d76ebf510dbc0e65e5ceb"}'}

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

Downloading test.csv.zip to /content
 21% 5.00M/23.4M [00:00<00:01, 17.7MB/s]
100% 23.4M/23.4M [00:00<00:00, 67.5MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/1.39M [00:00<?, ?B/s]
100% 1.39M/1.39M [00:00<00:00, 93.3MB/s]
Downloading train.csv.zip to /content
 61% 16.0M/26.3M [00:00<00:00, 33.7MB/s]
100% 26.3M/26.3M [00:00<00:00, 75.7MB/s]
Downloading test_labels.csv.zip to /content
  0% 0.00/1.46M [00:00<?, ?B/s]
100% 1.46M/1.46M [00:00<00:00, 211MB/s]


In [None]:
! mkdir dataset

In [None]:
! unzip test.csv.zip -d dataset

Archive:  test.csv.zip
  inflating: dataset/test.csv        


In [None]:
! unzip train.csv.zip -d dataset

Archive:  train.csv.zip
  inflating: dataset/train.csv       


In [None]:
! unzip test_labels.csv.zip -d dataset

Archive:  test_labels.csv.zip
  inflating: dataset/test_labels.csv  


# Download GloVe Word Embeddings

In [None]:
! wget http://nlp.stanford.edu/data/glove.840B.300d.zip

--2020-10-03 12:39:42--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2020-10-03 12:39:43--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2020-10-03 12:39:43--  http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip

In [None]:
! unzip glove.840B.300d.zip 

Archive:  glove.840B.300d.zip
  inflating: glove.840B.300d.txt     


# Data Fetching

In [None]:
train = pd.read_csv('dataset/train.csv', dtype={'comment_text':'string'})
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,""" More I can't make any real suggestions on im...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
train = train.drop(columns='id')
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,""" More I can't make any real suggestions on im...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
test = pd.read_csv('dataset/test.csv', dtype={'comment_text':'string'})
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,"== From RfC == The title is fine as it is, ..."
2,00013b17ad220c46,""" == Sources == * Zawe Ashton on Lapland..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [None]:
test_labels = pd.read_csv('dataset/test_labels.csv')
test_labels.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [None]:
test_labels = test_labels[test_labels['toxic'] != -1] 
test_labels.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,0001ea8717f6de06,0,0,0,0,0,0
7,000247e83dcc1211,0,0,0,0,0,0
11,0002f87b16116a7f,0,0,0,0,0,0
13,0003e1cccfd5a40a,0,0,0,0,0,0
14,00059ace3e3e9a53,0,0,0,0,0,0


In [None]:
test_labels.shape

(63978, 7)

In [None]:
test = pd.merge(test, test_labels, how='inner', on ='id')
test = test.drop(columns=['id'])
test.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Thank you for understanding. I think very high...,0,0,0,0,0,0
1,:Dear god this site is horrible.,0,0,0,0,0,0
2,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0
3,""" It says it right there that it IS a type....",0,0,0,0,0,0
4,""" == Before adding a new product to the lis...",0,0,0,0,0,0


In [None]:
test.shape

(63978, 7)

In [None]:
train = pd.concat([train,test], ignore_index=True)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,""" More I can't make any real suggestions on im...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
train.shape

(223549, 7)

# Preprocessing

In [None]:
X = train['comment_text'].values
Y = train.iloc[:,1:].values

In [None]:
print(X.shape)

(223549,)


In [None]:
print(Y.shape)
Y

(223549, 6)


array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0]])

In [None]:
X_train, y_train = X,Y

In [None]:
tokenizer = Tokenizer()

In [None]:
tokenizer.fit_on_texts(X_train)

In [None]:
X_train_seq = tokenizer.texts_to_sequences(X_train)

In [None]:
len(X_train_seq)

223549

In [None]:
print(len(tokenizer.word_index))

300257


In [None]:
X_train_seq = pad_sequences(X_train_seq, maxlen=250)

In [None]:
X_train_seq.shape

(223549, 250)

# Pre-Trained Embedding

In [None]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

300258

In [None]:
embeddings_index = dict()
glove = open('glove.840B.300d.txt')

In [None]:
for line in glove:
    word, coefs = line.split(maxsplit=1)
    coefs = np.fromstring(coefs, "f", sep=" ")
    embeddings_index[word] = coefs

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
print("Found %s word vectors." % len(embeddings_index))

Found 2195884 word vectors.


In [None]:
glove.close()

In [None]:
embedding_matrix = np.zeros((vocab_size, 300))
miss = 0

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      if embedding_vector.shape[0] != 0:
        embedding_matrix[i] = embedding_vector
      else:
        miss+=1

print(miss)

11


In [None]:
embedding_matrix.shape

(300258, 300)

In [None]:
model = Sequential()

In [None]:
model.add(Embedding(input_dim=vocab_size, output_dim = 300, input_length = 250, weights=[embedding_matrix], trainable = False))

In [None]:
model.add(LSTM(units=150,return_sequences=True, dropout=0.1))

In [None]:
model.add(GlobalMaxPool1D())

In [None]:
model.add(Dense(units = 64, activation='relu'))

In [None]:
model.add(Dense(units = 16, activation='relu'))

In [None]:
model.add(Dense(units = 6, activation='sigmoid'))

In [None]:
model.compile(loss=BinaryCrossentropy(),optimizer=Adam(),metrics=[AUC()])

In [None]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 300)          90077400  
_________________________________________________________________
lstm (LSTM)                  (None, 250, 150)          270600    
_________________________________________________________________
global_max_pooling1d (Global (None, 150)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                9664      
_________________________________________________________________
dense_1 (Dense)              (None, 16)                1040      
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 102       
Total params: 90,358,806
Trainable params: 281,406
Non-trainable params: 90,077,400
______________________________________

In [None]:
history = model.fit(np.array(X_train_seq), np.array(y_train), batch_size=256, epochs=10)

In [None]:
model_json = model.to_json()

In [None]:
with open('ToxicBot_GloVeEmbedding.json', 'w') as json_file:
  json_file.write(model_json)

In [None]:
model.save_weights("ToxicBot_Weights.h5")

In [None]:
json_file = open('ToxicBot_GloVeEmbedding.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

In [None]:
with open('ToxicBot_Tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('ToxicBot_Tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
loaded_model.load_weights("ToxicBot_Weights.h5")

In [None]:
loaded_model.compile(loss=BinaryCrossentropy(),optimizer=Adam(),metrics=[AUC()])