# Import the Dataset

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import Sequential
from keras.layers import Dense, Embedding, GlobalMaxPool1D, LSTM
from keras.losses import BinaryCrossentropy
from keras.metrics import AUC
from keras.optimizers import Adam
from keras.models import model_from_json
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import re
import gc
import pickle

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
tf.debugging.set_log_device_placement(True)

Num GPUs Available:  1


In [None]:
! pip install kaggle



In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"sid200026","key":"0faf0cead147b30eb8078a68bb3fa165"}'}

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

Downloading sample_submission.csv.zip to /content
  0% 0.00/1.39M [00:00<?, ?B/s]
100% 1.39M/1.39M [00:00<00:00, 45.9MB/s]
Downloading train.csv.zip to /content
 34% 9.00M/26.3M [00:01<00:02, 8.50MB/s]
100% 26.3M/26.3M [00:01<00:00, 18.8MB/s]
Downloading test.csv.zip to /content
 38% 9.00M/23.4M [00:01<00:02, 6.39MB/s]
100% 23.4M/23.4M [00:01<00:00, 14.3MB/s]
Downloading test_labels.csv.zip to /content
  0% 0.00/1.46M [00:00<?, ?B/s]
100% 1.46M/1.46M [00:00<00:00, 48.4MB/s]


In [None]:
! mkdir dataset

In [None]:
! unzip test.csv.zip -d dataset

Archive:  test.csv.zip
  inflating: dataset/test.csv        


In [None]:
! unzip train.csv.zip -d dataset

Archive:  train.csv.zip
  inflating: dataset/train.csv       


# Download GloVe Word Embeddings

In [None]:
! wget http://nlp.stanford.edu/data/glove.840B.300d.zip

--2020-10-02 11:53:25--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2020-10-02 11:53:26--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2020-10-02 11:53:26--  http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip

In [None]:
! unzip glove.840B.300d.zip 

Archive:  glove.840B.300d.zip
  inflating: glove.840B.300d.txt     


# Data Fetching

In [None]:
train = pd.read_csv('dataset/train.csv', dtype={'comment_text':'string'})
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,""" More I can't make any real suggestions on im...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
train = train.drop(columns='id')
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,""" More I can't make any real suggestions on im...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
test = pd.read_csv('dataset/test.csv', dtype={'comment_text':'string'})
ids = test.iloc[:,0]
test = test.drop(columns='id')
test.head()

Unnamed: 0,comment_text
0,Yo bitch Ja Rule is more succesful then you'll...
1,"== From RfC == The title is fine as it is, ..."
2,""" == Sources == * Zawe Ashton on Lapland..."
3,":If you have a look back at the source, the in..."
4,I don't anonymously edit articles at all.


In [None]:
ids.head()

0    00001cee341fdb12
1    0000247867823ef7
2    00013b17ad220c46
3    00017563c3f7919a
4    00017695ad8997eb
Name: id, dtype: object

# Preprocessing

In [None]:
X = train['comment_text'].values
Y = train.iloc[:,1:].values

In [None]:
print(X.shape)

(159571,)


In [None]:
print(Y.shape)
Y

(159571, 6)


array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2)

In [None]:
tokenizer = Tokenizer()

In [None]:
tokenizer.fit_on_texts(X_train)

In [None]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
len(X_train_seq)

127656

In [None]:
print(len(tokenizer.word_index))

183360


In [None]:
len(X_test)

31915

In [None]:
X_train_seq = pad_sequences(X_train_seq, maxlen=250)
X_test_seq = pad_sequences(X_test_seq, maxlen=250)

In [None]:
X_test_seq.shape

(31915, 250)

In [None]:
X_train_seq.shape

(127656, 250)

# Pre-Trained Embedding

In [None]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

183361

In [None]:
embeddings_index = dict()
glove = open('glove.840B.300d.txt')

In [None]:
for line in glove:
    word, coefs = line.split(maxsplit=1)
    coefs = np.fromstring(coefs, "f", sep=" ")
    embeddings_index[word] = coefs

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
print("Found %s word vectors." % len(embeddings_index))

Found 2195884 word vectors.


In [None]:
glove.close()

In [None]:
embedding_matrix = np.zeros((vocab_size, 300))
miss = 0

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      if embedding_vector.shape[0] != 0:
        embedding_matrix[i] = embedding_vector
      else:
        miss+=1

print(miss)

11


In [None]:
embedding_matrix.shape

(183361, 300)

In [None]:
model = Sequential()

In [None]:
model.add(Embedding(input_dim=vocab_size, output_dim = 300, input_length = 250, weights=[embedding_matrix], trainable = False))

In [None]:
model.add(LSTM(units=150,return_sequences=True, dropout=0.1))

In [None]:
model.add(GlobalMaxPool1D())

In [None]:
model.add(Dense(units = 64, activation='relu'))

In [None]:
model.add(Dense(units = 16, activation='relu'))

In [None]:
model.add(Dense(units = 6, activation='sigmoid'))

In [None]:
model.compile(loss=BinaryCrossentropy(),optimizer=Adam(),metrics=[AUC()])

In [None]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 300)          55008300  
_________________________________________________________________
lstm (LSTM)                  (None, 250, 150)          270600    
_________________________________________________________________
global_max_pooling1d (Global (None, 150)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                9664      
_________________________________________________________________
dense_1 (Dense)              (None, 16)                1040      
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 102       
Total params: 55,289,706
Trainable params: 281,406
Non-trainable params: 55,008,300
______________________________________

In [None]:
history = model.fit(np.array(X_train_seq), np.array(y_train), batch_size=256, epochs=10, validation_data=(np.array(X_test_seq),np.array(y_test)))

In [None]:
model_json = model.to_json()

In [None]:
with open('glove_embedding.json', 'w') as json_file:
  json_file.write(model_json)

In [None]:
model.save_weights("weights.h5")

In [None]:
json_file = open('glove_embedding.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

In [None]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
loaded_model.load_weights("weights.h5")

In [None]:
loaded_model.compile(loss=BinaryCrossentropy(),optimizer=Adam(),metrics=[AUC()])

# Kaggle Submission

In [None]:
test.head()

Unnamed: 0,comment_text
0,Yo bitch Ja Rule is more succesful then you'll...
1,"== From RfC == The title is fine as it is, ..."
2,""" == Sources == * Zawe Ashton on Lapland..."
3,":If you have a look back at the source, the in..."
4,I don't anonymously edit articles at all.


In [None]:
test_X = test['comment_text'].values
test_X

<StringArray>
[                                                                                                                                                                                                         "Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,",
                                                                                                                                                                                                                                                                                                                                                                                                                             

In [None]:
test_X_seq = tokenizer.texts_to_sequences(test_X)

In [None]:
test_X_seq = pad_sequences(test_X_seq, maxlen=250)

In [None]:
prediction = loaded_model.predict(test_X_seq)
prediction

In [None]:
prediction.shape

(153164, 6)

In [None]:
result = pd.DataFrame()
result.head()

In [None]:
result["id"] = ids
result.head()

Unnamed: 0,id
0,00001cee341fdb12
1,0000247867823ef7
2,00013b17ad220c46
3,00017563c3f7919a
4,00017695ad8997eb


In [None]:
result["toxic"] = prediction[:,0]
result["severe_toxic"] = prediction[:,1]
result["obscene"] = prediction[:,2]
result["threat"] = prediction[:,3]
result["insult"] = prediction[:,4]
result["identity_hate"] = prediction[:,5]
result.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999805,0.6745511,0.987478,0.593773,0.945423,0.5131797
1,0000247867823ef7,7.2e-05,5.205939e-07,1.1e-05,2e-06,2e-06,2.965975e-07
2,00013b17ad220c46,0.000221,2.882138e-06,8.1e-05,7e-06,8e-06,5.883628e-07
3,00017563c3f7919a,0.000549,5.272218e-06,0.000263,5.6e-05,5.4e-05,2.552747e-06
4,00017695ad8997eb,0.004073,3.747078e-05,0.000571,8.7e-05,9.2e-05,8.639281e-06


In [None]:
result.to_csv('submission.csv', index=False)

In [None]:
! kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f submission.csv -m "Using GloVe Word Embeddings and LSTM instead of CUDNNLSTM"

100% 13.8M/13.8M [00:11<00:00, 1.30MB/s]
Successfully submitted to Toxic Comment Classification Challenge