# Import the dataset

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import Sequential
from keras.layers import Dense, Embedding, GlobalMaxPool1D
from keras.losses import BinaryCrossentropy
from keras.metrics import AUC
from keras.optimizers import Adam
from keras.models import model_from_json
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import re
import gc
import pickle

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
tf.debugging.set_log_device_placement(True)

Num GPUs Available:  1


In [None]:
! pip install kaggle



In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"sid200026","key":"974e774bd5d9d9ed93a3d723363684ac"}'}

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

Downloading train.csv.zip to /content
 65% 17.0M/26.3M [00:01<00:01, 6.03MB/s]
100% 26.3M/26.3M [00:01<00:00, 19.5MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/1.39M [00:00<?, ?B/s]
100% 1.39M/1.39M [00:00<00:00, 45.2MB/s]
Downloading test_labels.csv.zip to /content
  0% 0.00/1.46M [00:00<?, ?B/s]
100% 1.46M/1.46M [00:00<00:00, 98.4MB/s]
Downloading test.csv.zip to /content
 38% 9.00M/23.4M [00:01<00:02, 7.42MB/s]
100% 23.4M/23.4M [00:01<00:00, 16.7MB/s]


In [None]:
! mkdir dataset

In [None]:
! unzip test.csv.zip -d dataset

Archive:  test.csv.zip
  inflating: dataset/test.csv        


In [None]:
! unzip train.csv.zip -d dataset

Archive:  train.csv.zip
  inflating: dataset/train.csv       


# Data Fetching

In [None]:
train = pd.read_csv('dataset/train.csv', dtype={'comment_text':'string'})
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,""" More I can't make any real suggestions on im...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
train = train.drop(columns='id')
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,""" More I can't make any real suggestions on im...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
test = pd.read_csv('dataset/test.csv', dtype={'comment_text':'string'})
ids = test.iloc[:,0]
test = test.drop(columns='id')
test.head()

Unnamed: 0,comment_text
0,Yo bitch Ja Rule is more succesful then you'll...
1,"== From RfC == The title is fine as it is, ..."
2,""" == Sources == * Zawe Ashton on Lapland..."
3,":If you have a look back at the source, the in..."
4,I don't anonymously edit articles at all.


In [None]:
ids.head()

0    00001cee341fdb12
1    0000247867823ef7
2    00013b17ad220c46
3    00017563c3f7919a
4    00017695ad8997eb
Name: id, dtype: object

# Data Preprocessing

In [None]:
X = train['comment_text'].values
Y = train.iloc[:,1:].values

In [None]:
print(X.shape)

(159571,)


In [None]:
print(Y.shape)
Y

(159571, 6)


array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2)

In [None]:
tokenizer = Tokenizer()

In [None]:
tokenizer.fit_on_texts(X_train)

In [None]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
len(X_train_seq)

127656

In [None]:
print(len(tokenizer.word_index))

183094


In [None]:
len(X_test)

31915

In [None]:
X_train_seq = pad_sequences(X_train_seq, maxlen=250)
X_test_seq = pad_sequences(X_test_seq, maxlen=250)

In [None]:
X_test_seq.shape

(31915, 250)

In [None]:
X_train_seq.shape

(127656, 250)

# Custom Embedding

In [None]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

183095

In [None]:
model = Sequential()

In [None]:
model.add(Embedding(input_dim=vocab_size, output_dim = 300, input_length = 250, trainable = True))

In [None]:
model.add(CuDNNLSTM(units=150,return_sequences=True))

In [None]:
model.add(GlobalMaxPool1D())

In [None]:
model.add(Dense(units = 64, activation='relu'))

In [None]:
model.add(Dense(units = 16, activation='relu'))

In [None]:
model.add(Dense(units = 6, activation='sigmoid'))

In [None]:
model.compile(loss=BinaryCrossentropy(),optimizer=Adam(),metrics=[AUC()])

In [None]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 300)          54928500  
_________________________________________________________________
cu_dnnlstm (CuDNNLSTM)       (None, 250, 150)          271200    
_________________________________________________________________
global_max_pooling1d (Global (None, 150)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                9664      
_________________________________________________________________
dense_1 (Dense)              (None, 16)                1040      
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 102       
Total params: 55,210,506
Trainable params: 55,210,506
Non-trainable params: 0
____________________________________________

In [None]:
type(X_train_seq)

numpy.ndarray

In [None]:
history = model.fit(np.array(X_train_seq), np.array(y_train), batch_size=128, epochs=10, validation_data=(np.array(X_test_seq),np.array(y_test)))

In [None]:
model_json = model.to_json()

In [None]:
with open('custom_embedding.json', 'w') as json_file:
  json_file.write(model_json)

In [None]:
model.save_weights("weights.h5")

In [None]:
json_file = open('custom_embedding.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

In [None]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
loaded_model.load_weights("weights.h5")

In [None]:
loaded_model.compile(loss=BinaryCrossentropy(),optimizer=Adam(),metrics=[AUC()])

# Kaggle Submission

In [None]:
test.head()

Unnamed: 0,comment_text
0,Yo bitch Ja Rule is more succesful then you'll...
1,"== From RfC == The title is fine as it is, ..."
2,""" == Sources == * Zawe Ashton on Lapland..."
3,":If you have a look back at the source, the in..."
4,I don't anonymously edit articles at all.


In [None]:
test_X = test['comment_text'].values
test_X

<StringArray>
[                                                                                                                                                                                                         "Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,",
                                                                                                                                                                                                                                                                                                                                                                                                                             

In [None]:
test_X_seq = tokenizer.texts_to_sequences(test_X)

Executing op DeleteIterator in device /job:localhost/replica:0/task:0/device:CPU:0


In [None]:
test_X_seq = pad_sequences(test_X_seq, maxlen=250)

In [None]:
prediction = loaded_model.predict(test_X_seq)
prediction

In [None]:
prediction.shape

(153164, 6)

In [None]:
result = pd.DataFrame()
result.head()

In [None]:
result["id"] = ids
result.head()

Unnamed: 0,id
0,00001cee341fdb12
1,0000247867823ef7
2,00013b17ad220c46
3,00017563c3f7919a
4,00017695ad8997eb


In [None]:
result["toxic"] = prediction[:,0]
result["severe_toxic"] = prediction[:,1]
result["obscene"] = prediction[:,2]
result["threat"] = prediction[:,3]
result["insult"] = prediction[:,4]
result["identity_hate"] = prediction[:,5]
result.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.9983998,0.4484064,0.9979224,0.2651219,0.9951018,0.9011816
1,0000247867823ef7,1.900233e-09,4.384968e-10,2.825302e-09,2.858031e-11,1.202494e-09,1.274504e-12
2,00013b17ad220c46,3.518985e-08,3.854106e-07,1.022069e-06,1.6551e-09,1.094796e-08,2.372304e-10
3,00017563c3f7919a,5.832004e-08,2.464014e-09,1.021635e-08,4.35907e-09,1.228756e-09,7.232331e-11
4,00017695ad8997eb,3.253878e-08,3.323933e-08,1.626306e-07,1.533089e-10,6.984859e-11,8.71624e-11


In [None]:
result.to_csv('submission.csv', index=False)

In [None]:
! kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f submission.csv -m "Using Custom Word Embeddings"

100% 14.1M/14.1M [00:09<00:00, 1.59MB/s]
Successfully submitted to Toxic Comment Classification Challenge