# Import the dataset

In [48]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, GlobalMaxPool1D, LSTM, Input
from keras.losses import BinaryCrossentropy
from keras.metrics import AUC
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split

import gc
import pickle

In [2]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
tf.debugging.set_log_device_placement(True)

Num GPUs Available:  1


In [4]:
!pip install kaggle



In [5]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"sid200026","key":"c1398985906d76ebf510dbc0e65e5ceb"}'}

In [6]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

In [7]:
! chmod 600 ~/.kaggle/kaggle.json

In [8]:
! kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification

Downloading train.csv.zip to /content
 99% 273M/276M [00:10<00:00, 30.0MB/s]
100% 276M/276M [00:10<00:00, 27.8MB/s]
Downloading all_data.csv.zip to /content
 92% 300M/326M [00:13<00:01, 15.5MB/s]
100% 326M/326M [00:13<00:00, 24.6MB/s]
Downloading toxicity_individual_annotations.csv.zip to /content
 88% 57.0M/64.7M [00:03<00:00, 13.0MB/s]
100% 64.7M/64.7M [00:03<00:00, 20.7MB/s]
Downloading test_private_expanded.csv.zip to /content
 32% 5.00M/15.8M [00:01<00:02, 3.80MB/s]
100% 15.8M/15.8M [00:01<00:00, 11.3MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/224k [00:00<?, ?B/s]
100% 224k/224k [00:00<00:00, 72.1MB/s]
Downloading test_public_expanded.csv.zip to /content
 57% 9.00M/15.9M [00:01<00:00, 8.02MB/s]
100% 15.9M/15.9M [00:01<00:00, 11.0MB/s]
Downloading identity_individual_annotations.csv.zip to /content
 41% 5.00M/12.3M [00:01<00:02, 3.71MB/s]
100% 12.3M/12.3M [00:01<00:00, 8.77MB/s]
Downloading test.csv.zip to /content
 41% 5.00M/12.1M [00:01<00:01, 3.91MB/s]
100%

In [9]:
! mkdir dataset

In [10]:
! unzip test.csv.zip -d dataset

Archive:  test.csv.zip
  inflating: dataset/test.csv        


In [11]:
! unzip train.csv.zip -d dataset

Archive:  train.csv.zip
  inflating: dataset/train.csv       


# Data Fetching

In [12]:
gc.collect()

15

In [13]:
train = pd.read_csv('dataset/train.csv',dtype={'comment_text':'string'})
train.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:41.987077+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:42.870083+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:45.222647+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:47.601894+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-09-29 10:50:48.488476+00,2,,2006,rejected,0,0,0,1,0,0.0,4,47


In [14]:
train = train.drop(columns='id')
train.head()

Unnamed: 0,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:41.987077+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
1,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:42.870083+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
2,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:45.222647+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
3,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:47.601894+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
4,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-09-29 10:50:48.488476+00,2,,2006,rejected,0,0,0,1,0,0.0,4,47


In [15]:
train.isna().sum()

target                                       0
comment_text                                 0
severe_toxicity                              0
obscene                                      0
identity_attack                              0
insult                                       0
threat                                       0
asian                                  1399744
atheist                                1399744
bisexual                               1399744
black                                  1399744
buddhist                               1399744
christian                              1399744
female                                 1399744
heterosexual                           1399744
hindu                                  1399744
homosexual_gay_or_lesbian              1399744
intellectual_or_learning_disability    1399744
jewish                                 1399744
latino                                 1399744
male                                   1399744
muslim       

In [16]:
train.shape

(1804874, 44)

In [17]:
train = train.drop(columns=['toxicity_annotator_count', 'identity_annotator_count', 'disagree', 'likes', 'sad', 'wow', 'funny', 'rating'])
train = train.drop(columns=['article_id', 'parent_id', 'publication_id', 'created_date'])
train.head()

Unnamed: 0,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,sexual_explicit
0,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0
1,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0
2,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0
3,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0
4,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
identity_columns = train.iloc[:,7:31].columns
identity_columns

Index(['asian', 'atheist', 'bisexual', 'black', 'buddhist', 'christian',
       'female', 'heterosexual', 'hindu', 'homosexual_gay_or_lesbian',
       'intellectual_or_learning_disability', 'jewish', 'latino', 'male',
       'muslim', 'other_disability', 'other_gender', 'other_race_or_ethnicity',
       'other_religion', 'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white'],
      dtype='object')

In [19]:
train[identity_columns] = train[identity_columns].fillna(0)
train.head()

Unnamed: 0,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,sexual_explicit
0,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
output_columns = ['target', 'asian', 'atheist', 'bisexual', 'black', 'buddhist', 'christian',
       'female', 'heterosexual', 'hindu', 'homosexual_gay_or_lesbian',
       'intellectual_or_learning_disability', 'jewish', 'latino', 'male',
       'muslim', 'other_disability', 'other_gender', 'other_race_or_ethnicity',
       'other_religion', 'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white']

In [21]:
OUTPUT = train.loc[:,output_columns]
OUTPUT.head()

Unnamed: 0,target,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.893617,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Data Preprocessing

In [22]:
X = train['comment_text'].values
X

<StringArray>
[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              "This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!",
                                                  

In [23]:
y = OUTPUT.values
y

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.62121212, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [24]:
del train
gc.collect()

24

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
X_test.shape

(360975,)

In [27]:
X_train.shape

(1443899,)

In [28]:
! wget http://nlp.stanford.edu/data/glove.840B.300d.zip

--2020-10-04 19:03:26--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2020-10-04 19:03:26--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2020-10-04 19:03:27--  http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip

In [29]:
! unzip glove.840B.300d.zip

Archive:  glove.840B.300d.zip
  inflating: glove.840B.300d.txt     


In [31]:
tokenizer = Tokenizer()

In [32]:
tokenizer.fit_on_texts(X_train)

In [33]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [34]:
len(X_train_seq)

1443899

In [35]:
print(len(tokenizer.word_index))

353109


In [36]:
X_train_seq = pad_sequences(X_train_seq, maxlen=250)
X_test_seq = pad_sequences(X_test_seq, maxlen=250)

In [37]:
X_test_seq.shape

(360975, 250)

# Word Embedding

In [38]:
gc.collect()

0

In [39]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

353110

In [40]:
embeddings_index = dict()
glove = open('glove.840B.300d.txt')

In [41]:
for line in glove:
    word, coefs = line.split(maxsplit=1)
    coefs = np.fromstring(coefs, "f", sep=" ")
    embeddings_index[word] = coefs

  This is separate from the ipykernel package so we can avoid doing imports until


In [42]:
print("Found %s word vectors." % len(embeddings_index))

Found 2195884 word vectors.


In [43]:
glove.close()

In [45]:
embedding_matrix = np.zeros((vocab_size, 300))
miss = 0

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      if embedding_vector.shape[0] != 0:
        embedding_matrix[i] = embedding_vector
      else:
        miss+=1

print(miss)

11


In [46]:
embedding_matrix.shape

(353110, 300)

In [49]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [50]:
with open('embedding_matrix.pickle', 'wb') as handle:
    pickle.dump(embedding_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [51]:
del embeddings_index
del X
del y
del tokenizer
del glove

gc.collect()

392

# Model Training

In [53]:
y_target = y_train[:,0]
y_target.shape

(1443899,)

In [54]:
y_test_target = y_test[:,0]
y_test_target.shape

(360975,)

In [58]:
aux_output = y_train[:,1:]
aux_output.shape

(1443899, 24)

In [56]:
X_train.shape

(1443899,)

In [None]:
model = Sequential()

In [None]:
model.add(Embedding(input_dim=vocab_size, output_dim = 300, input_length = 250, weights=[embedding_matrix], trainable = False))

In [None]:
model.add(LSTM(units=150,return_sequences=True, dropout=0.1))

In [63]:
model.add(GlobalMaxPool1D())

In [None]:
model.add(Dense(units = 32, activation='relu'))

In [None]:
model.add(Dense(units = 1, activation='sigmoid'))

In [None]:
model.compile(loss=BinaryCrossentropy(),optimizer=Adam(),metrics=[AUC()])

In [None]:
print(model.summary())

In [68]:
gc.collect()

58

In [None]:
history = model.fit(np.array(X_train_seq), np.array(y_target), batch_size=512, epochs=5, validation_data=(np.array(X_test_seq),np.array(y_test_target)))

In [70]:
model_json = model.to_json()

In [71]:
with open('glove_embedding.json', 'w') as json_file:
  json_file.write(model_json)

In [72]:
model.save_weights("weights.h5")

Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /j

In [73]:
del embedding_matrix

gc.collect()

1525

# Kaggle Submission

In [75]:
test = pd.read_csv('dataset/test.csv')
test.head()

Unnamed: 0,id,comment_text
0,7097320,[ Integrity means that you pay your debts.]\n\...
1,7097321,This is malfeasance by the Administrator and t...
2,7097322,@Rmiller101 - Spoken like a true elitist. But ...
3,7097323,"Paul: Thank you for your kind words. I do, in..."
4,7097324,Sorry you missed high school. Eisenhower sent ...


In [76]:
test.shape

(97320, 2)

In [77]:
comments = test['comment_text'].values
comments.shape

(97320,)

In [78]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [79]:
comment_seq = tokenizer.texts_to_sequences(comments)

In [80]:
comment_pad_seq = pad_sequences(comment_seq, maxlen=250)

In [None]:
prediction = model.predict(comment_pad_seq)

In [82]:
prediction.shape

(97320, 1)

In [83]:
ids = test.iloc[:,0]
ids

0        7097320
1        7097321
2        7097322
3        7097323
4        7097324
          ...   
97315    7194635
97316    7194636
97317    7194637
97318    7194638
97319    7194639
Name: id, Length: 97320, dtype: int64

In [85]:
result = pd.DataFrame()
result['id'] = ids
result['prediction'] = prediction

In [86]:
result.head()

Unnamed: 0,id,prediction
0,7097320,0.020808
1,7097321,0.046746
2,7097322,0.165737
3,7097323,0.042358
4,7097324,0.012673


In [87]:
result.to_csv('submission.csv', index=False)