In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback


Using TensorFlow backend.


In [5]:
FOLDER = '/python/datasource/kaggle/jigsaw-toxic-comment-classification-challenge'

In [6]:
path = os.path.join(FOLDER, 'sample_submission.csv')
submission = pd.read_csv(path)

submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5


In [7]:
path = os.path.join(FOLDER, 'test.csv')
test = pd.read_csv(path)
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [8]:
path = os.path.join(FOLDER, 'test_labels.csv')
test_labels = pd.read_csv(path)
test_labels.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [9]:
path = os.path.join(FOLDER, 'train.csv')
train = pd.read_csv(path)
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [10]:
X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

In [11]:
max_features = 30000
maxlen = 100
embed_size = 300

In [12]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [13]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

In [14]:
EMBEDDING_FILE = '/python/datasource/kaggle/jigsaw-toxic-comment-classification-challenge/crawl-300d-2M.vec'

In [15]:
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE,encoding='utf8'))

In [16]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


In [17]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [18]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [24]:
pd.DataFrame?

In [None]:
train_len = 10000


In [27]:
train_x = x_train[0:100]
train_y = y_train[0:train_len]

100

In [19]:
model = get_model()
batch_size = 32
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
path = os.path.join(FOLDER, 'submission.csv')
submission.to_csv(path, index=False)

W1007 09:03:14.541217  3376 deprecation_wrapper.py:119] From D:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:66: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1007 09:03:18.533427  3376 deprecation_wrapper.py:119] From D:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:541: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1007 09:03:20.089231  3376 deprecation_wrapper.py:119] From D:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:4432: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1007 09:03:21.749037  3376 deprecation_wrapper.py:119] From D:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:190: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W1007 09:03:21.749037  3376 deprecation_wrapper.py:119] 

Train on 151592 samples, validate on 7979 samples
Epoch 1/2
 - 1575s - loss: 0.0499 - acc: 0.9819 - val_loss: 0.0465 - val_acc: 0.9823

 ROC-AUC - epoch: 1 - score: 0.986975 

Epoch 2/2
 - 1561s - loss: 0.0379 - acc: 0.9853 - val_loss: 0.0456 - val_acc: 0.9825

 ROC-AUC - epoch: 2 - score: 0.987062 



In [19]:
model = get_model()
batch_size = 32
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
path = os.path.join(FOLDER, 'submission.csv')
submission.to_csv(path, index=False)

W1007 09:03:14.541217  3376 deprecation_wrapper.py:119] From D:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:66: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1007 09:03:18.533427  3376 deprecation_wrapper.py:119] From D:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:541: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1007 09:03:20.089231  3376 deprecation_wrapper.py:119] From D:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:4432: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1007 09:03:21.749037  3376 deprecation_wrapper.py:119] From D:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:190: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W1007 09:03:21.749037  3376 deprecation_wrapper.py:119] 

Train on 151592 samples, validate on 7979 samples
Epoch 1/2
 - 1575s - loss: 0.0499 - acc: 0.9819 - val_loss: 0.0465 - val_acc: 0.9823

 ROC-AUC - epoch: 1 - score: 0.986975 

Epoch 2/2
 - 1561s - loss: 0.0379 - acc: 0.9853 - val_loss: 0.0456 - val_acc: 0.9825

 ROC-AUC - epoch: 2 - score: 0.987062 



In [20]:
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
path = os.path.join(FOLDER, 'submission.csv')
submission.to_csv(path, index=False)

In [21]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.997472,0.445425,0.981227,0.09856,0.972344,0.444091
1,0000247867823ef7,0.00089,1.1e-05,0.000258,3e-06,9.3e-05,2.7e-05
2,00013b17ad220c46,0.001296,6.7e-05,0.000777,4.1e-05,0.00041,7.2e-05
3,00017563c3f7919a,0.001474,2.4e-05,0.0007,3.2e-05,0.00037,6e-06
4,00017695ad8997eb,0.007197,0.000178,0.002687,5.6e-05,0.0007,8e-05


In [22]:
y_pred

array([[9.9747193e-01, 4.4542503e-01, 9.8122662e-01, 9.8560065e-02,
        9.7234356e-01, 4.4409132e-01],
       [8.9016557e-04, 1.0550022e-05, 2.5773048e-04, 3.1590462e-06,
        9.3340874e-05, 2.6911497e-05],
       [1.2955666e-03, 6.7085028e-05, 7.7745318e-04, 4.1455030e-05,
        4.0972233e-04, 7.1734190e-05],
       ...,
       [2.4020970e-03, 2.5779009e-05, 7.6711178e-04, 5.3942204e-06,
        4.3523312e-04, 4.1335821e-05],
       [2.4250746e-03, 3.3259392e-05, 3.4856796e-04, 1.9103289e-05,
        4.0900707e-04, 1.9114316e-03],
       [9.7665966e-01, 8.1960559e-03, 7.4142146e-01, 1.3203025e-03,
        4.4450170e-01, 1.7091930e-03]], dtype=float32)