In [72]:
import pandas as pd
import numpy  as np
from itertools import chain
from nltk.tokenize import wordpunct_tokenize

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from crossvalidation import multilabel_label_combinations

import keras.backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, InputLayer, Bidirectional, LSTM, GlobalMaxPool1D, Dropout, Dense
from keras.models import Sequential, Model
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [2]:
dftrain = pd.read_csv('input/train.csv')
dftest = pd.read_csv('input/test.csv')

In [15]:
dftrain['comment_text'] = dftrain['comment_text'].apply(str)
dftest['comment_text'] = dftest['comment_text'].apply(str)

In [3]:
dftrain.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [4]:
dftest.head()

Unnamed: 0,id,comment_text
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...
1,6102620,::Kentuckiana is colloquial. Even though the ...
2,14563293,"Hello fellow Wikipedians,\nI have just modifie..."
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2..."
4,22982444,== [WIKI_LINK: Talk:Celts] ==


In [6]:
with open('fasttext-train.txt', 'w', encoding='utf-8') as target:
    for text in list(dftrain['comment_text']) + list(dftest['comment_text']):
        target.write("__label__0__\t{0}\n".format(text))

In [8]:
!fasttext skipgram -input fasttext-train.txt -output fasttext-model


Read 1M words
Read 2M words
Read 3M words
Read 4M words
Read 5M words
Read 6M words
Read 7M words
Read 8M words
Read 9M words
Read 10M words
Read 11M words
Read 12M words
Read 13M words
Read 14M words
Read 15M words
Read 16M words
Read 17M words
Read 18M words
Read 19M words
Read 20M words
Read 21M words
Read 22M words
Read 23M words
Read 24M words
Read 24M words
Number of words:  119980
Number of labels: 1

Progress: 0.0%  words/sec/thread: 21  lr: 0.050000  loss: 4.143619  eta: -596523h-14m 
Progress: 0.0%  words/sec/thread: 43  lr: 0.050000  loss: 4.152472  eta: 132h27m 
Progress: 0.0%  words/sec/thread: 69  lr: 0.050000  loss: 4.156240  eta: 65h19m 
Progress: 0.0%  words/sec/thread: 92  lr: 0.050000  loss: 4.157403  eta: 41h0m 
Progress: 0.0%  words/sec/thread: 118  lr: 0.050000  loss: 4.158525  eta: 30h42m 
Progress: 0.0%  words/sec/thread: 145  lr: 0.050000  loss: 4.158800  eta: 24h0m 
Progress: 0.0%  words/sec/thread: 178  lr: 0.050000  loss: 4.159240  eta: 20h3m 
Progress: 0.0

In [17]:
train_wordset = set(chain(*map(wordpunct_tokenize, dftrain['comment_text'])))
test_wordset = set(chain(*map(wordpunct_tokenize, dftest['comment_text'])))
wordset = train_wordset | test_wordset
with open('fasttext-words.txt', 'w', encoding='utf-8') as target:
    for word in wordset:
        target.write("{0}\n".format(word))

In [None]:
!fasttext print-word-vectors fasttext-model.bin < fasttext-words.txt > fasttext-word-vectors.txt

In [23]:
embedding = np.zeros([len(wordset), 100])
word2index = {}
with open('fasttext-word-vectors.txt', 'r', encoding='utf-8') as src:
    for line in map(lambda row: row.strip().split(' '), src):
        word = line[0]
        vector = np.fromiter(map(float, line[1:]), dtype=np.float)
        idx = len(word2index)
        word2index[word] = idx
        embedding[idx, :] = vector

In [25]:
MAX_NB_WORDS = 30000

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(list(dftrain['comment_text']) + list(dftest['comment_text']))

In [41]:
MAXLEN = 100

sequences = tokenizer.texts_to_sequences(dftrain['comment_text'])
X = pad_sequences(sequences, MAXLEN)

In [28]:
embedding_matrix = np.zeros([len(wordset) + 1, 100])
for word, index in tokenizer.word_index.items():
    if word not in word2index:
        continue
    embedding_matrix[index, :] = embedding[word2index[word], :]

In [39]:
y = np.array(dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']])
y_combinations = multilabel_label_combinations(y, 2)
y_converted = np.zeros([len(y)])
for i, row in enumerate(y_combinations):
    idx = np.all(y == row, axis=1)
    y_converted[idx] = i

In [40]:
idx = np.arange(len(dftrain), dtype=np.int)
train_idx, test_idx, _, _ = train_test_split(idx, y_converted, stratify=y_converted, test_size=0.1)

In [42]:
train_X = X[train_idx]
train_y = y[train_idx]
val_X = X[test_idx]
val_y = y[test_idx]

In [45]:
embedding = Embedding(len(wordset) + 1, 
                      100,
                      weights=[embedding_matrix],
                      input_length=MAXLEN,
                      trainable=False)

In [73]:
model = Sequential([
    InputLayer(input_shape=(MAXLEN,), dtype='int32'),
    embedding,
    Bidirectional(LSTM(50, return_sequences=True)),
    GlobalMaxPool1D(),
    Dropout(0.1),
    Dense(50, activation='relu'),
    Dropout(0.1),
    Dense(6, activation='sigmoid')
])
model.compile('nadam', 'binary_crossentropy')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 100)          42534000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 100)          60400     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 50)                5050      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
__________

In [76]:
model.fit(train_X, train_y,
          validation_data=(val_X, val_y),
          epochs=5,
          batch_size=32,
          callbacks=[
              ModelCheckpoint('model.h5', save_best_only=True),
              EarlyStopping(patience=2),
          ],
          verbose=True)

Train on 86265 samples, validate on 9586 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
  512/86265 [..............................] - ETA: 23:04 - loss: 0.0439

KeyboardInterrupt: 

In [77]:
model.load_weights('model.h5')

In [78]:
sequences = tokenizer.texts_to_sequences(dftest['comment_text'])
X = pad_sequences(sequences, MAXLEN)

In [80]:
prediction = model.predict(X, verbose=True)



In [81]:
submission = pd.read_csv('input/sample_submission.csv')

In [82]:
submission[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = prediction

In [83]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.01527,8.847006e-05,0.001641,0.0002240585,0.00174,0.00047
1,6102620,0.000208,2.160608e-07,3e-05,3.814144e-07,1.7e-05,8e-06
2,14563293,0.000254,4.207182e-07,3.8e-05,8.167399e-07,1.8e-05,6e-06
3,21086297,0.004583,3.357466e-05,0.000566,7.805817e-05,0.000582,0.000248
4,22982444,0.006814,3.882337e-05,0.000813,9.414572e-05,0.000817,0.000253


In [84]:
submission.to_csv('output.csv', index=None)