In [1]:
from keras.layers import Dense, Embedding, Input, GlobalMaxPool1D, GlobalAveragePooling1D, concatenate, Reshape
from keras.layers import Bidirectional, Dropout, CuDNNGRU, GRU
from keras.models import Model
from keras.optimizers import RMSprop
import pandas as pd
from toxic.nltk_utils import tokenize_sentences
from toxic.embedding_utils import read_embedding_list, clear_embedding_list, convert_tokens_to_ids
from toxic.nltk_utils import clean

import numpy as np

Using TensorFlow backend.


In [2]:
UNKNOWN_WORD = "_UNK_"
END_WORD = "_END_"
NAN_WORD = "_NAN_"

CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

PROBABILITIES_NORMALIZE_COEFFICIENT = 1.4

In [3]:
train_file_path = "train.csv"
test_file_path = "test.csv"
embedding_path = "crawl-300d-2M.vec"
print("Loading data...")
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

# train_data['comment_text'] = train_data.apply(lambda x: clean(x.comment_text), axis=1)
# train_data['comment_text'] = train_data.apply(lambda x: clean(x.comment_text), axis=1)

list_sentences_train = train_data["comment_text"].fillna(NAN_WORD).values
list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values
y_train = train_data[CLASSES].values

Loading data...


In [4]:
#marking comments without any tags as "clean"
rowsums=train_data.iloc[:,2:8].sum(axis=1)
train_data['clean']=(rowsums==0)
#count number of clean entries
train_data['clean'].sum()
print("Total comments = ",len(train_data))
print("Total clean comments = ",train_data['clean'].sum())
print("Total tags =",rowsums.sum())


Total comments =  159571
Total clean comments =  143346
Total tags = 35098


In [5]:
print("Tokenizing sentences in train set...")
tokenized_sentences_train, words_dict = tokenize_sentences(list_sentences_train, {})

print("Tokenizing sentences in test set...")
tokenized_sentences_test, words_dict = tokenize_sentences(list_sentences_test, words_dict)

  0%|          | 107/159571 [00:00<02:29, 1063.21it/s]

Tokenizing sentences in train set...


100%|██████████| 159571/159571 [01:56<00:00, 1366.49it/s]
  0%|          | 143/153164 [00:00<01:47, 1428.95it/s]

Tokenizing sentences in test set...


100%|██████████| 153164/153164 [01:42<00:00, 1488.13it/s]


In [6]:
print(embedding_path)
words_dict[UNKNOWN_WORD] = len(words_dict)

print("Loading embeddings...")
embedding_list, embedding_word_dict = read_embedding_list(file_path=embedding_path)
embedding_size = len(embedding_list[0])

crawl-300d-2M.vec
Loading embeddings...


100%|██████████| 1999999/1999999 [03:22<00:00, 9888.98it/s] 


In [7]:
import numpy as np
print("Preparing data...")
embedding_list, embedding_word_dict = clear_embedding_list(embedding_list, embedding_word_dict, words_dict)

embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict)
embedding_list.append([0.] * embedding_size)
embedding_word_dict[END_WORD] = len(embedding_word_dict)
embedding_list.append([-1.] * embedding_size)

embedding_matrix = np.array(embedding_list)
print(embedding_matrix.shape)

Preparing data...
(170075, 300)


In [8]:
print(embedding_matrix.shape)

(170075, 300)


In [9]:
# embedding_matrix = np.load("embedding_matrix.npy")

In [10]:
# from keras.backend import manual_variable_initialization
# manual_variable_initialization(True)



In [11]:
# Variables for the model
sequence_length = 500
result_path = "toxic_results"
batch_size = 256
sentences_length = 500
recurrent_units=64
dropout_rate = 0.3
dense_size=32
fold_count=10


# Model Architecture
input_layer = Input(shape=(sequence_length,))
embedding_layer = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],
                            weights=[embedding_matrix], trainable=False)(input_layer)
x = Bidirectional(GRU(recurrent_units, reset_after=True, recurrent_activation='sigmoid', return_sequences=True))(embedding_layer)
x = Dropout(dropout_rate)(x)
x = Bidirectional(GRU(recurrent_units, reset_after=True,  recurrent_activation='sigmoid', return_sequences=True))(x)
x_max = GlobalMaxPool1D()(x)
x_avg = GlobalAveragePooling1D()(x)
x = concatenate([x_max, x_avg])
output_layer = Dense(6, activation="sigmoid")(x)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer=RMSprop(clipvalue=1, clipnorm=1), metrics=['accuracy'])



In [12]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 500, 300)     51022500    input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 500, 128)     140544      embedding_1[0][0]                
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 500, 128)     0           bidirectional_1[0][0]            
__________________________________________________________________________________________________
bidirectio

In [13]:
#model.save_weights("model{0}_weights.h5".format(1))

In [14]:
#model_0_weights = np.load("toxic_results/model0_weights.npy")

In [15]:
# model.load_weights("model0_weights.h5")

In [16]:
# model.get_weights()[0].shape

In [34]:
list_texts_to_predict= []
list_texts_to_predict.append(comment_text)

In [35]:
tokenized_sentences_test, words_dict = tokenize_sentences(list_texts_to_predict, words_dict)

100%|██████████| 1/1 [00:00<00:00, 2880.70it/s]


In [36]:
id_to_word = dict((id, word) for word, id in words_dict.items())
test_list_of_token_ids = convert_tokens_to_ids(
    tokenized_sentences_test,
    id_to_word,
    embedding_word_dict,
    sequence_length)
X_test = np.array(test_list_of_token_ids)

In [37]:
test_predicts_list = []
for i  in range(0,10):
    print("model{0}_weights.h5".format(i))
    model.load_weights("model{0}_weights.h5".format(i))
    test_predicts = model.predict(X_test, batch_size=1)
    print(test_predicts)
    test_predicts_list.append(test_predicts)

model0_weights.h5
(2, -1)
(2, -1)
(2, -1)
(2, -1)
[[  4.10551578e-02   9.42347324e-05   1.00320845e-03   3.91074136e-05
    3.98289226e-03   2.10573824e-04]]
model1_weights.h5
(2, -1)
(2, -1)
(2, -1)
(2, -1)
[[  1.36008877e-02   1.17192816e-04   7.66469224e-04   9.32302646e-05
    8.31047015e-04   1.80955089e-04]]
model2_weights.h5
(2, -1)
(2, -1)
(2, -1)
(2, -1)
[[  3.18567678e-02   7.89816040e-05   5.09556092e-04   1.11715170e-04
    5.41921193e-03   1.85749421e-04]]
model3_weights.h5
(2, -1)
(2, -1)
(2, -1)
(2, -1)
[[  1.92193948e-02   9.65333602e-05   6.85599749e-04   6.28872222e-05
    1.41573511e-03   2.23680225e-04]]
model4_weights.h5
(2, -1)
(2, -1)
(2, -1)
(2, -1)
[[  2.73008998e-02   8.54976897e-05   1.68198301e-03   2.88858919e-05
    3.35144438e-03   6.40737591e-04]]
model5_weights.h5
(2, -1)
(2, -1)
(2, -1)
(2, -1)
[[  1.52591150e-02   3.25870169e-05   2.95840029e-04   2.21366508e-06
    8.16736545e-04   8.80024018e-05]]
model6_weights.h5
(2, -1)
(2, -1)
(2, -1)
(2, -1)
[[

In [38]:
print(test_predicts_list)

[array([[  4.10551578e-02,   9.42347324e-05,   1.00320845e-03,
          3.91074136e-05,   3.98289226e-03,   2.10573824e-04]], dtype=float32), array([[  1.36008877e-02,   1.17192816e-04,   7.66469224e-04,
          9.32302646e-05,   8.31047015e-04,   1.80955089e-04]], dtype=float32), array([[  3.18567678e-02,   7.89816040e-05,   5.09556092e-04,
          1.11715170e-04,   5.41921193e-03,   1.85749421e-04]], dtype=float32), array([[  1.92193948e-02,   9.65333602e-05,   6.85599749e-04,
          6.28872222e-05,   1.41573511e-03,   2.23680225e-04]], dtype=float32), array([[  2.73008998e-02,   8.54976897e-05,   1.68198301e-03,
          2.88858919e-05,   3.35144438e-03,   6.40737591e-04]], dtype=float32), array([[  1.52591150e-02,   3.25870169e-05,   2.95840029e-04,
          2.21366508e-06,   8.16736545e-04,   8.80024018e-05]], dtype=float32), array([[  4.20477204e-02,   9.99281256e-05,   2.26802984e-03,
          1.09087523e-04,   5.08441078e-03,   2.12198123e-04]], dtype=float32), array

In [39]:
test_predicts = np.ones(test_predicts_list[0].shape)
for fold_predict in test_predicts_list:
    test_predicts *= fold_predict
    
test_predicts **= (1. / len(test_predicts_list))
print(test_predicts)
print(PROBABILITIES_NORMALIZE_COEFFICIENT)
test_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT
print(test_predicts)


[[  2.18401975e-02   8.00465756e-05   6.74920022e-04   3.34065822e-05
    1.85801107e-03   2.07941145e-04]]
1.4
[[  4.73106795e-03   1.83941558e-06   3.63875853e-05   5.41208817e-07
    1.50199199e-04   7.00029920e-06]]


In [40]:
print(test_predicts)

[[  4.73106795e-03   1.83941558e-06   3.63875853e-05   5.41208817e-07
    1.50199199e-04   7.00029920e-06]]
