In [1]:
from keras.layers import Dense, Embedding, Input, GlobalMaxPool1D, GlobalAveragePooling1D, concatenate, Reshape
from keras.layers import Bidirectional, Dropout, CuDNNGRU, GRU
from keras.models import Model
from keras.optimizers import RMSprop
import pandas as pd
from toxic.nltk_utils import tokenize_sentences
from toxic.embedding_utils import read_embedding_list, clear_embedding_list, convert_tokens_to_ids

Using TensorFlow backend.


In [2]:
UNKNOWN_WORD = "_UNK_"
END_WORD = "_END_"
NAN_WORD = "_NAN_"

CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

PROBABILITIES_NORMALIZE_COEFFICIENT = 1.4

In [3]:
train_file_path = "train.csv"
test_file_path = "test.csv"
embedding_path = "crawl-300d-2M.vec"
print("Loading data...")
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

list_sentences_train = train_data["comment_text"].fillna(NAN_WORD).values
list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values
y_train = train_data[CLASSES].values

Loading data...


In [4]:
# #marking comments without any tags as "clean"
# rowsums=train_data.iloc[:,2:8].sum(axis=1)
# train_data['clean']=(rowsums==0)
# #count number of clean entries
# train_data['clean'].sum()
# print("Total comments = ",len(train_data))
# print("Total clean comments = ",train_data['clean'].sum())
# print("Total tags =",rowsums.sum())


In [5]:
print("Tokenizing sentences in train set...")
tokenized_sentences_train, words_dict = tokenize_sentences(list_sentences_train, {})

print("Tokenizing sentences in test set...")
tokenized_sentences_test, words_dict = tokenize_sentences(list_sentences_test, words_dict)

  0%|          | 114/159571 [00:00<02:20, 1133.96it/s]

Tokenizing sentences in train set...


100%|██████████| 159571/159571 [01:54<00:00, 1395.28it/s]
  0%|          | 143/153164 [00:00<01:47, 1427.19it/s]

Tokenizing sentences in test set...


100%|██████████| 153164/153164 [01:41<00:00, 1508.80it/s]


In [6]:
print(embedding_path)
words_dict[UNKNOWN_WORD] = len(words_dict)

print("Loading embeddings...")
embedding_list, embedding_word_dict = read_embedding_list(file_path=embedding_path)
embedding_size = len(embedding_list[0])

crawl-300d-2M.vec
Loading embeddings...


100%|██████████| 1999999/1999999 [03:21<00:00, 9902.63it/s] 


In [7]:
import numpy as np
print("Preparing data...")
embedding_list, embedding_word_dict = clear_embedding_list(embedding_list, embedding_word_dict, words_dict)

embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict)
embedding_list.append([0.] * embedding_size)
embedding_word_dict[END_WORD] = len(embedding_word_dict)
embedding_list.append([-1.] * embedding_size)

embedding_matrix = np.array(embedding_list)

Preparing data...


In [49]:
# Variables for the model
sequence_length = 500
result_path = "toxic_results"
batch_size = 256
sentences_length = 500
recurrent_units=64
dropout_rate = 0.3
dense_size=32
fold_count=10


# Model Architecture
input_layer = Input(shape=(sequence_length,))
embedding_layer = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],
                            weights=[embedding_matrix], trainable=False)(input_layer)
x = Bidirectional(GRU(recurrent_units, reset_after=True, recurrent_activation='sigmoid', return_sequences=True, implementation=2))(embedding_layer)
x = Dropout(dropout_rate)(x)
x = Bidirectional(GRU(recurrent_units, reset_after=True,  recurrent_activation='sigmoid', return_sequences=True))(x)
x_max = GlobalMaxPool1D()(x)
x_avg = GlobalAveragePooling1D()(x)
x = concatenate([x_max, x_avg])
output_layer = Dense(6, activation="sigmoid")(x)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer=RMSprop(clipvalue=1, clipnorm=1), metrics=['accuracy'])



# input_layer = Input(shape=(sequence_length,))
# embedding_layer = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],
#                             weights=[embedding_matrix], trainable=False)(input_layer)
# x = Bidirectional(GRU(recurrent_units, return_sequences=True))(embedding_layer)
# x = Dropout(dropout_rate)(x)
# x = Bidirectional(GRU(recurrent_units, return_sequences=False))(x)
# x = Dense(dense_size, activation="relu")(x)
# output_layer = Dense(6, activation="sigmoid")(x)
# model = Model(inputs=input_layer, outputs=output_layer)
# model.compile(loss='binary_crossentropy',
#               optimizer=RMSprop(clipvalue=1, clipnorm=1),
#               metrics=['accuracy'])


In [50]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 500, 300)     51022500    input_7[0][0]                    
__________________________________________________________________________________________________
bidirectional_10 (Bidirectional (None, 500, 128)     140544      embedding_7[0][0]                
__________________________________________________________________________________________________
dropout_7 (Dropout)             (None, 500, 128)     0           bidirectional_10[0][0]           
__________________________________________________________________________________________________
bidirectio

In [61]:
model.save_weights("model{0}_weights.h5".format(1))

In [62]:
model_0_weights = np.load("toxic_results/model0_weights.npy")

In [63]:
model.get_weights()[3].shape

(2, 192)

In [64]:
model_0_weights[3].shape

(384,)

In [65]:
model.set_weights(model_0_weights)

ValueError: Shapes must be equal rank, but are 2 and 1 for 'Assign_8' (op: 'Assign') with input shapes: [2,192], [384].