In [28]:
from keras.layers import Dense, Embedding, Input, GlobalMaxPool1D, GlobalAveragePooling1D, concatenate, Reshape
from keras.layers import Bidirectional, Dropout, CuDNNGRU, GRU
from keras.models import Model
from keras.optimizers import RMSprop
import pandas as pd
from toxic.nltk_utils import tokenize_sentences
from toxic.embedding_utils import read_embedding_list, clear_embedding_list, convert_tokens_to_ids
from toxic.nltk_utils import clean

In [29]:
UNKNOWN_WORD = "_UNK_"
END_WORD = "_END_"
NAN_WORD = "_NAN_"

CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

PROBABILITIES_NORMALIZE_COEFFICIENT = 1.4

In [30]:
train_file_path = "train.csv"
test_file_path = "test.csv"
embedding_path = "crawl-300d-2M.vec"
print("Loading data...")
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

train_data['comment_text'] = train_data.apply(lambda x: clean(x.comment_text), axis=1)
train_data['comment_text'] = train_data.apply(lambda x: clean(x.comment_text), axis=1)

list_sentences_train = train_data["comment_text"].fillna(NAN_WORD).values
list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values
y_train = train_data[CLASSES].values

Loading data...


In [4]:
# #marking comments without any tags as "clean"
# rowsums=train_data.iloc[:,2:8].sum(axis=1)
# train_data['clean']=(rowsums==0)
# #count number of clean entries
# train_data['clean'].sum()
# print("Total comments = ",len(train_data))
# print("Total clean comments = ",train_data['clean'].sum())
# print("Total tags =",rowsums.sum())


In [31]:
print("Tokenizing sentences in train set...")
tokenized_sentences_train, words_dict = tokenize_sentences(list_sentences_train, {})

print("Tokenizing sentences in test set...")
tokenized_sentences_test, words_dict = tokenize_sentences(list_sentences_test, words_dict)

  0%|          | 392/159571 [00:00<00:40, 3909.99it/s]

Tokenizing sentences in train set...


100%|██████████| 159571/159571 [00:43<00:00, 3636.74it/s]
  0%|          | 136/153164 [00:00<01:52, 1357.85it/s]

Tokenizing sentences in test set...


100%|██████████| 153164/153164 [01:51<00:00, 1374.15it/s]


In [32]:
print(embedding_path)
words_dict[UNKNOWN_WORD] = len(words_dict)

print("Loading embeddings...")
embedding_list, embedding_word_dict = read_embedding_list(file_path=embedding_path)
embedding_size = len(embedding_list[0])

crawl-300d-2M.vec
Loading embeddings...


100%|██████████| 1999999/1999999 [03:47<00:00, 8775.32it/s]


In [33]:
import numpy as np
print("Preparing data...")
embedding_list, embedding_word_dict = clear_embedding_list(embedding_list, embedding_word_dict, words_dict)

embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict)
embedding_list.append([0.] * embedding_size)
embedding_word_dict[END_WORD] = len(embedding_word_dict)
embedding_list.append([-1.] * embedding_size)

embedding_matrix = np.array(embedding_list)
print(embedding_matrix.shape)

Preparing data...
(166561, 300)


In [41]:
# Variables for the model
sequence_length = 500
result_path = "toxic_results"
batch_size = 256
sentences_length = 500
recurrent_units=64
dropout_rate = 0.3
dense_size=32
fold_count=10


# Model Architecture
input_layer = Input(shape=(sequence_length,))
embedding_layer = Embedding(embedding_matrix_bkp.shape[0], embedding_matrix_bkp.shape[1],
                            weights=[embedding_matrix_bkp], trainable=False)(input_layer)
x = Bidirectional(GRU(recurrent_units, reset_after=True, recurrent_activation='sigmoid', return_sequences=True))(embedding_layer)
x = Dropout(dropout_rate)(x)
x = Bidirectional(GRU(recurrent_units, reset_after=True,  recurrent_activation='sigmoid', return_sequences=True))(x)
x_max = GlobalMaxPool1D()(x)
x_avg = GlobalAveragePooling1D()(x)
x = concatenate([x_max, x_avg])
output_layer = Dense(6, activation="sigmoid")(x)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer=RMSprop(clipvalue=1, clipnorm=1), metrics=['accuracy'])

#     input_layer = Input(shape=(sequence_length,))
#     embedding_layer = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],
#                                 weights=[embedding_matrix], trainable=False)(input_layer)
#     x = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(embedding_layer)
#     x = Dropout(dropout_rate)(x)
#     x = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(x)
#     x_max = GlobalMaxPool1D()(x)
#     x_avg = GlobalAveragePooling1D()(x)
#     x = concatenate([x_max, x_avg])
#     # x = Dense(dense_size, activation="relu")(x)
#     output_layer = Dense(6, activation="sigmoid")(x)
#     model = Model(inputs=input_layer, outputs=output_layer)
#     model.compile(loss='binary_crossentropy', optimizer=RMSprop(clipvalue=1, clipnorm=1), metrics=['accuracy'])


In [35]:
print(embedding_matrix.shape)

(166561, 300)


In [39]:
embedding_matrix_bkp = embedding_matrix[0:153945,]

In [40]:
print(embedding_matrix_bkp.shape)

(153945, 300)


In [21]:
#embedding_matrix = embedding_matrix_bkp

In [24]:
#print(embedding_matrix.shape)

(153945, 300)


In [42]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 500, 300)     46183500    input_3[0][0]                    
__________________________________________________________________________________________________
bidirectional_5 (Bidirectional) (None, 500, 128)     140544      embedding_3[0][0]                
__________________________________________________________________________________________________
dropout_3 (Dropout)             (None, 500, 128)     0           bidirectional_5[0][0]            
__________________________________________________________________________________________________
bidirectio

In [11]:
#model.save_weights("model{0}_weights.h5".format(1))

In [12]:
#model_0_weights = np.load("toxic_results/model0_weights.npy")

In [58]:
model.load_weights("toxic_results/model0_weights.h5")

(2, -1)
(2, -1)
(2, -1)
(2, -1)


In [59]:
model.get_weights()[0].shape

(153945, 300)

In [65]:
print(train_data['comment_text'][3])

morei cannot make real suggestions improvement wonder section statistics later subsection type accidents think reference may need tidy exact format ie date format etc later noone else first preferences format style reference want please let knowthere appear backlog article review guess may delay reviewer turn list relevant form eg wikipediagoodarticlenominationstransport


In [68]:
id_to_word = dict((id, word) for word, id in words_dict.items())
test_list_of_token_ids = convert_tokens_to_ids(
    tokenized_sentences_test,
    id_to_word,
    embedding_word_dict,
    sequence_length)
X_test = np.array(test_list_of_token_ids)

test_predicts = model.predict(X_test, batch_size=256)

InvalidArgumentError: indices[0,20] = 166559 is not in [0, 153945)
	 [[Node: embedding_3/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/cpu:0"](embedding_3/embeddings/read, embedding_3/Cast)]]

Caused by op 'embedding_3/Gather', defined at:
  File "/Users/sdivakarla/anaconda/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/sdivakarla/anaconda/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2802, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-41-81d8e7da67b2>", line 15, in <module>
    weights=[embedding_matrix_bkp], trainable=False)(input_layer)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/Keras-2.1.4-py3.5.egg/keras/engine/topology.py", line 619, in __call__
    output = self.call(inputs, **kwargs)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/Keras-2.1.4-py3.5.egg/keras/layers/embeddings.py", line 138, in call
    out = K.gather(self.embeddings, inputs)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/Keras-2.1.4-py3.5.egg/keras/backend/tensorflow_backend.py", line 1211, in gather
    return tf.gather(reference, indices)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/tensorflow/python/ops/array_ops.py", line 2409, in gather
    validate_indices=validate_indices, name=name)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1219, in gather
    validate_indices=validate_indices, name=name)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Users/sdivakarla/anaconda/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): indices[0,20] = 166559 is not in [0, 153945)
	 [[Node: embedding_3/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/cpu:0"](embedding_3/embeddings/read, embedding_3/Cast)]]


In [None]:
print(test_predicts)