In [None]:
import numpy as np 
import pandas as pd
import pickle

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

max_features = 20000
maxlen = 100 

train = pd.read_csv("/content/drive/MyDrive/jigsaw-toxic-comment-classification-challenge/train.csv")
test = pd.read_csv("/content/drive/MyDrive/jigsaw-toxic-comment-classification-challenge/test.csv")

In [None]:
list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").values  

In [None]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [None]:
def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

model = get_model()
batch_size = 32
epochs = 2

file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [None]:
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [checkpoint, early] #early
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)
model.load_weights(file_path)

Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.05113, saving model to weights_base.best.hdf5
Epoch 2/2

Epoch 00002: val_loss improved from 0.05113 to 0.04837, saving model to weights_base.best.hdf5


In [None]:
y_test = model.predict(X_te)
sample_submission = pd.read_csv("/content/drive/MyDrive/jigsaw-toxic-comment-classification-challenge/sample_submission.csv")
sample_submission[list_classes] = y_test
sample_submission.to_csv("baseline.csv", index=False)

In [None]:
pickle.dump(tokenizer, open('/content/drive/MyDrive/jigsaw-toxic-comment-classification-challenge/tokernizer.pickle', 'wb')) 

In [None]:
model.save('/content/drive/MyDrive/jigsaw-toxic-comment-classification-challenge/model/') 



INFO:tensorflow:Assets written to: /content/drive/MyDrive/jigsaw-toxic-comment-classification-challenge/model/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/jigsaw-toxic-comment-classification-challenge/model/assets
