In [14]:
import tensorflow as tf
from tensorflow.keras import layers

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
import datetime
import pickle


In [15]:
tf.test.is_gpu_available(
    cuda_only=False, min_cuda_compute_capability=None
)

True

In [16]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [17]:
train_set = pd.read_csv("dataset/rnn_cleaned_train.csv")

PADDING_LENGTH = 100
embed_size = 50
FEATURE_PATH = 'features/glove.6B.50d.txt'

In [18]:
def create_embedding_matrix(word_index, filename):
    words_nb = len(word_index)
    embeddings_index = dict()
    with open(filename, 'rb') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    # create a weight matrix for words in training docs
    embedding_matrix = np.zeros((words_nb+1, 50))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [19]:
train_text = train_set.comment_text

# train label processing
train_labels = train_set.values[:,2:]
train_labels = np.asarray(train_labels, dtype=int)

x_train_clean, x_test_clean, y_train, y_test = train_test_split(train_text, train_labels, test_size=.2, shuffle=True)

x_train_clean = x_train_clean.tolist()
x_test_clean = x_test_clean.tolist()

# Tokenize the comment_text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text.tolist())

x_train_seq = tokenizer.texts_to_sequences(x_train_clean)
x_train_pad = pad_sequences(x_train_seq, maxlen=PADDING_LENGTH)

x_test_seq = tokenizer.texts_to_sequences(x_test_clean)
x_test_pad = pad_sequences(x_test_seq, maxlen=PADDING_LENGTH)

word_index = tokenizer.word_index

max_features = len(word_index)+1

In [20]:
embedding_matrix = create_embedding_matrix(word_index, FEATURE_PATH)

In [21]:
with open('models/RNN/rnn_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [22]:
tf.keras.backend.clear_session()

In [23]:
inp = layers.Input(shape=(100, ))
x = layers.Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = layers.LSTM(60, return_sequences=True, name='lstm_layer')(x)
x = layers.GlobalMaxPool1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(50, activation="relu")(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=[tf.keras.metrics.AUC(), 
                       tf.keras.metrics.Recall(), 
                       tf.keras.metrics.Precision()])

In [24]:
x_train_pad.shape

(127656, 100)

In [25]:
model.fit(x_train_pad, y_train, batch_size=128, epochs=10, validation_data=(x_test_pad, y_test), shuffle=True)
# Save the entire model as a SavedModel.
model.save('models/RNN/my_rnn_model')

Train on 127656 samples, validate on 31915 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
model.save('models/RNN/my_rnn_model')