In [1]:
import tensorboard

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding, Bidirectional, Flatten, Input, Lambda, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.metrics import sparse_categorical_accuracy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text 
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertModel, BertConfig
import numpy as np
import string
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = pd.read_csv('../Data/improvedds_v3.csv')

In [4]:
model_ans_col = 'Model_Answer'
ans_col = 'Answer'
label_col = 'Category'

In [5]:
def preprocess_data_for_model_training(df, model_ans_col='Model_Answer', ans_col='Answer'):
    """
    Preprocess data
    """
    print("Preprocessing data...")
    df[model_ans_col] = data[model_ans_col].apply(lambda x: str(x).lower())
    df[ans_col] = data[ans_col].apply(lambda x: str(x).lower())

    df[model_ans_col] = df[model_ans_col].apply(lambda x: x.strip())
    df[ans_col] = df[ans_col].apply(lambda x: x.strip())

    df[model_ans_col] = df[model_ans_col].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    df[ans_col] = df[ans_col].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

    #remove br tags
    df[model_ans_col] = df[model_ans_col].apply(lambda x: x.replace('<br>', ' '))
    df[ans_col] = df[ans_col].apply(lambda x: x.replace('<br>', ' '))
    df[model_ans_col] = df[model_ans_col].apply(lambda x: x.replace('<br/>', ' '))
    df[ans_col] = df[ans_col].apply(lambda x: x.replace('<br/>', ' '))

    #remove multiple spaces
    df[model_ans_col] = df[model_ans_col].apply(lambda x: x.replace('  ', ' '))
    df[ans_col] = df[ans_col].apply(lambda x: x.replace('  ', ' '))

    #remove newlines
    df[model_ans_col] = df[model_ans_col].apply(lambda x: x.replace('\n', ' '))
    df[ans_col] = df[ans_col].apply(lambda x: x.replace('\n', ' '))

    print("Data preprocessing is done")
    return df


In [6]:
#shuffle data
data = data.sample(frac=1).reset_index(drop=True)

In [7]:
# split data for deep learning model training and testing
X_train, X_test, y_train, y_test = train_test_split(
    data[[model_ans_col, ans_col, 'LengthRatio', 'Cosine_Similarity']], data[label_col], test_size=0.2, random_state=156)

In [8]:
y_train = to_categorical(y_train, num_classes=3)
y_test = to_categorical(y_test, num_classes=3)

In [9]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",
    trainable=False)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 768].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 768].

In [10]:
embedding_model = tf.keras.Model(text_input, sequence_output)

In [11]:
# Define Input Layers
input_model_answer = Input(shape=(1,), dtype=tf.string)
input_answer = Input(shape=(1,), dtype=tf.string)
#input_text_features = Input(shape=(2,) , dtype=tf.float32)

In [12]:
embedding_model_answer = embedding_model(input_model_answer)
embedding_answer = embedding_model(input_answer)

In [13]:
# get the differnce of the embedding vectors
diff_model_ans = Lambda(
        lambda x: x[0] - x[1])([embedding_model_answer, embedding_answer])

In [14]:
# apply the LSTM layer
lstm_model_answer = Bidirectional(
    LSTM(100, return_sequences=True))(diff_model_ans)

lstm_answer = Bidirectional(
    LSTM(100, return_sequences=True))(embedding_answer)


In [15]:
# apply dropout
dropout_model_answer = Dropout(0.25)(lstm_model_answer)
dropout_answer = Dropout(0.25)(lstm_answer)

In [16]:
# apply the Dense layer
#dense_model_answer = Dense(10, activation='relu')(dropout_model_answer)
#dense_answer = Dense(10, activation='relu')(dropout_answer)

In [17]:
#feature_layer = Dense(60, activation='relu')(input_text_features)

In [18]:
# concatenate the outputs
#concatenate_model_answer = concatenate([dense_model_answer, dense_answer])
concatenate_model_answer = concatenate([dropout_model_answer, dropout_answer])

In [19]:
# apply Dense layer
dense_model1 = Dense(40, activation='relu')(concatenate_model_answer)

# apply dropout
dropout_model1 = Dropout(0.2)(dense_model1)

In [20]:
# reduce the dimensionality
#dense_model2 = Dense(20, activation='relu')(dropout_model1)

In [21]:
# include Flatten layer
flatten_model_answer = Flatten()(dense_model1)

In [22]:
#concat_layer = concatenate([flatten_model_answer, feature_layer])

In [23]:
#final_layer = Dense(5, activation='relu')(concat_layer)

In [24]:
# apply Dense layer - output layer
#output_model_answer = Dense(3, activation='softmax')(final_layer)
output_model_answer = Dense(3, activation='softmax')(flatten_model_answer)

In [25]:
# create the model
#model = Model(inputs=[input_model_answer, input_answer, input_text_features], outputs=output_model_answer)
model = Model(inputs=[input_model_answer, input_answer], outputs=output_model_answer)

In [26]:
# compile the model
model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=0.0001), metrics=['accuracy'])

  super(Adam, self).__init__(name, **kwargs)


In [27]:
from tensorflow.keras.callbacks import TensorBoard
import time

# logs about the training process
# accuracy and loss are plotted in the TensorBoard
log_dir = "logs/fit/" + str(int(time.time()))
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [28]:
# define class weights according to the number of samples in each class
num_samples_class_0 = len(X_train[y_train[:, 0] == 1.0])
num_samples_class_1 = len(X_train[y_train[:, 1] == 1.0])
num_samples_class_2 = len(X_train[y_train[:, 2] == 1.0])


class_weights = {0: num_samples_class_0 / (num_samples_class_0 + num_samples_class_1 + num_samples_class_2),
                    1: num_samples_class_1 / (num_samples_class_0 + num_samples_class_1 + num_samples_class_2),                                                     
                    2: num_samples_class_2 / (num_samples_class_0 + num_samples_class_1 + num_samples_class_2)}     

In [29]:
# fit the model
#history = model.fit([X_train[model_ans_col], X_train[ans_col], X_train[['LengthRatio', 'Cosine_Similarity']]], y_train, epochs=50, batch_size=32,
#              validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=5), tensorboard_callback])
history = model.fit([X_train[model_ans_col], X_train[ans_col]], y_train, epochs=50, batch_size=32,
              validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=5), tensorboard_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


In [30]:
# evaluate the model
# scores = model.evaluate(
#         [X_train[model_ans_col], X_train[ans_col], X_train[['LengthRati', 'Cosine_Similarity']]], y_train, verbose=0)
scores = model.evaluate(
        [X_train[model_ans_col], X_train[ans_col]], y_train, verbose=0)

print("Accuracy: %.2f%%" % (scores[1]*100))
print("Loss: %.2f" % scores[0])


Accuracy: 97.88%
Loss: 0.06


In [31]:
print("Test Accuracy: %.2f%%" % (model.evaluate(
        [X_test[model_ans_col], X_test[ans_col]], y_test, verbose=1)[1]*100))

Test Accuracy: 96.62%


In [32]:
model_ans_p = "When the array is not sorted"
ans_p = "When the array is ordered in ascending order"

def get_length_ration(model_ans, ans):
    return len(ans)/len(model_ans)

cosine_similarity_p = 0.912
length_ration_p = get_length_ration(model_ans_p, ans_p)

testdf = pd.DataFrame(columns=['Model_Answer', 'Answer', 'LengthRatio', 'Cosine_Similarity'])
testdf.loc[0] = [model_ans_p, ans_p, length_ration_p, cosine_similarity_p]

prediction = model.predict([testdf[model_ans_col].iloc[0:1], testdf[ans_col].iloc[0:1]])
print("Category:", np.argmax(prediction))

Category: 2


In [33]:
prediction

array([[1.1867922e-01, 2.3843113e-05, 8.8129693e-01]], dtype=float32)

In [34]:
# save the model
import datetime
isSave = 'y'

if isSave == 'y':
    model.save('./Models/model' + str(datetime.datetime.now().timestamp().__round__()) + '.h5')
    print("Model saved")

Model saved
