In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding, Bidirectional, Flatten, Input, Lambda, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.metrics import sparse_categorical_accuracy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text 
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertModel, BertConfig
import numpy as np
import string
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('../Data/new_ds.csv')

In [3]:
model_ans_col = 'Model_Answer'
ans_col = 'Answer'
label_col = 'Category'

In [4]:
def preprocess_data_for_model_training(df, model_ans_col='Model_Answer', ans_col='Answer'):
    """
    Preprocess data
    """
    print("Preprocessing data...")
    df[model_ans_col] = data[model_ans_col].apply(lambda x: str(x).lower())
    df[ans_col] = data[ans_col].apply(lambda x: str(x).lower())

    df[model_ans_col] = df[model_ans_col].apply(lambda x: x.strip())
    df[ans_col] = df[ans_col].apply(lambda x: x.strip())

    df[model_ans_col] = df[model_ans_col].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    df[ans_col] = df[ans_col].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

    print("Data preprocessing is done")
    return df


In [5]:
# split data for deep learning model training and testing
X_train, X_test, y_train, y_test = train_test_split(
    data[[model_ans_col, ans_col, 'LengthRatio', 'Cosine_Similarity']], data[label_col], test_size=0.2, random_state=256)

# convert to one-hot encoding
y_train = to_categorical(y_train, num_classes=None)
y_test = to_categorical(y_test, num_classes=None)


In [6]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",
    trainable=False)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 768].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 768].

In [7]:
embedding_model = tf.keras.Model(text_input, sequence_output)

In [8]:
# Define Input Layers
input_model_answer = Input(shape=(1,), dtype=tf.string)
input_answer = Input(shape=(1,), dtype=tf.string)
input_text_features = Input(shape=(1,) , dtype=tf.float32)

In [9]:
embedding_model_answer = embedding_model(input_model_answer)
embedding_answer = embedding_model(input_answer)

In [10]:
# get the differnce of the embedding vectors
diff_model_ans = Lambda(
        lambda x: x[0] - x[1])([embedding_model_answer, embedding_answer])

In [11]:
# apply the LSTM layer
lstm_model_answer = Bidirectional(
    LSTM(100, return_sequences=True))(diff_model_ans)

lstm_answer = Bidirectional(
    LSTM(100, return_sequences=True))(embedding_answer)


In [12]:
# apply dropout
dropout_model_answer = Dropout(0.2)(lstm_model_answer)
dropout_answer = Dropout(0.2)(lstm_answer)

In [13]:
# apply the Dense layer
dense_model_answer = Dense(100, activation='relu')(dropout_model_answer)
dense_answer = Dense(100, activation='relu')(dropout_answer)

In [14]:
feature_layer = Dense(100, activation='relu')(input_text_features)

In [15]:
# concatenate the outputs
concatenate_model_answer = concatenate([dense_model_answer, dense_answer])

In [16]:
# apply Dense layer
dense_model1 = Dense(200, activation='relu')(concatenate_model_answer)

# apply dropout
dropout_model1 = Dropout(0.2)(dense_model1)

In [17]:
# reduce the dimensionality
dense_model2 = Dense(150, activation='relu')(dropout_model1)

In [18]:
# include Flatten layer
flatten_model_answer = Flatten()(dense_model2)

In [19]:
concat_layer = concatenate([flatten_model_answer, feature_layer])

In [20]:
final_layer = Dense(50, activation='relu')(concat_layer)

In [21]:
# apply Dense layer - output layer
output_model_answer = Dense(3, activation='softmax')(final_layer)

In [22]:
# create the model
model = Model(inputs=[input_model_answer, input_answer, input_text_features], outputs=output_model_answer)

In [23]:
# compile the model
model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=0.0001), metrics=['accuracy'])

  super(Adam, self).__init__(name, **kwargs)


In [24]:
# define class weights according to the number of samples in each class
num_samples_class_0 = len(X_train[y_train[:, 0] == 1])
num_samples_class_1 = len(X_train[y_train[:, 1] == 1])
num_samples_class_2 = len(X_train[y_train[:, 2] == 1])


class_weights = {0: num_samples_class_0 / (num_samples_class_0 + num_samples_class_1 + num_samples_class_2),
                    1: num_samples_class_1 / (num_samples_class_0 + num_samples_class_1 + num_samples_class_2),                                                     
                    2: num_samples_class_2 / (num_samples_class_0 + num_samples_class_1 + num_samples_class_2)}     

In [25]:
# fit the model
model.fit([X_train[model_ans_col], X_train[ans_col], X_train['LengthRatio']], y_train, epochs=100, batch_size=32,
              validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=5)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100


<keras.callbacks.History at 0x1f4a3cb9490>

In [26]:
# evaluate the model
scores = model.evaluate(
        [X_train[model_ans_col], X_train[ans_col], X_train[['LengthRatio']]], y_train, verbose=0)

print("Accuracy: %.2f%%" % (scores[1]*100))
print("Loss: %.2f" % scores[0])


Accuracy: 99.73%
Loss: 0.01


In [27]:
print("Test Accuracy: %.2f%%" % (model.evaluate(
        [X_test[model_ans_col], X_test[ans_col], np.asarray(X_test[['LengthRatio']]).astype(np.float32)], y_test, verbose=1)[1]*100))

Test Accuracy: 99.46%


In [28]:
model.input

[<KerasTensor: shape=(None, 1) dtype=string (created by layer 'input_2')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'input_3')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'input_4')>]

In [29]:
np.asarray(X_test[['LengthRatio', 'Cosine_Similarity']].iloc[0:1]).astype(np.float32).shape

(1, 2)

In [30]:
model_ans_p = "python3 change_ch10_files.py"
ans_p = "OOP is very important in programming"

cosine_similarity_p = 0.912
length_ration_p = get_length_ration(model_ans_p, ans_p)

testdf = pd.DataFrame(columns=['Model_Answer', 'Answer', 'LengthRatio', 'Cosine_Similarity'])
testdf.loc[0] = [model_ans_p, ans_p, length_ration_p, cosine_similarity_p]

prediction = model.predict([testdf[model_ans_col].iloc[0:1], testdf[ans_col].iloc[0:1], np.asarray(testdf[['LengthRatio']].iloc[0:1]).astype(np.float32)])
print("Category:", np.argmax(prediction))

NameError: name 'get_length_ration' is not defined

In [None]:
def get_length_ration(model_ans, ans):
    return len(ans)/len(model_ans)

In [33]:
y_pred = model.predict([X_test[model_ans_col], X_test[ans_col], np.asarray(X_test[['LengthRatio']]).astype(np.float32)])

In [32]:
#import libraries for Root Mean Squared Error
from sklearn.metrics import mean_squared_error
from math import sqrt

#root mean square error of the model
def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

#calculate the root mean square error of the model
print("Root Mean Squared Error:", rmse(y_test, model.predict(
        [X_test[model_ans_col], X_test[ans_col], np.asarray(X_test[['LengthRatio']]).astype(np.float32)])))

Root Mean Squared Error: 0.0565322772807057


In [38]:
# y_pred to categorical
y_predd = np.argmax(y_pred, axis=1)
print(y_predd  )

[1 2 1 ... 2 2 2]


In [37]:
y_test

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [35]:
# Pearsons correlation coefficient
from scipy.stats import pearsonr


def get_pearsonr(y_true, y_pred):
    return pearsonr(y_true, y_pred)[0]


#calculate the Pearson correlation coefficient of the model
print("Pearson's Correlation Coefficient:", get_pearsonr(y_test, model.predict(
        [X_test[model_ans_col], X_test[ans_col], np.asarray(X_test[['LengthRatio']]).astype(np.float32)])))


ValueError: shapes (7564,3) and (7564,3) not aligned: 3 (dim 1) != 7564 (dim 0)

In [31]:
# save the model
import datetime
isSave = 'y'

if isSave == 'y':
    model.save('./Models/model' + str(datetime.datetime.now().timestamp().__round__()) + '.h5')
    print("Model saved")

Model saved
