In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding, Bidirectional, Flatten, Input, Lambda, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.metrics import sparse_categorical_accuracy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text 
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertModel, BertConfig
import numpy as np
import string
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('../Data/merged.csv')

In [3]:
model_ans_col = 'Model_Answer'
ans_col = 'Answer'
label_col = 'Category'

In [4]:
def preprocess_data_for_model_training(df, model_ans_col='Model_Answer', ans_col='Answer'):
    """
    Preprocess data
    """
    print("Preprocessing data...")
    df[model_ans_col] = data[model_ans_col].apply(lambda x: str(x).lower())
    df[ans_col] = data[ans_col].apply(lambda x: str(x).lower())

    df[model_ans_col] = df[model_ans_col].apply(lambda x: x.strip())
    df[ans_col] = df[ans_col].apply(lambda x: x.strip())

    df[model_ans_col] = df[model_ans_col].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    df[ans_col] = df[ans_col].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

    print("Data preprocessing is done")
    return df


In [5]:
# split data for deep learning model training and testing
X_train, X_test, y_train, y_test = train_test_split(
    data[[model_ans_col, ans_col, 'LengthRatio', 'Cosine_Similarity']], data[label_col], test_size=0.2, random_state=176)

# convert to one-hot encoding
y_train = to_categorical(y_train, num_classes=None)
y_test = to_categorical(y_test, num_classes=None)


In [6]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",
    trainable=False)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 768].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 768].

In [7]:
embedding_model = tf.keras.Model(text_input, sequence_output)

In [131]:
# Define Input Layers
input_model_answer = Input(shape=(1,), dtype=tf.string)
input_answer = Input(shape=(1,), dtype=tf.string)
input_text_features = Input(shape=(1,) , dtype=tf.float32)

In [132]:
embedding_model_answer = embedding_model(input_model_answer)
embedding_answer = embedding_model(input_answer)

In [133]:
# get the differnce of the embedding vectors
diff_model_ans = Lambda(
        lambda x: x[0] - x[1])([embedding_model_answer, embedding_answer])

In [134]:
# apply the LSTM layer
lstm_model_answer = Bidirectional(
    LSTM(100, return_sequences=True))(diff_model_ans)

lstm_answer = Bidirectional(
    LSTM(100, return_sequences=True))(embedding_answer)


In [135]:
# apply dropout
dropout_model_answer = Dropout(0.2)(lstm_model_answer)
dropout_answer = Dropout(0.2)(lstm_answer)

In [136]:
# apply the Dense layer
dense_model_answer = Dense(100, activation='relu')(dropout_model_answer)
dense_answer = Dense(100, activation='relu')(dropout_answer)

In [137]:
feature_layer = Dense(100, activation='relu')(input_text_features)

In [138]:
# concatenate the outputs
concatenate_model_answer = concatenate([dense_model_answer, dense_answer])

In [139]:
# apply Dense layer
dense_model1 = Dense(500, activation='relu')(concatenate_model_answer)

# apply dropout
dropout_model1 = Dropout(0.2)(dense_model1)

In [140]:
# reduce the dimensionality
dense_model2 = Dense(250, activation='relu')(dropout_model1)

In [141]:
# include Flatten layer
flatten_model_answer = Flatten()(dense_model2)

In [142]:
concat_layer = concatenate([flatten_model_answer, feature_layer])

In [143]:
final_layer = Dense(50, activation='relu')(concat_layer)

In [144]:
# apply Dense layer - output layer
output_model_answer = Dense(3, activation='softmax')(final_layer)

In [145]:
# create the model
model = Model(inputs=[input_model_answer, input_answer, input_text_features], outputs=output_model_answer)

In [146]:
# compile the model
model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=0.0001), metrics=['accuracy'])

In [147]:
# fit the model
model.fit([X_train[model_ans_col], X_train[ans_col], X_train['LengthRatio']], y_train, epochs=100, batch_size=32,
              validation_split=0.3, callbacks=[EarlyStopping(monitor='val_loss', patience=5)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100


<keras.callbacks.History at 0x11bca2bbdc0>

In [149]:
# evaluate the model
scores = model.evaluate(
        [X_train[model_ans_col], X_train[ans_col], X_train[['LengthRatio']]], y_train, verbose=0)

print("Accuracy: %.2f%%" % (scores[1]*100))
print("Loss: %.2f" % scores[0])


Accuracy: 90.53%
Loss: 0.34


In [150]:
print("Test Accuracy: %.2f%%" % (model.evaluate(
        [X_test[model_ans_col], X_test[ans_col], np.asarray(X_test[['LengthRatio']]).astype(np.float32)], y_test, verbose=1)[1]*100))

Test Accuracy: 75.05%


In [90]:
model.input

[<KerasTensor: shape=(None, 1) dtype=string (created by layer 'input_5')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'input_6')>,
 <KerasTensor: shape=(None, 2) dtype=float32 (created by layer 'input_7')>]

In [112]:
np.asarray(X_test[['LengthRatio', 'Cosine_Similarity']].iloc[0:1]).astype(np.float32).shape

(1, 2)

In [181]:
model_ans_p = "Narendra Modi is the prime minister of India"
ans_p = "OOP is very important in programming"

cosine_similarity_p = 0.912
length_ration_p = get_length_ration(model_ans_p, ans_p)

testdf = pd.DataFrame(columns=['Model_Answer', 'Answer', 'LengthRatio', 'Cosine_Similarity'])
testdf.loc[0] = [model_ans_p, ans_p, length_ration_p, cosine_similarity_p]

prediction = model.predict([testdf[model_ans_col].iloc[0:1], testdf[ans_col].iloc[0:1], np.asarray(testdf[['LengthRatio']].iloc[0:1]).astype(np.float32)])
print("Category:", np.argmax(prediction))

Category: 1


In [171]:
def get_length_ration(model_ans, ans):
    return len(ans)/len(model_ans)

In [172]:
import re

def preprocess_text_for_inferencing(model_ans, ans):
    model_ans = str(model_ans).lower()
    ans = str(ans).lower()

    #remove punctuation
    model_ans = re.sub('[^a-zA-Z0-9]', ' ', model_ans)
    ans = re.sub('[^a-zA-Z0-9]', ' ', ans)

    #strip whitespace
    model_ans = model_ans.strip()
    ans = ans.strip()

    return model_ans, ans

In [173]:
datetime.datetime.now().timestamp().__round__()

NameError: name 'datetime' is not defined

In [None]:
# save the model
import datetime
isSave = 'y'

if isSave == 'y':
    model.save('./Models/model' + str(datetime.datetime.now().timestamp().__round__()) + '.h5')
    print("Model saved")