In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding, Bidirectional, Flatten, Input, Lambda, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.metrics import sparse_categorical_accuracy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np
import string
import pandas as pd

In [2]:
data = pd.read_csv('../Data/merged.csv')

In [3]:
model_ans_col = 'Model_Answer'
ans_col = 'Answer'
label_col = 'Category'

In [4]:
def preprocess_data_for_model_training(df, model_ans_col='Model_Answer', ans_col='Answer'):
    """
    Preprocess data
    """
    print("Preprocessing data...")
    df[model_ans_col] = data[model_ans_col].apply(lambda x: str(x).lower())
    df[ans_col] = data[ans_col].apply(lambda x: str(x).lower())

    df[model_ans_col] = df[model_ans_col].apply(lambda x: x.strip())
    df[ans_col] = df[ans_col].apply(lambda x: x.strip())

    df[model_ans_col] = df[model_ans_col].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    df[ans_col] = df[ans_col].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

    print("Data preprocessing is done")
    return df


In [5]:
data = preprocess_data_for_model_training(data, model_ans_col, ans_col)

Preprocessing data...
Data preprocessing is done


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2442 entries, 0 to 2441
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Question_ID             2442 non-null   float64
 1   Question                2442 non-null   object 
 2   Model_Answer            2442 non-null   object 
 3   Answer                  2442 non-null   object 
 4   Score                   2442 non-null   float64
 5   LengthRatio             2442 non-null   float64
 6   Cosine_Similarity       2442 non-null   float64
 7   Category                2442 non-null   int64  
 8   Answer_Embedding        2442 non-null   object 
 9   Model_Answer_Embedding  2442 non-null   object 
dtypes: float64(4), int64(1), object(5)
memory usage: 190.9+ KB


In [7]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(data[model_ans_col])
tokenizer.fit_on_texts(data[ans_col])
word_index = tokenizer.word_index
print("Found %s unique tokens." % len(word_index))


Found 2972 unique tokens.


In [8]:
# split data for deep learning model training and testing
X_train, X_test, y_train, y_test = train_test_split(
    data[[model_ans_col, ans_col, 'LengthRatio', 'Cosine_Similarity']], data[label_col], test_size=0.2, random_state=176)

# convert to one-hot encoding
y_train = to_categorical(y_train, num_classes=None)
y_test = to_categorical(y_test, num_classes=None)


In [9]:
model_ans_seq = tokenizer.texts_to_sequences(
    X_train[model_ans_col])
ans_seq = tokenizer.texts_to_sequences(X_train[ans_col])

model_ans_pad_seq = pad_sequences(model_ans_seq, maxlen=100)
ans_pad_seq = pad_sequences(ans_seq, maxlen=100)

In [10]:
embeddings_index = {}
f = open('../Data/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [11]:
# create the embedding matrix
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [12]:
# create the embedding layer
embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                            100,
                            weights=[embedding_matrix],
                            trainable=True)


In [13]:
# create the model

# Define Input Layers
input_model_answer = Input(shape=(100,))
input_answer = Input(shape=(100,))
input_text_features = Input(shape=(2,))

In [14]:
# Shared Embedding Layer
embedding_model_answer = embedding_layer(input_model_answer)
embedding_answer = embedding_layer(input_answer)

In [15]:
# get the differnce of the embedding vectors
diff_model_ans = Lambda(
        lambda x: x[0] - x[1])([embedding_model_answer, embedding_answer])

In [16]:
# apply the LSTM layer
lstm_model_answer = Bidirectional(
    LSTM(1000, return_sequences=True))(diff_model_ans)

lstm_answer = Bidirectional(
    LSTM(1000, return_sequences=True))(embedding_answer)


In [17]:
# apply dropout
dropout_model_answer = Dropout(0.2)(lstm_model_answer)
dropout_answer = Dropout(0.2)(lstm_answer)

In [18]:
# apply the Dense layer
dense_model_answer = Dense(1000, activation='relu')(dropout_model_answer)
dense_answer = Dense(1000, activation='relu')(dropout_answer)

In [19]:
feature_layer = Dense(100, activation='relu')(input_text_features)

In [20]:
# concatenate the outputs
concatenate_model_answer = concatenate([dense_model_answer, dense_answer])

In [21]:
# apply Dense layer
dense_model1 = Dense(500, activation='relu')(concatenate_model_answer)

# apply dropout
dropout_model1 = Dropout(0.2)(dense_model1)

In [22]:
# reduce the dimensionality
dense_model2 = Dense(250, activation='relu')(dropout_model1)

In [23]:
# include Flatten layer
flatten_model_answer = Flatten()(dense_model2)

In [24]:
concat_layer = concatenate([flatten_model_answer, feature_layer])

In [25]:
final_layer = Dense(50, activation='relu')(concat_layer)

In [26]:
# apply Dense layer - output layer
output_model_answer = Dense(3, activation='softmax')(final_layer)

In [27]:
# create the model
model = Model(inputs=[input_model_answer, input_answer, input_text_features], outputs=output_model_answer)

In [28]:
# compile the model
model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=0.0001), metrics=['accuracy'])

  super(Adam, self).__init__(name, **kwargs)


In [29]:
# fit the model
model.fit([model_ans_pad_seq, ans_pad_seq,X_train[['LengthRatio', 'Cosine_Similarity']]], y_train, epochs=100, batch_size=32,
              validation_split=0.3, callbacks=[EarlyStopping(monitor='val_loss', patience=5)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


<keras.callbacks.History at 0x1ca1a564fd0>

In [30]:
# evaluate the model
scores = model.evaluate(
        [model_ans_pad_seq, ans_pad_seq, X_train[['LengthRatio', 'Cosine_Similarity']]], y_train, verbose=0)

print("Accuracy: %.2f%%" % (scores[1]*100))
print("Loss: %.2f" % scores[0])


Accuracy: 84.38%
Loss: 0.42


In [31]:
model_ans_seq_test = tokenizer.texts_to_sequences(
    X_test[model_ans_col])
ans_seq_test = tokenizer.texts_to_sequences(X_test[ans_col])

model_ans_pad_seq_test = pad_sequences(model_ans_seq_test, maxlen=100)
ans_pad_seq_test = pad_sequences(ans_seq_test, maxlen=100)

In [32]:
print("Test Accuracy: %.2f%%" % (model.evaluate(
        [model_ans_pad_seq_test, ans_pad_seq_test, X_test[['LengthRatio', 'Cosine_Similarity']]], y_test, verbose=0)[1]*100))

Test Accuracy: 70.96%


In [33]:
import re

def preprocess_text_for_inferencing(model_ans, ans):
    model_ans = str(model_ans).lower()
    ans = str(ans).lower()

    #remove punctuation
    model_ans = re.sub('[^a-zA-Z0-9]', ' ', model_ans)
    ans = re.sub('[^a-zA-Z0-9]', ' ', ans)

    #strip whitespace
    model_ans = model_ans.strip()
    ans = ans.strip()

    #Tokenize
    tokenizer_p = Tokenizer()
    tokenizer_p.fit_on_texts([model_ans, ans])

    model_ans = tokenizer_p.texts_to_sequences([model_ans])
    ans = tokenizer_p.texts_to_sequences([ans])

    #Pad
    model_ans = pad_sequences(model_ans, maxlen=100)
    ans = pad_sequences(ans, maxlen=100)

    return model_ans, ans


In [34]:
def get_length_ration(model_ans, ans):
    return len(ans)/len(model_ans)

In [35]:
from sklearn.metrics.pairwise import cosine_similarity
from nltk import sent_tokenize

from sentence_transformers import SentenceTransformer
def process_bert_similarity(base_answer, sample_answer):
    model = SentenceTransformer('bert-base-nli-mean-tokens')

    tokenize_base_answer = sent_tokenize(base_answer)
    base_answer_embedding = model.encode(tokenize_base_answer)
    base_answer_embedding_mean = np.mean(np.array(base_answer_embedding), axis=0)

    tokenize_sample_answer = sent_tokenize(sample_answer)
    sample_answer_embedding = model.encode(tokenize_sample_answer)
    sample_answer_embedding_mean = np.mean(np.array(sample_answer_embedding), axis=0)

    cosine_similarity_score = cosine_similarity([base_answer_embedding_mean], [sample_answer_embedding_mean]).flatten()

    return cosine_similarity_score[0]

  from .autonotebook import tqdm as notebook_tqdm


In [36]:
model_ans_p = "A primary key is a special relational database table column (or combination of columns) designated to uniquely identify each table record."
ans_p = "A primary key is a key that is used to identify the column names correctly"

cosine_similarity_p = process_bert_similarity(model_ans_p, ans_p)
length_ration_p = get_length_ration(model_ans_p, ans_p)
model_ans_p, ans_p = preprocess_text_for_inferencing(model_ans_p, ans_p)

prediction = model.predict([model_ans_p, ans_p, np.array([[length_ration_p, cosine_similarity_p]])])
print("Category:", np.argmax(prediction))

Category: 1


In [37]:
cosine_similarity_p

0.78207314

In [41]:
datetime.datetime.now().timestamp().__round__()

1651170305

In [44]:
# save the model
import datetime
isSave = 'y'

if isSave == 'y':
    model.save('./Models/model' + str(datetime.datetime.now().timestamp().__round__()) + '.h5')
    print("Model saved")

Model saved
