In [1]:
import math
import pandas as pd
import tensorflow as tf
import numpy as np
import pickle
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
import sklearn.metrics
from tensorflow.keras.losses import MeanSquaredLogarithmicError
from tensorflow.keras import initializers

In [2]:
# Reading the final scores from the all three pipelines/modules
df = pd.read_csv('FinalScoringDataset.csv')

In [3]:
errors = {"keywordModule": "YAKE",
         "similarityModule": "SimCSE",
         "nerModule": "Camembert"}

# scoring relaxed, constrained to one decimal place 
def roundVal(X):
    y = X.to_list()
    for i in range(0, len(y)):
        y[i] = round(y[i], 1)
    return y

In [4]:
# Rounding off the values to one decimal place
df["keyword_score"] = roundVal(df["keyword_score"])
df["similarity_score"] = roundVal(df["similarity_score"])
df["ner_score"] = roundVal(df["ner_score"])

# Initializing the values for model generation
X = df[["keyword_score", "similarity_score", "ner_score"]]
y = df['Actual Score (0-10)']

In [5]:
# Training the model from the generated data from the Calculated Scores
# Scores are from three pipelines
# Along with them there are the final score on which data is trained

# using the train test split function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=14,
                          train_size=0.8, shuffle=True)
model = Sequential()
model.add(Dense(units=8, input_dim=3, kernel_initializer='normal', activation='relu'))
model.add(Dense(units=16, kernel_initializer='normal', activation='relu'))
model.add(Dense(units=32, kernel_initializer=initializers.RandomNormal(stddev=0.01), bias_initializer=initializers.Zeros(), activation='tanh'))
model.add(Dense(units=64, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.summary()
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train ,batch_size = 6, epochs = 300, verbose=0)

# testingError, trainingError, totalError for DNN
# These scores are converted to normalize the error
# error ranges between 0-1 (may depend on various trial runs)
testRMSE = math.sqrt(sklearn.metrics.mean_squared_error(y_test/10, np.round(model.predict(X_test),1)/10))/10  
trainRMSE = math.sqrt(sklearn.metrics.mean_squared_error(y/10, np.round(model.predict(X),1)/10))/10  
totalRMSE = math.sqrt(sklearn.metrics.mean_squared_error(y/10, np.round(model.predict(X),1)/10))/10
errors["trainingError"] = trainRMSE
errors["testingError"] = testRMSE
errors["totalError"] = totalRMSE

trainedMLmodel = pickle.dumps(model)
tf.keras.backend.clear_session()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 32        
                                                                 
 dense_1 (Dense)             (None, 16)                144       
                                                                 
 dense_2 (Dense)             (None, 32)                544       
                                                                 
 dense_3 (Dense)             (None, 64)                2112      
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2,897
Trainable params: 2,897
Non-trainable params: 0
_________________________________________________________________
INFO:tensorflow:Assets written to: ram://89a70498-238c