In [1]:
from math import sqrt
import numpy as np
import scipy
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.isotonic import IsotonicRegression

In [6]:
import warnings
warnings.filterwarnings('ignore')

## Evaluation Metrics

In [2]:
# Metrics functions : rmse and pearson correlation
def RMSE(actual, pred):
    return sqrt(mean_squared_error(actual, pred))

def Pearson(actual,pred):
    mean_a = sum(actual) / len(actual)
    mean_p = sum(pred) / len(pred)
    cov = sum((a - mean_a) * (b - mean_p) for (a, b) in zip(actual, pred)) / len(actual)
    p = float(cov / (np.std(actual) * np.std(pred)))
    return p

## Regression Models

In [3]:
def linear_reg(xTrain,yTrain,xTest):
    xTrain = check_nan(xTrain.to_numpy().reshape(-1,1))
    yTrain = check_nan(yTrain.to_numpy().reshape(-1, 1))
    xTest = check_nan(xTest.to_numpy().reshape(-1, 1))

    model = LinearRegression()
    model.fit(xTrain, yTrain)
    y_pred = model.predict(xTest)
    return y_pred

def isotonic_reg(xTrain,yTrain,xTest):
    xTrain = xTrain.to_list()
    yTrain = yTrain.to_list()
    xTest = xTest.to_list()

    model = IsotonicRegression()
    model.fit(xTrain, yTrain)
    y_pred = model.predict(xTest)
    return y_pred

def ridge_reg(xTrain, yTrain, xTest):
    xTrain = check_nan(xTrain.to_numpy().reshape(-1, 1))
    yTrain = check_nan(yTrain.to_numpy().reshape(-1, 1))
    xTest = check_nan(xTest.to_numpy().reshape(-1, 1))

    model = Ridge()
    model.fit(xTrain, yTrain)
    y_pred = model.predict(xTest)
    return y_pred

## Evaluation
The similarity scores are used as data to train different regression models.

In [4]:
def avg(l):
    return sum(l) / len(l)

def check_nan(arr):
    idx_NaN = np.isnan(arr)
    arr[idx_NaN] = 0
    return arr

In [8]:
def evaluate_models(mdl, df):
    mask = np.random.rand(len(df)) < (70 / 100)
    dataTrain = df[mask]
    dataTest = df[~mask]

    mdl_score = 'normalized_' + mdl.lower() + '_score'
    xTrain = dataTrain[mdl_score]
    xTest = dataTest[mdl_score]
    yTrain = dataTrain['score_avg']
    yTest = dataTest['score_avg'].to_list()

    linear_yPred = [float(x) for x in linear_reg(xTrain,yTrain,xTest)]
    ridge_yPred = [float(x) for x in ridge_reg(xTrain,yTrain,xTest)]
    isotonic_yPred = list(np.nan_to_num(isotonic_reg(xTrain,yTrain,xTest), nan=0))

    y = check_nan(np.asarray(yTest))
    isotonic_pred = check_nan(np.asarray([round(i*2)/2  for i in isotonic_yPred]))
    linear_pred = check_nan(np.asarray([round(i*2)/2  for i in linear_yPred]))
    ridge_pred = check_nan(np.asarray([round(i*2)/2  for i in ridge_yPred]))

    return RMSE(y,isotonic_pred), Pearson(y,isotonic_pred), RMSE(y,linear_pred), Pearson(y,linear_pred), RMSE(y,ridge_pred), Pearson(y,ridge_pred)

In [9]:
df = pd.read_csv('dataset/answers_with_similarity_score.csv')
print("Evaluation Results:")
models=["bert","elmo","gpt","gpt2","universal", "roberta","xlnet"]
for m in models:
    print(m.upper())
    rmse_iso,rmse_lin,rmse_rid = [],[],[]
    pc_iso, pc_lin, pc_rid = [], [],[]

    for i in range(0, 1000):
        iso_rmse_score, iso_pc_score, lin_rmse_score, lin_pc_score, rid_rmse_score, rid_pc_score = evaluate_models(m,df)
        rmse_iso.append(iso_rmse_score)
        pc_iso.append(iso_pc_score)

        rmse_lin.append(lin_rmse_score)
        pc_lin.append(lin_pc_score)
        
        rmse_rid.append(rid_rmse_score)
        pc_rid.append(rid_pc_score)


    print("Isotonic Regression \t ==> \t RMSE :",round(avg(rmse_iso), 3)," \t Pearson Correlation :", round(avg(pc_iso), 3))
    print("Linear Regression \t ==> \t RMSE :",round(avg(rmse_lin), 3)," \t Pearson Correlation :", round(avg(pc_lin), 3))
    print("Ridge Regression \t ==> \t RMSE :",round(avg(rmse_rid), 3)," \t Pearson Correlation :", round(avg(pc_rid), 3))

Evaluation Results:
BERT
Isotonic Regression 	 ==> 	 RMSE : 1.057  	 Pearson Correlation : 0.304
Linear Regression 	 ==> 	 RMSE : 1.078  	 Pearson Correlation : 0.246
Ridge Regression 	 ==> 	 RMSE : 1.075  	 Pearson Correlation : 0.249
ELMO
Isotonic Regression 	 ==> 	 RMSE : 0.981  	 Pearson Correlation : 0.47
Linear Regression 	 ==> 	 RMSE : 0.993  	 Pearson Correlation : 0.443
Ridge Regression 	 ==> 	 RMSE : 0.994  	 Pearson Correlation : 0.442
GPT
Isotonic Regression 	 ==> 	 RMSE : 1.082  	 Pearson Correlation : 0.232
Linear Regression 	 ==> 	 RMSE : 1.089  	 Pearson Correlation : 0.195
Ridge Regression 	 ==> 	 RMSE : 1.09  	 Pearson Correlation : 0.189
GPT2
Isotonic Regression 	 ==> 	 RMSE : 1.066  	 Pearson Correlation : 0.292
Linear Regression 	 ==> 	 RMSE : 1.079  	 Pearson Correlation : 0.248
Ridge Regression 	 ==> 	 RMSE : 1.081  	 Pearson Correlation : 0.241
UNIVERSAL
Isotonic Regression 	 ==> 	 RMSE : 0.988  	 Pearson Correlation : 0.46
Linear Regression 	 ==> 	 RMSE : 0.979