In [1]:
# General libraries
import pandas as pd
import joblib
import warnings
warnings.filterwarnings('ignore')

# Scikit Learn libraries
from sklearn.model_selection import train_test_split

# Utils functions
from utils.utils import compare_models

In [2]:
data_folder_path = '../data/'

data_path = data_folder_path + 'complex_processed_data.csv'
standardized_data_path = data_folder_path + 'complex_processed_standardized_data.csv'
standardized_poutliers_removed_data_path = data_folder_path + 'complex_processed_standardized_outliers_removed_data.csv'

df_solubility = pd.read_csv(standardized_data_path)

In [3]:
# Split dataset into X and Y for machine learning

df_sol_X = df_solubility.copy()
df_sol_X.drop(columns=['solubility'], axis=1, inplace=True)

df_sol_y = df_solubility[['solubility']]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(
                        df_sol_X, df_sol_y, 
                        train_size = 0.8,
                        test_size = 0.2,
                        random_state = 10
                        )

In [5]:
def load_model(model):
    models_folder = '../models/'
    return joblib.load(models_folder + model + '_model.joblib')

In [6]:
# Compare LR with SVR

compare_models(
    reg1=load_model('lr'),
    reg2=load_model('svr'),
    X=x_test,
    y=y_test
)

Iteration  1 score difference = 0.315003
Iteration  2 score difference = -0.250495
Iteration  3 score difference = -0.607164
Iteration  4 score difference = -0.030802
Iteration  5 score difference = -0.281113
Iteration  6 score difference = -0.639541
Iteration  7 score difference = -0.187477
Iteration  8 score difference = 0.019428
Iteration  9 score difference = -0.661571
Iteration 10 score difference = -0.885296
mean_score_1 -0.07437663173168368, std 0.3272123203761014
mean_score_2 0.2465262921698143, std 0.1920332768785105
Ttest_relResult(statistic=-2.726472535872227, pvalue=0.023359013651871822)


In [7]:
# Compare LR with XGBoost

compare_models(
    reg1=load_model('lr'),
    reg2=load_model('xgboost'),
    X=x_test,
    y=y_test
)

Iteration  1 score difference = 0.300196
Iteration  2 score difference = -0.365482
Iteration  3 score difference = -0.419712
Iteration  4 score difference = 0.002041
Iteration  5 score difference = -0.180386
Iteration  6 score difference = -0.606326
Iteration  7 score difference = -0.100105
Iteration  8 score difference = -0.092002
Iteration  9 score difference = -0.556166
Iteration 10 score difference = -0.646214
mean_score_1 -0.07437663173168368, std 0.3272123203761014
mean_score_2 0.1920391250798394, std 0.19021803919470226
Ttest_relResult(statistic=-2.764827844340375, pvalue=0.02193826351132964)


In [8]:
# Compare SVR with XGBoost

compare_models(
    reg1=load_model('svr'),
    reg2=load_model('xgboost'),
    X=x_test,
    y=y_test
)

Iteration  1 score difference = -0.014807
Iteration  2 score difference = -0.114987
Iteration  3 score difference = 0.187452
Iteration  4 score difference = 0.032843
Iteration  5 score difference = 0.100727
Iteration  6 score difference = 0.033215
Iteration  7 score difference = 0.087372
Iteration  8 score difference = -0.111430
Iteration  9 score difference = 0.105405
Iteration 10 score difference = 0.239082
mean_score_1 0.2465262921698143, std 0.1920332768785105
mean_score_2 0.1920391250798394, std 0.19021803919470226
Ttest_relResult(statistic=1.4950630451980296, pvalue=0.16911027480240523)
