In [1]:
# General libraries
import pandas as pd
import joblib
import warnings
warnings.filterwarnings('ignore')

# Scikit Learn libraries
from sklearn.model_selection import train_test_split

# Utils functions
from utils.utils import compare_models

In [2]:
data_folder_path = '../data/'

data_path = data_folder_path + 'complex_processed_data.csv'
standardized_data_path = data_folder_path + 'complex_processed_standardized_data.csv'
standardized_poutliers_removed_data_path = data_folder_path + 'complex_processed_standardized_outliers_removed_data.csv'

df_solubility = pd.read_csv(standardized_data_path)

In [3]:
# Split dataset into X and Y for machine learning

df_sol_X = df_solubility.copy()
df_sol_X.drop(columns=['solubility'], axis=1, inplace=True)

df_sol_y = df_solubility[['solubility']]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(
                        df_sol_X, df_sol_y, 
                        train_size = 0.8,
                        test_size = 0.2,
                        random_state = 10
                        )

In [5]:
def load_model(model):
    models_folder = '../models/'
    return joblib.load(models_folder + model + '_model.joblib')

In [6]:
# Compare LR with SVR

compare_models(
    reg1=load_model('lr'),
    reg2=load_model('svr'),
    X=x_test.copy(),
    y=y_test.copy()
)

Iteration  1 score difference = -0.382427
Iteration  2 score difference = -0.210558
Iteration  3 score difference = -0.386934
Iteration  4 score difference = -0.220979
Iteration  5 score difference = -0.162628
Iteration  6 score difference = -0.173434
Iteration  7 score difference = -0.221071
Iteration  8 score difference = -0.060403
Iteration  9 score difference = -0.161637
Iteration 10 score difference = -0.246155
mean_score_1 0.06740107824084776, std 0.09271151310129545
mean_score_2 0.2900237144436351, std 0.04076351287193381
Ttest_relResult(statistic=-7.067760218484142, pvalue=5.8704317814202804e-05)
P value menor ou igual a 0.05


In [7]:
# Compare LR with XGBoost

compare_models(
    reg1=load_model('lr'),
    reg2=load_model('xgboost'),
    X=x_test.copy(),
    y=y_test.copy()
)

Iteration  1 score difference = -0.366040
Iteration  2 score difference = -0.196164
Iteration  3 score difference = -0.350153
Iteration  4 score difference = -0.177931
Iteration  5 score difference = -0.184947
Iteration  6 score difference = -0.226295
Iteration  7 score difference = -0.124930
Iteration  8 score difference = -0.112214
Iteration  9 score difference = -0.100741
Iteration 10 score difference = -0.280679
mean_score_1 0.06740107824084776, std 0.09271151310129545
mean_score_2 0.27941039416367447, std 0.04376212379295462
Ttest_relResult(statistic=-7.120095454665582, pvalue=5.5439650247468435e-05)
P value menor ou igual a 0.05


In [8]:
# Compare SVR with XGBoost

compare_models(
    reg1=load_model('svr'),
    reg2=load_model('xgboost'),
    X=x_test.copy(),
    y=y_test.copy()
)

Iteration  1 score difference = 0.016387
Iteration  2 score difference = 0.014394
Iteration  3 score difference = 0.036781
Iteration  4 score difference = 0.043048
Iteration  5 score difference = -0.022318
Iteration  6 score difference = -0.052861
Iteration  7 score difference = 0.096141
Iteration  8 score difference = -0.051811
Iteration  9 score difference = 0.060896
Iteration 10 score difference = -0.034524
mean_score_1 0.2900237144436351, std 0.04076351287193381
mean_score_2 0.27941039416367447, std 0.04376212379295462
Ttest_relResult(statistic=0.6686446512348456, pvalue=0.5205014728218971)


In [9]:
compare_models(
    reg1=load_model('svr'),
    reg2=load_model('xgboost'),
    X=x_test.copy(),
    y=y_test.copy(),
    metric="adj_r2"
)

Iteration  1 score difference = 0.034989
Iteration  2 score difference = 0.030733
Iteration  3 score difference = 0.078531
Iteration  4 score difference = 0.091913
Iteration  5 score difference = -0.047653
Iteration  6 score difference = -0.112865
Iteration  7 score difference = 0.205275
Iteration  8 score difference = -0.110623
Iteration  9 score difference = 0.130021
Iteration 10 score difference = -0.073714
mean_score_1 -0.5158953124041306, std 0.08703560856439922
mean_score_2 -0.5385561854343165, std 0.0934380480984707
Ttest_relResult(statistic=0.6686446512348417, pvalue=0.5205014728218995)
