In [1]:
# General libraries
import pandas as pd
import numpy as np

# Scikit Learn libraries
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

# Scipy libraries
from scipy import stats
import joblib
import warnings
warnings.filterwarnings('ignore')

# Utils functions
from utils.utils import kfold, five_two

# Load Dataset

In [2]:
folder_path = "../data/"

data_path = folder_path + "complex_processed_data.csv"
standardized_data_path = folder_path + 'complex_processed_standardized_data.csv'
standardized_poutliers_removed_data_path = folder_path + 'complex_processed_standardized_outliers_removed_data.csv'

df_solubility = pd.read_csv(standardized_data_path)

# Process Dataset

Process Dataset before the model creation.
The following actions were done:
* Split the independent variable from the dependent ones;
* Split Dataset for training and testing.

In [3]:
# Split dataset into X and Y for machine learning

df_sol_X = df_solubility.copy()
df_sol_X.drop(columns=['solubility'], axis=1, inplace=True)

df_sol_y = df_solubility[['solubility']]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(
                        df_sol_X, df_sol_y, 
                        train_size = 0.8,
                        test_size = 0.2,
                        random_state = 10
                        )

# Support Vector Regression (SVR)

In [5]:
# The cross validation scheme to be used for train and test
folds = kfold()

In [6]:
# Create a SVR model with cross validation and default parameters
svr = SVR(kernel='rbf', C=1.0, gamma='auto', epsilon=0.1, degree=3)

## Grid Search

In [7]:
# Specify range of hyperparameters to tune
hyper_params = {
    'kernel': ('linear', 'rbf','poly', 'sigmoid'),
    'C':[1, 1.5, 5, 10, 100],
    'gamma': [1e-7, 1e-4, 'auto', 'scale'],
    'epsilon':[0.1,0.2,0.3,0.4,0.5],
    'degree': [1,2,3,4]
    }


# Call GridSearchCV()
model_cv = GridSearchCV(
    estimator = SVR(),
    param_grid = hyper_params,
    scoring= 'r2',
    cv = folds,
    verbose = 1,
    return_train_score=True,
    n_jobs = -1,
    refit = True
    )


# Fit the model
best_model = model_cv.fit(x_train, np.ravel(y_train)) 

print(model_cv.best_params_)

Fitting 10 folds for each of 1600 candidates, totalling 16000 fits
{'C': 1.5, 'degree': 1, 'epsilon': 0.2, 'gamma': 'scale', 'kernel': 'rbf'}


In [8]:
# Create new model with best_params_ from grid search
# Use cross validation on the best_params_ model

svr_best = SVR(
    kernel=model_cv.best_params_['kernel'],
    C=model_cv.best_params_['C'],
    gamma=model_cv.best_params_['gamma'],
    epsilon=model_cv.best_params_['epsilon'],
    degree=model_cv.best_params_['degree']
    )

## Statistical hypothesis testing

Validate if the grid model is better than the base model

Null hyphotesis and Alternative hyphotesis
* Ho = Best params R2 and Adj R2 <= base model R2 and Adj R2
* Ha = Best params R2 and Adj R2 > base model R2 and Adj R2

Errors:
* Type I Error: false positive, reject the Ho but it is true
* Type II Error: false negative, do not reject the Ho but its false

In [9]:
five_two(
    reg1=svr,
    reg2=svr_best,
    X=df_sol_X,
    y=df_sol_y
)

Fold  1 score difference = -0.027373
Fold  2 score difference = -0.034255
Fold  1 score difference = 0.034320
Fold  2 score difference = 0.029165
Fold  1 score difference = -0.070850
Fold  2 score difference = -0.081545
Fold  1 score difference = -0.008678
Fold  2 score difference = -0.058249
Fold  1 score difference = 0.000059
Fold  2 score difference = 0.017027
Regression 1 mean score and stdev : 0.179571 + 0.030092
Regression 2 mean score and stdev : 0.199609 + 0.039461
Score difference mean + stdev : -0.020038 + 0.039196
t_value for the current test is -1.598158


In [10]:
five_two(
    reg1=svr,
    reg2=svr_best,
    X=df_sol_X,
    y=df_sol_y,
    metric='adj_r2'
)

Fold  1 score difference = -0.034649
Fold  2 score difference = -0.043304
Fold  1 score difference = 0.043443
Fold  2 score difference = 0.036869
Fold  1 score difference = -0.089684
Fold  2 score difference = -0.103085
Fold  1 score difference = -0.010984
Fold  2 score difference = -0.073636
Fold  1 score difference = 0.000075
Fold  2 score difference = 0.021524
Regression 1 mean score and stdev : -0.037847 + 0.038478
Regression 2 mean score and stdev : -0.012504 + 0.050316
Score difference mean + stdev : -0.025343 + 0.049574
t_value for the current test is -1.601135


In [11]:
from mlxtend.evaluate import paired_ttest_5x2cv

t, p = paired_ttest_5x2cv(estimator1=svr,
                          estimator2=svr_best,
                          X=df_sol_X, y=df_sol_y,
                          scoring='r2',
                          random_seed=42)

print('t statistic: %.3f' % t)
print('p value: %.3f' % p)

t statistic: -0.634
p value: 0.554


# Saving trained model

In [12]:
filename = '../models/svr_model.joblib'
joblib.dump(svr_best, filename)

['../models/svr_model.joblib']

# Conclusions

What was done:
* Split dataset in test 20% and train 80%;
* Create a SVR model with default parameters and another with grid seach + cross validation;
* Compare the test scores (r2 and adj r2) from before the grid search and after using T test using 5 x 2-fold cross validation (5 cv of 2 folds);
* On grid search: C=1000 did not show better results so it was removed
* It seems that the grid model is better than the basemodel with a 80% confidence level