In [1]:
# General libraries
import pandas as pd
import numpy as np

# Scikit Learn libraries
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

# Scipy libraries
from scipy import stats
import joblib
import warnings
warnings.filterwarnings('ignore')

# Utils functions
from utils.utils import kfold, five_two, read_datasets

# Load Dataset

In [2]:
x_train, x_test, y_train, y_test = read_datasets(
    'x_train.csv',
    'x_test.csv',
    'y_train.csv',
    'y_test.csv'
)

# Support Vector Regression (SVR)

In [3]:
# The cross validation scheme to be used for train and test
folds = kfold()

In [4]:
# Create a SVR model with cross validation and default parameters
svr = SVR(kernel='rbf', C=1.0, gamma='auto', epsilon=0.1, degree=3)

## Grid Search

In [5]:
# Specify range of hyperparameters to tune
hyper_params = {
    'kernel': ('linear', 'rbf','poly', 'sigmoid'),
    'C':[1, 1.5, 5, 10, 100],
    'gamma': [1e-7, 1e-4, 'auto', 'scale'],
    'epsilon':[0.1,0.2,0.3,0.4,0.5],
    'degree': [1,2,3,4]
    }


# Call GridSearchCV()
model_cv = GridSearchCV(
    estimator = SVR(),
    param_grid = hyper_params,
    scoring= 'r2',
    cv = folds,
    verbose = 1,
    return_train_score=True,
    n_jobs = -1,
    refit = True
    )


# Fit the model
best_model = model_cv.fit(x_train, np.ravel(y_train)) 

print(model_cv.best_params_)

Fitting 10 folds for each of 1600 candidates, totalling 16000 fits
{'C': 10, 'degree': 2, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'poly'}


In [6]:
# Create new model with best_params_ from grid search
# Use cross validation on the best_params_ model

svr_best = SVR(
    kernel=model_cv.best_params_['kernel'],
    C=model_cv.best_params_['C'],
    gamma=model_cv.best_params_['gamma'],
    epsilon=model_cv.best_params_['epsilon'],
    degree=model_cv.best_params_['degree']
    )

## Statistical hypothesis testing

Validate if the grid model is better than the base model

Null hyphotesis and Alternative hyphotesis
* Ho = Best params R2 and Adj R2 <= base model R2 and Adj R2
* Ha = Best params R2 and Adj R2 > base model R2 and Adj R2

Errors:
* Type I Error: false positive, reject the Ho but it is true
* Type II Error: false negative, do not reject the Ho but its false

In [7]:
five_two(
    reg1=svr,
    reg2=svr_best,
    X=x_train,
    y=y_train
)

Fold  1 score difference = -0.158989
Fold  2 score difference = -0.239713
Fold  1 score difference = -0.055025
Fold  2 score difference = -0.222706
Fold  1 score difference = -0.356077
Fold  2 score difference = -0.115042
Fold  1 score difference = -0.195386
Fold  2 score difference = -0.155598
Fold  1 score difference = -0.257971
Fold  2 score difference = -0.155834
Regression 1 mean score and stdev : -0.029638 + 0.067381
Regression 2 mean score and stdev : 0.161596 + 0.047055
Score difference mean + stdev : -0.191234 + 0.079313
t_value for the current test is -1.553459


In [8]:
five_two(
    reg1=svr,
    reg2=svr_best,
    X=x_train,
    y=y_train,
    metric='adj_r2'
)

Fold  1 score difference = -0.208511
Fold  2 score difference = -0.314378
Fold  1 score difference = -0.072164
Fold  2 score difference = -0.292074
Fold  1 score difference = -0.466986
Fold  2 score difference = -0.150875
Fold  1 score difference = -0.256244
Fold  2 score difference = -0.204063
Fold  1 score difference = -0.338323
Fold  2 score difference = -0.204372
Regression 1 mean score and stdev : -0.350345 + 0.088368
Regression 2 mean score and stdev : -0.099546 + 0.061712
Score difference mean + stdev : -0.250799 + 0.104017
t_value for the current test is -1.553459


In [9]:
from mlxtend.evaluate import paired_ttest_5x2cv

t, p = paired_ttest_5x2cv(estimator1=svr,
                          estimator2=svr_best,
                          X=x_train, y=y_train,
                          scoring='r2',
                          random_seed=42)

print('t statistic: %.3f' % t)
print('p value: %.3f' % p)

t statistic: -2.559
p value: 0.051


# Saving trained model

In [10]:
filename = '../models/svr_model.joblib'
joblib.dump(svr_best, filename)

['../models/svr_model.joblib']

# Conclusions

What was done:
* Create a SVR model with default parameters and another with grid seach + cross validation;
* Compare the test scores (r2 and adj r2) from before the grid search and after using T test using 5 x 2-fold cross validation (5 cv of 2 folds);
* On grid search: C=1000 did not show better results so it was removed
* It seems that the grid model is better than the basemodel with a 80% confidence level