In [1]:
# General libraries
import pandas as pd
import numpy as np

# Scikit Learn libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

# Scipy libraries
from scipy import stats
import joblib
import warnings
warnings.filterwarnings('ignore')

# Utils functions
from utils.utils import kfold, five_two, read_datasets

# Load Dataset

In [2]:
x_train, x_test, y_train, y_test = read_datasets(
    'x_train.csv',
    'x_test.csv',
    'y_train.csv',
    'y_test.csv'
)

# Support Vector Regression (SVR)

In [3]:
# The cross validation scheme to be used for train and test
folds = kfold()

## Grid Search

In [4]:
# Specify range of hyperparameters to tune
hyper_params = {
    'kernel': ('linear', 'rbf','poly', 'sigmoid'),
    'C':[1, 1.5, 5, 10, 100],
    'gamma': [1e-7, 1e-4, 'auto', 'scale'],
    'epsilon':[0.1,0.2,0.3,0.4,0.5],
    'degree': [1,2,3,4]
    }


# Call GridSearchCV()
model_cv = GridSearchCV(
    estimator = SVR(),
    param_grid = hyper_params,
    scoring= 'r2',
    cv = folds,
    verbose = 1,
    return_train_score=True,
    n_jobs = -1,
    refit = True
    )


# Fit the model
best_model = model_cv.fit(x_train, np.ravel(y_train)) 

print(model_cv.best_params_)

Fitting 10 folds for each of 1600 candidates, totalling 16000 fits
{'C': 10, 'degree': 2, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'poly'}


In [5]:
# Create new model with best_params_ from grid search

svr_best = best_model.best_estimator_

In [6]:
# Get the results for each split

def get_best_model_cv_split_results(best_model, n_splits=10, set_type='train'):
    results = []
    best_index = best_model.best_index_
    for i in range(0, n_splits):
        current_split = 'split{}_{}_score'.format(i, set_type)
        split_result = best_model.cv_results_[current_split][best_index]
        results.append(split_result)

    return results

print("Train Results: {}".format(get_best_model_cv_split_results(best_model, 10, 'train')))
print("Validation Results: {}".format(get_best_model_cv_split_results(best_model, 10, 'test')))

Train Results: [0.2449845184409153, 0.24671253738174037, 0.22688614479296676, 0.2396351851711167, 0.25832170284743305, 0.2981504783462676, 0.25189229386532197, 0.25460737558499014, 0.2480827991202349, 0.23813494819473513]
Validation Results: [0.2625022850749761, 0.10467198420613932, 0.39127596504957196, 0.2715635146099973, 0.22672019392391485, -0.31933591059322075, 0.20390555701599544, 0.04469261682705328, 0.1256176235187545, 0.29059931309576503]


In [7]:
#Get the mean for the train and test
#Pegar o desvio padrão

train_mean = sum(get_best_model_cv_split_results(best_model, 10, 'train'))/10
test_mean = sum(get_best_model_cv_split_results(best_model, 10, 'test'))/10

print("Train mean: {}".format(train_mean))
print("Validation mean: {}".format(test_mean))

Train mean: 0.25074079837457225
Validation mean: 0.1602213142728947


In [8]:
r2 = r2_score(y_test, svr_best.predict(x_test))
print("The r2 score on test set: {:.4f}".format(r2))

The r2 score on test set: 0.1072


# Saving trained model

In [9]:
filename = '../models/svr_model.joblib'
joblib.dump(svr_best, filename)

['../models/svr_model.joblib']

# Conclusions

What was done:
* Create a SVR model with default parameters and another with grid seach + cross validation;
* Compare the test scores (r2 and adj r2) from before the grid search and after using T test using 5 x 2-fold cross validation (5 cv of 2 folds);
* On grid search: C=1000 did not show better results so it was removed
* It seems that the grid model is better than the basemodel with a 80% confidence level