# Dataset Analysis 5: Repeated Holdout

In [1]:
import pandas as pd
import numpy as np
from datasets import *
from training import *

from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.neighbors import LocalOutlierFactor

**Normal generated NA, max censoring, Standard scaler**

In [2]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='max', scaler=StandardScaler())
dogs.data.shape

(161, 16)

Sklearn GridSearchCV

In [3]:
print("SVR with grid search model selection\n")

X, y = dogs.data, dogs.target

param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 1/8)
svreg = GridSearchCV(svm.SVR(), param_grid, cv=6, n_jobs=8)
svreg.fit(X_Train, y_Train)
result = (svreg.best_params_, svreg.score(X_Test, y_Test))

print("Best parameters set found on development set:")
print(result[0])
print("Test score: %f" % result[1])

SVR with grid search model selection

Best parameters set found on development set:
{'C': 1, 'coef0': 10, 'degree': 2, 'epsilon': 10, 'gamma': 1, 'kernel': 'poly'}
Test score: -0.254380


Repeated Holdout

In [4]:
print("SVR with repeated holdout model selection\n")

X, y = dogs.data, dogs.target
param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

result = SVR_gridsearch_holdout(X, y, svm.SVR, param_grid, 10, 15, runs=5)

print("Best parameters set found on development set:")
print(result[0])
print("Test score: %f" % result[1])

SVR with repeated holdout model selection

Best parameters set found on development set:
{'C': 0.5, 'coef0': 100, 'degree': 3, 'epsilon': 10, 'gamma': 0.1, 'kernel': 'poly'}
Test score: -0.549179


**Normal generated NA, drop censoring, Standard Scaler**

In [5]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='drop', scaler=StandardScaler())
dogs.data.shape

(118, 16)

Sklearn GridSearchCV

In [6]:
print("SVR with grid search model selection\n")

X, y = dogs.data, dogs.target

param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 1/8)
svreg = GridSearchCV(svm.SVR(), param_grid, cv=6, n_jobs=8)
svreg.fit(X_Train, y_Train)
result = (svreg.best_params_, svreg.score(X_Test, y_Test))

print("Best parameters set found on development set:")
print(result[0])
print("Test score: %f" % result[1])

SVR with grid search model selection

Best parameters set found on development set:
{'C': 4, 'coef0': 100, 'degree': 3, 'epsilon': 0.0001, 'gamma': 0.0001, 'kernel': 'poly'}
Test score: -0.113340


Repeated Holdout

In [7]:
print("SVR with repeated holdout model selection\n")

X, y = dogs.data, dogs.target
param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

result = SVR_gridsearch_holdout(X, y, svm.SVR, param_grid, 10, 15, runs=5)

print("Best parameters set found on development set:")
print(result[0])
print("Test score: %f" % result[1])

SVR with repeated holdout model selection

Best parameters set found on development set:
{'C': 0.5, 'coef0': 10, 'degree': 2, 'epsilon': 0.0001, 'gamma': 1, 'kernel': 'poly'}
Test score: -1.005486


**Normal generated NA, max censoring, QuantileTransformer scaler**

In [9]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='max', scaler=QuantileTransformer())
dogs.data.shape

(161, 16)

Sklearn GridSearchCV

In [10]:
print("SVR with grid search model selection\n")

X, y = dogs.data, dogs.target

param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 1/8)
svreg = GridSearchCV(svm.SVR(), param_grid, cv=6, n_jobs=8)
svreg.fit(X_Train, y_Train)
result = (svreg.best_params_, svreg.score(X_Test, y_Test))

print("Best parameters set found on development set:")
print(result[0])
print("Test score: %f" % result[1])

SVR with grid search model selection

Best parameters set found on development set:
{'C': 2, 'coef0': 1, 'degree': 3, 'epsilon': 0.0001, 'gamma': 1, 'kernel': 'poly'}
Test score: -0.038165


**Normal generated NA, max censoring, outlier elimination, Standard scaler**

In [11]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='max', scaler=StandardScaler(), outlier_detector=LocalOutlierFactor())
dogs.data.shape

(144, 16)

Sklearn GridSearchCV

In [12]:
print("SVR with grid search model selection\n")

X, y = dogs.data, dogs.target

param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 1/8)
svreg = GridSearchCV(svm.SVR(), param_grid, cv=6, n_jobs=8)
svreg.fit(X_Train, y_Train)
result = (svreg.best_params_, svreg.score(X_Test, y_Test))

print("Best parameters set found on development set:")
print(result[0])
print("Test score: %f" % result[1])

SVR with grid search model selection

Best parameters set found on development set:
{'C': 8, 'coef0': 10, 'degree': 2, 'epsilon': 0.0001, 'gamma': 0.1, 'kernel': 'poly'}
Test score: -0.237374


**Normal generated NA, max censoring, Standard scaler (scaling only trainset)**

In [13]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='max', scaler=None)
dogs.data.shape

(161, 16)

In [14]:
print("SVR with repeated holdout model selection\n")

X, y = dogs.data, dogs.target
param_grid = [
    {'C': [1, 5, 10], 'epsilon':[0.01, 0.1, 1, 10, 50], 'gamma': [0.01, 0.001, 0.0001], 'degree': [2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

result = SVR_gridsearch_holdout(X, y, svm.SVR, param_grid, 10, 15, runs=5, scaler=StandardScaler)

print("Best parameters set found on development set:")
print(result[0])
print("Test score: %f" % result[1])

SVR with repeated holdout model selection

Best parameters set found on development set:
{'C': 10, 'coef0': 10, 'degree': 3, 'epsilon': 1, 'gamma': 0.01, 'kernel': 'poly'}
Test score: -0.813652


**Normal generated NA, max censoring, outlier elimination, Standard scaler (doing outlier elimination and scaling only on trainset)**

In [15]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='max')
dogs.data.shape

(161, 16)

In [16]:
print("SVR with repeated holdout model selection\n")

X, y = dogs.data, dogs.target
param_grid = [
    {'C': [1, 5, 10], 'epsilon':[0.01, 0.1, 1, 10, 50], 'gamma': [0.01, 0.001, 0.0001], 'degree': [2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

result = SVR_gridsearch_holdout(X, y, svm.SVR, param_grid, 10, 15, runs=5, scaler=StandardScaler, outlier_detector=LocalOutlierFactor())

print("Best parameters set found on development set:")
print(result[0])
print("Test score: %f" % result[1])

SVR with repeated holdout model selection

Best parameters set found on development set:
{'C': 10, 'coef0': 100, 'degree': 3, 'epsilon': 1, 'gamma': 0.001, 'kernel': 'poly'}
Test score: -1.722171
