# Dataset Analysis 5: Repeated Holdout

In [2]:
import pandas as pd
import numpy as np
from datasets import *
from training import *

from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.neighbors import LocalOutlierFactor

**Normal generated NA, max censoring, Standard scaler**

In [2]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='max', scaler=StandardScaler())
dogs.data.shape

(161, 16)

Sklearn GridSearchCV

In [3]:
print("SVR with grid search model selection\n")

X, y = dogs.data, dogs.target

param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(5):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 1/8)
    svreg = GridSearchCV(svm.SVR(), param_grid, cv=6, n_jobs=8)
    svreg.fit(X_Train, y_Train)
    curr_result = (svreg.best_params_, svreg.score(X_Test, y_Test))
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print("Best parameters set found on development set:")
print(best_result[0])
print("Test score: %f" % best_result[1])

mean_score = random_split_tests(X, y, svm.SVR, best_result[0], 25)
print("Mean score of model on random test splits: %f" % mean_score)

SVR with grid search model selection

Best parameters set found on development set:
{'C': 2, 'coef0': 10, 'degree': 2, 'epsilon': 10, 'gamma': 0.1, 'kernel': 'poly'}
Test score: -0.048444
Mean score of model on random test splits: -0.114932


Repeated Holdout

In [4]:
print("SVR with repeated holdout model selection\n")

X, y = dogs.data, dogs.target
param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(10):
    curr_result = SVR_gridsearch_holdout(X, y, svm.SVR, param_grid, 10, 15)
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print("Best parameters set found on development set:")
print(best_result[0])
print("Test score: %f" % best_result[1])

mean_score = random_split_tests(X, y, svm.SVR, best_result[0], 25)
print("Mean score of model on random test splits: %f" % mean_score)

SVR with repeated holdout model selection

Best parameters set found on development set:
{'C': 0.5, 'coef0': 0, 'degree': 3, 'epsilon': 0.0001, 'gamma': 1, 'kernel': 'poly'}
Test score: 0.234359
Mean score of model on random test splits: -1.402006


**Normal generated NA, drop censoring, Standard Scaler**

In [5]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='drop', scaler=StandardScaler())
dogs.data.shape

(118, 16)

Sklearn GridSearchCV

In [8]:
print("SVR with grid search model selection\n")

X, y = dogs.data, dogs.target

param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(10):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 1/8)
    svreg = GridSearchCV(svm.SVR(), param_grid, cv=6, n_jobs=8)
    svreg.fit(X_Train, y_Train)
    curr_result = (svreg.best_params_, svreg.score(X_Test, y_Test))
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print("Best parameters set found on development set:")
print(best_result[0])
print("Test score: %f" % best_result[1])

mean_score = random_split_tests(X, y, svm.SVR, best_result[0], 25)
print("Mean score of model on random test splits: %f" % mean_score)

SVR with grid search model selection

Best parameters set found on development set:
{'C': 8, 'coef0': 0, 'degree': 3, 'epsilon': 5, 'gamma': 0.1, 'kernel': 'poly'}
Test score: 0.006891
Mean score of model on random test splits: -0.075051


Repeated Holdout

In [6]:
print("SVR with repeated holdout model selection\n")

X, y = dogs.data, dogs.target
param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(10):
    curr_result = SVR_gridsearch_holdout(X, y, svm.SVR, param_grid, 10, 15)
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print("Best parameters set found on development set:")
print(best_result[0])
print("Test score: %f" % best_result[1])

mean_score = random_split_tests(X, y, svm.SVR, best_result[0], 25)
print("Mean score of model on random test splits: %f" % mean_score)

SVR with repeated holdout model selection

Best parameters set found on development set:
{'C': 8, 'coef0': 10, 'degree': 1, 'epsilon': 0.0001, 'gamma': 1, 'kernel': 'poly'}
Test score: 0.250883
Mean score of model on random test splits: -0.017156


**Normal generated NA, max censoring, QuantileTransformer scaler**

In [55]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='max', scaler=QuantileTransformer())
dogs.data.shape

(161, 16)

Sklearn GridSearchCV

In [56]:
print("SVR with grid search model selection\n")

X, y = dogs.data, dogs.target

param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(10):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 1/8)
    svreg = GridSearchCV(svm.SVR(), param_grid, cv=6, n_jobs=8)
    svreg.fit(X_Train, y_Train)
    curr_result = (svreg.best_params_, svreg.score(X_Test, y_Test))
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print("Best parameters set found on development set:")
print(best_result[0])
print("Test score: %f" % best_result[1])

mean_score = random_split_tests(X, y, svm.SVR, best_result[0], 25)
print("Mean score of model on random test splits: %f" % mean_score)

SVR with grid search model selection

Best parameters set found on development set:
{'C': 8, 'coef0': 0, 'degree': 2, 'epsilon': 10, 'gamma': 1, 'kernel': 'poly'}
Test score: 0.140830
Mean score of model on random test splits: -0.201410


**Normal generated NA, max censoring, outlier elimination, Standard scaler**

In [2]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='max', scaler=StandardScaler(), outlier_detector=LocalOutlierFactor())
dogs.data.shape

(144, 16)

Sklearn GridSearchCV

In [3]:
print("SVR with grid search model selection\n")

X, y = dogs.data, dogs.target

param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(10):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 1/8)
    svreg = GridSearchCV(svm.SVR(), param_grid, cv=6, n_jobs=8)
    svreg.fit(X_Train, y_Train)
    curr_result = (svreg.best_params_, svreg.score(X_Test, y_Test))
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print("Best parameters set found on development set:")
print(best_result[0])
print("Test score: %f" % best_result[1])

mean_score = random_split_tests(X, y, svm.SVR, best_result[0], 25)
print("Mean score of model on random test splits: %f" % mean_score)

SVR with grid search model selection

Best parameters set found on development set:
{'C': 4, 'coef0': 100, 'degree': 3, 'epsilon': 10, 'gamma': 0.001, 'kernel': 'poly'}
Test score: 0.058900
Mean score of model on random test splits: -0.010849


**Normal generated NA, max censoring, Standard scaler (scaling only trainset)**

In [2]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='max', scaler=None)
dogs.data.shape

(161, 16)

In [3]:
print("SVR with repeated holdout model selection\n")

X, y = dogs.data, dogs.target
param_grid = [
    {'C': [1, 5, 10], 'epsilon':[0.01, 0.1, 1, 10, 50], 'gamma': [0.01, 0.001, 0.0001], 'degree': [2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(5):
    curr_result = SVR_gridsearch_holdout(X, y, svm.SVR, param_grid, 10, 15, scaler=StandardScaler)
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print("Best parameters set found on development set:")
print(best_result[0])
print("Test score: %f" % best_result[1])

mean_score = random_split_tests(X, y, svm.SVR, best_result[0], 25, scaler=StandardScaler)
print("Mean score of model on random test splits: %f" % mean_score)

SVR with repeated holdout model selection

Best parameters set found on development set:
{'C': 1, 'coef0': 0, 'degree': 2, 'epsilon': 10, 'gamma': 0.01, 'kernel': 'poly'}
Test score: -0.131758
Mean score of model on random test splits: -0.110786


**Normal generated NA, max censoring, outlier elimination, Standard scaler (doing outlier elimination and scaling only on trainset)**

In [7]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='max')
dogs.data.shape

(161, 16)

In [8]:
print("SVR with repeated holdout model selection\n")

X, y = dogs.data, dogs.target
param_grid = [
    {'C': [1, 5, 10], 'epsilon':[0.01, 0.1, 1, 10, 50], 'gamma': [0.01, 0.001, 0.0001], 'degree': [2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(5):
    curr_result = SVR_gridsearch_holdout(X, y, svm.SVR, param_grid, 10, 15, scaler=StandardScaler, outlier_detector=LocalOutlierFactor())
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print("Best parameters set found on development set:")
print(best_result[0])
print("Test score: %f" % best_result[1])

mean_score = random_split_tests(X, y, svm.SVR, best_result[0], 25, scaler=StandardScaler, outlier_detector=LocalOutlierFactor())
print("Mean score of model on random test splits: %f" % mean_score)

SVR with repeated holdout model selection

Best parameters set found on development set:
{'C': 1, 'coef0': 100, 'degree': 2, 'epsilon': 1, 'gamma': 0.001, 'kernel': 'poly'}
Test score: 0.143360
Mean score of model on random test splits: -0.173518
