# Dataset Analysis 5: Repeated Holdout

In [1]:
import pandas as pd
import numpy as np
from datasets import *
from training import *

from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.neighbors import LocalOutlierFactor

**Normal generated NA, max censoring, Standard scaler**

In [4]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='max', scaler=StandardScaler())
dogs.data.shape

(161, 16)

Sklearn GridSearchCV

In [49]:
print("SVR with grid search model selection\n")

X, y = dogs.data, dogs.target

param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(5):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 1/8)
    svreg = GridSearchCV(svm.SVR(), param_grid, cv=6, n_jobs=8)
    svreg.fit(X_Train, y_Train)
    curr_result = (svreg.best_params_, svreg.score(X_Test, y_Test))
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print("Best parameters set found on development set:")
print(best_result[0])
print("Test score: %f" % best_result[1])

mean_score = 0
for i in range(10):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=25)
    best_params = best_result[0]
    best_svr = svm.SVR(**best_params)
    best_svr.fit(X_Train, y_Train)
    mean_score += best_svr.score(X_Test, y_Test)/10
print("Mean score of model on random test splits: %f" % mean_score)

SVR with grid search model selection

Best parameters set found on development set:
{'C': 1, 'coef0': 100, 'degree': 3, 'epsilon': 10, 'gamma': 0.0001, 'kernel': 'poly'}
Test score: -0.016038
Mean score of model on random test splits: -0.116339


Repeated Holdout

In [5]:
print("SVR with repeated holdout model selection\n")

X, y = dogs.data, dogs.target
param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': ['auto'], 'degree': [0], 'coef0':[0], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [0], 'coef0':[0], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(10):
    curr_result = SVR_gridsearch_holdout(X, y, svm.SVR, param_grid, 10, 15)
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print("Best parameters set found on development set:")
print(best_result[0])
print("Test score: %f" % best_result[1])

mean_score = 0
for i in range(10):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=25)
    best_params = best_result[0]
    best_svr = svm.SVR(**best_params)
    best_svr.fit(X_Train, y_Train)
    mean_score += best_svr.score(X_Test, y_Test)/10
print("Mean score of model on random test splits: %f" % mean_score)

SVR with repeated holdout model selection

Best parameters set found on development set:
{'C': 16, 'coef0': 100, 'degree': 2, 'epsilon': 0.0001, 'gamma': 0.1, 'kernel': 'poly'}
Test score: 0.031020
-0.09069422544979347
Mean score of model on random test splits: -0.090694


**Normal generated NA, drop censoring, Standard Scaler**

In [2]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='drop', scaler=StandardScaler())
dogs.data.shape

(118, 16)

Sklearn GridSearchCV

In [4]:
print("SVR with grid search model selection\n")

X, y = dogs.data, dogs.target

param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(10):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 1/8)
    svreg = GridSearchCV(svm.SVR(), param_grid, cv=6, n_jobs=8)
    svreg.fit(X_Train, y_Train)
    curr_result = (svreg.best_params_, svreg.score(X_Test, y_Test))
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print("Best parameters set found on development set:")
print(best_result[0])
print("Test score: %f" % best_result[1])

mean_score = 0
for i in range(10):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=25)
    best_params = best_result[0]
    best_svr = svm.SVR(**best_params)
    best_svr.fit(X_Train, y_Train)
    mean_score += best_svr.score(X_Test, y_Test)/10
print("Mean score of model on random test splits: %f" % mean_score)

SVR with grid search model selection

Best parameters set found on development set:
{'C': 16, 'coef0': 0, 'degree': 3, 'epsilon': 0.0001, 'gamma': 0.1, 'kernel': 'poly'}
Test score: 0.066022
Mean score of model on random test splits: -0.170716


Repeated Holdout

In [3]:
print("SVR with repeated holdout model selection\n")

X, y = dogs.data, dogs.target
param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': ['auto'], 'degree': [0], 'coef0':[0], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [0], 'coef0':[0], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(10):
    curr_result = SVR_gridsearch_holdout(X, y, svm.SVR, param_grid, 10, 15)
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print("Best parameters set found on development set:")
print(best_result[0])
print("Test score: %f" % best_result[1])

mean_score = 0
for i in range(10):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=25)
    best_params = best_result[0]
    best_svr = svm.SVR(**best_params)
    best_svr.fit(X_Train, y_Train)
    mean_score += best_svr.score(X_Test, y_Test)/10
print("Mean score of model on random test splits: %f" % mean_score)

SVR with repeated holdout model selection

Best parameters set found on development set:
{'C': 8, 'coef0': 100, 'degree': 3, 'epsilon': 0.0001, 'gamma': 0.0001, 'kernel': 'poly'}
Test score: 0.279897
-0.09315949440376499
Mean score of model on random test splits: -0.093159


**Normal generated NA, max censoring, QuantileTransformer scaler**

In [55]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='max', scaler=QuantileTransformer())
dogs.data.shape

(161, 16)

Sklearn GridSearchCV

In [56]:
print("SVR with grid search model selection\n")

X, y = dogs.data, dogs.target

param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(10):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 1/8)
    svreg = GridSearchCV(svm.SVR(), param_grid, cv=6, n_jobs=8)
    svreg.fit(X_Train, y_Train)
    curr_result = (svreg.best_params_, svreg.score(X_Test, y_Test))
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print("Best parameters set found on development set:")
print(best_result[0])
print("Test score: %f" % best_result[1])

mean_score = 0
for i in range(10):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=25)
    best_params = best_result[0]
    best_svr = svm.SVR(**best_params)
    best_svr.fit(X_Train, y_Train)
    mean_score += best_svr.score(X_Test, y_Test)/10
print("Mean score of model on random test splits: %f" % mean_score)

SVR with grid search model selection

Best parameters set found on development set:
{'C': 8, 'coef0': 0, 'degree': 2, 'epsilon': 10, 'gamma': 1, 'kernel': 'poly'}
Test score: 0.140830
Mean score of model on random test splits: -0.201410


**Normal generated NA, max censoring, outlier elimination, Standard scaler**

In [2]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='max', scaler=StandardScaler(), outlier_detector=LocalOutlierFactor())
dogs.data.shape

(144, 16)

Sklearn GridSearchCV

In [3]:
print("SVR with grid search model selection\n")

X, y = dogs.data, dogs.target

param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(10):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 1/8)
    svreg = GridSearchCV(svm.SVR(), param_grid, cv=6, n_jobs=8)
    svreg.fit(X_Train, y_Train)
    curr_result = (svreg.best_params_, svreg.score(X_Test, y_Test))
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print("Best parameters set found on development set:")
print(best_result[0])
print("Test score: %f" % best_result[1])

mean_score = 0
for i in range(10):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=25)
    best_params = best_result[0]
    best_svr = svm.SVR(**best_params)
    best_svr.fit(X_Train, y_Train)
    mean_score += best_svr.score(X_Test, y_Test)/10
print("Mean score of model on random test splits: %f" % mean_score)

SVR with grid search model selection

Best parameters set found on development set:
{'C': 4, 'coef0': 100, 'degree': 3, 'epsilon': 10, 'gamma': 0.001, 'kernel': 'poly'}
Test score: 0.058900
Mean score of model on random test splits: -0.010849


**Normal generated NA, max censoring, Standard scaler (scaling only trainset)**

In [2]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='max', scaler=None)
dogs.data.shape

(161, 16)

In [None]:
print("SVR with repeated holdout model selection\n")

X, y = dogs.data, dogs.target
param_grid = [
    {'C': [1, 5, 10, 50, 100], 'epsilon':[0.1, 1, 10], 'gamma': [0.01, 0.001, 0.0001], 'degree': [2,3], 'coef0': [0, 1, 10, 50, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(10):
    curr_result = SVR_gridsearch_holdout(X, y, svm.SVR, param_grid, 10, 15, scaler=StandardScaler)
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print("Best parameters set found on development set:")
print(best_result[0])
print("Test score: %f" % best_result[1])

mean_score = 0
for i in range(10):
    X_Scaler, y_Scaler = StandardScaler(), StandardScaler()
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y.astype('float64'), test_size=25)
    X_Scaler.fit(X_Train)
    y_Scaler.fit(y_Train.reshape((-1,1)))
    X_Train, y_Train = X_Scaler.transform(X_Train), y_Scaler.transform(y_Train.reshape((-1,1))).ravel()
    X_Test, y_Test = X_Scaler.transform(X_Test), y_Scaler.transform(y_Test.reshape((-1,1))).ravel()
    best_params = best_result[0]
    best_svr = svm.SVR(**best_params)
    best_svr.fit(X_Train, y_Train)
    mean_score += best_svr.score(X_Test, y_Test)/10
print("Mean score of model on random test splits: %f" % mean_score)