# Dataset Analysis 5: Repeated Holdout

In [1]:
import pandas as pd
import numpy as np
from datasets import *
from training import *

from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

**Normal generated NA, max censoring**

In [2]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='max', scaler=StandardScaler())
dogs.data.shape

(161, 16)

Sklearn GridSearchCV

In [5]:
print("SVR with grid search model selection\n")

X, y = dogs.data, dogs.target

param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(10):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 1/8)
    svreg = GridSearchCV(svm.SVR(), param_grid, cv=6, n_jobs=8)
    svreg.fit(X_Train, y_Train)
    curr_result = (svreg.best_params_, svreg.score(X_Test, y_Test))
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print("Best parameters set found on development set:")
print(best_result[0])
print("Test score: %f" % best_result[1])

mean_score = 0
for i in range(10):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=25)
    best_params = best_result[0]
    best_svr = svm.SVR(**best_params)
    best_svr.fit(X_Train, y_Train)
    mean_score += best_svr.score(X_Test, y_Test)/10
print("Mean score of model on random test splits: %f" % mean_score)

SVR with grid search model selection

Best parameters set found on development set:
{'C': 2, 'coef0': 100, 'degree': 3, 'epsilon': 10, 'gamma': 0.0001, 'kernel': 'poly'}
Test score: 0.077740
Mean score of model on random test splits: -0.225159


Repeated Holdout

In [3]:
X, y = dogs.data, dogs.target
param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': ['auto'], 'degree': [0], 'coef0':[0], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [0], 'coef0':[0], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(10):
    curr_result = SVR_gridsearch_holdout(X, y, svm.SVR, param_grid, 20, 25)
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print(best_result)

({'C': 1, 'coef0': 100, 'degree': 3, 'epsilon': 1, 'gamma': 0.001, 'kernel': 'poly'}, 0.17374976769518824)


In [4]:
mean_score = 0
for i in range(10):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=25)
    best_params = best_result[0]
    best_svr = svm.SVR(**best_params)
    best_svr.fit(X_Train, y_Train)
    mean_score += best_svr.score(X_Test, y_Test)/10
print(mean_score)

-0.21681542458613356


**Normal generated NA, drop censoring**

In [8]:
dogs = load_skl_dogs_2016(NApolicy='normal', censoringPolicy='drop', scaler=StandardScaler())
dogs.data.shape

(118, 16)

In [9]:
X, y = dogs.data, dogs.target
param_grid = [
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': ['auto'], 'degree': [0], 'coef0':[0], 'kernel': ['linear']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [0], 'coef0':[0], 'kernel': ['rbf']},
  {'C': [0.5, 1, 2, 4, 8, 16], 'epsilon':[0.0001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [1,2,3], 'coef0': [0, 1, 10, 100], 'kernel': ['poly']}
 ]

best_result = (0,-np.inf)

for i in range(10):
    curr_result = SVR_gridsearch_holdout(X, y, svm.SVR, param_grid, 10, 15)
    if curr_result[1] > best_result[1]:
        best_result = curr_result

print(best_result)

((8, 0, 3, 0.0001, 0.1, 'poly'), 0.01726788895510678)


In [10]:
mean_score = 0
for i in range(10):
    X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=25)
    best_params = best_result[0]
    best_svr = svm.SVR(C=best_params[0], coef0=best_params[1], degree=best_params[2], epsilon=best_params[3], gamma=best_params[4], kernel=best_params[5])
    best_svr.fit(X_Train, y_Train)
    mean_score += best_svr.score(X_Test, y_Test)/10
print(mean_score)

-0.12922320322938202
