# HoldOut Grid Search

###### importing libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
from scipy.stats import pearsonr
import itertools
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

###### loading data

In [2]:
train = pd.read_excel('walkertrain.xlsx')
test = pd.read_excel('walkertest.xlsx')

In [3]:
x_tr = train[['x','y']]
y_tr = train[['v']]

x_te = test[['x','y']]
y_te = test['v']

###### Main grid search Code

In [4]:
# Define ranges for the chosen random forest hyperparameters 
hyperparams = {
    'max_depth': [3, 4, 5],
    'min_samples_leaf': [3, 4],
}

lists = hyperparams.values()

# get all param combinations
param_combinations = list(itertools.product(*lists))
print(param_combinations)
print()
total_param_combinations = len(param_combinations)
print(total_param_combinations)

[(3, 3), (3, 4), (4, 3), (4, 4), (5, 3), (5, 4)]

6


In [5]:
def holdout_grid_search(model, X_train_hp, y_train_hp, X_val_hp, y_val_hp, hyperparams, fixed_hyperparams={}):
    '''
    Conduct hyperparameter grid search on hold out validation set. Use holdout validation.
    Hyperparameters are input as a dictionary mapping each hyperparameter name to the
    range of values they should iterate over. Use the cindex function as your evaluation
    function.

    Input:
        model: sklearn model
        X_train_hp (dataframe): dataframe for training set input variables
        y_train_hp (dataframe): dataframe for training set targets
        X_val_hp (dataframe): dataframe for validation set input variables
        y_val_hp (dataframe): dataframe for validation set targets
        hyperparams (dict): hyperparameter dictionary mapping hyperparameter
                            names to range of values for grid search
        fixed_hyperparams (dict): dictionary of fixed hyperparameters that
                                  are not included in the grid search

    Output:
        best_estimator (sklearn classifier): fitted sklearn classifier with best performance on
                                             validation set
        best_hyperparams (dict): hyperparameter dictionary mapping hyperparameter
                                 names to values in best_estimator
    '''
    best_estimator = None
    best_hyperparams = {}
    
    # hold best running score
    best_score = 0.0

    # get list of param values
    lists = hyperparams.values()
    
    # get all param combinations
    param_combinations = list(itertools.product(*lists))
    total_param_combinations = len(param_combinations)

    # iterate through param combinations
    for i, params in enumerate(param_combinations, 1):
        # fill param dict with params
        param_dict = {}
        for param_index, param_name in enumerate(hyperparams):
            param_dict[param_name] = params[param_index]
            
        # create estimator with specified params
        estimator = model(**param_dict, **fixed_hyperparams)

        # fit estimator
        estimator.fit(X_train_hp, y_train_hp)
        
        # get predictions on validation set
        preds = estimator.predict(X_val_hp)
        
        # compute cindex for predictions
        estimator_score = pearsonr(y_val_hp, preds)[0]

        print(f'[{i}/{total_param_combinations}] {param_dict}')
        print(f'Val pearsonr: {estimator_score}\n')

        # if new high score, update high score, best estimator
        # and best params 
        if estimator_score >= best_score:
                best_score = estimator_score
                best_estimator = estimator
                best_hyperparams = param_dict

    # add fixed hyperparamters to best combination of variable hyperparameters
    best_hyperparams.update(fixed_hyperparams)
    
    return best_estimator, best_hyperparams

In [6]:
def random_forest_grid_search(X_train, y_train, X_val, y_val):

    # Define ranges for the chosen random forest hyperparameters 
    hyperparams = {

        # the maximum depth of trees in the forest (int)
        'max_depth': [None,5, 6, 7],
        'min_samples_leaf': [1, 2, 3],

    }

    
    fixed_hyperparams = {
        'random_state': 42,
        'n_estimators': 500,
    }
    
    rf = RandomForestRegressor

    best_rf, best_hyperparams = holdout_grid_search(rf, X_train, y_train,
                                                    X_val, y_val, hyperparams,
                                                    fixed_hyperparams)

    print(f"Best hyperparameters:\n{best_hyperparams}")

    
    y_train_best = best_rf.predict(X_train)
    print(f"Train pearsonr: {pearsonr(y_train, y_train_best)[0]}")

    y_val_best = best_rf.predict(X_val)
    print(f"Val pearsonr: {pearsonr(y_val, y_val_best)[0]}")
    
    # add fixed hyperparamters to best combination of variable hyperparameters
    best_hyperparams.update(fixed_hyperparams)
    
    return best_rf, best_hyperparams

###### preparing data

In [7]:
x_train, x_val, y_train, y_val = train_test_split(x_tr,y_tr,test_size=0.2)

In [8]:
x_train.shape

(376, 2)

In [9]:
###
dst_tr = distance.cdist(x_train,x_train, "euclidean")
xtrain = np.concatenate((x_train, dst_tr), axis = 1)

###
dst_te = distance.cdist(x_val,x_train, "euclidean")
xval = np.concatenate((x_val, dst_te), axis = 1)

In [10]:
xtrain.shape

(376, 378)

In [11]:
xval.shape

(94, 378)

In [12]:
y_train.shape

(376, 1)

In [13]:
y_val.shape

(94, 1)

###### Applying to data

In [14]:
best_rf, best_hyperparams = random_forest_grid_search(xtrain, y_train.values.ravel(), xval, y_val.values.ravel())

[1/12] {'max_depth': None, 'min_samples_leaf': 1}
Val pearsonr: 0.6839816114242607

[2/12] {'max_depth': None, 'min_samples_leaf': 2}
Val pearsonr: 0.6771305108482462

[3/12] {'max_depth': None, 'min_samples_leaf': 3}
Val pearsonr: 0.6708229743706622

[4/12] {'max_depth': 5, 'min_samples_leaf': 1}
Val pearsonr: 0.6652854071687462

[5/12] {'max_depth': 5, 'min_samples_leaf': 2}
Val pearsonr: 0.6639655366039244

[6/12] {'max_depth': 5, 'min_samples_leaf': 3}
Val pearsonr: 0.6584690091618948

[7/12] {'max_depth': 6, 'min_samples_leaf': 1}
Val pearsonr: 0.6730892718390785

[8/12] {'max_depth': 6, 'min_samples_leaf': 2}
Val pearsonr: 0.6704714779584129

[9/12] {'max_depth': 6, 'min_samples_leaf': 3}
Val pearsonr: 0.6639798858596782

[10/12] {'max_depth': 7, 'min_samples_leaf': 1}
Val pearsonr: 0.67947249599229

[11/12] {'max_depth': 7, 'min_samples_leaf': 2}
Val pearsonr: 0.6774154683616854

[12/12] {'max_depth': 7, 'min_samples_leaf': 3}
Val pearsonr: 0.6688098826069296

Best hyperparamete

###### test data final evaluation

In [19]:
###
dst_tr = distance.cdist(x_tr,x_tr, "euclidean")
xtrain = np.concatenate((x_tr, dst_tr), axis = 1)

###
dst_te = distance.cdist(x_te,x_tr, "euclidean")
xtest = np.concatenate((x_te, dst_te), axis = 1)

In [16]:
best_rf.fit(xtrain,y_tr)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [17]:
y_test_best = best_rf.predict(xtest)

In [18]:
print(f"Test pearsonr: {pearsonr(y_te.values, y_test_best)[0]}")

Test pearsonr: 0.794365874654315
