# Hyperparameter search

## Setup

In [194]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt


from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler, PowerTransformer, FunctionTransformer, MinMaxScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
import xgboost
from xgboost import XGBClassifier
from sklearn.decomposition import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split 

from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import StackingClassifier

from sklearn.base import BaseEstimator, TransformerMixin

from IPython.display import display

from scipy.stats import uniform

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [4]:
# Sourced from the UCI website
from ucimlrepo import fetch_ucirepo 
  
spambase = fetch_ucirepo(id=94) 
  
full_X = spambase.data.features
full_y = spambase.data.targets 

full = full_X.join(full_y).drop_duplicates()
train, test = train_test_split(full, test_size=0.20, stratify=full['Class'], random_state=123)

y = train.pop('Class').reset_index(drop=True)
X = train.reset_index(drop=True)

test_y = test.pop('Class').reset_index(drop=True)
test_X = test.reset_index(drop=True)

In [32]:
outliers = pd.Series(0, index=np.arange(len(X)))
for col in X.columns:
    q = X[X[col]>0][col].quantile(0.99)
    outliers |= X[col] > q

X2 = X[~outliers].copy().reset_index(drop=True)
y2 = y[~outliers].copy().reset_index(drop=True)

outliers.sum()

377

In [6]:
# Log transform capital columns
def capital_log_transform(X):
    X = X.copy()
    rest_cols = X.filter(like='capital').columns
    X[rest_cols] = np.log(X[rest_cols])
    return X
CapitalLogTransformer = FunctionTransformer(capital_log_transform)

# Convert percentages to [0,1] range
def freq_percent_transform(X):
    X = X.copy()
    freq_cols = X.filter(like='freq').columns
    X[freq_cols] /= 100
    return X
FreqPercentTransformer = FunctionTransformer(capital_log_transform)

# Sqrt transform freq cols
def freq_sqrt_transform(X):
    X = X.copy()
    freq_cols = X.filter(like='freq').columns
    X[freq_cols] = np.sqrt(X[freq_cols])
    return X
FreqSqrtTransformer = FunctionTransformer(freq_sqrt_transform)

# log transform freq cols
def freq_log_transform(X):
    X = X.copy()
    freq_cols = X.filter(like='freq').columns
    X[freq_cols] = np.log(X[freq_cols]+0.001)
    return X
FreqLogTransformer = FunctionTransformer(freq_log_transform)

# Mark zero freq values
def freq_zero_transform(X):
    X = X.copy()
    freq_cols = X.filter(like='freq').columns
    # Name them zero_i so later transform steps dont mess it up
    zero_mark_cols =  'zero_i' + pd.Index(np.arange(len(freq_cols))).astype(str)
    X[zero_mark_cols] = X[freq_cols] > 0
    return X
FreqZeroTransformer = FunctionTransformer(freq_zero_transform)

In [7]:
def wrap_model(scaler=StandardScaler, transformers=[], poly=False):
    def inner(model):
        return make_pipeline(*transformers,
                             *([scaler()] if scaler else []), 
                             *([PolynomialFeatures(2, interaction_only=True)] if poly else []), 
                             model)
    return inner

default = wrap_model()

def test_model(model, X, y, preprocess=default, metric='accuracy', splits=5):
    # If no preprocessing, just use model
    pipeline = model
    if preprocess:
        pipeline = preprocess(model)
        
    kf = KFold(n_splits=splits, shuffle=True, random_state=0)
    cv_results = cross_val_score(
        pipeline,
        X,
        y,
        cv=kf,
        scoring=metric,
        n_jobs=-1)
    
    return cv_results

def test_models(models, X, y, metric='accuracy', splits=5, preprocess=default):
    for model in models:
        result = test_model(model, X, y, metric=metric, preprocess=preprocess).mean()
        print(f'{model}: {result}')

def baseline_models():
    return [
        # Dummy
        DummyClassifier(strategy='uniform'),
        # Linear
        LogisticRegression(max_iter=500),
        # Ensambles
        RandomForestClassifier(random_state=42),
        GradientBoostingClassifier(random_state=42),
        ExtraTreesClassifier(random_state=42),
        HistGradientBoostingClassifier(random_state=42),
        # SVM
        SVC(),
        # KNN
        KNeighborsClassifier(),
    ]

In [15]:
best_transformers = [FreqZeroTransformer, FreqSqrtTransformer, CapitalLogTransformer]

In [33]:
neg_low_corr_cols = [
    'word_freq_parts', 'word_freq_direct', 'word_freq_table', 'char_freq_;', 'char_freq_('
]
pos_low_corr_cols = [ 'word_freq_3d' ]

pca = wrap_model(transformers=[CapitalLogTransformer, FreqSqrtTransformer])(PCA(n_components=2)).fit(X2[neg_low_corr_cols])
pca_out = pd.DataFrame(pca.transform(X2[neg_low_corr_cols]))

X_pca = X2.copy()
X_pca = X_pca.drop(neg_low_corr_cols, axis=1)
X_pca = X_pca.drop(pos_low_corr_cols, axis=1)
X_pca['pca0'] = pca_out[0]
X_pca['pca1'] = pca_out[1]

In [9]:
def pipeline_adjust_params(model_name, params):
    return {model_name+'__'+key: val for key,val in params.items()}

In [83]:
# { param_pipeline_name: regular_name }, for renaming params back to short form 
def pipeline_param_rename(col_name):
    return col_name.split('__')[-1]

In [86]:
def process_search_result(cv_results, n_largest=5):
    res = pd.DataFrame(cv_results).nlargest(n_largest, columns='mean_test_score')
    return res.filter(like='param').rename(pipeline_param_rename, axis=1).join(res['mean_test_score'])

## ExtraTrees

In [76]:
params = {
    'bootstrap': [True, False],
    'max_depth': list(range(5,25,10))+[None],
    'n_estimators': list(range(100, 500, 50)),
    'max_features': [None, 0.5, 'sqrt', 'log2'],
    'min_samples_split': [2,3,5]
}

full_params = pipeline_adjust_params('extratreesclassifier', params)

rand_search = RandomizedSearchCV(
    estimator=wrap_model(transformers=best_transformers)(ExtraTreesClassifier(random_state=42)),
    param_distributions=full_params,
    n_iter=100,
    scoring = 'precision',
    cv = 4,
    n_jobs=-1,
    verbose=1
)
rand_search.fit(X_pca,y2)
rand_search.best_params_

Fitting 4 folds for each of 100 candidates, totalling 400 fits


{'extratreesclassifier__n_estimators': 450,
 'extratreesclassifier__min_samples_split': 5,
 'extratreesclassifier__max_features': 'log2',
 'extratreesclassifier__max_depth': 15,
 'extratreesclassifier__bootstrap': False}

In [89]:
process_search_result(rand_search.cv_results_, 10)

Unnamed: 0,n_estimators,min_samples_split,max_features,max_depth,bootstrap,params,mean_test_score
68,450,5,log2,15.0,False,"{'extratreesclassifier__n_estimators': 450, 'e...",0.947399
26,350,5,sqrt,15.0,False,"{'extratreesclassifier__n_estimators': 350, 'e...",0.946728
13,300,3,log2,15.0,True,"{'extratreesclassifier__n_estimators': 300, 'e...",0.945756
16,350,3,log2,15.0,True,"{'extratreesclassifier__n_estimators': 350, 'e...",0.945618
72,450,2,sqrt,15.0,True,"{'extratreesclassifier__n_estimators': 450, 'e...",0.944928
18,100,2,log2,,True,"{'extratreesclassifier__n_estimators': 100, 'e...",0.944679
4,300,2,log2,15.0,False,"{'extratreesclassifier__n_estimators': 300, 'e...",0.944174
99,150,2,log2,,True,"{'extratreesclassifier__n_estimators': 150, 'e...",0.944081
49,300,5,log2,15.0,True,"{'extratreesclassifier__n_estimators': 300, 'e...",0.94355
43,100,5,sqrt,15.0,True,"{'extratreesclassifier__n_estimators': 100, 'e...",0.943428


In [93]:
params = pipeline_adjust_params('extratreesclassifier', {
    'bootstrap': [True, False],
    'max_depth': [None, 15],
    'n_estimators': [100, 150, 300, 400, 450, 500],
    'max_features': ['log2', 'sqrt'],
    'min_samples_split': [3,5]
})

grid_search = GridSearchCV(
    estimator=wrap_model(transformers=best_transformers)(ExtraTreesClassifier(random_state=42)),
    param_grid=params,
    scoring = 'precision',
    cv = 4,
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_pca,y2)
grid_search.best_params_

Fitting 4 folds for each of 96 candidates, totalling 384 fits


{'extratreesclassifier__bootstrap': False,
 'extratreesclassifier__max_depth': 15,
 'extratreesclassifier__max_features': 'log2',
 'extratreesclassifier__min_samples_split': 5,
 'extratreesclassifier__n_estimators': 400}

In [97]:
process_search_result(rand_search.cv_results_, 10)

Unnamed: 0,n_estimators,min_samples_split,max_features,max_depth,bootstrap,params,mean_test_score
68,450,5,log2,15.0,False,"{'extratreesclassifier__n_estimators': 450, 'e...",0.947399
26,350,5,sqrt,15.0,False,"{'extratreesclassifier__n_estimators': 350, 'e...",0.946728
13,300,3,log2,15.0,True,"{'extratreesclassifier__n_estimators': 300, 'e...",0.945756
16,350,3,log2,15.0,True,"{'extratreesclassifier__n_estimators': 350, 'e...",0.945618
72,450,2,sqrt,15.0,True,"{'extratreesclassifier__n_estimators': 450, 'e...",0.944928
18,100,2,log2,,True,"{'extratreesclassifier__n_estimators': 100, 'e...",0.944679
4,300,2,log2,15.0,False,"{'extratreesclassifier__n_estimators': 300, 'e...",0.944174
99,150,2,log2,,True,"{'extratreesclassifier__n_estimators': 150, 'e...",0.944081
49,300,5,log2,15.0,True,"{'extratreesclassifier__n_estimators': 300, 'e...",0.94355
43,100,5,sqrt,15.0,True,"{'extratreesclassifier__n_estimators': 100, 'e...",0.943428


In [227]:
best_model = [
    ExtraTreesClassifier(random_state=42, n_estimators=200),
    ExtraTreesClassifier(random_state=42, n_estimators=450),
    ExtraTreesClassifier(random_state=42, n_estimators=300, min_samples_split=3, max_features='sqrt', max_depth=15, bootstrap=True),
    ExtraTreesClassifier(random_state=42, n_estimators=350, min_samples_split=5, max_features='sqrt', max_depth=15),
    ExtraTreesClassifier(random_state=42, n_estimators=400, min_samples_split=5, max_features='log2', max_depth=15),
    ExtraTreesClassifier(random_state=42, n_estimators=450, min_samples_split=5, max_features='log2', max_depth=15)
]
test_models(best_model, X_pca, y2, metric='precision', preprocess=wrap_model(transformers=[FreqSqrtTransformer, CapitalLogTransformer]))

ExtraTreesClassifier(n_estimators=200, random_state=42): 0.9470413638129525
ExtraTreesClassifier(n_estimators=450, random_state=42): 0.9470142198471816
ExtraTreesClassifier(bootstrap=True, max_depth=15, min_samples_split=3,
                     n_estimators=300, random_state=42): 0.949792882152769
ExtraTreesClassifier(max_depth=15, min_samples_split=5, n_estimators=350,
                     random_state=42): 0.9513629400129296
ExtraTreesClassifier(max_depth=15, max_features='log2', min_samples_split=5,
                     n_estimators=400, random_state=42): 0.9538546572019031
ExtraTreesClassifier(max_depth=15, max_features='log2', min_samples_split=5,
                     n_estimators=450, random_state=42): 0.9537605762522225


In [228]:
best_model = [
    ExtraTreesClassifier(random_state=42, n_estimators=200),
    ExtraTreesClassifier(random_state=42, n_estimators=450),
    ExtraTreesClassifier(random_state=42, n_estimators=300, min_samples_split=3, max_features='sqrt', max_depth=15, bootstrap=True),
    ExtraTreesClassifier(random_state=42, n_estimators=350, min_samples_split=5, max_features='sqrt', max_depth=15),
    ExtraTreesClassifier(random_state=42, n_estimators=400, min_samples_split=5, max_features='log2', max_depth=15),
    ExtraTreesClassifier(random_state=42, n_estimators=450, min_samples_split=5, max_features='log2', max_depth=15)
]
test_models(best_model, X_pca, y2, metric='accuracy', preprocess=wrap_model(transformers=[FreqSqrtTransformer, CapitalLogTransformer]))

ExtraTreesClassifier(n_estimators=200, random_state=42): 0.9525245531850745
ExtraTreesClassifier(n_estimators=450, random_state=42): 0.9531928911619699
ExtraTreesClassifier(bootstrap=True, max_depth=15, min_samples_split=3,
                     n_estimators=300, random_state=42): 0.9448339205252901
ExtraTreesClassifier(max_depth=15, min_samples_split=5, n_estimators=350,
                     random_state=42): 0.9445011473972788
ExtraTreesClassifier(max_depth=15, max_features='log2', min_samples_split=5,
                     n_estimators=400, random_state=42): 0.944167257580918
ExtraTreesClassifier(max_depth=15, max_features='log2', min_samples_split=5,
                     n_estimators=450, random_state=42): 0.9431639130993126


In [154]:
best_etc = ExtraTreesClassifier(max_depth=15, 
                                max_features='log2', 
                                min_samples_split=5,
                                n_estimators=400,
                                random_state=42)

## LogisticRegression

In [120]:
params = pipeline_adjust_params('logisticregression', {
    'C': np.arange(0.1,1.1,0.1),
})

rand_search = GridSearchCV(
    estimator=wrap_model(transformers=best_transformers)(LogisticRegression(max_iter=500)),
    param_grid=params,
    scoring = 'precision',
    cv = 4,
    n_jobs=-1,
    verbose=1
)
rand_search.fit(X_pca,y2)
rand_search.best_params_

Fitting 4 folds for each of 10 candidates, totalling 40 fits


{'logisticregression__C': 0.1}

In [121]:
process_search_result(rand_search.cv_results_, 10)

Unnamed: 0,C,params,mean_test_score
0,0.1,{'logisticregression__C': 0.1},0.938724
1,0.2,{'logisticregression__C': 0.2},0.93815
2,0.3,{'logisticregression__C': 0.30000000000000004},0.936496
3,0.4,{'logisticregression__C': 0.4},0.934823
4,0.5,{'logisticregression__C': 0.5},0.934121
5,0.6,{'logisticregression__C': 0.6},0.932525
6,0.7,{'logisticregression__C': 0.7000000000000001},0.932418
7,0.8,{'logisticregression__C': 0.8},0.930054
8,0.9,{'logisticregression__C': 0.9},0.929343
9,1.0,{'logisticregression__C': 1.0},0.928467


In [130]:
params = pipeline_adjust_params('logisticregression', {
    'solver': ['saga'],
    'penalty': ['l2', 'l1'],
    'C': np.arange(0.1,1.1,0.1),
})

rand_search = GridSearchCV(
    estimator=wrap_model(transformers=best_transformers)(LogisticRegression(max_iter=1000, solver='saga')),
    param_grid=params,
    scoring = 'precision',
    cv = 4,
    n_jobs=-1,
    verbose=1
)
rand_search.fit(X_pca,y2)
rand_search.best_params_

Fitting 4 folds for each of 20 candidates, totalling 80 fits


{'logisticregression__C': 0.1,
 'logisticregression__penalty': 'l2',
 'logisticregression__solver': 'saga'}

In [131]:
process_search_result(rand_search.cv_results_, 10)

Unnamed: 0,C,penalty,solver,params,mean_test_score
0,0.1,l2,saga,"{'logisticregression__C': 0.1, 'logisticregres...",0.938724
5,0.3,l1,saga,"{'logisticregression__C': 0.30000000000000004,...",0.938717
1,0.1,l1,saga,"{'logisticregression__C': 0.1, 'logisticregres...",0.938598
3,0.2,l1,saga,"{'logisticregression__C': 0.2, 'logisticregres...",0.938587
2,0.2,l2,saga,"{'logisticregression__C': 0.2, 'logisticregres...",0.93815
9,0.5,l1,saga,"{'logisticregression__C': 0.5, 'logisticregres...",0.938127
7,0.4,l1,saga,"{'logisticregression__C': 0.4, 'logisticregres...",0.937901
13,0.7,l1,saga,"{'logisticregression__C': 0.7000000000000001, ...",0.936625
11,0.6,l1,saga,"{'logisticregression__C': 0.6, 'logisticregres...",0.936586
4,0.3,l2,saga,"{'logisticregression__C': 0.30000000000000004,...",0.936496


In [134]:
params = pipeline_adjust_params('logisticregression', {
    'solver': ['saga'],
    'penalty': ['elasticnet'],
    'C': np.arange(0.1,1.1,0.1),
    'l1_ratio': np.arange(0.1,1,0.1),
})

grid_search = GridSearchCV(
    estimator=wrap_model(transformers=best_transformers)(LogisticRegression(max_iter=2000, solver='saga')),
    param_grid=params,
    scoring = 'precision',
    cv = 4,
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_pca,y2)
grid_search.best_params_

Fitting 4 folds for each of 90 candidates, totalling 360 fits


{'logisticregression__C': 0.2,
 'logisticregression__l1_ratio': 0.7000000000000001,
 'logisticregression__penalty': 'elasticnet',
 'logisticregression__solver': 'saga'}

In [135]:
process_search_result(grid_search.cv_results_, 10)

Unnamed: 0,C,l1_ratio,penalty,solver,params,mean_test_score
15,0.2,0.7,elasticnet,saga,"{'logisticregression__C': 0.2, 'logisticregres...",0.939523
26,0.3,0.9,elasticnet,saga,"{'logisticregression__C': 0.30000000000000004,...",0.938772
25,0.3,0.8,elasticnet,saga,"{'logisticregression__C': 0.30000000000000004,...",0.938708
14,0.2,0.6,elasticnet,saga,"{'logisticregression__C': 0.2, 'logisticregres...",0.93869
17,0.2,0.9,elasticnet,saga,"{'logisticregression__C': 0.2, 'logisticregres...",0.93869
12,0.2,0.4,elasticnet,saga,"{'logisticregression__C': 0.2, 'logisticregres...",0.938646
16,0.2,0.8,elasticnet,saga,"{'logisticregression__C': 0.2, 'logisticregres...",0.938566
5,0.1,0.6,elasticnet,saga,"{'logisticregression__C': 0.1, 'logisticregres...",0.93852
32,0.4,0.6,elasticnet,saga,"{'logisticregression__C': 0.4, 'logisticregres...",0.938175
22,0.3,0.5,elasticnet,saga,"{'logisticregression__C': 0.30000000000000004,...",0.938075


In [151]:
best_model = [
    LogisticRegression(),
    LogisticRegression(C=0.1),
    LogisticRegression(C=0.01),
    LogisticRegression(C=0.2, l1_ratio=0.7, solver='saga', penalty='elasticnet', max_iter=1000),
    LogisticRegression(C=0.1, l1_ratio=0.7, solver='saga', penalty='elasticnet', max_iter=1000),
    LogisticRegression(C=0.05, l1_ratio=0.7, solver='saga', penalty='elasticnet', max_iter=1000),
    LogisticRegression(C=0.01, l1_ratio=0.7, solver='saga', penalty='elasticnet', max_iter=1000),
]
test_models(best_model, X_pca, y2, metric='precision', preprocess=wrap_model(transformers=best_transformers))

LogisticRegression(): 0.9323722845824115
LogisticRegression(C=0.1): 0.9351368292420525
LogisticRegression(C=0.01): 0.9384845196344299
LogisticRegression(C=0.2, l1_ratio=0.7, max_iter=1000, penalty='elasticnet',
                   solver='saga'): 0.9354646373059843
LogisticRegression(C=0.1, l1_ratio=0.7, max_iter=1000, penalty='elasticnet',
                   solver='saga'): 0.9340496580757858
LogisticRegression(C=0.05, l1_ratio=0.7, max_iter=1000, penalty='elasticnet',
                   solver='saga'): 0.9365553293401394
LogisticRegression(C=0.01, l1_ratio=0.7, max_iter=1000, penalty='elasticnet',
                   solver='saga'): 0.9422004575806426


In [150]:
best_model = [
    LogisticRegression(),
    LogisticRegression(C=0.1),
    LogisticRegression(C=0.01),
    LogisticRegression(C=0.2, l1_ratio=0.7, solver='saga', penalty='elasticnet', max_iter=1000),
    LogisticRegression(C=0.1, l1_ratio=0.7, solver='saga', penalty='elasticnet', max_iter=1000),
    LogisticRegression(C=0.05, l1_ratio=0.7, solver='saga', penalty='elasticnet', max_iter=1000),
    LogisticRegression(C=0.01, l1_ratio=0.7, solver='saga', penalty='elasticnet', max_iter=1000),
]
test_models(best_model, X_pca, y2, metric='accuracy', preprocess=wrap_model(transformers=best_transformers))

LogisticRegression(): 0.9431689381968834
LogisticRegression(C=0.1): 0.9465100697371873
LogisticRegression(C=0.01): 0.9431655881318364
LogisticRegression(C=0.2, l1_ratio=0.7, max_iter=1000, penalty='elasticnet',
                   solver='saga'): 0.9458434067928152
LogisticRegression(C=0.1, l1_ratio=0.7, max_iter=1000, penalty='elasticnet',
                   solver='saga'): 0.9438344844529066
LogisticRegression(C=0.05, l1_ratio=0.7, max_iter=1000, penalty='elasticnet',
                   solver='saga'): 0.9418272371455213
LogisticRegression(C=0.01, l1_ratio=0.7, max_iter=1000, penalty='elasticnet',
                   solver='saga'): 0.9297910117754785


In [152]:
best_lr = LogisticRegression(C=0.01)

## XGBoost

In [161]:
params = pipeline_adjust_params('xgbclassifier', {
    'max_depth': list(range(5,25,10))+[None],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'eta': [0.1, 0.3, 0.5],
    'subsample': [0.6, 0.8, 1.0],
    'min_child_weight': [0.6, 0.8, 1.0]
})

rand_search = RandomizedSearchCV(
    estimator=wrap_model(transformers=best_transformers)(
        XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='aucpr', n_jobs=-1)
    ),
    param_distributions=params,
    n_iter=100,
    scoring = 'precision',
    cv = 4,
    n_jobs=2,
    verbose=1
)
rand_search.fit(X_pca,y2)
rand_search.best_params_

Fitting 4 folds for each of 100 candidates, totalling 400 fits


{'xgbclassifier__subsample': 0.6,
 'xgbclassifier__min_child_weight': 1.0,
 'xgbclassifier__max_depth': 15,
 'xgbclassifier__gamma': 2,
 'xgbclassifier__eta': 0.1}

In [162]:
process_search_result(rand_search.cv_results_, 10)

Unnamed: 0,subsample,min_child_weight,max_depth,gamma,eta,params,mean_test_score
32,0.6,1.0,15.0,2.0,0.1,"{'xgbclassifier__subsample': 0.6, 'xgbclassifi...",0.942489
1,0.8,0.8,,5.0,0.3,"{'xgbclassifier__subsample': 0.8, 'xgbclassifi...",0.94231
64,1.0,0.8,,2.0,0.5,"{'xgbclassifier__subsample': 1.0, 'xgbclassifi...",0.939985
11,0.8,1.0,,2.0,0.1,"{'xgbclassifier__subsample': 0.8, 'xgbclassifi...",0.93982
29,1.0,0.6,5.0,5.0,0.5,"{'xgbclassifier__subsample': 1.0, 'xgbclassifi...",0.939602
54,0.8,0.8,,5.0,0.1,"{'xgbclassifier__subsample': 0.8, 'xgbclassifi...",0.939092
31,1.0,0.8,5.0,2.0,0.5,"{'xgbclassifier__subsample': 1.0, 'xgbclassifi...",0.93906
19,1.0,0.8,5.0,5.0,0.3,"{'xgbclassifier__subsample': 1.0, 'xgbclassifi...",0.93901
77,0.8,1.0,15.0,5.0,0.1,"{'xgbclassifier__subsample': 0.8, 'xgbclassifi...",0.938787
70,0.8,1.0,,1.5,0.5,"{'xgbclassifier__subsample': 0.8, 'xgbclassifi...",0.938719


In [165]:
params = pipeline_adjust_params('xgbclassifier', {
    'max_depth': [None, 5, 15],
    'gamma': [2, 5],
    'eta': [0.1, 0.3, 0.5],
    'subsample': [0.6, 0.8, 1.0],
    'min_child_weight': [0.6, 0.8, 1.0]
})

grid_search = GridSearchCV(
    estimator=wrap_model(transformers=best_transformers)(
        XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='aucpr', n_jobs=-1)
    ),
    param_grid=params,
    scoring = 'precision',
    cv = 4,
    n_jobs=2,
    verbose=1
)
grid_search.fit(X_pca,y2)
grid_search.best_params_

Fitting 4 folds for each of 162 candidates, totalling 648 fits


{'xgbclassifier__eta': 0.1,
 'xgbclassifier__gamma': 2,
 'xgbclassifier__max_depth': 15,
 'xgbclassifier__min_child_weight': 1.0,
 'xgbclassifier__subsample': 0.6}

In [166]:
process_search_result(grid_search.cv_results_, 10)

Unnamed: 0,eta,gamma,max_depth,min_child_weight,subsample,params,mean_test_score
24,0.1,2,15.0,1.0,0.6,"{'xgbclassifier__eta': 0.1, 'xgbclassifier__ga...",0.942489
85,0.3,5,,0.8,0.8,"{'xgbclassifier__eta': 0.3, 'xgbclassifier__ga...",0.94231
119,0.5,2,5.0,0.6,1.0,"{'xgbclassifier__eta': 0.5, 'xgbclassifier__ga...",0.942158
112,0.5,2,,0.8,0.8,"{'xgbclassifier__eta': 0.5, 'xgbclassifier__ga...",0.942062
64,0.3,2,5.0,0.6,0.8,"{'xgbclassifier__eta': 0.3, 'xgbclassifier__ga...",0.941191
15,0.1,2,5.0,1.0,0.6,"{'xgbclassifier__eta': 0.1, 'xgbclassifier__ga...",0.940684
67,0.3,2,5.0,0.8,0.8,"{'xgbclassifier__eta': 0.3, 'xgbclassifier__ga...",0.940678
82,0.3,5,,0.6,0.8,"{'xgbclassifier__eta': 0.3, 'xgbclassifier__ga...",0.940639
16,0.1,2,5.0,1.0,0.8,"{'xgbclassifier__eta': 0.1, 'xgbclassifier__ga...",0.94061
21,0.1,2,15.0,0.8,0.6,"{'xgbclassifier__eta': 0.1, 'xgbclassifier__ga...",0.94017


In [168]:
best_models = [
    XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='aucpr'),
    XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='aucpr',
                  eta=0.1, gamma=2, max_depth=15, min_child_weight=1, subsample=0.6),
    XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='aucpr',
                  eta=0.3, gamma=5, max_depth=None, min_child_weight=0.8, subsample=0.8),
    XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='aucpr',
                  eta=0.5, gamma=2, max_depth=5, min_child_weight=0.6, subsample=1),
]
test_models(best_models, X_pca, y2, metric='precision', preprocess=wrap_model(transformers=best_transformers))

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='aucpr', feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...): 0.9368234843109354
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_

In [169]:
best_models = [
    XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='aucpr'),
    XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='aucpr',
                  eta=0.1, gamma=2, max_depth=15, min_child_weight=1, subsample=0.6),
    XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='aucpr',
                  eta=0.3, gamma=5, max_depth=None, min_child_weight=0.8, subsample=0.8),
    XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='aucpr',
                  eta=0.5, gamma=2, max_depth=5, min_child_weight=0.6, subsample=1),
]
test_models(best_models, X_pca, y2, metric='accuracy', preprocess=wrap_model(transformers=best_transformers))

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='aucpr', feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...): 0.9465072780163147
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_

In [170]:
best_xg = XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='aucpr', 
                        eta=0.1, gamma=2, max_depth=15, min_child_weight=1, subsample=0.6)

## SVC

In [177]:
params = pipeline_adjust_params('svc', {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf','linear', 'poly']
})

grid_search = GridSearchCV(
    estimator=wrap_model(transformers=best_transformers)(SVC()),
    param_grid=params,
    scoring = 'accuracy',
    cv = 4,
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_pca,y2)
grid_search.best_params_

Fitting 4 folds for each of 75 candidates, totalling 300 fits


{'svc__C': 1, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}

In [178]:
process_search_result(grid_search.cv_results_, 10)

Unnamed: 0,C,gamma,kernel,params,mean_test_score
21,1.0,0.01,rbf,"{'svc__C': 1, 'svc__gamma': 0.01, 'svc__kernel...",0.95219
54,100.0,0.001,rbf,"{'svc__C': 100, 'svc__gamma': 0.001, 'svc__ker...",0.948513
38,10.0,0.01,poly,"{'svc__C': 10, 'svc__gamma': 0.01, 'svc__kerne...",0.948513
36,10.0,0.01,rbf,"{'svc__C': 10, 'svc__gamma': 0.01, 'svc__kerne...",0.947844
39,10.0,0.001,rbf,"{'svc__C': 10, 'svc__gamma': 0.001, 'svc__kern...",0.945838
72,1000.0,0.0001,rbf,"{'svc__C': 1000, 'svc__gamma': 0.0001, 'svc__k...",0.944503
24,1.0,0.001,rbf,"{'svc__C': 1, 'svc__gamma': 0.001, 'svc__kerne...",0.942829
42,10.0,0.0001,rbf,"{'svc__C': 10, 'svc__gamma': 0.0001, 'svc__ker...",0.942495
57,100.0,0.0001,rbf,"{'svc__C': 100, 'svc__gamma': 0.0001, 'svc__ke...",0.94216
1,0.1,1.0,linear,"{'svc__C': 0.1, 'svc__gamma': 1, 'svc__kernel'...",0.941827


In [182]:
best_models = [
    SVC(),
    SVC(C=1, gamma=0.01, kernel='rbf'),
    SVC(C=100, gamma=0.001, kernel='rbf'),
    SVC(C=10, gamma=0.01, kernel='poly'),
    SVC(C=10, gamma=0.01, kernel='rbf'),
]
test_models(best_models, X_pca, y2, metric='precision', preprocess=wrap_model(transformers=best_transformers))

SVC(): 0.9421013879609674
SVC(C=1, gamma=0.01): 0.9421013879609674
SVC(C=100, gamma=0.001): 0.9422759218253587
SVC(C=10, gamma=0.01, kernel='poly'): 0.950912169504359
SVC(C=10, gamma=0.01): 0.943290598656698


In [183]:
best_models = [
    SVC(),
    SVC(C=1, gamma=0.01, kernel='rbf'),
    SVC(C=100, gamma=0.001, kernel='rbf'),
    SVC(C=10, gamma=0.01, kernel='poly'),
    SVC(C=10, gamma=0.01, kernel='rbf'),
]
test_models(best_models, X_pca, y2, metric='accuracy', preprocess=wrap_model(transformers=best_transformers))

SVC(): 0.9505223309752597
SVC(C=1, gamma=0.01): 0.9505223309752597
SVC(C=100, gamma=0.001): 0.9501878828147247
SVC(C=10, gamma=0.01, kernel='poly'): 0.9515240004243415
SVC(C=10, gamma=0.01): 0.9535306893875524


In [181]:
best_svc = SVC(C=10, gamma=0.01, kernel='poly')

## HistGradientBoosting

In [186]:
params = pipeline_adjust_params('histgradientboostingclassifier', {
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': list(range(5,25,10))+[None],
    'l2_regularization': [0, 0.1, 0.5, 1],
    'min_samples_leaf': [10, 20, 30],
    'max_bins': [126, 255],
})

rand_search = RandomizedSearchCV(
    estimator=wrap_model(transformers=best_transformers)(HistGradientBoostingClassifier(random_state=42)),
    param_distributions=params,
    n_iter=100,
    scoring = 'precision',
    cv = 4,
    n_jobs=-1,
    verbose=1
)
rand_search.fit(X_pca,y2)
rand_search.best_params_

Fitting 4 folds for each of 100 candidates, totalling 400 fits


{'histgradientboostingclassifier__min_samples_leaf': 10,
 'histgradientboostingclassifier__max_depth': 5,
 'histgradientboostingclassifier__max_bins': 126,
 'histgradientboostingclassifier__learning_rate': 0.2,
 'histgradientboostingclassifier__l2_regularization': 0.1}

In [187]:
process_search_result(rand_search.cv_results_, 10)

Unnamed: 0,min_samples_leaf,max_depth,max_bins,learning_rate,l2_regularization,params,mean_test_score
89,10,5,126,0.2,0.1,{'histgradientboostingclassifier__min_samples_...,0.941855
64,20,5,255,0.2,1.0,{'histgradientboostingclassifier__min_samples_...,0.940352
23,20,5,255,0.05,1.0,{'histgradientboostingclassifier__min_samples_...,0.939561
31,20,5,255,0.2,0.5,{'histgradientboostingclassifier__min_samples_...,0.939045
20,10,5,255,0.2,0.0,{'histgradientboostingclassifier__min_samples_...,0.938842
90,10,5,126,0.2,0.0,{'histgradientboostingclassifier__min_samples_...,0.938657
3,20,5,126,0.1,0.5,{'histgradientboostingclassifier__min_samples_...,0.938279
34,20,5,255,0.05,0.1,{'histgradientboostingclassifier__min_samples_...,0.937936
16,20,15,255,0.2,0.0,{'histgradientboostingclassifier__min_samples_...,0.937879
81,20,5,126,0.2,0.0,{'histgradientboostingclassifier__min_samples_...,0.937822


In [188]:
params = pipeline_adjust_params('histgradientboostingclassifier', {
    'learning_rate': [0.05, 0.2, 0.3],
    'max_depth': [None, 5],
    'l2_regularization': [0, 0.1, 0.5, 1],
    'min_samples_leaf': [10, 20],
    'max_bins': [126, 255],
})

grid_search = GridSearchCV(
    estimator=wrap_model(transformers=best_transformers)(HistGradientBoostingClassifier(random_state=42)),
    param_grid=params,
    scoring = 'precision',
    cv = 4,
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_pca,y2)
grid_search.best_params_

Fitting 4 folds for each of 96 candidates, totalling 384 fits


{'histgradientboostingclassifier__l2_regularization': 0.1,
 'histgradientboostingclassifier__learning_rate': 0.2,
 'histgradientboostingclassifier__max_bins': 126,
 'histgradientboostingclassifier__max_depth': 5,
 'histgradientboostingclassifier__min_samples_leaf': 10}

In [189]:
process_search_result(grid_search.cv_results_, 10)

Unnamed: 0,l2_regularization,learning_rate,max_bins,max_depth,min_samples_leaf,params,mean_test_score
34,0.1,0.2,126,5,10,{'histgradientboostingclassifier__l2_regulariz...,0.941855
94,1.0,0.3,255,5,10,{'histgradientboostingclassifier__l2_regulariz...,0.940949
87,1.0,0.2,255,5,20,{'histgradientboostingclassifier__l2_regulariz...,0.940352
42,0.1,0.3,126,5,10,{'histgradientboostingclassifier__l2_regulariz...,0.939842
7,0.0,0.05,255,5,20,{'histgradientboostingclassifier__l2_regulariz...,0.939687
35,0.1,0.2,126,5,20,{'histgradientboostingclassifier__l2_regulariz...,0.939568
79,1.0,0.05,255,5,20,{'histgradientboostingclassifier__l2_regulariz...,0.939561
63,0.5,0.2,255,5,20,{'histgradientboostingclassifier__l2_regulariz...,0.939045
2,0.0,0.05,126,5,10,{'histgradientboostingclassifier__l2_regulariz...,0.938888
14,0.0,0.2,255,5,10,{'histgradientboostingclassifier__l2_regulariz...,0.938842


In [211]:
best_models = [
    HistGradientBoostingClassifier(random_state=42),
    HistGradientBoostingClassifier(random_state=42, l2_regularization=0.1, learning_rate=0.2, max_bins=126, max_depth=5, min_samples_leaf=10),
    HistGradientBoostingClassifier(random_state=42, l2_regularization=1, learning_rate=0.3, max_bins=255, max_depth=5, min_samples_leaf=10),
    HistGradientBoostingClassifier(random_state=42, l2_regularization=1, learning_rate=0.2, max_bins=255, max_depth=5, min_samples_leaf=20),
]
test_models(best_models, X_pca, y2, metric='precision', preprocess=wrap_model(transformers=best_transformers))

HistGradientBoostingClassifier(random_state=42): 0.9379575281041278
HistGradientBoostingClassifier(l2_regularization=0.1, learning_rate=0.2,
                               max_bins=126, max_depth=5, min_samples_leaf=10,
                               random_state=42): 0.9365707352023593
HistGradientBoostingClassifier(l2_regularization=1, learning_rate=0.3,
                               max_depth=5, min_samples_leaf=10,
                               random_state=42): 0.9394159922309246
HistGradientBoostingClassifier(l2_regularization=1, learning_rate=0.2,
                               max_depth=5, random_state=42): 0.9381395616602595


In [212]:
best_models = [
    HistGradientBoostingClassifier(random_state=42),
    HistGradientBoostingClassifier(random_state=42, l2_regularization=0.1, learning_rate=0.2, max_bins=126, max_depth=5, min_samples_leaf=10),
    HistGradientBoostingClassifier(random_state=42, l2_regularization=1, learning_rate=0.3, max_bins=255, max_depth=5, min_samples_leaf=10),
    HistGradientBoostingClassifier(random_state=42, l2_regularization=1, learning_rate=0.2, max_bins=255, max_depth=5, min_samples_leaf=20),
]
test_models(best_models, X_pca, y2, metric='accuracy', preprocess=wrap_model(transformers=best_transformers))

HistGradientBoostingClassifier(random_state=42): 0.9475139725629672
HistGradientBoostingClassifier(l2_regularization=0.1, learning_rate=0.2,
                               max_bins=126, max_depth=5, min_samples_leaf=10,
                               random_state=42): 0.9481795188189904
HistGradientBoostingClassifier(l2_regularization=1, learning_rate=0.3,
                               max_depth=5, min_samples_leaf=10,
                               random_state=42): 0.9475128558746183
HistGradientBoostingClassifier(l2_regularization=1, learning_rate=0.2,
                               max_depth=5, random_state=42): 0.9461728298557798


In [213]:
best_hgbc = HistGradientBoostingClassifier(l2_regularization=1, learning_rate=0.3,
                                           max_depth=5, min_samples_leaf=10, random_state=42)

## Results

In [214]:
display([
    best_lr,
    best_xg,
    best_etc,
    best_svc,
    best_hgbc
])

[LogisticRegression(C=0.01),
 XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eta=0.1, eval_metric='aucpr',
               feature_types=None, gamma=2, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=None, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=15,
               max_leaves=None, min_child_weight=1, missing=nan,
               monotone_constraints=None, multi_strategy=None, n_estimators=None,
               n_jobs=None, num_parallel_tree=None, ...),
 ExtraTreesClassifier(max_depth=15, max_features='log2', min_samples_split=5,
                      n_estimators=400, random_state=42),
 SVC(C=10, gamma=0.01, kernel='poly'),
 HistGradientBoostingClassifier(l2_regula