# Imports

In [2]:
import sys
import pathlib
import joblib
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import itertools

from functools import partial
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder

if pathlib.Path().parent.resolve().absolute().as_posix() not in sys.path:
    sys.path.append(pathlib.Path().parent.resolve().absolute().as_posix())

from pilot import Pilot, ensemble, Tree

from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning

OUTPUTPATH = pathlib.Path().absolute() / 'Output'

warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning)

%load_ext autoreload
%autoreload 2

# Functions

In [3]:
def highlight(data, kind='max', color='black', background_color='lightgreen'):
    '''
    highlight the maximum in a Series or DataFrame
    '''
    attr = 'color: {}; background-color: {}'.format(color, background_color)
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        if kind == 'max':
            highlight = data == data.max()
        elif kind == 'min':
            highlight = data == data.min()
        return [attr if v else '' for v in highlight]
    else:  # from .apply(axis=None)
        if kind == 'max':
            highlight = data == data.max().max()
        elif kind == 'min':
            highlight = data == data.min().min()
        return pd.DataFrame(np.where(highlight, attr, ''),
                            index=data.index, columns=data.columns)

# Load data

In [4]:
DATAPATH = pathlib.Path().absolute() / 'Data'
datasets = {
    'abalone': {'categorical': [0]}, # sex
    'airfoil': {'categorical': [-1]},
    'Bias_correction_ucl': {'categorical': [0]}, # station 
    'bodyfat_preprocessed': {'categorical': [-1]},
    'boston': {'categorical': [3]}, # Charles river dummy
    'communities': {'categorical': [-1]}, # 119 = LemasGangUnitDeploy
    'concrete': {'categorical': [-1]},
    'diabetes': {'categorical': [-1]},
    'electricity': {'categorical': [-1]},
    'energy': {'categorical': [5]}, # Orientation (X6)
    'ga_preprocessed': {'categorical': [6]}, # Research
    'housing': {'categorical': [-1]},
    'ozone_preprocessed': {'categorical': [10, 11]},
    'residential': {'categorical': [-1]},
    'ribo_preprocessed': {'categorical': [-1]},
    'SeoulBikeData': {'categorical': [-1]}, # 10 = seasons
    'skills': {'categorical': [-1]},
    'superconductor': {'categorical': [-1]},
    'Walmart_preprocessed': {'categorical': [0]},
    'wine': {'categorical': [-1]}
}

In [5]:
for d in datasets:
    df = pd.read_csv(DATAPATH / f'{d}.csv')
    X = df.drop(columns='target').values
    y = df['target'].values
    datasets[d]['data'] = X, y


# Grid Search

## Run grid

In [70]:
OUTPUTPATH = pathlib.Path().absolute() / 'Output'

filename = OUTPUTPATH / 'rf_gridsearch_final.pkl'

if filename.exists():
    results = joblib.load(filename)
else:
    results = {}

In [74]:
datasets.keys(), results.keys()

(dict_keys(['abalone', 'airfoil', 'Bias_correction_ucl', 'bodyfat_preprocessed', 'boston', 'communities', 'concrete', 'diabetes', 'electricity', 'energy', 'ga_preprocessed', 'housing', 'ozone_preprocessed', 'residential', 'ribo_preprocessed', 'SeoulBikeData', 'skills', 'superconductor', 'Walmart_preprocessed', 'wine']),
 dict_keys(['abalone', 'airfoil', 'bodyfat_preprocessed', 'boston', 'concrete', 'diabetes', 'electricity', 'energy', 'ga_preprocessed', 'ozone_preprocessed', 'residential', 'SeoulBikeData', 'skills', 'Walmart_preprocessed', 'wine']))

In [72]:
datasets_to_ignore = ['superconductor', 'Bias_correction_ucl', 'communities', 'housing', 'ribo_preprocessed']

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
for i, (d, data) in enumerate(datasets.items()):
    print(i, d)
    if (d in datasets_to_ignore) or (d in results.keys()):
        continue
    pilot = GridSearchCV(
        estimator=ensemble.RandomForestPilot(),
        param_grid={
            'n_estimators': [100],
            'max_depth': [10],
            'truncation_factor': [1, 1.5],
            'min_sample_split': [2],
            'min_sample_leaf':  [1],
            'n_features': [0.3, 0.7, 1.0],
            'rel_tolerance': [0.01],
            'df_settings': [{'con': i, 'lin': j} for i, j in itertools.product([1, 5], [2, 5])],
            'min_unique_values_regression': [2, 5]
            
        }, 
        scoring='neg_mean_squared_error',
        cv=cv,
        n_jobs=-1,
        verbose=10
    )
    tree = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid={
            'n_estimators': [100],
            'max_depth': [10],
            'min_samples_split': [2],
            'min_samples_leaf':  [1],
            'max_features': [0.3, 0.7, 1.0]
        },
        scoring='neg_mean_squared_error', 
        cv=cv,
        n_jobs=-1
    )

    X, y = data['data']
    
    pilot.fit(X, y, categorical_idx=np.array(data['categorical']))
    tree.fit(X, y)

    
    print(d, round(pilot.best_score_, 2), round(tree.best_score_, 2))
    
    results[d] = {'pilot': pilot.cv_results_, 'rf': tree.cv_results_}
    joblib.dump(results, filename)

## Inspect results

In [7]:
OUTPUTPATH = pathlib.Path().absolute() / 'Output'
results = joblib.load( OUTPUTPATH / 'rf_gridsearch_final.pkl')

In [75]:
normalized_results = pd.DataFrame([{
    'dataset': d, 
    'pilot_mse': -np.nanmax(r['pilot']['mean_test_score']), 
    'rf_mse': -np.nanmax(r['rf']['mean_test_score'])
    } for d, r in results.items()]).set_index('dataset').apply(lambda row: row / row.min(), axis=1)

In [76]:
normalized_results.style.apply(partial(highlight, kind='min'), axis=1).format('{:.3f}')

Unnamed: 0_level_0,pilot_mse,rf_mse
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
abalone,1.0,1.01
airfoil,1.0,1.673
bodyfat_preprocessed,1.0,1.265
boston,1.053,1.0
concrete,1.0,1.159
diabetes,1.0,1.076
electricity,1.0,1.385
energy,1.0,1.583
ga_preprocessed,1.0,1.066
ozone_preprocessed,1.005,1.0


In [77]:
df = pd.concat([
    pd.concat([
        pd.DataFrame(results[d]['pilot']).assign(dataset=d).reset_index(),
        pd.json_normalize(pd.DataFrame(results[d]['pilot'])['param_df_settings']).rename(columns=lambda c: 'param_df_' + c)
    ], axis=1).assign(best_rf=np.nanmax(results[d]['rf']['mean_test_score'])) for d in results
]).groupby(
    ['dataset'] + [c for c in results['abalone']['pilot'].keys() if c.startswith('param_') and c != 'param_df_settings'] + ['param_df_con', 'param_df_lin']
)[['mean_test_score', 'best_rf']].mean() * -1

df['mse_ratio'] = df['mean_test_score'] / df['best_rf']
# remove param combo's that were not tested for all datasets
df = df['mse_ratio'].unstack(level=0).loc[lambda df: (df.index.get_level_values('param_max_depth') == 10) & ~(df.index.get_level_values('param_df_con').astype(str) + df.index.get_level_values('param_df_lin').astype(str)).isin(['22', '25'])]

df['average'] = df.mean(axis=1)
df['count_better'] = (df.drop(columns='average') < 1).sum(axis=1)

In [78]:
df.sort_values(['count_better', 'average'], ascending=[False, True]).T

param_max_depth,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10
param_min_sample_leaf,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
param_min_sample_split,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
param_min_unique_values_regression,2,2,2,2,5,5,5,5,2,2,2,2,2,2,2,5,5,2,5,2,5
param_n_estimators,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
param_n_features,1.0,0.7,0.7,1.0,0.7,0.7,1.0,1.0,1.0,1.0,...,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3
param_rel_tolerance,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
param_truncation_factor,1.5,1.0,1.5,1.0,1.0,1.5,1.0,1.5,1.0,1.5,...,1.5,1.5,1.0,1.5,1.0,1.5,1.5,1.0,1.0,1.5
param_df_con,5,5,5,5,5,5,5,5,5,5,...,5,5,1,1,1,1,1,1,1,1
param_df_lin,5,5,5,5,5,5,5,5,2,2,...,2,5,2,2,2,2,5,5,5,5
dataset,Unnamed: 1_level_10,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10,Unnamed: 17_level_10,Unnamed: 18_level_10,Unnamed: 19_level_10,Unnamed: 20_level_10,Unnamed: 21_level_10
SeoulBikeData,0.786038,0.80553,0.794413,0.779449,0.938859,0.941188,0.899355,0.901317,0.696924,0.725416,...,1.554089,1.196334,1.670895,1.644263,1.850976,1.764512,1.84516,1.920155,1.899127,1.908037
Walmart_preprocessed,0.894787,0.865841,0.865535,0.866862,0.842733,0.840777,0.825848,0.813531,0.845026,0.855812,...,5.108469,1.754169,5.559884,5.832017,6.170271,6.103731,5.910075,6.390852,6.043235,6.006049
abalone,0.994255,0.992397,0.989938,1.000906,0.991328,0.995651,0.996559,0.999882,1.027321,1.027229,...,1.191904,1.002727,1.196213,1.202286,1.196531,1.198258,1.174188,1.18852,1.191786,1.17612
airfoil,0.600123,0.666019,0.669633,0.597814,0.688618,0.681741,0.632782,0.63659,1.35484,1.32232,...,5.685847,2.766779,5.988846,6.109742,6.824868,6.892946,6.646197,6.85493,6.68289,6.813786
bodyfat_preprocessed,0.91876,0.85121,0.904338,0.88192,0.908027,0.883396,0.920544,0.91305,0.811191,0.808981,...,0.861663,0.911535,0.913511,0.920924,0.922751,0.937337,0.998819,0.99821,1.026355,1.005672
boston,1.217297,1.193807,1.139658,1.249513,1.129794,1.119003,1.223981,1.211959,1.14624,1.220173,...,1.194463,1.135619,1.378884,1.339649,1.32092,1.322092,1.458401,1.371805,1.465751,1.462903
concrete,0.90565,0.862629,0.870268,0.909576,0.879964,0.883046,0.885322,0.879763,0.889854,0.9019,...,1.607506,1.048852,2.021578,2.048293,1.977683,2.019419,2.321249,2.365847,2.279432,2.393897
diabetes,0.995255,0.998078,0.990657,0.977797,0.963075,0.982103,0.975362,1.000564,0.933314,0.939668,...,0.945458,0.963343,0.976305,0.97478,0.986637,0.985789,1.031564,1.016921,1.020581,1.019263
electricity,0.736316,0.749665,0.768069,0.721931,0.753269,0.758483,0.722801,0.736272,0.790994,0.808643,...,1.879259,1.016131,1.901882,1.949037,1.932734,1.972985,2.047285,2.008533,2.013246,2.050369
energy,0.631806,0.760404,0.74501,0.669385,1.112337,1.102683,1.142238,1.123226,1.102708,1.107177,...,1.396889,1.201179,1.85077,1.740273,2.466897,2.686153,2.111633,2.80877,2.027249,2.975774


In [82]:
results['airfoil']['pilot'].keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_df_settings', 'param_max_depth', 'param_min_sample_leaf', 'param_min_sample_split', 'param_min_unique_values_regression', 'param_n_estimators', 'param_n_features', 'param_rel_tolerance', 'param_truncation_factor', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [83]:
df = pd.concat([
    pd.concat([
        pd.DataFrame(results[d]['pilot']).assign(dataset=d).reset_index(),
        pd.json_normalize(pd.DataFrame(results[d]['pilot'])['param_df_settings']).rename(columns=lambda c: 'param_df_' + c)
    ], axis=1) for d in results
]).groupby(
    ['dataset'] + [c for c in results['abalone']['pilot'].keys() if c.startswith('param_') and c != 'param_df_settings'] + ['param_df_con', 'param_df_lin']
)['mean_fit_time'].mean().unstack(level=0).loc[lambda df: (df.index.get_level_values('param_max_depth') == 10) & ~(df.index.get_level_values('param_df_con').astype(str) + df.index.get_level_values('param_df_lin').astype(str)).isin(['22', '25'])]

df['average'] = df.mean(axis=1)


In [86]:
df.sort_values('average', ascending=False).T

param_max_depth,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10
param_min_sample_leaf,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
param_min_sample_split,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
param_min_unique_values_regression,2,2,5,2,5,2,2,2,5,5,5,5,5,5,5,2,2,2,5,2,5
param_n_estimators,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
param_n_features,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3
param_rel_tolerance,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
param_truncation_factor,1.5,1.0,1.0,1.5,1.5,1.0,1.0,1.5,1.0,1.5,...,1.5,1.5,1.0,1.0,1.0,1.5,1.5,1.5,1.0,1.0
param_df_con,5,5,5,1,5,1,5,5,1,1,...,5,1,5,1,1,1,1,1,1,1
param_df_lin,2,2,2,2,2,2,5,5,2,2,...,2,2,2,2,2,2,5,5,5,5
dataset,Unnamed: 1_level_10,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10,Unnamed: 17_level_10,Unnamed: 18_level_10,Unnamed: 19_level_10,Unnamed: 20_level_10,Unnamed: 21_level_10
SeoulBikeData,591.946307,600.689023,468.056606,398.750226,500.705917,364.488468,462.007738,439.233831,349.738237,348.292085,...,64.368673,46.094328,69.045716,39.161579,46.509819,46.921479,42.212653,37.583595,39.867051,38.197229
Walmart_preprocessed,274.096758,273.585674,250.491879,198.881071,246.920173,193.200664,300.64897,295.260073,185.38844,179.923122,...,35.989835,20.314955,36.829382,20.447835,23.672229,21.627594,21.484598,18.372691,20.36107,18.248985
abalone,81.058273,83.525842,89.024745,104.315097,86.032984,93.768481,138.748985,137.566539,97.951054,100.913928,...,14.617115,18.204015,14.979194,18.49883,14.986057,16.512768,12.412461,12.355069,11.781692,11.984651
airfoil,33.653573,33.991733,14.096375,13.567928,14.09544,12.974417,42.253367,42.626163,8.088487,8.134761,...,2.260261,0.642218,2.201348,0.62667,0.781796,0.716614,0.692998,0.666799,0.700361,0.743537
bodyfat_preprocessed,21.863316,21.289592,20.712917,7.541303,20.83505,7.504929,19.464761,19.708444,7.246135,7.158293,...,5.472055,1.731611,5.581708,1.737034,2.036692,1.839187,2.450657,2.401819,2.45007,2.467045
boston,44.004106,43.222365,53.011499,26.96627,52.051166,26.499841,28.140735,28.703705,27.64826,27.191794,...,8.444519,4.763264,9.103645,4.514353,5.370272,4.771813,3.982225,3.813773,3.836697,3.933758
concrete,81.303153,77.932901,44.345036,29.181865,44.161832,28.889335,46.264091,46.802825,19.70821,19.339251,...,7.856649,3.678142,7.308229,3.265283,3.888737,4.042578,3.147216,3.784473,3.049297,4.761941
diabetes,11.34695,11.440345,10.848298,8.516861,10.142951,8.539936,20.887059,20.837168,7.890005,8.013408,...,2.695494,1.753757,2.332751,1.682986,1.786106,1.783543,1.658981,1.589022,1.683433,1.551615
electricity,2217.543054,2112.142107,2000.759984,2253.560109,1903.95461,2251.678108,1552.858779,1559.345844,1934.502613,1808.199017,...,368.436779,453.248081,352.462371,445.824077,350.393224,347.990498,330.599803,326.349911,313.453965,311.544171
energy,7.784366,7.476793,20.655138,3.465992,21.138024,3.130007,47.878762,46.60065,6.301603,6.086632,...,4.846357,0.9401,4.665933,1.060393,1.608203,1.598118,0.729104,0.975467,0.749886,0.938533


In [98]:
df.stack().reset_index().groupby(['dataset', 'param_min_unique_values_regression'])[0].mean().unstack()

param_min_unique_values_regression,2,5
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
SeoulBikeData,268.402972,235.964836
Walmart_preprocessed,147.363678,136.18023
abalone,59.721499,60.384274
airfoil,13.060156,10.218217
average,136.917102,128.725745
bodyfat_preprocessed,9.713924,9.67342
boston,19.422069,20.638361
concrete,25.321328,19.848678
diabetes,7.861885,7.638603
electricity,1133.118934,1062.935695


# Grid search difficult datasets

## Run grid

In [8]:
OUTPUTPATH = pathlib.Path().absolute() / 'Output'

filename = OUTPUTPATH / 'rf_gridsearch_final_other.pkl'

if filename.exists():
    results = joblib.load(filename)
else:
    results = {}

In [9]:
datasets.keys(), results.keys()

(dict_keys(['abalone', 'airfoil', 'Bias_correction_ucl', 'bodyfat_preprocessed', 'boston', 'communities', 'concrete', 'diabetes', 'electricity', 'energy', 'ga_preprocessed', 'housing', 'ozone_preprocessed', 'residential', 'ribo_preprocessed', 'SeoulBikeData', 'skills', 'superconductor', 'Walmart_preprocessed', 'wine']),
 dict_keys([]))

In [10]:
datasets_to_handle = ['Bias_correction_ucl', 'communities', 'housing', 'ribo_preprocessed']

In [None]:
cv = KFold(n_splits=4, shuffle=True, random_state=42)
for i, (d, data) in enumerate(datasets.items()):
    print(i, d)
    if (d not in datasets_to_handle) or (d in results.keys()):
        continue
    pilot = GridSearchCV(
        estimator=ensemble.RandomForestPilot(),
        param_grid={
            'n_estimators': [100],
            'max_depth': [10],
            'truncation_factor': [1, 1.5],
            'min_sample_split': [2],
            'min_sample_leaf':  [1],
            'n_features': [0.3, 0.7, 1.0],
            'rel_tolerance': [0.01],
            'df_settings': [{'con': 1, 'lin': 2}, {'con': 5, 'lin': 5}, {'con': 5, 'lin': 2}],
            'min_unique_values_regression': [4, 5]
            
        }, 
        scoring='neg_mean_squared_error',
        cv=cv,
        n_jobs=-1,
        verbose=10
    )
    tree = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid={
            'n_estimators': [100],
            'max_depth': [10],
            'min_samples_split': [2],
            'min_samples_leaf':  [1],
            'max_features': [0.3, 0.7, 1.0]
        },
        scoring='neg_mean_squared_error', 
        cv=cv,
        n_jobs=-1
    )

    X, y = data['data']
    
    pilot.fit(X, y, categorical_idx=np.array(data['categorical']))
    tree.fit(X, y)

    
    print(d, round(pilot.best_score_, 2), round(tree.best_score_, 2))
    
    results[d] = {'pilot': pilot.cv_results_, 'rf': tree.cv_results_}
    joblib.dump(results, filename)

## Inspect results

In [None]:
OUTPUTPATH = pathlib.Path().absolute() / 'Output'
results = joblib.load( OUTPUTPATH / 'rf_gridsearch_final_other.pkl')

In [13]:
normalized_results = pd.DataFrame([{
    'dataset': d, 
    'pilot_mse': -np.nanmax(r['pilot']['mean_test_score']), 
    'rf_mse': -np.nanmax(r['rf']['mean_test_score'])
    } for d, r in results.items()]).set_index('dataset').apply(lambda row: row / row.min(), axis=1)

In [14]:
normalized_results.style.apply(partial(highlight, kind='min'), axis=1).format('{:.3f}')

Unnamed: 0_level_0,pilot_mse,rf_mse
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
Bias_correction_ucl,1.0,1.18
communities,1.0,1.053


In [16]:
df = pd.concat([
    pd.concat([
        pd.DataFrame(results[d]['pilot']).assign(dataset=d).reset_index(),
        pd.json_normalize(pd.DataFrame(results[d]['pilot'])['param_df_settings']).rename(columns=lambda c: 'param_df_' + c)
    ], axis=1).assign(best_rf=np.nanmax(results[d]['rf']['mean_test_score'])) for d in results
]).groupby(
    ['dataset'] + [c for c in list(results.values())[0]['pilot'].keys() if c.startswith('param_') and c != 'param_df_settings'] + ['param_df_con', 'param_df_lin']
)[['mean_test_score', 'best_rf']].mean() * -1

df['mse_ratio'] = df['mean_test_score'] / df['best_rf']
# remove param combo's that were not tested for all datasets
df = df['mse_ratio'].unstack(level=0).loc[lambda df: (df.index.get_level_values('param_max_depth') == 10) & ~(df.index.get_level_values('param_df_con').astype(str) + df.index.get_level_values('param_df_lin').astype(str)).isin(['22', '25'])]

df['average'] = df.mean(axis=1)
df['count_better'] = (df.drop(columns='average') < 1).sum(axis=1)

In [17]:
df.sort_values(['count_better', 'average'], ascending=[False, True]).T

param_max_depth,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10
param_min_sample_leaf,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
param_min_sample_split,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
param_min_unique_values_regression,4,4,5,4,5,5,4,5,5,4,4,4,5,4,5,4,5,5,4,5,4
param_n_estimators,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
param_n_features,0.3,0.3,0.3,0.7,0.3,0.7,0.7,0.7,0.7,0.7,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
param_rel_tolerance,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
param_truncation_factor,1.0,1.5,1.0,1.5,1.5,1.5,1.0,1.0,1.5,1.5,...,1.0,1.5,1.0,1.0,1.5,1.5,1.0,1.5,1.5,1.0
param_df_con,5,5,5,5,5,5,5,5,1,1,...,5,5,1,5,1,1,1,5,5,5
param_df_lin,2,2,2,2,2,2,2,2,2,2,...,2,2,2,5,2,2,2,5,5,5
dataset,Unnamed: 1_level_10,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10,Unnamed: 17_level_10,Unnamed: 18_level_10,Unnamed: 19_level_10,Unnamed: 20_level_10,Unnamed: 21_level_10
Bias_correction_ucl,0.847544,0.865949,0.862982,0.881355,0.878173,0.886513,0.887175,0.885362,0.924,0.920827,...,0.976669,0.981002,1.01043,1.008894,1.009043,1.013988,1.014808,1.012589,1.01521,1.017895
communities,0.956044,0.95165,0.960536,0.949789,0.955659,0.957671,0.957504,0.959755,0.951283,0.96214,...,0.962599,0.959158,0.955646,0.9638,0.963685,0.962385,0.963174,0.965589,0.963722,0.972384
average,0.901794,0.9088,0.911759,0.915572,0.916916,0.922092,0.92234,0.922558,0.937641,0.941484,...,0.969634,0.97008,0.983038,0.986347,0.986364,0.988187,0.988991,0.989089,0.989466,0.99514
count_better,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Check old results

In [60]:
results_old = joblib.load( OUTPUTPATH / 'rf_gridsearch_incl_df_settings.pkl')

In [81]:
ds = 'ozone_preprocessed'
best_idx = results_old[ds]['pilot']['mean_test_score'].argmax()
results_old[ds]['pilot']['params'][best_idx]

{'df_settings': {'con': 5, 'lin': 5},
 'max_depth': 10,
 'min_sample_leaf': 1,
 'min_sample_split': 2,
 'n_estimators': 20,
 'n_features': 0.5,
 'rel_tolerance': 0.01,
 'truncation_factor': 1.5}