In [None]:
# Enable autoreload
%load_ext autoreload
%autoreload 2

In [None]:
!pip install xgboost tqdm shap

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedShuffleSplit, TimeSeriesSplit, cross_validate
import matplotlib.pyplot as plt
import xgboost as xgb
from scipy.special import logit, expit
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, mean_squared_error
import itertools
import shap
shap.initjs()
from datetime import datetime


pd.set_option('max_columns',None)
pd.set_option('display.max_rows', 200)
tqdm.pandas()

# Import pickle file with features

In [None]:
df = pd.read_pickle('./data/export_features_2016_2020.pkl.bz2')

In [None]:
df.info()

In [None]:
df_orig = df.copy() # save all data for later

In [None]:
df=df_orig # get all data back

In [None]:
all_features = ['location_count', 'location_mean', 'location_std',
 'location_min', 'location_5%', 'location_10%', 'location_15%',
 'location_20%', 'location_25%', 'location_30%', 'location_35%',
 'location_40%', 'location_45%', 'location_50%', 'location_55%',
 'location_60%', 'location_65%', 'location_70%', 'location_75%',
 'location_80%', 'location_85%', 'location_90%', 'location_95%',
 'location_max', 
                
 'band_count', 'band_mean', 'band_std',
 'band_min', 'band_5%', 'band_10%', 'band_15%', 'band_20%',
 'band_25%', 'band_30%', 'band_35%', 'band_40%', 'band_45%',
 'band_50%', 'band_55%', 'band_60%', 'band_65%', 'band_70%',
 'band_75%', 'band_80%', 'band_85%', 'band_90%', 'band_95%',
 'band_max', 
                
 'promoter_count', 'promoter_mean', 'promoter_std',
 'promoter_min', 'promoter_5%', 'promoter_10%', 'promoter_15%',
 'promoter_20%', 'promoter_25%', 'promoter_30%', 'promoter_35%',
 'promoter_40%', 'promoter_45%', 'promoter_50%', 'promoter_55%',
 'promoter_60%', 'promoter_65%', 'promoter_70%', 'promoter_75%',
 'promoter_80%', 'promoter_85%', 'promoter_90%', 'promoter_95%',
 'promoter_max', 
 
 'vg_datum_year', 'vg_datum_month', 'vg_datum_day_of_week',
                
 'location_kirche',  'location_hotel', 'location_theater', 'location_cafe',
 'location_stadthalle', 'location_buergerhaus', 'location_club', 'location_gaststaette',
 'location_halle', 'location_festhalle', 'location_kurhaus', 'location_schloss',
 'location_restaurant', 'location_kulturzentrum', 'location_festzelt', 'location_musikschule',
 'location_mehrzweckhalle', 'location_pub', 'location_bar', 'location_gasthaus', 'location_turnhalle',
 'location_kulturhaus', 'location_gymnasium', 'location_rathaus', 'location_gasthof',
 'location_park', 'location_kabarett', 'location_schuetzenhalle', 'location_gemeindehalle',
 'location_gemeindehaus', 
                
 'band_musikverein', 'band_band', 'band_mv', 'band_duo', 'band_trio', 'band_musikkapelle',
 'band_chor', 'band_blaskapelle', 'band_orchester', 'band_stadtkapelle', 'band_gbr',
 'band_jazz', 'band_kurorchester', 'band_amp', 'band_ensemble', 'band_blasorchester',
 'band_partyband', 'band_friends', 'band_blues', 'band_original', 'band_live',
 'band_swing', 'band_musikzug', 'band_solo', 'band_mgv', 'band_jugendkapelle',
 'band_sound', 'band_harmonie', 'band_black', 'band_ev']

# Explore Data

In [None]:
df[df.amount_segment==3].amount.describe()

In [None]:
df.groupby(['amount_segment']).min()

In [None]:
df.groupby(['amount_segment']).max()

In [None]:
df.groupby(['amount_segment']).count()

In [None]:
df[(df.amount>150) &(df.amount_segment==4)].shape

# Get sample of the data

In [None]:
#Get a sample of the DataFrame 
splitSample = StratifiedShuffleSplit(n_splits=1, test_size=0.01, random_state=42)

for train_idx, test_idx in splitSample.split(df[all_features], df.amount_segment):
    df_sample=df.iloc[test_idx]
    
    plt.figure()
    df.amount_segment.astype(int).value_counts().sort_index().plot.bar(color='r')
    df_sample.amount_segment.astype(int).value_counts().sort_index().plot.bar(color='g')

    plt.title('Inkasso-Segment')
    plt.legend(['Full DF', 'Sample DF'])
    plt.show()

df=df_sample.copy()
df.info()

# Define Features

In [None]:
feature_groups = ['location', 'band', 'promoter']

feature_group_combinations = []
for i in range(1, len(feature_groups) + 1):
    comb = itertools.combinations(feature_groups, i)
    feature_group_combinations += list(comb)


In [None]:
features={}

features['location'] = ['location_count', 'location_mean', 'location_std',
 'location_min', 'location_5%', 'location_10%', 'location_15%',
 'location_20%', 'location_25%', 'location_30%', 'location_35%',
 'location_40%', 'location_45%', 'location_50%', 'location_55%',
 'location_60%', 'location_65%', 'location_70%', 'location_75%',
 'location_80%', 'location_85%', 'location_90%', 'location_95%',
 'location_max', 'location_kirche',  'location_hotel', 'location_theater', 'location_cafe',
 'location_stadthalle', 'location_buergerhaus', 'location_club', 'location_gaststaette',
 'location_halle', 'location_festhalle', 'location_kurhaus', 'location_schloss',
 'location_restaurant', 'location_kulturzentrum', 'location_festzelt', 'location_musikschule',
 'location_mehrzweckhalle', 'location_pub', 'location_bar', 'location_gasthaus', 'location_turnhalle',
 'location_kulturhaus', 'location_gymnasium', 'location_rathaus', 'location_gasthof',
 'location_park', 'location_kabarett', 'location_schuetzenhalle', 'location_gemeindehalle',
 'location_gemeindehaus']

features['band'] = ['band_count', 'band_mean', 'band_std',
 'band_min', 'band_5%', 'band_10%', 'band_15%', 'band_20%',
 'band_25%', 'band_30%', 'band_35%', 'band_40%', 'band_45%',
 'band_50%', 'band_55%', 'band_60%', 'band_65%', 'band_70%',
 'band_75%', 'band_80%', 'band_85%', 'band_90%', 'band_95%',
 'band_max','band_musikverein', 'band_band', 'band_mv', 'band_duo', 'band_trio', 'band_musikkapelle',
 'band_chor', 'band_blaskapelle', 'band_orchester', 'band_stadtkapelle', 'band_gbr',
 'band_jazz', 'band_kurorchester', 'band_amp', 'band_ensemble', 'band_blasorchester',
 'band_partyband', 'band_friends', 'band_blues', 'band_original', 'band_live',
 'band_swing', 'band_musikzug', 'band_solo', 'band_mgv', 'band_jugendkapelle',
 'band_sound', 'band_harmonie', 'band_black', 'band_ev']

features['promoter']=['promoter_count', 'promoter_mean', 'promoter_std',
 'promoter_min', 'promoter_5%', 'promoter_10%', 'promoter_15%',
 'promoter_20%', 'promoter_25%', 'promoter_30%', 'promoter_35%',
 'promoter_40%', 'promoter_45%', 'promoter_50%', 'promoter_55%',
 'promoter_60%', 'promoter_65%', 'promoter_70%', 'promoter_75%',
 'promoter_80%', 'promoter_85%', 'promoter_90%', 'promoter_95%',
 'promoter_max']
    
features['date']=['vg_datum_year', 'vg_datum_month', 'vg_datum_day_of_week']

In [None]:
model_features={}
for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    model_features[model_name] = features['date']
    for feature_group in feature_groups:
        if feature_group in feature_group_combination:
            model_features[model_name]=model_features[model_name]+features[feature_group]

# Define Models

In [None]:
clf1_models = {}
clf2_models = {}
reg2_models = {}
reg3_models = {}
reg2_logit_models = {}
reg3_logit_models = {}

for feature_group_combination in feature_group_combinations:

    model_name="_".join(feature_group_combination)
    
    clf1_models[model_name]=xgb.XGBClassifier(n_estimators=1100, max_depth=9, use_label_encoder=False, objective='binary:logistic',eval_metric = 'error')
    clf1_models[model_name].set_params(tree_method = 'gpu_hist')
    clf2_models[model_name]=xgb.XGBClassifier(n_estimators=1100, max_depth=9, use_label_encoder=False, objective='binary:logistic',eval_metric = 'error')
    clf2_models[model_name].set_params(tree_method = 'gpu_hist')
    reg2_models[model_name]=xgb.XGBRegressor(n_estimators=700, max_depth=7, min_child_weight=5, objective='reg:squarederror')
    reg2_models[model_name].set_params(tree_method = 'gpu_hist')
    reg2_logit_models[model_name]=xgb.XGBRegressor(n_estimators=700, max_depth=7, min_child_weight=5, objective='reg:squarederror')
    reg2_logit_models[model_name].set_params(tree_method = 'gpu_hist')
    reg3_models[model_name]=xgb.XGBRegressor(n_estimators=700, max_depth=7, min_child_weight=5, objective='reg:squarederror')
    reg3_models[model_name].set_params(tree_method = 'gpu_hist')
    reg3_logit_models[model_name]=xgb.XGBRegressor(n_estimators=700, max_depth=7, min_child_weight=5, objective='reg:squarederror')
    reg3_logit_models[model_name].set_params(tree_method = 'gpu_hist')

# Prepare DataFrame for Classification Model

We keep only features + 'amount' and 'amount_segment' columns.

Encode segments for Classifier 1:
* Variable y_clf1 
* Positive class (seg 4+) y_clf1=1
* Negative class (seg 2 or 3) y_clf1=0

Encode segments for Classifier 2: 
* Variable y_clf2
* Positive class (seg 3) y_clf2=1
* Negative class (seg 2) y_clf2=0
* Segments >3 are "encoded" as np.NaN. These NaN values will be dropped before training

In [None]:
df.loc[:,'y_clf1']=(df.amount_segment.astype(int) > 3).values.astype(int)
df.loc[:,'y_clf2']=df.amount_segment.apply(lambda x: 1 if x==3 else (0 if x==2 else np.nan))

# Train and Test Classification Models

In oder to have larger train and test sets, we use crossvalidation-like approach to verify the model performance.
1. We split the dataset into 5 folds with StratifiedKFold. The criteria for split is to have the same proportion of data based on the segment in each fold.
2. In each fold we train and test both classifiers independently from each other
3. We save the predict_proba results from both classifiers
4. Train and test iteration numbers are also saved (in case we want to evaluate the results based on the train/test iteration

In [None]:
cv_model_eval = TimeSeriesSplit(n_splits=5)

df_dict = {}

for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    print('Crossvalidate models for {}'.format(model_name))
    
    df_dict[model_name]=df[model_features[model_name]+['amount','amount_segment','y_clf1','y_clf2']].dropna(subset=model_features[model_name]).copy()
    
    df_dict[model_name]['y_pred_proba_clf1']=np.NaN
    df_dict[model_name]['y_pred_proba_clf2']=np.NaN
    df_dict[model_name]["y_pred_reg2"]=np.NaN
    df_dict[model_name]["y_pred_reg2_logit"]=np.NaN
    df_dict[model_name]["y_pred_reg3"]=np.NaN
    df_dict[model_name]["y_pred_reg3_logit"]=np.NaN

    df_dict[model_name]['train_iter']=np.NaN
    df_dict[model_name]['test_iter']=np.NaN

    iter_nr=0

    for train_idx, test_idx in tqdm(cv_model_eval.split(df_dict[model_name][model_features[model_name]], df_dict[model_name].amount_segment), total=cv_model_eval.n_splits):
        print('Test iteration {}'.format(iter_nr))
        
        df_train=df_dict[model_name].iloc[train_idx]
        df_test=df_dict[model_name].iloc[test_idx]

        #Test dataset is the same for all models
        X_test = df_test[model_features[model_name]]

        X_train_clf1 = df_train[model_features[model_name]]
        y_train_clf1 = df_train.y_clf1
        y_test_clf1 = df_test.y_clf1

        # For Classifier2 (clf2) we use data from segments 2 and 3 (so we drop records where y_clf2 is NaN)
        df_train_clf2 = df_train.dropna(subset=['y_clf2'])

        X_train_clf2 = df_train_clf2[model_features[model_name]]
        y_train_clf2 = df_train_clf2.y_clf2


        # For Regression seg2 use only Segment 2 and amount 0.00001>=amount>=49.9999 (because of logit)
        df_train_reg2 = df_train[(df_train['amount_segment']==2) & 
                                 (df_train['amount']>=0.00001) & 
                                 (df_train['amount']<=49.9999)]

        X_train_reg2 = df_train_reg2[model_features[model_name]]
        y_train_reg2 = df_train_reg2.amount
        y_train_reg2_logit = (y_train_reg2/50).apply(logit)

        
        df_test_reg2 = df_test[(df_test['amount_segment']==2) & 
                                 (df_test['amount']>=0.00001) & 
                                 (df_test['amount']<=49.9999)]

        X_test_reg2 = df_test_reg2[model_features[model_name]]
        y_test_reg2 = df_test_reg2.amount
        y_test_reg2_logit = (y_test_reg2/50).apply(logit)


        # For Regression seg3 use only Segment 3 and amount 50>amount>=99.9999 (because of logit)
        df_train_reg3 = df_train[(df_train['amount_segment']==3) & 
                                 (df_train['amount']>50) & 
                                 (df_train['amount']<=99.9999)]

        X_train_reg3 = df_train_reg3[model_features[model_name]]
        y_train_reg3 = df_train_reg3.amount
        y_train_reg3_logit = ((y_train_reg3-50)/50).apply(logit)

        
        df_test_reg3 = df_test[(df_test['amount_segment']==3) & 
                                 (df_test['amount']>50) & 
                                 (df_test['amount']<=99.9999)]

        X_test_reg3 = df_test_reg3[model_features[model_name]]
        y_test_reg3 = df_test_reg3.amount
        y_test_reg3_logit = ((y_test_reg3-50)/50).apply(logit)



        #Fit and test the models

        #Classifier 1
        clf1_models[model_name].fit(X_train_clf1, y_train_clf1)
        y_pred_proba_clf1 = clf1_models[model_name].predict_proba(X_test)[:, 1]
        print("CLF1 Train Score: {}".format(clf1_models[model_name].score(X_train_clf1, y_train_clf1)))
        print("CLF1 Test Score: {}".format(clf1_models[model_name].score(X_test, y_test_clf1)))

        #Classifier 2        
        clf2_models[model_name].fit(X_train_clf2, y_train_clf2)
        y_pred_proba_clf2 = clf2_models[model_name].predict_proba(X_test)[:, 1]
        print("CLF2 Train Score: {}".format(clf2_models[model_name].score(X_train_clf2, y_train_clf2)))
        print("CLF2 Test Score: {}".format(clf2_models[model_name].score(df_test.dropna(subset=['y_clf2'])[model_features[model_name]], 
                                                      df_test.dropna(subset=['y_clf2']).y_clf2)))
 

        #Regression Segment 2
        reg2_models[model_name].fit(X_train_reg2, y_train_reg2)
        y_pred_reg2=reg2_models[model_name].predict(X_test)
        print("REG2 Train Score: {}".format(reg2_models[model_name].score(X_train_reg2, y_train_reg2)))
        print("REG2 Test Score: {}".format(reg2_models[model_name].score(X_test_reg2, y_test_reg2)))


        #Regression Segment 2 with logit transformation
        reg2_logit_models[model_name].fit(X_train_reg2, y_train_reg2_logit)
        y_pred_reg2_logit = reg2_logit_models[model_name].predict(X_test)
        
        y_pred_reg2_logit_transf = pd.Series(y_pred_reg2_logit).apply(expit)*50
        
        print("REG2_Logit Train Score: {}".format(reg2_logit_models[model_name].score(X_train_reg2, y_train_reg2_logit)))
        print("REG2_Logit Test Score: {}".format(reg2_logit_models[model_name].score(X_test_reg2, y_test_reg2_logit)))

        #Regression Segment 3
        reg3_models[model_name].fit(X_train_reg3, y_train_reg3)
        y_pred_reg3=reg3_models[model_name].predict(X_test)
        print("REG3 Train Score: {}".format(reg3_models[model_name].score(X_train_reg3, y_train_reg3)))
        print("REG3 Test Score: {}".format(reg3_models[model_name].score(X_test_reg3, y_test_reg3)))

        #Regression Segment 3 with logit transformation
        reg3_logit_models[model_name].fit(X_train_reg3, y_train_reg3_logit)
        y_pred_reg3_logit = reg3_logit_models[model_name].predict(X_test)
        y_pred_reg3_logit_transf = pd.Series(y_pred_reg3_logit).apply(expit)*50+50
        print("REG3_Logit Train Score: {}".format(reg3_logit_models[model_name].score(X_train_reg3, y_train_reg3_logit)))
        print("REG3_Logit Test Score: {}".format(reg3_logit_models[model_name].score(X_test_reg3, y_test_reg3_logit)))

        #Save the prediction results in separate columns
        df_dict[model_name].iloc[test_idx,df_dict[model_name].columns.get_loc("y_pred_proba_clf1")]=y_pred_proba_clf1
        df_dict[model_name].iloc[test_idx,df_dict[model_name].columns.get_loc("y_pred_proba_clf2")]=y_pred_proba_clf2
        df_dict[model_name]['y_pred_proba_clf1']=df_dict[model_name]['y_pred_proba_clf1'].apply(lambda x: format(float(x),".8f")).astype(float)
        df_dict[model_name]['y_pred_proba_clf2']=df_dict[model_name]['y_pred_proba_clf2'].apply(lambda x: format(float(x),".8f")).astype(float)
        df_dict[model_name].iloc[test_idx,df_dict[model_name].columns.get_loc("y_pred_reg2")]=y_pred_reg2
        df_dict[model_name].iloc[test_idx,df_dict[model_name].columns.get_loc("y_pred_reg2_logit")]=y_pred_reg2_logit_transf.values
        df_dict[model_name].iloc[test_idx,df_dict[model_name].columns.get_loc("y_pred_reg3")]=y_pred_reg3
        df_dict[model_name].iloc[test_idx,df_dict[model_name].columns.get_loc("y_pred_reg3_logit")]=y_pred_reg3_logit_transf.values

        #Save train and test iteration number, in case we need it later 
        #(not sure if one record can be multiple times in train/test, anyway we save only the last iteration number for now...)
        df_dict[model_name].iloc[train_idx,df_dict[model_name].columns.get_loc("train_iter")]=iter_nr
        df_dict[model_name].iloc[test_idx,df_dict[model_name].columns.get_loc("test_iter")]=iter_nr

        iter_nr=iter_nr+1

In [None]:
#Export dataframes with predictions for later
for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    
    df_dict[model_name].to_pickle('./predictions_df/export_predictions_'+model_name, protocol=4)

In [None]:
df_dict = {}

#Read dataframes with predictions
for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    
    df_dict[model_name]=pd.read_pickle('./predictions_df/export_predictions_'+model_name)

In [None]:
df_dict['location']

# Threshold optimization

In [None]:
cv_treasholds = TimeSeriesSplit(n_splits=5)

## Threshold optimization (clf1)

We optimize only t_neg threshold since for clf1 only positive class is of importance. We want to prevent classification of segments 4+ as segments 2 or 3 since this will mean "lost money".

In [None]:
max_for_clf1_list = [0.01, 0.02, 0.03, 0.04, 0.05, 0.07, 0.09]
opt_t_neg_list_clf1 = {}
classifier_thresholds_clf1 = {}

df_dict={}

for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    print('Optimize threshold for CLF1 for {}'.format(model_name))

    opt_t_neg_list_clf1[model_name] = pd.DataFrame()
    
    df_dict[model_name]=df[model_features[model_name]+['amount','amount_segment','y_clf1','y_clf2']].dropna(subset=model_features[model_name]).copy()


    #Use all features, encode segment 4+ as positive class, otherwise negative
    X_train_clf1 = df_dict[model_name][model_features[model_name]].dropna()
    y_train_clf1= df_dict[model_name].dropna(subset=model_features[model_name]).y_clf1

    for i in tqdm(range(5)):
        # we use all data for threshold optimization
        for train_idx, test_idx in tqdm(cv_treasholds.split(X_train_clf1, y_train_clf1), total=cv_treasholds.n_splits) : 

            clf1_models[model_name].fit(X_train_clf1.iloc[train_idx], y_train_clf1.iloc[train_idx])

            y_test_pred_proba_clf1 = clf1_models[model_name].predict_proba(X_train_clf1.iloc[test_idx])[:, 1]
            y_test_clf1 = y_train_clf1.iloc[test_idx]

            opt_t_neg_clf1 = 0
            opt_fnr_clf1 = 0

            for max_for_clf1 in max_for_clf1_list :
                for t_neg_clf1 in np.linspace(0, 1, 1001):
                    for_clf1 = (y_test_pred_proba_clf1[y_test_clf1 == 1] <= t_neg_clf1).sum() / (y_test_pred_proba_clf1 <= t_neg_clf1).sum()
                    if for_clf1 > max_for_clf1:
                        break
                    opt_t_neg_clf1 = t_neg_clf1
                    opt_for_clf1 = for_clf1

                print(opt_t_neg_clf1, opt_for_clf1, max_for_clf1)
                opt_t_neg_list_clf1[model_name] = opt_t_neg_list_clf1[model_name].append({"max_for_clf1": max_for_clf1, "opt_t_neg_clf1": opt_t_neg_clf1}, ignore_index=True)

    for max_for_clf1 in max_for_clf1_list :
        print('max_for_clf1 = {}'.format(max_for_clf1))
        display(opt_t_neg_list_clf1[model_name][opt_t_neg_list_clf1[model_name].max_for_clf1==max_for_clf1].opt_t_neg_clf1.describe())

    classifier_thresholds_clf1[model_name] = pd.DataFrame()
    for max_for_clf1 in max_for_clf1_list :
        #t_neg = min(opt_t_neg_list)
        t_neg_clf1 = np.median(opt_t_neg_list_clf1[model_name][opt_t_neg_list_clf1[model_name].max_for_clf1==max_for_clf1].opt_t_neg_clf1)

        classifier_thresholds_clf1[model_name] = classifier_thresholds_clf1[model_name].append({"max_for_clf1": max_for_clf1,
            "t_neg_clf1": t_neg_clf1}, ignore_index=True)

    print("Classfifier Thresholds for {}".format(model_name))
    display(classifier_thresholds_clf1[model_name])
    
    #Save Thresholds
    classifier_thresholds_clf1[model_name].to_pickle(('./thresholds/export_thresholds_{}_clf1.pkl.bz2').format(model_name), protocol=4)



In [None]:
def create_conf_matrix_clf1(t_neg, model_name):
    df_dict[model_name]['y_pred_clf1']=df_dict[model_name]['y_pred_proba_clf1'].apply(lambda x: 1 if x >= t_neg else 0) 
    df_confusion_clf1 = pd.DataFrame(confusion_matrix(df_dict[model_name].y_clf1, df_dict[model_name].y_pred_clf1), index=['true neg (2,3)', 'true pos (4+)'], columns=['pred. neg (2,3)', 'pred. pos (4+)'])

    return df_confusion_clf1


for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    print('Plot thresholds for {}'.format(model_name))
    
    correct_positive=[]
    correct_negative=[]
    false_positive=[]
    false_negative=[]

    for v in np.linspace(0, 1, 100):
        result = create_conf_matrix_clf1(v, model_name)

        correct_positive.append(result.iloc[1,1]/df_dict[model_name].shape[0])
        correct_negative.append(result.iloc[0,0]/df_dict[model_name].shape[0])
        false_positive.append(result.iloc[0,1]/df_dict[model_name].shape[0])    
        false_negative.append(result.iloc[1,0]/df_dict[model_name].shape[0])

        #print('Threshold = {}'.format(v))
        #print(result)

    x=np.linspace(0, 1, 100)
    plt.figure(figsize=(20,7))


    plt.stackplot(x,correct_positive, correct_negative, false_positive, false_negative, 
                  labels=['Correct positive', 
                          'Correct negative',
                          'False Positive (Extra work)', 
                          'False Negative (Lost money)'])
    plt.legend(loc='lower right')
    plt.show()

## Threshold optimization (clf2)

Both segment 2 (negative class) and segment 3 (positive class) are equally important --> we need both t_neg and t_pos

In [None]:
#Define Thresholds (t_neg and t_pos) We want to minimize False Negative and False Positive

max_for_clf2_list = [0.01, 0.02, 0.03, 0.04, 0.05, 0.07, 0.09]
max_fdr_clf2_list = [0.01, 0.02, 0.03, 0.04, 0.05, 0.07, 0.09]

opt_t_neg_list_clf2 = {}
opt_t_pos_list_clf2 = {}

classifier_thresholds_clf2={}

for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    print('Optimize threshold for CLF2 for {}'.format(model_name))

    opt_t_neg_list_clf2[model_name] = pd.DataFrame()
    opt_t_pos_list_clf2[model_name] = pd.DataFrame()

    #Use only data records from seg. 2 and 3. Use all features. 
    X_train_clf2 = df_dict[model_name][~df_dict[model_name].y_clf2.isnull()][model_features[model_name]].dropna()
    #Encode segment 3 as 1 (positive class), seg. 2 as 0 (negative class)
    y_train_clf2= df_dict[model_name][~df_dict[model_name].y_clf2.isnull()].dropna(subset=model_features[model_name]).y_clf2

    for i in tqdm(range(5)):
        for train_idx, test_idx in tqdm(cv_treasholds.split(X_train_clf2, y_train_clf2), total=cv_treasholds.n_splits) : 
            clf2_models[model_name].fit(X_train_clf2.iloc[train_idx], y_train_clf2.iloc[train_idx])

            y_test_pred_proba_clf2 = clf2_models[model_name].predict_proba(X_train_clf2.iloc[test_idx])[:, 1]
            y_test_clf2 = y_train_clf2.iloc[test_idx]

            for max_for_clf2 in max_for_clf2_list :
                opt_t_neg_clf2 = 0
                opt_for_clf2 = 0
                for t_neg_clf2 in np.linspace(0, 1, 1001):
                    for_clf2 = (y_test_pred_proba_clf2[y_test_clf2 == 1] <= t_neg_clf2).sum() / (y_test_pred_proba_clf2 <= t_neg_clf2).sum()
                    if for_clf2 > max_for_clf2:
                        break
                    opt_t_neg_clf2 = t_neg_clf2
                    opt_for_clf2 = for_clf2

                print(opt_t_neg_clf2, opt_for_clf2, max_for_clf2)
                opt_t_neg_list_clf2[model_name] = opt_t_neg_list_clf2[model_name].append({"max_for_clf2": max_for_clf2, "opt_t_neg_clf2": opt_t_neg_clf2}, ignore_index=True)


            for max_fdr_clf2 in max_fdr_clf2_list :
                opt_t_pos_clf2 = 0
                opt_fdr_clf2 = 0
                for t_pos_clf2 in np.linspace(0, 1, 1001):
                    fdr_clf2 = (y_test_pred_proba_clf2[y_test_clf2 == 0] >= t_pos_clf2).sum() / (y_test_pred_proba_clf2 >= t_pos_clf2).sum()
                    opt_t_pos_clf2 = t_pos_clf2
                    opt_fdr_clf2 = fdr_clf2
                    if fdr_clf2 <= max_fdr_clf2:
                        break
                print(opt_t_pos_clf2, opt_fdr_clf2, max_fdr_clf2)
                opt_t_pos_list_clf2[model_name] = opt_t_pos_list_clf2[model_name].append({"max_fdr_clf2": max_fdr_clf2, "opt_t_pos_clf2": opt_t_pos_clf2}, ignore_index=True)

    for max_for_clf2 in max_for_clf2_list :
        print('max_for_clf2 = {}'.format(max_for_clf2))
        display(opt_t_neg_list_clf2[model_name][opt_t_neg_list_clf2[model_name].max_for_clf2==max_for_clf2].opt_t_neg_clf2.describe())

    for max_fdr_clf2 in max_fdr_clf2_list :
        print('max_fdr_clf2 = {}'.format(max_fdr_clf2))
        display(opt_t_pos_list_clf2[model_name][opt_t_pos_list_clf2[model_name].max_fdr_clf2==max_fdr_clf2].opt_t_pos_clf2.describe())

    classifier_thresholds_clf2[model_name] = pd.DataFrame()

    for max_for_clf2 in max_for_clf2_list :
        t_neg_clf2 = np.median(opt_t_neg_list_clf2[model_name][opt_t_neg_list_clf2[model_name].max_for_clf2==max_for_clf2].opt_t_neg_clf2)

        classifier_thresholds_clf2[model_name] = classifier_thresholds_clf2[model_name].append({"max_for_clf2": max_for_clf2,
            "t_neg_clf2": t_neg_clf2}, ignore_index=True)


    for max_fdr_clf2 in max_fdr_clf2_list :
        t_pos_clf2 = np.median(opt_t_pos_list_clf2[model_name][opt_t_pos_list_clf2[model_name].max_fdr_clf2==max_fdr_clf2].opt_t_pos_clf2)

        classifier_thresholds_clf2[model_name] = classifier_thresholds_clf2[model_name].append({"max_fdr_clf2": max_fdr_clf2,
            "t_pos_clf2": t_pos_clf2}, ignore_index=True)

    display(classifier_thresholds_clf2[model_name])
    
    #Save Thresholds
    classifier_thresholds_clf2[model_name].to_pickle(('./thresholds/export_thresholds_{}_clf2.pkl.bz2').format(model_name), protocol=4)


# Load Thresholds

In [None]:
classifier_thresholds_clf1 = {}
classifier_thresholds_clf2 = {}

#Read dataframes with predictions
for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    
    classifier_thresholds_clf1[model_name]=pd.read_pickle(('./thresholds/export_thresholds_{}_clf1.pkl.bz2').format(model_name))
    classifier_thresholds_clf2[model_name]=pd.read_pickle(('./thresholds/export_thresholds_{}_clf2.pkl.bz2').format(model_name))

# Create combined confusion matrix

In [None]:
max_for_clf1 = 0.03
max_for_clf2 = 0.05 #False omission rate
max_fdr_clf2 = 0.05 #False discovery rate

for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    print('Create combined confusion matrix for {}'.format(model_name))

    t_neg_clf1=0.32#classifier_thresholds_clf1[model_name][classifier_thresholds_clf1[model_name].max_for_clf1==max_for_clf1].t_neg_clf1.values[0]

    t_neg_clf2=0.467#classifier_thresholds_clf2[model_name][classifier_thresholds_clf2[model_name].max_for_clf2==max_for_clf2].t_neg_clf2.values[0]
    t_pos_clf2=0.843#classifier_thresholds_clf2[model_name][classifier_thresholds_clf2[model_name].max_fdr_clf2==max_fdr_clf2].t_pos_clf2.values[0]

    df_dict[model_name]=df_dict[model_name][df_dict[model_name].test_iter.isnull()!=True]
    df_dict[model_name]['y']=df_dict[model_name].amount_segment.apply(lambda x: '2' if x==2 else ('3' if x==3 else '4+'))

    df_dict[model_name]['y_pred_clf1']=df_dict[model_name]['y_pred_proba_clf1'].apply(lambda x: '2 or 3' if x <= t_neg_clf1 else '4+')
    df_dict[model_name]['y_pred_clf2']=df_dict[model_name]['y_pred_proba_clf2'].apply(lambda x: '2' if x<=t_neg_clf2 else 
                                                    ('2?' if x<=0.5 else 
                                                     ('3?' if x<=t_pos_clf2 else '3')))

    df_dict[model_name]['y_pred']=df_dict[model_name].apply(lambda x: x['y_pred_clf1'] if x['y_pred_clf1']=='4+' else x['y_pred_clf2'], axis=1)
    
    df_confusion_comb = pd.DataFrame(confusion_matrix(df_dict[model_name].y, df_dict[model_name].y_pred, 
                                                  labels=['2','2?','3?','3','4+']), 
                                 index=['true 2', 'true 2?', 'true 3?', 'true 3', 'true 4+'], 
                                 columns=['pred 2', 'pred 2?', 'pred 3?', 'pred 3', 'pred 4+'])
    print('Abs. Numbers')
    display(df_confusion_comb.drop(index=['true 2?', 'true 3?']))
    
    df_confusion_comb = pd.DataFrame(confusion_matrix(df_dict[model_name].y, df_dict[model_name].y_pred, 
                                                  labels=['2','2?','3?','3','4+'], normalize="true"), 
                                index=['true 2', 'true 2?', 'true 3?', 'true 3', 'true 4+'], 
                                 columns=['pred 2', 'pred 2?', 'pred 3?', 'pred 3', 'pred 4+'])
    print('Rel. to true count per class in %')
    display((df_confusion_comb.drop(index=['true 2?', 'true 3?']) * 100).round(1))
    
    print('Classification Report')
    print(classification_report(df_dict[model_name].y, df_dict[model_name].y_pred, labels=['2','2?','3?','3','4+'],zero_division=0))

In [None]:
for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    print('Make prediction for {}'.format(model_name))
    
    df_dict[model_name]['predicted_segment'] = df_dict[model_name].y_pred.map({
    '2': '2',
    '2?': '2',
    '3': '3',
    '3?': '3',
    '4+': '4+'
    })
    
    df_dict[model_name]['segment']=df_dict[model_name].amount_segment.map({2:'2', 3: '3', 
                                                                 4: '4+',5: '4+',6: '4+',7: '4+',8: '4+',
                                                                9: '4+', 10: '4+',11: '4+',12: '4+'})
    
    cm = pd.DataFrame(confusion_matrix(df_dict[model_name].segment, df_dict[model_name].predicted_segment, labels=['2', '3', '4+']),
                  index=['true 2', 'true 3', 'true 4+'],
                  columns=['pred 2', 'pred 3', 'pred 4+'])

    display(cm)
    
    cm_norm = pd.DataFrame(confusion_matrix(df_dict[model_name].segment, df_dict[model_name].predicted_segment, labels=['2', '3', '4+'], normalize='true'),
                  index=['true 2', 'true 3', 'true 4+'],
                  columns=['pred 2', 'pred 3', 'pred 4+'])

    display(cm_norm)
    
    print(classification_report(df_dict[model_name].segment, df_dict[model_name].predicted_segment, target_names=['2', '3', '4']))

# Get scores for regressions (with correct class)

In [None]:
for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    print(df_dict[model_name].shape)

In [None]:
df_dict['location'][(df_dict['location'].amount_segment==2)&(df_dict['location'].test_iter>0)].head()

In [None]:
for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    print('Regression score for {}'.format(model_name))
    
    df_reg2 = df_dict[model_name][df_dict[model_name].amount_segment==2].copy()
    df_reg2['discr'] = df_reg2.amount-df_reg2.y_pred_reg2
    df_reg2['discr_abs'] = abs(df_reg2.amount-df_reg2.y_pred_reg2)
    df_reg2['discr_logit'] = df_reg2.amount-df_reg2.y_pred_reg2_logit
    df_reg2['discr_logit_abs'] = abs(df_reg2.amount-df_reg2.y_pred_reg2_logit)

    df_reg3 = df_dict[model_name][df_dict[model_name].amount_segment==3].copy()
    df_reg3['discr'] = df_reg3.amount-df_reg3.y_pred_reg3
    df_reg3['discr_abs'] = abs(df_reg3.amount-df_reg3.y_pred_reg3)
    df_reg3['discr_logit'] = df_reg3.amount-df_reg3.y_pred_reg3_logit
    df_reg3['discr_logit_abs'] = abs(df_reg3.amount-df_reg3.y_pred_reg3_logit)

    scores = pd.DataFrame(columns=('min', 'mean', 'max'), 
                          index=('reg2','reg2_abs', 'reg2_logit', 'reg2_logit_abs', 
                                 'reg3','reg3_abs', 'reg3_logit', 'reg3_logit_abs'))
    scores.loc['reg2']=[np.min(df_reg2.discr), np.mean(df_reg2.discr), np.max(df_reg2.discr)]
    scores.loc['reg2_abs']=[np.min(df_reg2.discr_abs), np.mean(df_reg2.discr_abs), np.max(df_reg2.discr_abs)]
    scores.loc['reg2_logit']=[np.min(df_reg2.discr_logit), np.mean(df_reg2.discr_logit), np.max(df_reg2.discr_logit)]
    scores.loc['reg2_logit_abs']=[np.min(df_reg2.discr_logit_abs), np.mean(df_reg2.discr_logit_abs), np.max(df_reg2.discr_logit_abs)]
    scores.loc['reg3']=[np.min(df_reg3.discr), np.mean(df_reg3.discr), np.max(df_reg3.discr)]
    scores.loc['reg3_abs']=[np.min(df_reg3.discr_abs), np.mean(df_reg3.discr_abs), np.max(df_reg3.discr_abs)]
    scores.loc['reg3_logit']=[np.min(df_reg3.discr_logit), np.mean(df_reg3.discr_logit), np.max(df_reg3.discr_logit)]
    scores.loc['reg3_logit_abs']=[np.min(df_reg3.discr_logit_abs), np.mean(df_reg3.discr_logit_abs), np.max(df_reg3.discr_logit_abs)]

    display(scores)
    
    plt.hist(df_reg2.discr, bins=100)
    plt.title('Regression 2 Discrepancy')
    plt.show()
    plt.hist(df_reg2.discr_logit, bins=100)
    plt.title('Regression 2 Logit Discrepancy')
    plt.show()

    plt.hist(df_reg3.discr, bins=100)
    plt.title('Regression 3 Discrepancy')
    plt.show()
    plt.hist(df_reg3.discr_logit, bins=100)
    plt.title('Regression 3 Logit Discrepancy')
    plt.show()

# Regression Segment 2 and 3 (Review combined result)

## Calculate discrepancy

Keep Regression results only for correct classes (reg2 for '2' and '2?', reg3 for '3' and '3?')

In [None]:
for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    print('Combined regression score for {}'.format(model_name))
    
    df_dict[model_name].loc[df_dict[model_name]['y_pred'].isin(['3','3?','4+']),['y_pred_reg2','y_pred_reg2_logit']]=np.nan
    df_dict[model_name].loc[df_dict[model_name]['y_pred'].isin(['2','2?','4+']),['y_pred_reg3','y_pred_reg3_logit']]=np.nan

    df_dict[model_name]['discr_reg2']=df_dict[model_name].amount-df_dict[model_name].y_pred_reg2
    df_dict[model_name]['discr_reg2_abs']=abs(df_dict[model_name].amount-df_dict[model_name].y_pred_reg2)
    df_dict[model_name]['discr_reg2_logit']=df_dict[model_name].amount-df_dict[model_name].y_pred_reg2_logit
    df_dict[model_name]['discr_reg2_logit_abs']=abs(df_dict[model_name].amount-df_dict[model_name].y_pred_reg2_logit)
    df_dict[model_name]['discr_reg3']=df_dict[model_name].amount-df_dict[model_name].y_pred_reg3
    df_dict[model_name]['discr_reg3_abs']=abs(df_dict[model_name].amount-df_dict[model_name].y_pred_reg3)
    df_dict[model_name]['discr_reg3_logit']=df_dict[model_name].amount-df_dict[model_name].y_pred_reg3_logit
    df_dict[model_name]['discr_reg3_logit_abs']=abs(df_dict[model_name].amount-df_dict[model_name].y_pred_reg3_logit)
    
    print("Only for classes 2 and 3")

    scores = pd.DataFrame(columns=('min', 'mean', 'max'), 
                          index=('reg2','reg2_abs', 'reg2_logit', 'reg2_logit_abs', 
                                 'reg3','reg3_abs', 'reg3_logit', 'reg3_logit_abs'))
    
    
    scores.loc['reg2']=[np.min(df_dict[model_name][df_dict[model_name].y_pred.isin(['2'])].discr_reg2), 
                        np.mean(df_dict[model_name][df_dict[model_name].y_pred.isin(['2'])].discr_reg2), 
                        np.max(df_dict[model_name][df_dict[model_name].y_pred.isin(['2'])].discr_reg2)]
    scores.loc['reg2_abs']=[np.min(df_dict[model_name][df_dict[model_name].y_pred.isin(['2'])].discr_reg2_abs), 
                            np.mean(df_dict[model_name][df_dict[model_name].y_pred.isin(['2'])].discr_reg2_abs), 
                            np.max(df_dict[model_name][df_dict[model_name].y_pred.isin(['2'])].discr_reg2_abs)]
    scores.loc['reg2_logit']=[np.min(df_dict[model_name][df_dict[model_name].y_pred.isin(['2'])].discr_reg2_logit), 
                              np.mean(df_dict[model_name][df_dict[model_name].y_pred.isin(['2'])].discr_reg2_logit),
                              np.max(df_dict[model_name][df_dict[model_name].y_pred.isin(['2'])].discr_reg2_logit)]
    scores.loc['reg2_logit_abs']=[np.min(df_dict[model_name][df_dict[model_name].y_pred.isin(['2'])].discr_reg2_logit_abs), 
                                  np.mean(df_dict[model_name][df_dict[model_name].y_pred.isin(['2'])].discr_reg2_logit_abs), 
                                  np.max(df_dict[model_name][df_dict[model_name].y_pred.isin(['2'])].discr_reg2_logit_abs)]
    scores.loc['reg3']=[np.min(df_dict[model_name][df_dict[model_name].y_pred.isin(['3'])].discr_reg3), 
                        np.mean(df_dict[model_name][df_dict[model_name].y_pred.isin(['3'])].discr_reg3), 
                        np.max(df_dict[model_name][df_dict[model_name].y_pred.isin(['3'])].discr_reg3)]
    scores.loc['reg3_abs']=[np.min(df_dict[model_name][df_dict[model_name].y_pred.isin(['3'])].discr_reg3_abs), 
                            np.mean(df_dict[model_name][df_dict[model_name].y_pred.isin(['3'])].discr_reg3_abs), 
                            np.max(df_dict[model_name][df_dict[model_name].y_pred.isin(['3'])].discr_reg3_abs)]
    scores.loc['reg3_logit']=[np.min(df_dict[model_name][df_dict[model_name].y_pred.isin(['3'])].discr_reg3_logit), 
                              np.mean(df_dict[model_name][df_dict[model_name].y_pred.isin(['3'])].discr_reg3_logit),
                              np.max(df_dict[model_name][df_dict[model_name].y_pred.isin(['3'])].discr_reg3_logit)]
    scores.loc['reg3_logit_abs']=[np.min(df_dict[model_name][df_dict[model_name].y_pred.isin(['3'])].discr_reg3_logit_abs), 
                                  np.mean(df_dict[model_name][df_dict[model_name].y_pred.isin(['3'])].discr_reg3_logit_abs), 
                                  np.max(df_dict[model_name][df_dict[model_name].y_pred.isin(['3'])].discr_reg3_logit_abs)]
    
    display(scores)

    
    print("For classes 2, 2?,3? and 3")

    scores = pd.DataFrame(columns=('min', 'mean', 'max'), 
                          index=('reg2','reg2_abs', 'reg2_logit', 'reg2_logit_abs', 
                                 'reg3','reg3_abs', 'reg3_logit', 'reg3_logit_abs'))
    scores.loc['reg2']=[np.min(df_dict[model_name][df_dict[model_name].y_pred.isin(['2','2?'])].discr_reg2), 
                        np.mean(df_dict[model_name][df_dict[model_name].y_pred.isin(['2','2?'])].discr_reg2), 
                        np.max(df_dict[model_name][df_dict[model_name].y_pred.isin(['2','2?'])].discr_reg2)]
    scores.loc['reg2_abs']=[np.min(df_dict[model_name][df_dict[model_name].y_pred.isin(['2','2?'])].discr_reg2_abs), 
                            np.mean(df_dict[model_name][df_dict[model_name].y_pred.isin(['2','2?'])].discr_reg2_abs), 
                            np.max(df_dict[model_name][df_dict[model_name].y_pred.isin(['2','2?'])].discr_reg2_abs)]
    scores.loc['reg2_logit']=[np.min(df_dict[model_name][df_dict[model_name].y_pred.isin(['2','2?'])].discr_reg2_logit), 
                              np.mean(df_dict[model_name][df_dict[model_name].y_pred.isin(['2','2?'])].discr_reg2_logit),
                              np.max(df_dict[model_name][df_dict[model_name].y_pred.isin(['2','2?'])].discr_reg2_logit)]
    scores.loc['reg2_logit_abs']=[np.min(df_dict[model_name][df_dict[model_name].y_pred.isin(['2','2?'])].discr_reg2_logit_abs), 
                                  np.mean(df_dict[model_name][df_dict[model_name].y_pred.isin(['2','2?'])].discr_reg2_logit_abs), 
                                  np.max(df_dict[model_name][df_dict[model_name].y_pred.isin(['2','2?'])].discr_reg2_logit_abs)]
    scores.loc['reg3']=[np.min(df_dict[model_name][df_dict[model_name].y_pred.isin(['3','3?'])].discr_reg3), 
                        np.mean(df_dict[model_name][df_dict[model_name].y_pred.isin(['3','3?'])].discr_reg3), 
                        np.max(df_dict[model_name][df_dict[model_name].y_pred.isin(['3','3?'])].discr_reg3)]
    scores.loc['reg3_abs']=[np.min(df_dict[model_name][df_dict[model_name].y_pred.isin(['3','3?'])].discr_reg3_abs), 
                            np.mean(df_dict[model_name][df_dict[model_name].y_pred.isin(['3','3?'])].discr_reg3_abs), 
                            np.max(df_dict[model_name][df_dict[model_name].y_pred.isin(['3','3?'])].discr_reg3_abs)]
    scores.loc['reg3_logit']=[np.min(df_dict[model_name][df_dict[model_name].y_pred.isin(['3','3?'])].discr_reg3_logit), 
                              np.mean(df_dict[model_name][df_dict[model_name].y_pred.isin(['3','3?'])].discr_reg3_logit),
                              np.max(df_dict[model_name][df_dict[model_name].y_pred.isin(['3','3?'])].discr_reg3_logit)]
    scores.loc['reg3_logit_abs']=[np.min(df_dict[model_name][df_dict[model_name].y_pred.isin(['3','3?'])].discr_reg3_logit_abs), 
                                  np.mean(df_dict[model_name][df_dict[model_name].y_pred.isin(['3','3?'])].discr_reg3_logit_abs), 
                                  np.max(df_dict[model_name][df_dict[model_name].y_pred.isin(['3','3?'])].discr_reg3_logit_abs)]
    
    display(scores)

    plt.hist(df_dict[model_name][(df_dict[model_name].y_pred.isin(['2']))].discr_reg2, bins=100)
    plt.title('Discrepancy for predicted seg. "2"' )
    plt.show()
    plt.hist(df_dict[model_name][(df_dict[model_name].y_pred.isin(['2','2?']))].discr_reg2, bins=100)
    plt.title('Discrepancy for predicted seg. "2" and "2?"' )
    plt.show()

    plt.hist(df_dict[model_name][(df_dict[model_name].y_pred.isin(['2']))&(df_dict[model_name].discr_reg2_abs<=100)].discr_reg2, bins=100)
    plt.title('Discrepancy for predicted seg. "2" where absolute discr. <= 100' )
    plt.show()
    plt.hist(df_dict[model_name][(df_dict[model_name].y_pred.isin(['2','2?']))&(df_dict[model_name].discr_reg2_abs<=100)].discr_reg2, bins=100)
    plt.title('Discrepancy for predicted seg. "2" and "2?" where absolute discr. <= 100' )
    plt.show()
    
    plt.hist(df_dict[model_name][(df_dict[model_name].y_pred.isin(['2']))].discr_reg2_logit, bins=100)
    plt.title('(Logit) Discrepancy for predicted seg. "2" ' )
    plt.show()
    plt.hist(df_dict[model_name][(df_dict[model_name].y_pred.isin(['2','2?']))].discr_reg2_logit, bins=100)
    plt.title('(Logit) Discrepancy for predicted seg. "2" and "2?" ' )
    plt.show()

    plt.hist(df_dict[model_name][(df_dict[model_name].y_pred.isin(['2']))&(df_dict[model_name].discr_reg2_logit_abs<=100)].discr_reg2_logit, bins=100)
    plt.title('(Logit) Discrepancy for predicted seg. "2" where absolute discr. <= 100' )
    plt.show()
    plt.hist(df_dict[model_name][(df_dict[model_name].y_pred.isin(['2','2?']))&(df_dict[model_name].discr_reg2_logit_abs<=100)].discr_reg2_logit, bins=100)
    plt.title('(Logit) Discrepancy for predicted seg. "2" and "2?" where absolute discr. <= 100' )
    plt.show()
    
    plt.hist(df_dict[model_name][(df_dict[model_name].y_pred.isin(['3']))].discr_reg3, bins=100)
    plt.title('Discrepancy for predicted seg. "3"' )
    plt.show()
    plt.hist(df_dict[model_name][(df_dict[model_name].y_pred.isin(['3','3?']))].discr_reg3, bins=100)
    plt.title('Discrepancy for predicted seg. "3" and "3?"' )
    plt.show()

    plt.hist(df_dict[model_name][(df_dict[model_name].y_pred.isin(['3']))&(df_dict[model_name].discr_reg3_abs<=100)].discr_reg3, bins=100)
    plt.title('Discrepancy for predicted seg. "3" where absolute discr. <= 100' )
    plt.show()
    plt.hist(df_dict[model_name][(df_dict[model_name].y_pred.isin(['3','3?']))&(df_dict[model_name].discr_reg3_abs<=100)].discr_reg3, bins=100)
    plt.title('Discrepancy for predicted seg. "3" and "3?" where absolute discr. <= 100' )
    plt.show()
    
    plt.hist(df_dict[model_name][(df_dict[model_name].y_pred.isin(['3']))].discr_reg3_logit, bins=100)
    plt.title('(Logit) Discrepancy for predicted seg. "3" ' )
    plt.show()
    plt.hist(df_dict[model_name][(df_dict[model_name].y_pred.isin(['3','3?']))].discr_reg3_logit, bins=100)
    plt.title('(Logit) Discrepancy for predicted seg. "3" and "3?" ' )
    plt.show()

    plt.hist(df_dict[model_name][(df_dict[model_name].y_pred.isin(['3']))&(df_dict[model_name].discr_reg3_logit_abs<=100)].discr_reg3_logit, bins=100)
    plt.title('(Logit) Discrepancy for predicted seg. "3" where absolute discr. <= 100' )
    plt.show()
    plt.hist(df_dict[model_name][(df_dict[model_name].y_pred.isin(['3','3?']))&(df_dict[model_name].discr_reg3_logit_abs<=100)].discr_reg3_logit, bins=100)
    plt.title('(Logit) Discrepancy for predicted seg. "3" and "3?" where absolute discr. <= 100' )
    plt.show()



# Feature Importance

## Feature importance

### CLF1

In [None]:
for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    print('Plot feature importance for CLF1 for {}'.format(model_name))
    
    explainer_clf1 = shap.TreeExplainer(clf1_models[model_name])
    shap_values_clf1 = explainer_clf1.shap_values(df_dict[model_name][model_features[model_name]])
    shap.summary_plot(shap_values_clf1, features=df_dict[model_name][model_features[model_name]], 
                      feature_names=df_dict[model_name][model_features[model_name]].columns)
    
    shap.summary_plot(shap_values_clf1, features=df_dict[model_name][model_features[model_name]], 
                      feature_names=df_dict[model_name][model_features[model_name]].columns, plot_type='bar')

### Feature importance CLF2

In [None]:
for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    print('Plot feature importance for CLF2 for {}'.format(model_name))
    
    explainer_clf2 = shap.TreeExplainer(clf2_models[model_name])
    shap_values_clf2 = explainer_clf2.shap_values(df_dict[model_name][~df_dict[model_name].y_clf2.isnull()][model_features[model_name]])
    shap.summary_plot(shap_values_clf2, 
                      features=df_dict[model_name][~df_dict[model_name].y_clf2.isnull()][model_features[model_name]], 
                      feature_names=df_dict[model_name][model_features[model_name]].columns)
    
    shap.summary_plot(shap_values_clf2, features=df_dict[model_name][~df_dict[model_name].y_clf2.isnull()][model_features[model_name]], 
                      feature_names=df_dict[model_name][model_features[model_name]].columns, plot_type='bar')

### Feature Importance Reg2

In [None]:
for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    print('Plot feature importance for REG2 for {}'.format(model_name))
    
    explainer_reg2 = shap.TreeExplainer(reg2_models[model_name])
    shap_values_reg2 = explainer_reg2.shap_values(df_dict[model_name][df_dict[model_name].amount_segment==2][model_features[model_name]])
    shap.summary_plot(shap_values_reg2, features=df_dict[model_name][df_dict[model_name].amount_segment==2][model_features[model_name]], feature_names=df_dict[model_name][model_features[model_name]].columns)
    shap.summary_plot(shap_values_reg2, features=df_dict[model_name][df_dict[model_name].amount_segment==2][model_features[model_name]], feature_names=df_dict[model_name][model_features[model_name]].columns, plot_type='bar')

In [None]:
for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    print('Plot feature importance for REG2 LOGIT for {}'.format(model_name))
    
    explainer_reg2_logit = shap.TreeExplainer(reg2_logit_models[model_name])
    shap_values_reg2_logit = explainer_reg2_logit.shap_values(df_dict[model_name][df_dict[model_name].amount_segment==2][model_features[model_name]])
    shap.summary_plot(shap_values_reg2_logit, features=df_dict[model_name][df_dict[model_name].amount_segment==2][model_features[model_name]], feature_names=df_dict[model_name][model_features[model_name]].columns)
    
    shap.summary_plot(shap_values_reg2_logit, features=df_dict[model_name][df_dict[model_name].amount_segment==2][model_features[model_name]], feature_names=df_dict[model_name][model_features[model_name]].columns, plot_type='bar')

### Feature Importance reg3

In [None]:
for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    print('Plot feature importance for REG3 for {}'.format(model_name))
    
    explainer_reg3 = shap.TreeExplainer(reg3_models[model_name])
    shap_values_reg3 = explainer_reg3.shap_values(df_dict[model_name][df_dict[model_name].amount_segment==3][model_features[model_name]])
    shap.summary_plot(shap_values_reg3, features=df_dict[model_name][df_dict[model_name].amount_segment==3][model_features[model_name]], feature_names=df_dict[model_name][model_features[model_name]].columns)
    
    shap.summary_plot(shap_values_reg3, features=df_dict[model_name][df_dict[model_name].amount_segment==3][model_features[model_name]], feature_names=df_dict[model_name][model_features[model_name]].columns, plot_type='bar')

In [None]:
for feature_group_combination in feature_group_combinations:
    model_name="_".join(feature_group_combination)
    print('Plot feature importance for REG3 Logit for {}'.format(model_name))
  
    explainer_reg3_logit = shap.TreeExplainer(reg3_logit_models[model_name])
    shap_values_reg3_logit = explainer_reg3_logit.shap_values(df_dict[model_name][df_dict[model_name].amount_segment==3][model_features[model_name]])
    shap.summary_plot(shap_values_reg3_logit, features=df_dict[model_name][df_dict[model_name].amount_segment==3][model_features[model_name]], feature_names=df_dict[model_name][model_features[model_name]].columns)
    
    shap.summary_plot(shap_values_reg3_logit, features=df_dict[model_name][df_dict[model_name].amount_segment==3][model_features[model_name]], feature_names=df_dict[model_name][model_features[model_name]].columns, plot_type='bar')

# Cross-validation

In [None]:
cv_model_eval = TimeSeriesSplit(n_splits=5)

df_dict = {}
cv_results_clf1 = {}
cv_results_clf2 = {}
cv_results_reg2 = {}
cv_results_reg2_logit = {}
cv_results_reg3 = {}
cv_results_reg3_logit = {}

for feature_group_combination in tqdm(feature_group_combinations):
    model_name="_".join(feature_group_combination)
    print('Crossvalidate models for {}'.format(model_name))
    
    df_dict[model_name]=df[model_features[model_name]+['amount','amount_segment','y_clf1','y_clf2']].dropna(subset=model_features[model_name]).copy()

    #Test dataset is the same for all models
    X_test = df_dict[model_name][model_features[model_name]]

    X_train_clf1 = df_dict[model_name][model_features[model_name]]
    y_train_clf1 = df_dict[model_name].y_clf1

    # For Classifier2 (clf2) we use data from segments 2 and 3 (so we drop records where y_clf2 is NaN)
    df_train_clf2 = df_dict[model_name].dropna(subset=['y_clf2'])

    X_train_clf2 = df_train_clf2[model_features[model_name]]
    y_train_clf2 = df_train_clf2.y_clf2.astype(int)


    # For Regression seg2 use only Segment 2 and amount 0.00001>=amount>=49.9999 (because of logit)
    df_train_reg2 = df_dict[model_name][(df_dict[model_name]['amount_segment']==2) & 
                             (df_dict[model_name]['amount']>=0.00001) & 
                             (df_dict[model_name]['amount']<=49.9999)]

    X_train_reg2 = df_train_reg2[model_features[model_name]]
    y_train_reg2 = df_train_reg2.amount
    y_train_reg2_logit = (y_train_reg2/50).apply(logit)


    # For Regression seg3 use only Segment 3 and amount 50>amount>=99.9999 (because of logit)
    df_train_reg3 = df_dict[model_name][(df_dict[model_name]['amount_segment']==3) & 
                             (df_dict[model_name]['amount']>50) & 
                             (df_dict[model_name]['amount']<=99.9999)]

    X_train_reg3 = df_train_reg3[model_features[model_name]]
    y_train_reg3 = df_train_reg3.amount
    y_train_reg3_logit = ((y_train_reg3-50)/50).apply(logit)


    cv_results_clf1[model_name] = cross_validate(clf1_models[model_name], X_train_clf1, y_train_clf1, scoring=['accuracy', 'roc_auc'], cv=cv_model_eval, return_train_score=True)   
    cv_results_clf2[model_name] = cross_validate(clf2_models[model_name], X_train_clf2, y_train_clf2, scoring=['accuracy', 'roc_auc'], cv=cv_model_eval, return_train_score=True)
    cv_results_reg2[model_name] = cross_validate(reg2_models[model_name], X_train_reg2, y_train_reg2, scoring=['neg_mean_squared_error', 'r2'], cv=cv_model_eval, return_train_score=True)
    cv_results_reg3[model_name] = cross_validate(reg3_models[model_name], X_train_reg3, y_train_reg3, scoring=['neg_mean_squared_error', 'r2'], cv=cv_model_eval, return_train_score=True)
    cv_results_reg2_logit[model_name] = cross_validate(reg2_logit_models[model_name], X_train_reg2, y_train_reg2_logit, scoring=['neg_mean_squared_error', 'r2'], cv=cv_model_eval, return_train_score=True)
    cv_results_reg3_logit[model_name] = cross_validate(reg3_logit_models[model_name], X_train_reg3, y_train_reg3_logit, scoring=['neg_mean_squared_error', 'r2'], cv=cv_model_eval, return_train_score=True)

In [None]:

for feature_group_combination in tqdm(feature_group_combinations):
    model_name="_".join(feature_group_combination)
    print('Cross-validation results for {}'.format(model_name))
    print(cv_results_clf1[model_name])
    print(cv_results_clf2[model_name])
    print(cv_results_reg2[model_name])
    print(cv_results_reg2_logit[model_name])
    print(cv_results_reg3[model_name])
    print(cv_results_reg3_logit[model_name])


In [None]:
columns_list_clf=['Model', 'Feature_Combination', 
              'Mean_Train_Accuracy', 'Std_Train_Accuracy',
              'Mean_Test_Accuracy', 'Std_Test_Accuracy', 
              'Mean_Train_AUC', 'Std_Train_AUC', 
              'Mean_Test_AUC', 'Std_Test_AUC']
cv_results_clf = pd.DataFrame(columns=columns_list_clf)

columns_list_reg=['Model', 'Feature_Combination', 
              'Mean_Train_NMSE', 'Std_Train_NMSE',
              'Mean_Test_NMSE', 'Std_Test_NMSE', 
              'Mean_Train_R2', 'Std_Train_R2', 
              'Mean_Test_R2', 'Std_Test_R2']
cv_results_reg = pd.DataFrame(columns=columns_list_reg)


for feature_group_combination in tqdm(feature_group_combinations):
    model_name="_".join(feature_group_combination)
    cv_results_clf=cv_results_clf.append(pd.DataFrame([['clf1', model_name,
                                 np.mean(cv_results_clf1[model_name]['train_accuracy']),
                                 np.std(cv_results_clf1[model_name]['train_accuracy']),
                                 np.mean(cv_results_clf1[model_name]['test_accuracy']),
                                 np.std(cv_results_clf1[model_name]['test_accuracy']),
                                 np.mean(cv_results_clf1[model_name]['train_roc_auc']),
                                 np.std(cv_results_clf1[model_name]['train_roc_auc']),
                                 np.mean(cv_results_clf1[model_name]['test_roc_auc']),
                                 np.std(cv_results_clf1[model_name]['test_roc_auc'])]], columns=columns_list_clf),
                                ignore_index=True)

    cv_results_clf=cv_results_clf.append(pd.DataFrame([['clf2', model_name,
                                 np.mean(cv_results_clf2[model_name]['train_accuracy']),
                                 np.std(cv_results_clf2[model_name]['train_accuracy']),
                                 np.mean(cv_results_clf2[model_name]['test_accuracy']),
                                 np.std(cv_results_clf2[model_name]['test_accuracy']),
                                 np.mean(cv_results_clf2[model_name]['train_roc_auc']),
                                 np.std(cv_results_clf2[model_name]['train_roc_auc']),
                                 np.mean(cv_results_clf2[model_name]['test_roc_auc']),
                                 np.std(cv_results_clf2[model_name]['test_roc_auc'])]], columns=columns_list_clf),
                                ignore_index=True)

    cv_results_reg=cv_results_reg.append(pd.DataFrame([['reg2', model_name,
                                 np.mean(cv_results_reg2[model_name]['train_neg_mean_squared_error']),
                                 np.std(cv_results_reg2[model_name]['train_neg_mean_squared_error']),
                                 np.mean(cv_results_reg2[model_name]['test_neg_mean_squared_error']),
                                 np.std(cv_results_reg2[model_name]['test_neg_mean_squared_error']),
                                 np.mean(cv_results_reg2[model_name]['train_r2']),
                                 np.std(cv_results_reg2[model_name]['train_r2']),
                                 np.mean(cv_results_reg2[model_name]['test_r2']),
                                 np.std(cv_results_reg2[model_name]['test_r2'])]], columns=columns_list_reg),
                                ignore_index=True)
    
    cv_results_reg=cv_results_reg.append(pd.DataFrame([['reg2_logit', model_name,
                                 np.mean(cv_results_reg2_logit[model_name]['train_neg_mean_squared_error']),
                                 np.std(cv_results_reg2_logit[model_name]['train_neg_mean_squared_error']),
                                 np.mean(cv_results_reg2_logit[model_name]['test_neg_mean_squared_error']),
                                 np.std(cv_results_reg2_logit[model_name]['test_neg_mean_squared_error']),
                                 np.mean(cv_results_reg2_logit[model_name]['train_r2']),
                                 np.std(cv_results_reg2_logit[model_name]['train_r2']),
                                 np.mean(cv_results_reg2_logit[model_name]['test_r2']),
                                 np.std(cv_results_reg2_logit[model_name]['test_r2'])]], columns=columns_list_reg),
                                ignore_index=True)
    cv_results_reg=cv_results_reg.append(pd.DataFrame([['reg3', model_name,
                                 np.mean(cv_results_reg3[model_name]['train_neg_mean_squared_error']),
                                 np.std(cv_results_reg3[model_name]['train_neg_mean_squared_error']),
                                 np.mean(cv_results_reg3[model_name]['test_neg_mean_squared_error']),
                                 np.std(cv_results_reg3[model_name]['test_neg_mean_squared_error']),
                                 np.mean(cv_results_reg3[model_name]['train_r2']),
                                 np.std(cv_results_reg3[model_name]['train_r2']),
                                 np.mean(cv_results_reg3[model_name]['test_r2']),
                                 np.std(cv_results_reg3[model_name]['test_r2'])]], columns=columns_list_reg),
                                ignore_index=True)
    cv_results_reg=cv_results_reg.append(pd.DataFrame([['reg3_logit', model_name,
                                 np.mean(cv_results_reg3_logit[model_name]['train_neg_mean_squared_error']),
                                 np.std(cv_results_reg3_logit[model_name]['train_neg_mean_squared_error']),
                                 np.mean(cv_results_reg3_logit[model_name]['test_neg_mean_squared_error']),
                                 np.std(cv_results_reg3_logit[model_name]['test_neg_mean_squared_error']),
                                 np.mean(cv_results_reg3_logit[model_name]['train_r2']),
                                 np.std(cv_results_reg3_logit[model_name]['train_r2']),
                                 np.mean(cv_results_reg3_logit[model_name]['test_r2']),
                                 np.std(cv_results_reg3_logit[model_name]['test_r2'])]], columns=columns_list_reg),
                                ignore_index=True)


display(cv_results_clf)
display(cv_results_reg)

# Grid Search

In [None]:
from pathlib import Path
from sklearn.model_selection import GridSearchCV
import pickle
df_dict={}

def grid_search_model_clf1(feature_group_combination):
    print('Grid Search model for: ', " and ".join(feature_group_combination))
        
    df_dict[model_name]=df[model_features[model_name]+['amount','amount_segment','y_clf1','y_clf2']].dropna(subset=model_features[model_name]).copy()

    #Test dataset is the same for all models
    X = df_dict[model_name][model_features[model_name]]
    y = df_dict[model_name].y_clf1
                
    scoring = {'AUC': 'roc_auc', 'Accuracy': 'accuracy'}

    parameters = {
        'max_depth': [6,7,8,9], # [default=6]
        'n_estimators': [500,700,900,1100], # [default=100]
        'tree_method': ['gpu_hist'],
        'use_label_encoder': [False], 
        'objective': ['binary:logistic'],
        'eval_metric': ['error']
    }

    xgb_model = xgb.XGBClassifier()
    cv = TimeSeriesSplit(n_splits=5)

    grid_search = GridSearchCV(xgb_model, parameters, n_jobs=4, cv=cv, scoring=scoring, refit=False, verbose=2, return_train_score=True) 
    grid_search.fit(X, y)

    print(grid_search.cv_results_)
    
    return grid_search


gs_results={}
for feature_group_combination in feature_group_combinations :
    pkl_filename = Path('./grid_search/'+'_'.join(feature_group_combination)+"_grid_search_clf1.pkl")
    
    if pkl_filename.exists() :
        print('Load Grid Search results for clf1 model '+'_'.join(feature_group_combination))
        
        with open(pkl_filename, 'rb') as file:
            gs_result= pickle.load(file)
    else :
        print('Grid Search for clf1 model '+'_'.join(feature_group_combination))
        gs_result = grid_search_model_clf1(feature_group_combination)
        
        with open(pkl_filename, 'wb') as file:
            pickle.dump(gs_result, file)
    
    df_result = pd.DataFrame(gs_result.cv_results_['params'])
    df_result['mean_test_AUC'] = gs_result.cv_results_['mean_test_AUC']
    df_result['std_test_AUC']=gs_result.cv_results_['std_test_AUC']
    df_result['mean_train_AUC'] = gs_result.cv_results_['mean_train_AUC']
    df_result['std_train_AUC']=gs_result.cv_results_['std_train_AUC']
    df_result['rank_test_AUC']=gs_result.cv_results_['rank_test_AUC']
    df_result['mean_test_Accuracy']=gs_result.cv_results_['mean_test_Accuracy']
    df_result['std_test_Accuracy']=gs_result.cv_results_['std_test_Accuracy']
    df_result['rank_test_Accuracy']=gs_result.cv_results_['rank_test_Accuracy']
    df_result['mean_train_Accuracy']=gs_result.cv_results_['mean_train_Accuracy']
    df_result['std_train_Accuracy']=gs_result.cv_results_['std_train_Accuracy']
    df_result['mean_time']=gs_result.cv_results_['mean_fit_time']+gs_result.cv_results_['mean_score_time']


    gs_results['_'.join(feature_group_combination)]=pd.DataFrame(df_result)


print(gs_results['location'])

In [None]:
best_param = pd.DataFrame()

for feature_group_combination in feature_group_combinations:
    result = gs_results['_'.join(feature_group_combination)]
    best_param.loc[
        '_'.join(feature_group_combination),
        ['max_depth_Acc', 'n_estimators_Acc', 'Acc_score', 'max_depth_AUC', 'n_estimators_AUC', 'AUC_score'
         ]] = result.loc[result.rank_test_Accuracy == 1,
                         ['max_depth', 'n_estimators', 'mean_test_Accuracy']].values[0].tolist() + result.loc[
                             result.rank_test_Accuracy == 1,
                             ['max_depth', 'n_estimators', 'mean_test_AUC']].values[0].tolist()

print(best_param)

In [None]:
def grid_search_model_clf2(feature_group_combination):
    print('Grid Search model for: ', " and ".join(feature_group_combination))
        
    df_dict[model_name]=df[model_features[model_name]+['amount','amount_segment','y_clf1','y_clf2']].dropna(subset=model_features[model_name]).copy()
    df_dict[model_name] = df_dict[model_name].dropna(subset=['y_clf2'])

    #Test dataset is the same for all models
    X = df_dict[model_name][model_features[model_name]]
    y = df_dict[model_name].y_clf2
                
    scoring = {'AUC': 'roc_auc', 'Accuracy': 'accuracy'}

    parameters = {
        'max_depth': [6,7,8,9], # [default=6]
        'n_estimators': [500,700,900,1100], # [default=100]
        'tree_method': ['gpu_hist'],
        'use_label_encoder': [False], 
        'objective': ['binary:logistic'],
        'eval_metric': ['error']
    }

    xgb_model = xgb.XGBClassifier()
    cv = TimeSeriesSplit(n_splits=5)

    grid_search = GridSearchCV(xgb_model, parameters, n_jobs=4, cv=cv, scoring=scoring, refit=False, verbose=2, return_train_score=True) 
    grid_search.fit(X, y)

    print(grid_search.cv_results_)
    
    return grid_search


gs_results={}
for feature_group_combination in feature_group_combinations :
    pkl_filename = Path('./grid_search/'+'_'.join(feature_group_combination)+"_grid_search_clf2.pkl")
    
    if pkl_filename.exists() :
        print('Load Grid Search results for clf2 model '+'_'.join(feature_group_combination))
        
        with open(pkl_filename, 'rb') as file:
            gs_result= pickle.load(file)
    else :
        print('Grid Search for clf2 model '+'_'.join(feature_group_combination))
        gs_result = grid_search_model_clf2(feature_group_combination)
        
        with open(pkl_filename, 'wb') as file:
            pickle.dump(gs_result, file)
    
    df_result = pd.DataFrame(gs_result.cv_results_['params'])
    df_result['mean_test_AUC'] = gs_result.cv_results_['mean_test_AUC']
    df_result['std_test_AUC']=gs_result.cv_results_['std_test_AUC']
    df_result['mean_train_AUC'] = gs_result.cv_results_['mean_train_AUC']
    df_result['std_train_AUC']=gs_result.cv_results_['std_train_AUC']
    df_result['rank_test_AUC']=gs_result.cv_results_['rank_test_AUC']
    df_result['mean_test_Accuracy']=gs_result.cv_results_['mean_test_Accuracy']
    df_result['std_test_Accuracy']=gs_result.cv_results_['std_test_Accuracy']
    df_result['rank_test_Accuracy']=gs_result.cv_results_['rank_test_Accuracy']
    df_result['mean_train_Accuracy']=gs_result.cv_results_['mean_train_Accuracy']
    df_result['std_train_Accuracy']=gs_result.cv_results_['std_train_Accuracy']
    df_result['mean_time']=gs_result.cv_results_['mean_fit_time']+gs_result.cv_results_['mean_score_time']


    gs_results['_'.join(feature_group_combination)]=pd.DataFrame(df_result)


print(gs_results['location'])

In [None]:
best_param = pd.DataFrame()

for feature_group_combination in feature_group_combinations:
    result = gs_results['_'.join(feature_group_combination)]
    best_param.loc[
        '_'.join(feature_group_combination),
        ['max_depth_Acc', 'n_estimators_Acc', 'Acc_score', 'max_depth_AUC', 'n_estimators_AUC', 'AUC_score'
         ]] = result.loc[result.rank_test_Accuracy == 1,
                         ['max_depth', 'n_estimators', 'mean_test_Accuracy']].values[0].tolist() + result.loc[
                             result.rank_test_Accuracy == 1,
                             ['max_depth', 'n_estimators', 'mean_test_AUC']].values[0].tolist()

print(best_param)