In [None]:
# Enable autoreload
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedShuffleSplit, TimeSeriesSplit
import matplotlib.pyplot as plt
import xgboost as xgb
from scipy.special import logit, expit
from sklearn.metrics import confusion_matrix, classification_report
import shap
shap.initjs()

pd.set_option('max_columns',None)
pd.set_option('display.max_rows', 200)
tqdm.pandas()

# Import pickle file with features

In [None]:
df = pd.read_pickle('./data/export_features_2016_2018.pkl.bz2')

In [None]:
df_orig = df.copy() # save all data for later

In [None]:
df=df_orig # get all data back

In [None]:
all_features = ['location_count', 'location_mean', 'location_std',
 'location_min', 'location_5%', 'location_10%', 'location_15%',
 'location_20%', 'location_25%', 'location_30%', 'location_35%',
 'location_40%', 'location_45%', 'location_50%', 'location_55%',
 'location_60%', 'location_65%', 'location_70%', 'location_75%',
 'location_80%', 'location_85%', 'location_90%', 'location_95%',
 'location_max', 
                
 'band_count', 'band_mean', 'band_std',
 'band_min', 'band_5%', 'band_10%', 'band_15%', 'band_20%',
 'band_25%', 'band_30%', 'band_35%', 'band_40%', 'band_45%',
 'band_50%', 'band_55%', 'band_60%', 'band_65%', 'band_70%',
 'band_75%', 'band_80%', 'band_85%', 'band_90%', 'band_95%',
 'band_max', 
                
 'promoter_count', 'promoter_mean', 'promoter_std',
 'promoter_min', 'promoter_5%', 'promoter_10%', 'promoter_15%',
 'promoter_20%', 'promoter_25%', 'promoter_30%', 'promoter_35%',
 'promoter_40%', 'promoter_45%', 'promoter_50%', 'promoter_55%',
 'promoter_60%', 'promoter_65%', 'promoter_70%', 'promoter_75%',
 'promoter_80%', 'promoter_85%', 'promoter_90%', 'promoter_95%',
 'promoter_max', 
 
 'vg_datum_year', 'vg_datum_month', 'vg_datum_day_of_week',
                
 'location_kirche', 'location_theater', 'location_hotel', 'location_cafe',
 'location_stadthalle', 'location_buergerhaus', 'location_club', 'location_gaststaette',
 'location_halle', 'location_schloss', 'location_festhalle', 'location_musikschule',
 'location_restaurant', 'location_kulturzentrum', 'location_kurhaus',
 'location_festzelt', 'location_mehrzweckhalle', 'location_pub',
 'location_gasthaus', 'location_bar', 'location_turnhalle', 'location_klinik',
 'location_gymnasium', 'location_kulturhaus', 'location_rathaus', 'location_gasthof',
 'location_park', 'location_schuetzenhalle', 'location_hochschule', 'location_gemeindehalle',
                
 'band_musikverein', 'band_band', 'band_mv', 'band_duo', 'band_trio', 'band_musikkapelle',
 'band_chor', 'band_blaskapelle', 'band_stadtkapelle', 'band_gbr', 'band_orchester',
 'band_jazz', 'band_blasorchester', 'band_original', 'band_partyband', 'band_kurorchester',
 'band_friends', 'band_ensemble', 'band_blues', 'band_ev', 'band_swing', 'band_live',
 'band_musikzug', 'band_solo', 'band_sound', 'band_jugendkapelle', 'band_alleinunterhalter',
 'band_musikanten', 'band_harmonie', 'band_spielmannszug']

Drop segments 1 and 13, since data in those segments is not relevant for us

Drop data before 2014, since data before 2014 is noisy

In [None]:
df=df.loc[~df.amount_segment.isin([1,13]),['amount', 'amount_segment']+all_features]
df = df[df['vg_datum_year'] >= 2014]
df=df.dropna(subset=['amount_segment'])
df.info()

# Get sample of the data

In [None]:
#Get a sample of the DataFrame 
splitSample = StratifiedShuffleSplit(n_splits=1, test_size=0.01, random_state=42)

for train_idx, test_idx in splitSample.split(df[all_features], df.amount_segment):
    df_sample=df.iloc[test_idx]
    
    plt.figure()
    df.amount_segment.astype(int).value_counts().sort_index().plot.bar(color='r')
    df_sample.amount_segment.astype(int).value_counts().sort_index().plot.bar(color='g')

    plt.title('Inkasso-Segment')
    plt.legend(['Full DF', 'Sample DF'])
    plt.show()

df=df_sample.copy()
df.info()

# Define Models

In [None]:
#Classification
clf1 = xgb.XGBClassifier(n_estimators=500, max_depth=7, use_label_encoder=False, objective='binary:logistic',eval_metric = 'error')
clf2 = xgb.XGBClassifier(n_estimators=500, max_depth=7, use_label_encoder=False, objective='binary:logistic',eval_metric = 'error')

#Regression
reg2 = xgb.XGBRegressor(n_estimators=700, max_depth=7, min_child_weight=5, objective='reg:squarederror')
reg2_logit = xgb.XGBRegressor(n_estimators=700, max_depth=7, min_child_weight=5, objective='reg:squarederror')

reg3 = xgb.XGBRegressor(n_estimators=700, max_depth=7, min_child_weight=5, objective='reg:squarederror')
reg3_logit = xgb.XGBRegressor(n_estimators=700, max_depth=7, min_child_weight=5, objective='reg:squarederror')

# Prepare DataFrame for Classification Model

We keep only features + 'amount' and 'amount_segment' columns.

Encode segments for Classifier 1:
* Variable y_clf1 
* Positive class (seg 4+) y_clf1=1
* Negative class (seg 2 or 3) y_clf1=0

Encode segments for Classifier 2: 
* Variable y_clf2
* Positive class (seg 3) y_clf2=1
* Negative class (seg 2) y_clf2=0
* Segments >3 are "encoded" as np.NaN. These NaN values will be dropped before training

In [None]:
df.loc[:,'y_clf1']=(df.amount_segment.astype(int) > 3).values.astype(int)
df.loc[:,'y_clf2']=df.amount_segment.apply(lambda x: 1 if x==3 else (0 if x==2 else np.nan))

# Train and Test Classification Models

In oder to have larger train and test sets, we use crossvalidation-like approach to verify the model performance.
1. We split the dataset into 5 folds with StratifiedKFold. The criteria for split is to have the same proportion of data based on the segment in each fold.
2. In each fold we train and test both classifiers independently from each other
3. We save the predict_proba results from both classifiers
4. Train and test iteration numbers are also saved (in case we want to evaluate the results based on the train/test iteration

In [None]:
#skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_model_eval = TimeSeriesSplit(n_splits=5)

df['y_pred_proba_clf1']=np.NaN
df['y_pred_proba_clf2']=np.NaN
df["y_pred_reg2"]=np.NaN
df["y_pred_reg2_logit"]=np.NaN
df["y_pred_reg3"]=np.NaN
df["y_pred_reg3_logit"]=np.NaN

df['train_iter']=np.NaN
df['test_iter']=np.NaN

iter_nr=0

for train_idx, test_idx in tqdm(cv_model_eval.split(df[all_features], df.amount_segment), total=cv_model_eval.n_splits):
    
    df_train=df.iloc[train_idx]
    df_test=df.iloc[test_idx]
    
    #Test dataset is the same for all models
    X_test = df_test[all_features]
    y_test_clf1=df_test.y_clf1
    
    X_train_clf1 = df_train[all_features]
    y_train_clf1 = df_train.y_clf1

    # For Classifier2 (clf2) we use data from segments 2 and 3 (so we drop records where y_clf2 is NaN)
    df_train_clf2 = df_train.dropna(subset=['y_clf2'])

    X_train_clf2 = df_train_clf2[all_features]
    y_train_clf2 = df_train_clf2.y_clf2
    
    y_test_clf2=df_test.y_clf2
    
    # For Regression seg2 use only Segment 2 and amount 0.00001>=amount>=49.9999 (because of logit)
    df_train_reg2 = df_train[(df_train['amount_segment']==2) & 
                             (df_train['amount']>=0.00001) & 
                             (df_train['amount']<=49.9999)]#.dropna().drop_duplicates()

    X_train_reg2 = df_train_reg2[all_features]
    y_train_reg2 = df_train_reg2.amount
    y_train_reg2_logit = (y_train_reg2/50).apply(logit)
    
    
    # For Regression seg3 use only Segment 3 and amount 50>amount>=99.9999 (because of logit)
    df_train_reg3 = df_train[(df_train['amount_segment']==3) & 
                             (df_train['amount']>50) & 
                             (df_train['amount']<=99.9999)]#.dropna().drop_duplicates()

    X_train_reg3 = df_train_reg3[all_features]
    y_train_reg3 = df_train_reg3.amount
    y_train_reg3_logit = ((y_train_reg3-50)/50).apply(logit)
    
        
  
    #Fit and test the models
    
    #Classifier 1
    clf1.fit(X_train_clf1, y_train_clf1)
    y_pred_proba_clf1 = clf1.predict_proba(X_test)[:, 1]
    print("CLF1 Train Error: {}".format(clf1.score(X_train_clf1, y_train_clf1)))
    print("CLF1 Test Error: {}".format(clf1.score(X_test, y_test_clf1)))
    
    #Classifier 2        
    clf2.fit(X_train_clf2, y_train_clf2)
    y_pred_proba_clf2 = clf2.predict_proba(X_test)[:, 1]
    print("CLF2 Train Error: {}".format(clf2.score(X_train_clf2, y_train_clf2)))
    print("CLF2 Test Error: {}".format(clf2.score(df_test.dropna(subset=['y_clf2'])[all_features], df_test.dropna(subset=['y_clf2']).y_clf2)))
    
    
    #Regression Segment 2
    reg2.fit(X_train_reg2, y_train_reg2)
    y_pred_reg2=reg2.predict(X_test)
    print("REG2 Train Error: {}".format(reg2.score(X_train_reg2, y_train_reg2)))
    print("REG2 Test Error: {}".format(reg2.score(df_test[df_test.amount_segment==2][all_features], df_test[df_test.amount_segment==2].amount)))

    #Regression Segment 2 with logit transformation
    reg2_logit.fit(X_train_reg2, y_train_reg2_logit)
    y_pred_reg2_logit = reg2_logit.predict(X_test)
    y_pred_reg2_logit_transf = pd.Series(y_pred_reg2_logit).apply(expit)*50
    print("REG2_Logit Train Error: {}".format(reg2_logit.score(X_train_reg2, y_train_reg2_logit)))
    print("REG2_Logit Test Error: {}".format(reg2_logit.score(df_test[(df_test.amount_segment==2)& 
                                                             (df_test.amount>=0.00001) & 
                                                             (df_test.amount<=49.9999)][all_features], 
                                                             (df_test[(df_test.amount_segment==2)& 
                                                             (df_test.amount>=0.00001) & 
                                                             (df_test.amount<=49.9999)
                                                                     ].amount/50).apply(logit))))

    
    #Regression Segment 3
    reg3.fit(X_train_reg3, y_train_reg3)
    y_pred_reg3=reg3.predict(X_test)
    print("REG3 Train Error: {}".format(reg3.score(X_train_reg3, y_train_reg3)))
    print("REG3 Test Error: {}".format(reg3.score(df_test[df_test.amount_segment==3][all_features], df_test[df_test.amount_segment==3].amount)))


    #Regression Segment 3 with logit transformation
    reg3_logit.fit(X_train_reg3, y_train_reg3_logit)
    y_pred_reg3_logit = reg3_logit.predict(X_test)
    y_pred_reg3_logit_transf = pd.Series(y_pred_reg3_logit).apply(expit)*50+50
    print("REG3_Logit Train Error: {}".format(reg3_logit.score(X_train_reg3, y_train_reg3_logit)))
    print("REG3_Logit Test Error: {}".format(reg3_logit.score(df_test[(df_test['amount_segment']==3) & 
                                                                      (df_test['amount']>50) & 
                                                                      (df_test['amount']<=99.9999)][all_features], 
                                                        ((df_test[(df_test['amount_segment']==3) & 
                                                                  (df_test['amount']>50) & 
                                                                  (df_test['amount']<=99.9999)].amount-50)/50).apply(logit))))
    
    
    #Save the prediction results in separate columns
    df.iloc[test_idx,df.columns.get_loc("y_pred_proba_clf1")]=y_pred_proba_clf1
    df.iloc[test_idx,df.columns.get_loc("y_pred_proba_clf2")]=y_pred_proba_clf2
    df.iloc[test_idx,df.columns.get_loc("y_pred_reg2")]=y_pred_reg2
    df.iloc[test_idx,df.columns.get_loc("y_pred_reg2_logit")]=y_pred_reg2_logit_transf
    df.iloc[test_idx,df.columns.get_loc("y_pred_reg3")]=y_pred_reg3
    df.iloc[test_idx,df.columns.get_loc("y_pred_reg3_logit")]=y_pred_reg3_logit_transf
    
    #Save train and test iteration number, in case we need it later 
    #(not sure if one record can be multiple times in train/test, anyway we save only the last iteration number for now...)
    df.iloc[train_idx,df.columns.get_loc("train_iter")]=iter_nr
    df.iloc[test_idx,df.columns.get_loc("test_iter")]=iter_nr
    
    iter_nr=iter_nr+1

Check if all records were used in train/test (was not the case with StratifiedShuffleSplit)

In [None]:
df[['train_iter', 'test_iter']].isnull().sum()

Visualisation of train/test split for each fold

In [None]:
def plot_cv_indices(cv, X, y, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)


    # Formatting
    yticklabels = list(range(n_splits))
    ax.set(yticks=np.arange(n_splits) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits, -.2])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
fig, ax = plt.subplots()
plot_cv_indices(cv_model_eval, df[all_features], df.amount_segment, ax, cv_model_eval.n_splits)

# Threshold optimization

In [None]:
df['y_pred_proba_clf1']=df['y_pred_proba_clf1'].apply(lambda x: format(float(x),".8f")).astype(float)
df['y_pred_proba_clf2']=df['y_pred_proba_clf2'].apply(lambda x: format(float(x),".8f")).astype(float)

In [None]:
cv_treasholds = TimeSeriesSplit(n_splits=5)

## Threshold optimization (clf1)

We optimize only t_neg threshold since for clf1 only positive class is of importance. We want to prevent classification of segments 4+ as segments 2 or 3 since this will mean "lost money".

In [None]:
#Define Thresholds (t_neg and t_pos) We want to minimize False Negative and False Positive
#max_for_clf1 = 0.03 # False Omission Rate

#opt_t_neg_list_clf1 = []

max_for_clf1_list = [0.01, 0.03, 0.05, 0.07, 0.09]
opt_t_neg_list_clf1 = pd.DataFrame()

#Use all features, encode segment 4+ as positive class, otherwise negative
X_train_clf1 = df[all_features]
y_train_clf1= df.y_clf1

for i in tqdm(range(5)):
    # we use all data for threshold optimization
    for train_idx, test_idx in tqdm(cv_treasholds.split(X_train_clf1, y_train_clf1), total=cv_treasholds.n_splits) : 
        
        clf1.fit(X_train_clf1.iloc[train_idx], y_train_clf1.iloc[train_idx])
        
        y_test_pred_proba_clf1 = clf1.predict_proba(X_train_clf1.iloc[test_idx])[:, 1]
        y_test_clf1 = y_train_clf1.iloc[test_idx]
        
        opt_t_neg_clf1 = 0
        opt_fnr_clf1 = 0
        
        for max_for_clf1 in max_for_clf1_list :
            for t_neg_clf1 in np.linspace(0, 1, 1001):
                for_clf1 = (y_test_pred_proba_clf1[y_test_clf1 == 1] <= t_neg_clf1).sum() / (y_test_pred_proba_clf1 <= t_neg_clf1).sum()
                if for_clf1 > max_for_clf1:
                    break
                opt_t_neg_clf1 = t_neg_clf1
                opt_for_clf1 = for_clf1

            print(opt_t_neg_clf1, opt_for_clf1, max_for_clf1)
            opt_t_neg_list_clf1 = opt_t_neg_list_clf1.append({"max_for_clf1": max_for_clf1, "opt_t_neg_clf1": opt_t_neg_clf1}, ignore_index=True)

for max_for_clf1 in max_for_clf1_list :
    print('max_for_clf1 = {}'.format(max_for_clf1))
    display(opt_t_neg_list_clf1[opt_t_neg_list_clf1.max_for_clf1==max_for_clf1].opt_t_neg_clf1.describe())

classifier_thresholds_clf1 = pd.DataFrame()
for max_for_clf1 in max_for_clf1_list :
    #t_neg = min(opt_t_neg_list)
    t_neg_clf1 = np.median(opt_t_neg_list_clf1[opt_t_neg_list_clf1.max_for_clf1==max_for_clf1].opt_t_neg_clf1)

    classifier_thresholds_clf1 = classifier_thresholds_clf1.append({"max_for_clf1": max_for_clf1,
        "t_neg_clf1": t_neg_clf1}, ignore_index=True)

display(classifier_thresholds_clf1)



In [None]:
def create_conf_matrix_clf1(t_neg):
    df['y_pred_clf1']=df['y_pred_proba_clf1'].apply(lambda x: 1 if x >= t_neg else 0) 
    df_confusion_clf1 = pd.DataFrame(confusion_matrix(df.y_clf1, df.y_pred_clf1), index=['true neg (2,3)', 'true pos (4+)'], columns=['pred. neg (2,3)', 'pred. pos (4+)'])

    return df_confusion_clf1

correct_positive=[]
correct_negative=[]
false_positive=[]
false_negative=[]


for v in np.linspace(0, 1, 100):
    result = create_conf_matrix_clf1(v)
    
    correct_positive.append(result.iloc[1,1]/df.shape[0])
    correct_negative.append(result.iloc[0,0]/df.shape[0])
    false_positive.append(result.iloc[0,1]/df.shape[0])    
    false_negative.append(result.iloc[1,0]/df.shape[0])
    
    #print('Threshold = {}'.format(v))
    #print(result)
    
x=np.linspace(0, 1, 100)
plt.figure(figsize=(20,7))


plt.stackplot(x,correct_positive, correct_negative, false_positive, false_negative, 
              labels=['Correct positive', 
                      'Correct negative',
                      'False Positive (Extra work)', 
                      'False Negative (Lost money)'])
plt.legend(loc='lower right')

## Threshold optimization (clf2)

Both segment 2 (negative class) and segment 3 (positive class) are equally important --> we need both t_neg and t_pos

In [None]:
#Define Thresholds (t_neg and t_pos) We want to minimize False Negative and False Positive
#max_for_clf2 = 0.05 #False omission rate
#max_fdr_clf2 = 0.05 #False discovery rate
max_for_clf2_list = [0.01, 0.03, 0.05, 0.07, 0.09]
max_fdr_clf2_list = [0.01, 0.03, 0.05, 0.07, 0.09]

opt_t_neg_list_clf2 = pd.DataFrame()
opt_t_pos_list_clf2 = pd.DataFrame()

#Use only data records from seg. 2 and 3. Use all features. 
X_train_clf2 = df[~df.y_clf2.isnull()][all_features]
#Encode segment 3 as 1 (positive class), seg. 2 as 0 (negative class)
y_train_clf2= df[~df.y_clf2.isnull()].y_clf2

for i in tqdm(range(5)):
    for train_idx, test_idx in tqdm(cv_treasholds.split(X_train_clf2, y_train_clf2), total=cv_treasholds.n_splits) : 
        clf2.fit(X_train_clf2.iloc[train_idx], y_train_clf2.iloc[train_idx])
        
        y_test_pred_proba_clf2 = clf2.predict_proba(X_train_clf2.iloc[test_idx])[:, 1]
        y_test_clf2 = y_train_clf2.iloc[test_idx]
        
        for max_for_clf2 in max_for_clf2_list :
            opt_t_neg_clf2 = 0
            opt_for_clf2 = 0
            for t_neg_clf2 in np.linspace(0, 1, 1001):
                for_clf2 = (y_test_pred_proba_clf2[y_test_clf2 == 1] <= t_neg_clf2).sum() / (y_test_pred_proba_clf2 <= t_neg_clf2).sum()
                if for_clf2 > max_for_clf2:
                    break
                opt_t_neg_clf2 = t_neg_clf2
                opt_for_clf2 = for_clf2

            print(opt_t_neg_clf2, opt_for_clf2, max_for_clf2)
            opt_t_neg_list_clf2 = opt_t_neg_list_clf2.append({"max_for_clf2": max_for_clf2, "opt_t_neg_clf2": opt_t_neg_clf2}, ignore_index=True)


        for max_fdr_clf2 in max_fdr_clf2_list :
            opt_t_pos_clf2 = 0
            opt_fdr_clf2 = 0
            for t_pos_clf2 in np.linspace(0, 1, 1001):
                fdr_clf2 = (y_test_pred_proba_clf2[y_test_clf2 == 0] >= t_pos_clf2).sum() / (y_test_pred_proba_clf2 >= t_pos_clf2).sum()
                opt_t_pos_clf2 = t_pos_clf2
                opt_fdr_clf2 = fdr_clf2
                if fdr_clf2 <= max_fdr_clf2:
                    break
            print(opt_t_pos_clf2, opt_fdr_clf2, max_fdr_clf2)
            opt_t_pos_list_clf2 = opt_t_pos_list_clf2.append({"max_fdr_clf2": max_fdr_clf2, "opt_t_pos_clf2": opt_t_pos_clf2}, ignore_index=True)

for max_for_clf2 in max_for_clf2_list :
    print('max_for_clf2 = {}'.format(max_for_clf2))
    display(opt_t_neg_list_clf2[opt_t_neg_list_clf2.max_for_clf2==max_for_clf2].opt_t_neg_clf2.describe())

for max_fdr_clf2 in max_fdr_clf2_list :
    print('max_fdr_clf2 = {}'.format(max_fdr_clf2))
    display(opt_t_pos_list_clf2[opt_t_pos_list_clf2.max_fdr_clf2==max_fdr_clf2].opt_t_pos_clf2.describe())

classifier_thresholds_clf2 = pd.DataFrame()

for max_for_clf2 in max_for_clf2_list :
    t_neg_clf2 = np.median(opt_t_neg_list_clf2[opt_t_neg_list_clf2.max_for_clf2==max_for_clf2].opt_t_neg_clf2)

    classifier_thresholds_clf2 = classifier_thresholds_clf2.append({"max_for_clf2": max_for_clf2,
        "t_neg_clf2": t_neg_clf2}, ignore_index=True)

    
for max_fdr_clf2 in max_fdr_clf2_list :
    t_pos_clf2 = np.median(opt_t_pos_list_clf2)
    t_pos_clf2 = np.median(opt_t_pos_list_clf2[opt_t_pos_list_clf2.max_fdr_clf2==max_fdr_clf2].opt_t_pos_clf2)

    classifier_thresholds_clf2 = classifier_thresholds_clf2.append({"max_fdr_clf2": max_fdr_clf2,
        "t_pos_clf2": t_pos_clf2}, ignore_index=True)
    
display(classifier_thresholds_clf2)

# Create combined confusion matrix

In [None]:
max_for_clf1 = 0.03
max_for_clf2 = 0.05 #False omission rate
max_fdr_clf2 = 0.05 #False discovery rate

t_neg_clf1=classifier_thresholds_clf1[classifier_thresholds_clf1.max_for_clf1==max_for_clf1].t_neg_clf1.values[0]

t_neg_clf2=classifier_thresholds_clf2[classifier_thresholds_clf2.max_for_clf2==max_for_clf2].t_neg_clf2.values[0]
t_pos_clf2=classifier_thresholds_clf2[classifier_thresholds_clf2.max_fdr_clf2==max_fdr_clf2].t_pos_clf2.values[0]

#t_neg_clf2=classifier_thresholds_clf2['t_neg_clf2']
#t_pos_clf2=classifier_thresholds_clf2['t_pos_clf2']

#remove entries with no predictions
df=df.dropna(subset=['test_iter'])


df['y']=df.amount_segment.apply(lambda x: '2' if x==2 else ('3' if x==3 else '4+'))

df['y_pred_clf1']=df['y_pred_proba_clf1'].apply(lambda x: '2 or 3' if x <= t_neg_clf1 else '4+')
df['y_pred_clf2']=df['y_pred_proba_clf2'].apply(lambda x: '2' if x<=t_neg_clf2 else 
                                                ('2?' if x<=0.5 else 
                                                 ('3?' if x<=t_pos_clf2 else '3')))

df['y_pred']=df.apply(lambda x: x['y_pred_clf1'] if x['y_pred_clf1']=='4+' else x['y_pred_clf2'], axis=1)

In [None]:
df[['amount_segment', 'y','y_pred_proba_clf1','y_pred_clf1', 'y_pred_proba_clf2', 'y_pred_clf2', 'y_pred']]

In [None]:
df_confusion_comb = pd.DataFrame(confusion_matrix(df.y, df.y_pred, 
                                                  labels=['2','2?','3?','3','4+']), 
                                 index=['true 2', 'true 2?', 'true 3?', 'true 3', 'true 4+'], 
                                 columns=['pred 2', 'pred 2?', 'pred 3?', 'pred 3', 'pred 4+'])
print('Abs. Numbers')
df_confusion_comb.drop(index=['true 2?', 'true 3?'])

In [None]:
df_confusion_comb = pd.DataFrame(confusion_matrix(df.y, df.y_pred, 
                                                  labels=['2','2?','3?','3','4+'], normalize="true"), 
                                 index=['true 2', 'true 2?', 'true 3?', 'true 3', 'true 4+'], 
                                 columns=['pred 2', 'pred 2?', 'pred 3?', 'pred 3', 'pred 4+'])
print('Rel. to true count per class in %')
(df_confusion_comb.drop(index=['true 2?', 'true 3?']) * 100).round(1)

In [None]:
print('Classification Report')
print(classification_report(df.y, df.y_pred, labels=['2','2?','3?','3','4+'],zero_division=0))

# Get scores for regressions (with correct class)

In [None]:
df_reg2 = df[df.amount_segment==2].copy()
df_reg2['discr'] = df.amount-df.y_pred_reg2
df_reg2['discr_abs'] = abs(df.amount-df.y_pred_reg2)
df_reg2['discr_logit'] = df.amount-df.y_pred_reg2_logit
df_reg2['discr_logit_abs'] = abs(df.amount-df.y_pred_reg2_logit)

df_reg3 = df[df.amount_segment==3].copy()
df_reg3['discr'] = df.amount-df.y_pred_reg3
df_reg3['discr_abs'] = abs(df.amount-df.y_pred_reg3)
df_reg3['discr_logit'] = df.amount-df.y_pred_reg3_logit
df_reg3['discr_logit_abs'] = abs(df.amount-df.y_pred_reg3_logit)

In [None]:
print('Discrepancy reg2')
print(df_reg2.discr.describe())
print('')
print('Absolute discrepancy reg2')
print(df_reg2.discr_abs.describe())
print('')
print('Discrepancy reg2_logit')
print(df_reg2.discr_logit.describe())
print('')
print('Absolute discrepancy reg2_logit')
print(df_reg2.discr_logit_abs.describe())

In [None]:
print('Discrepancy reg3')
print(df_reg3.discr.describe())
print('')
print('Absolute discrepancy reg3')
print(df_reg3.discr_abs.describe())
print('')
print('Discrepancy reg3_logit')
print(df_reg3.discr_logit.describe())
print('')
print('Absolute discrepancy reg3_logit')
print(df_reg3.discr_logit_abs.describe())

In [None]:
plt.hist(df_reg2.discr, bins=100)
plt.show()
plt.hist(df_reg2.discr_logit, bins=100)
plt.show()

In [None]:
plt.hist(df_reg3.discr, bins=100)
plt.show()
plt.hist(df_reg3.discr_logit, bins=100)
plt.show()

# Regression Segment 2 and 3 (Review combined result)

## Calculate discrepancy

Keep Regression results only for correct classes (reg2 for '2' and '2?', reg3 for '3' and '3?')

In [None]:
df.loc[df['y_pred'].isin(['3','3?','4+']),['y_pred_reg2','y_pred_reg2_logit']]=np.nan
df.loc[df['y_pred'].isin(['2','2?','4+']),['y_pred_reg3','y_pred_reg3_logit']]=np.nan

In [None]:
df['discr_reg2']=df.amount-df.y_pred_reg2
df['discr_reg2_abs']=abs(df.amount-df.y_pred_reg2)
df['discr_reg2_logit']=df.amount-df.y_pred_reg2_logit
df['discr_reg2_logit_abs']=abs(df.amount-df.y_pred_reg2_logit)
df['discr_reg3']=df.amount-df.y_pred_reg3
df['discr_reg3_abs']=abs(df.amount-df.y_pred_reg3)
df['discr_reg3_logit']=df.amount-df.y_pred_reg3_logit
df['discr_reg3_logit_abs']=abs(df.amount-df.y_pred_reg3_logit)

In [None]:
df.loc[df.y_pred!='4+',['amount', 'amount_segment', 
    'y', 'y_pred', 
    'y_pred_reg2', 'discr_reg2', 'discr_reg2_abs', 
    'y_pred_reg2_logit', 'discr_reg2_logit',   'discr_reg2_logit_abs',
    'y_pred_reg3', 'discr_reg3', 'discr_reg3_abs',
    'y_pred_reg3_logit', 'discr_reg3_logit','discr_reg3_logit_abs'
   ]]

In [None]:
print('Discrepancy reg2, for predicted 2 and 2?')
print(df[df.y_pred.isin(['2','2?'])].discr_reg2.describe())
print('')
print('Discrepancy reg2, for predicted 2')
print(df[df.y_pred.isin(['2'])].discr_reg2.describe())
print('')
print('Absolute discrepancy reg2, for predicted 2 and 2?')
print(df[df.y_pred.isin(['2','2?'])].discr_reg2_abs.describe())
print('')
print('Absolute discrepancy reg2, for predicted 2')
print(df[df.y_pred.isin(['2'])].discr_reg2_abs.describe())

In [None]:
print('Discrepancy reg3, for predicted 3 and 3?')
print(df[df.y_pred.isin(['3','3?'])].discr_reg3.describe())
print('')
print('Discrepancy reg3, for predicted 3')
print(df[df.y_pred.isin(['3'])].discr_reg3.describe())
print('')
print('Absolute discrepancy reg3, for predicted 3 and 3?')
print(df[df.y_pred.isin(['3','3?'])].discr_reg3_abs.describe())
print('')
print('Absolute discrepancy reg3, for predicted 3')
print(df[df.y_pred.isin(['3'])].discr_reg3_abs.describe())

In [None]:
plt.hist(df[(df.y_pred.isin(['2']))].discr_reg2, bins=100)
plt.show()
plt.hist(df[(df.y_pred.isin(['2','2?']))].discr_reg2, bins=100)
plt.show()

plt.hist(df[(df.y_pred.isin(['2']))&(df.discr_reg2_abs<=100)].discr_reg2, bins=100)
plt.show()
plt.hist(df[(df.y_pred.isin(['2','2?']))&(df.discr_reg2_abs<=100)].discr_reg2, bins=100)
plt.show()



In [None]:
plt.hist(df[(df.y_pred.isin(['2']))].discr_reg2_logit, bins=100)
plt.show()
plt.hist(df[(df.y_pred.isin(['2','2?']))].discr_reg2_logit, bins=100)
plt.show()

plt.hist(df[(df.y_pred.isin(['2']))&(df.discr_reg2_logit_abs<=100)].discr_reg2_logit, bins=100)
plt.show()
plt.hist(df[(df.y_pred.isin(['2','2?']))&(df.discr_reg2_logit_abs<=100)].discr_reg2_logit, bins=100)
plt.show()



In [None]:
plt.hist(df[(df.y_pred.isin(['3']))].discr_reg3, bins=100)
plt.show()
plt.hist(df[(df.y_pred.isin(['3', '3?']))].discr_reg3, bins=100)
plt.show()

plt.hist(df[(df.y_pred.isin(['3']))&(df.discr_reg3_abs<=150)].discr_reg3, bins=100)
plt.show()
plt.hist(df[(df.y_pred.isin(['3','3?']))&(df.discr_reg3_abs<=150)].discr_reg3, bins=100)
plt.show()



In [None]:
plt.hist(df[(df.y_pred.isin(['3']))].discr_reg3_logit, bins=100)
plt.show()
plt.hist(df[(df.y_pred.isin(['3', '3?']))].discr_reg3_logit, bins=100)
plt.show()

plt.hist(df[(df.y_pred.isin(['3']))&(df.discr_reg3_logit_abs<=150)].discr_reg3_logit, bins=100)
plt.show()
plt.hist(df[(df.y_pred.isin(['3','3?']))&(df.discr_reg3_logit_abs<=150)].discr_reg3_logit, bins=100)
plt.show()



# Feature Importance

## Feature importance

### CLF1

In [None]:
explainer_clf1 = shap.TreeExplainer(clf1)
shap_values_clf1 = explainer_clf1.shap_values(df[all_features])
shap.summary_plot(shap_values_clf1, features=df[all_features], feature_names=df[all_features].columns)

In [None]:
shap.summary_plot(shap_values_clf1, features=df[all_features], feature_names=df[all_features].columns, plot_type='bar')

### Feature importance CLF2

In [None]:
explainer_clf2 = shap.TreeExplainer(clf2)
shap_values_clf2 = explainer_clf2.shap_values(df[~df.y_clf2.isnull()][all_features])
shap.summary_plot(shap_values_clf2, features=df[~df.y_clf2.isnull()][all_features], feature_names=df[all_features].columns)

In [None]:
shap.summary_plot(shap_values_clf2, features=df[~df.y_clf2.isnull()][all_features], feature_names=df[all_features].columns, plot_type='bar')

### Feature Importance Reg2

In [None]:
explainer_reg2 = shap.TreeExplainer(reg2)
shap_values_reg2 = explainer_reg2.shap_values(df[df.amount_segment==2][all_features])
shap.summary_plot(shap_values_reg2, features=df[df.amount_segment==2][all_features], feature_names=df[all_features].columns)

In [None]:
shap.summary_plot(shap_values_reg2, features=df[df.amount_segment==2][all_features], feature_names=df[all_features].columns, plot_type='bar')

In [None]:
explainer_reg2_logit = shap.TreeExplainer(reg2_logit)
shap_values_reg2_logit = explainer_reg2_logit.shap_values(df[df.amount_segment==2][all_features])
shap.summary_plot(shap_values_reg2_logit, features=df[df.amount_segment==2][all_features], feature_names=df[all_features].columns)

In [None]:
shap.summary_plot(shap_values_reg2_logit, features=df[df.amount_segment==2][all_features], feature_names=df[all_features].columns, plot_type='bar')

### Feature Importance reg3

In [None]:
explainer_reg3 = shap.TreeExplainer(reg3)
shap_values_reg3 = explainer_reg3.shap_values(df[df.amount_segment==3][all_features])
shap.summary_plot(shap_values_reg3, features=df[df.amount_segment==3][all_features], feature_names=df[all_features].columns)

In [None]:
shap.summary_plot(shap_values_reg3, features=df[df.amount_segment==3][all_features], feature_names=df[all_features].columns, plot_type='bar')

In [None]:
explainer_reg3_logit = shap.TreeExplainer(reg3_logit)
shap_values_reg3_logit = explainer_reg3_logit.shap_values(df[df.amount_segment==3][all_features])
shap.summary_plot(shap_values_reg3_logit, features=df[df.amount_segment==3][all_features], feature_names=df[all_features].columns)

In [None]:
shap.summary_plot(shap_values_reg3_logit, features=df[df.amount_segment==3][all_features], feature_names=df[all_features].columns, plot_type='bar')

# Function

In [None]:
def make_predict(df_features, clf1, clf2, reg1, reg2, t_neg1, t_neg2, t_pos2):
    prediction_result=pd.DataFrame(columns=['y_pred_proba_clf1',
                                           'y_pred_proba_clf2',
                                           'y_pred_clf1',
                                           'y_pred_clf2',
                                            'y_pred_reg2',
                                            'y_pred_reg3',
                                           'segment',
                                           'amount'])

    y_pred_proba_clf1 = clf1.predict_proba(df_features)[:, 1]
    y_pred_proba_clf2 = clf2.predict_proba(df_features)[:, 1]
    y_pred_reg2_logit=reg2.predict(df_features)
    y_pred_reg3_logit=reg3.predict(df_features)
    
    prediction_result.y_pred_proba_clf1=y_pred_proba_clf1
    prediction_result.y_pred_proba_clf2=y_pred_proba_clf2
    prediction_result.y_pred_reg2 = pd.Series(y_pred_reg2_logit).apply(expit)*50
    prediction_result.y_pred_reg3 = pd.Series(y_pred_reg3_logit).apply(expit)*50+50
    
    prediction_result['y_pred_proba_clf1']=prediction_result['y_pred_proba_clf1'].apply(lambda x: format(float(x),".8f")).astype(float)
    prediction_result['y_pred_proba_clf2']=prediction_result['y_pred_proba_clf2'].apply(lambda x: format(float(x),".8f")).astype(float)
    
    prediction_result['y_pred_clf1']=prediction_result['y_pred_proba_clf1'].apply(lambda x: '2 or 3' if x <= t_neg_clf1 else '4+')
    prediction_result['y_pred_clf2']=prediction_result['y_pred_proba_clf2'].apply(lambda x: '2' if x<=t_neg_clf2 else 
                                                ('2?' if x<=0.5 else 
                                                 ('3?' if x<=t_pos_clf2 else '3')))
    
    prediction_result['segment']=prediction_result.apply(lambda x: x['y_pred_clf1'] if x['y_pred_clf1']=='4+' else x['y_pred_clf2'], axis=1)

    prediction_result['amount']=prediction_result.apply(lambda x: 
                                                        x['y_pred_reg2'] if '2' in x.segment
                                                        else (x['y_pred_reg3'] if '3' in x.segment 
                                                        else np.nan), axis=1)
    
    prediction_result=prediction_result[['segment','amount']].copy()
    
    return prediction_result

In [None]:
results=make_predict(df[all_features],clf1, clf2, reg2_logit, reg3_logit, 
             classifier_thresholds_clf1['t_neg_clf1'],
             classifier_thresholds_clf2['t_neg_clf2'],
             classifier_thresholds_clf2['t_pos_clf2'])
print(results.head(30))

In [None]:
results[results.segment.isin(['2', '2?'])].amount.describe()

In [None]:
results[results.segment.isin(['3','3?'])].amount.describe()