In [2]:
# pip install -U binclass-tools

In [1]:
import os
import sys
import inspect

import numpy as np
import pandas as pd

### Create dataset for classification and train random forest model

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Generate a binary imbalanced classification problem, with 80% zeros and 20% ones.
X, y = make_classification(n_samples=1000, n_features=20,
                           n_informative=14, n_redundant=0,
                           random_state=12, shuffle=False, weights = [0.8, 0.2])

# Train - test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state=123)

# Train a RF classifier
cls = RandomForestClassifier(max_depth=6, oob_score=True, random_state=123)
cls.fit(X_train, y_train) # We train it with fit

In [50]:
# Get prediction probabilities for the train set
train_predicted_proba = cls.predict_proba(X_train)[:,1] #cls is our class we train --> we dont use predict we use predict_proba

# Get prediction probabilities for the test set
test_predicted_proba = cls.predict_proba(X_test)[:,1]

In [53]:
test_predicted_proba

array([0.06518814, 0.0195348 , 0.03998879, 0.20216257, 0.0757036 ,
       0.05142666, 0.18919332, 0.05501012, 0.06997563, 0.07937482,
       0.16630497, 0.09597257, 0.28235628, 0.06165742, 0.06285022,
       0.05849617, 0.33974954, 0.04621485, 0.31975703, 0.2203028 ,
       0.05697991, 0.05310547, 0.41221068, 0.28753749, 0.08363675,
       0.14873614, 0.08441678, 0.11912544, 0.31652685, 0.02835584,
       0.77304784, 0.20216737, 0.04535207, 0.48389732, 0.07507466,
       0.16197785, 0.05948015, 0.22678447, 0.05477332, 0.41015493,
       0.21882829, 0.02443574, 0.63936278, 0.11931886, 0.37409705,
       0.14623303, 0.06596108, 0.7855197 , 0.200591  , 0.27741669,
       0.08510619, 0.08071955, 0.06618009, 0.75875235, 0.16189331,
       0.44031524, 0.1606569 , 0.5046141 , 0.06808013, 0.32225032,
       0.12054904, 0.02368998, 0.10377898, 0.56770768, 0.15386725,
       0.19803138, 0.06134993, 0.31448911, 0.52106837, 0.63935393,
       0.38002535, 0.14153543, 0.61961283, 0.06584576, 0.03090

## Import bctools package

In [4]:
import bctools as bc

### Plot Roc and PR plot, with isoFbeta curves, for the test set

In [5]:
ROC_plot, area_under_ROC = bc.curve_ROC_plot(true_y = y_test,
                                             predicted_proba = test_predicted_proba)

In [6]:
ROC_plot
# or
# ROC_plot.show()

In [7]:
area_under_ROC

0.9550544562049395

In [8]:
PR_plot, area_under_PR = bc.curve_PR_plot(true_y = y_test,
                                          predicted_proba = test_predicted_proba,
                                          beta = 1)

In [9]:
PR_plot

In [10]:
area_under_PR

0.9021518156511643

### Interactive probabilities violin plot for the test set

In [11]:
threshold_step = 0.05

violin_plot = bc.predicted_proba_violin_plot(true_y = y_test,
                                             predicted_proba = test_predicted_proba,
                                             threshold_step = threshold_step,
                                             #marker_size =3
                                            )

In [12]:
violin_plot

### Interactive kernel density estimation curve (or normal distribution curve) plot for the test set

Both plots below

In [13]:
#curve type parameter can be either 'kde' (default) or 'normal'
threshold_step = 0.05

curve_type = 'kde' #default
density_curve_kde = bc.predicted_proba_density_curve_plot(true_y = y_test,
                                                          predicted_proba = test_predicted_proba,
                                                          threshold_step = threshold_step,
                                                          curve_type = curve_type,
                                                          title = 'Interactive Probabilities Distribution Plot (kde)')

curve_type = 'normal'
density_curve_nor = bc.predicted_proba_density_curve_plot(true_y = y_test,
                                                          predicted_proba = test_predicted_proba,
                                                          threshold_step = threshold_step,
                                                          curve_type = 'normal',
                                                          title = 'Interactive Probabilities Distribution Plot (normal)')

In [14]:
density_curve_kde.show()
density_curve_nor.show()

### Confusion matrix and metrics analysis for train and test set

In [15]:
# set params for the train dataset
threshold_step = 0.05
amounts = np.abs(X_train[:, 13])
currency = '$'

In [16]:
# The function get_cost_dict can be used to define the dictionary of costs.
# It takes as input, for each class, a float or a list of floats.
# Lists must have coherent lenghts

train_cost_dict = bc.get_cost_dict(TN = 0, FP = 10, FN = np.abs(X_train[:, 12]), TP = 0)
train_cost_dict

{'TN': 0,
 'FP': 10,
 'FN': array([1.89314484e+00, 1.29550653e+00, 1.92604283e+00, 2.58953714e-01,
        2.37838720e+00, 2.29323077e+00, 3.98421872e+00, 3.50077540e+00,
        1.11822062e+00, 1.02829921e+00, 3.83654487e+00, 1.47242727e+00,
        6.58308291e+00, 2.20268267e+00, 2.53911133e+00, 1.72256580e-01,
        1.10244159e+00, 2.95496766e+00, 3.40204966e+00, 4.04029476e+00,
        1.55839360e+00, 1.05864083e+00, 2.14028777e+00, 2.97941137e+00,
        1.33448710e-02, 2.95986366e+00, 2.05122680e-01, 3.28804097e+00,
        1.64134175e+00, 3.37352697e+00, 1.65584500e+00, 1.46636176e+00,
        1.97375947e-01, 4.51469544e-01, 4.64249261e+00, 2.65424922e+00,
        3.84352601e-01, 4.95385582e+00, 1.37124163e+00, 1.89378415e-01,
        1.59887702e+00, 2.57027646e+00, 2.68583216e-01, 9.84439148e-01,
        3.46575702e+00, 1.41441729e+00, 1.13775096e+00, 4.13376716e+00,
        8.08329463e-02, 2.25216600e+00, 4.42330162e+00, 2.40229361e+00,
        3.89878618e+00, 1.38277536e+0

In [17]:
# plot confusion matrix and get variable metrics dataframe, invariant metric dataframe and optimal thresholds dataframe.

# cost_dict and amounts, if not given, are set to None and won't be visualized.

cf_fig, var_metrics_df, invar_metrics_df, opt_thresh_df = bc.confusion_matrix_plot(
    true_y = y_train,
    predicted_proba = train_predicted_proba,
    threshold_step = threshold_step,
    amounts = amounts,
    cost_dict = train_cost_dict,
    currency = currency,
    title = 'Interactive Confusion Matrix for the Training Set')

In [18]:
cf_fig

In [19]:
# the three dataframes returned
display(var_metrics_df, invar_metrics_df, opt_thresh_df)

Unnamed: 0,threshold,accuracy,balanced_accuracy,cohens_kappa,f1_score,matthews_corr_coef,precision,recall,f2_score,f05_score
0,0.0,0.2025,0.5,0.0,0.3368,0.0,0.2025,1.0,0.559392,0.240928
1,0.05,0.3962,0.6215,0.115,0.4015,0.247,0.2512,1.0,0.626496,0.295446
2,0.1,0.7288,0.8299,0.44,0.5989,0.5311,0.4274,1.0,0.788677,0.482676
3,0.15,0.8875,0.9295,0.7115,0.7826,0.7431,0.6429,1.0,0.900017,0.692347
4,0.2,0.965,0.9781,0.8982,0.9205,0.9029,0.8526,1.0,0.966579,0.878498
5,0.25,0.9838,0.9852,0.9507,0.961,0.9513,0.9357,0.9877,0.976843,0.945657
6,0.3,0.9862,0.9776,0.9573,0.9659,0.9573,0.9689,0.963,0.964174,0.967714
7,0.35,0.9875,0.9714,0.9606,0.9684,0.9611,0.9935,0.9444,0.953828,0.983276
8,0.4,0.9762,0.9414,0.9231,0.9377,0.9258,1.0,0.8827,0.903906,0.974111
9,0.45,0.9675,0.9198,0.893,0.9128,0.8981,1.0,0.8395,0.867342,0.963171


Unnamed: 0,invariant_metric,value
0,roc_auc,0.9992
1,pr_auc,0.9971
2,brier_score,0.0438


Unnamed: 0,metric,optimal_threshold
0,Kappa,0.35
1,MCC,0.35
2,f1_score,0.35
3,f2_score,0.25
4,f05_score,0.35
5,Cost,0.35


In [20]:
# You can also analyze the test dataset.

threshold_step = 0.05
amounts = np.abs(X_test[:, 13])
currency = '$'

test_cost_dict = bc.get_cost_dict(TN = 0, FP = 10, FN = np.abs(X_test[:, 12]), TP = 0)

In [21]:
cf_fig, var_metrics_df, invar_metrics_df, opt_thresh_df = bc.confusion_matrix_plot(
    true_y = y_test,
    predicted_proba = test_predicted_proba,
    threshold_step = threshold_step,
    amounts = amounts,
    cost_dict = test_cost_dict,
    currency = currency,
    title = 'Interactive Confusion Matrix for the Testing Set')

In [22]:
cf_fig

In [23]:
# the invariant metric dataframe can be obtained directly with
# the function get_invariant_metrics_df from the utilities module

bc.utilities.get_invariant_metrics_df(true_y = y_test,
                                      predicted_proba = test_predicted_proba)

Unnamed: 0,invariant_metric,value
0,roc_auc,0.9551
1,pr_auc,0.903
2,brier_score,0.0821


In [24]:
# for a specific threshold,
# the confusion matrix and a dataframe containing the list of metrics visualized in the first table of
# the interactive confusion matrix plot, can be obtained directly with
# the function get_confusion_matrix_and_metrics_df from the utilities module

conf_matrix, metrics_fixed_thresh_df = bc.utilities.get_confusion_matrix_and_metrics_df(
    true_y = y_test,
    predicted_proba = test_predicted_proba,
    threshold = 0.3 # default = 0.5
)

display(conf_matrix, metrics_fixed_thresh_df)


array([[151,   8],
       [  8,  33]], dtype=int64)

Unnamed: 0,threshold_dependent_metric,value
0,accuracy,0.92
1,balanced_accuracy,0.8773
2,f1_score,0.8049
3,precision,0.8049
4,recall,0.8049
5,cohens_kappa,0.7546
6,matthews_corr_coef,0.7546


In [25]:
# the optimal thresholds dataframe can be obtained directly with
# the function get_subset_optimal_thresholds_df from the thresholds module

# this function requires a list of thresholds instead of the step, for example:
threshold_values = np.round(np.arange(0.05, 1, 0.05), 4) # will generate an array of values from 0 to 1 with step 0.05 (rounded for representation reasons)

# to obtain the threshold that minimizes the cost for this train set, we need a train_cost_dict
train_cost_dict = bc.get_cost_dict(TN = 0, FP = 10,
                                   FN = np.abs(X_train[:, 12]), TP = 0)

bc.thresholds.get_subset_optimal_thresholds_df(threshold_values = threshold_values,
                                                 true_y = y_train,
                                                 predicted_proba = train_predicted_proba,
                                                 cost_dict = train_cost_dict)

Unnamed: 0,metric,optimal_threshold
0,Kappa,0.35
1,MCC,0.35
2,f1_score,0.35
3,f2_score,0.25
4,f05_score,0.35
5,Cost,0.35


In [26]:
# The previously obtained thresholds maximize the related metric - and minimize the cost - for the given train set

# With the GHOST method we can obtain thresholds that generally optimize given metrics for imbalanced sets of data
# The funciotn get_ghost_optimal_thresholds_df from the thresholds module returns a dataframe with the optimal thresholds
# obtained with GHOST method

# WARNING: could take a while

bc.thresholds.get_ghost_optimal_thresholds_df(optimize_threshold = 'all',
                                                threshold_values = threshold_values,
                                                true_y = y_train,
                                                predicted_proba = train_predicted_proba,
                                                cost_dict = train_cost_dict,
                                                N_subsets = 70, subsets_size = 0.2, with_replacement = False, # default
                                                random_state = 120)

Unnamed: 0,optimized_metric,GHOST_optimal_threshold
0,kappa,0.3
1,mcc,0.3
2,f1_score,0.25
3,f2_score,0.25
4,f05_score,0.35
5,cost,0.35


In [27]:
# to directly optimize a threshold for one specific metric in {'MCC', 'Kappa', 'Fscore'},
# the function get_ghost_optimal_threshold from the thresholds module can be used

# if ThOpt_metrics = Fscore, 3 values will be returned (optimal threshold for beta = 1, for beta = 2 and for beta = 0.5)

bc.thresholds.get_ghost_optimal_threshold(y_train,
                                        train_predicted_proba,
                                        threshold_values,
                                        ThOpt_metrics = 'MCC', # default = 'Kappa'
                                        N_subsets = 70, subsets_size = 0.2, with_replacement = False, # defaults
                                        random_seed = 120)

0.3

In [28]:
# to directly optimize a threshold for minimal cost,
# the function get_ghost_optimal_cost from the thresholds module can be used (cost_dict must be given)

bc.thresholds.get_ghost_optimal_cost(y_train,
                                     train_predicted_proba,
                                     threshold_values,
                                     cost_dict = train_cost_dict,
                                     N_subsets = 70, subsets_size = 0.2, with_replacement = False, # defaults
                                     random_seed = 120)

0.35

In [29]:
# plot "Interactive confusion line chart" and get amount/cost per threshold dataframe and total_amount.

# at least one of cost_dict or amounts must be given
# either cost_dict or amounts, if not given, is set to None and won't be visualized
# when amounts is not given, the total_amount returned will be None

cl_fig, amount_cost_df, total_amount = bc.confusion_linechart_plot(
    true_y = y_test,
    predicted_proba = test_predicted_proba,
    threshold_step =  threshold_step,
    amounts = amounts,
    cost_dict = test_cost_dict,
    currency = currency);


Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead



In [30]:
cl_fig

In [31]:
# total_amount and dataframe returned
print(f'total amount: {currency}{total_amount}')
amount_cost_df

total amount: $335.85


Unnamed: 0,threshold,amount_TN,amount_FP,amount_FN,amount_TP,cost_TN,cost_FP,cost_FN,cost_TP,total_cost
0,0.0,0.0,290.087727,0.0,45.761465,0.0,1590.0,0.0,0.0,1590.0
1,0.05,29.286441,260.801286,0.0,45.761465,0.0,1380.0,0.0,0.0,1380.0
2,0.1,141.016189,149.071538,0.271689,45.489775,0.0,750.0,2.295028,0.0,752.295028
3,0.15,185.252232,104.835495,0.271689,45.489775,0.0,490.0,2.295028,0.0,492.295028
4,0.2,232.413556,57.674171,1.096405,44.66506,0.0,300.0,4.25104,0.0,304.25104
5,0.25,260.154255,29.933472,7.812413,37.949052,0.0,160.0,9.48321,0.0,169.48321
6,0.3,272.472271,17.615456,7.812413,37.949052,0.0,80.0,9.48321,0.0,89.48321
7,0.35,288.065533,2.022194,9.907729,35.853736,0.0,20.0,13.266683,0.0,33.266683
8,0.4,289.577899,0.509828,12.351725,33.40974,0.0,10.0,21.557577,0.0,31.557577
9,0.45,290.087727,0.0,17.779753,27.981711,0.0,0.0,34.519345,0.0,34.519345


In [32]:
# the amount/cost per threshold dataframe can be obtained directly with
# the function get_amounts_cost_df in the utilities module

# this function requires a list of thresholds, instead of the step, for example:
threshold_values = np.arange(0, 1, 0.05) # will generate an array of values from 0 to 1 with step 0.05

# example without amounts
bc.utilities.get_amount_cost_df(
    true_y = y_test,
    predicted_proba = test_predicted_proba,
    threshold_values = threshold_values,
    #amounts = amounts,
    cost_dict = test_cost_dict)

Unnamed: 0,threshold,cost_TN,cost_FP,cost_FN,cost_TP,total_cost
0,0.0,0.0,1590.0,0.0,0.0,1590.0
1,0.05,0.0,1380.0,0.0,0.0,1380.0
2,0.1,0.0,750.0,2.295028,0.0,752.295028
3,0.15,0.0,490.0,2.295028,0.0,492.295028
4,0.2,0.0,300.0,4.25104,0.0,304.25104
5,0.25,0.0,160.0,9.48321,0.0,169.48321
6,0.3,0.0,80.0,9.48321,0.0,89.48321
7,0.35,0.0,20.0,13.266683,0.0,33.266683
8,0.4,0.0,10.0,21.557577,0.0,31.557577
9,0.45,0.0,0.0,34.519345,0.0,34.519345


### Custom Interactive Amount/Cost line chart

In [33]:
# plot "Amount/Cost line chart" and get a dataframe containing amount and cost per threshold for selected
# "confusion classes" (TN, FP, FN, TP) and their total

# at least one of cost_dict or amounts must be given
# either cost_dict or amounts, if not given, is set to None and won't be visualized
# amount_classes, if not given, is set to 'all' when amounts is given, to None otherwise
# cost_classes, if not given, is set to 'all' when cost_dict is given, to None otherwise

# for example, if we want to plot the sum of the amounts of the True Positive and False Positive data
# and the sum of the costs of all the data:

amount_classes = ['TP', 'FP']
cost_classes = 'all'

ac_fig, total_cost_amount_df = bc.total_amount_cost_plot(
    true_y = y_test,
    predicted_proba = test_predicted_proba,
    threshold_step = threshold_step,
    amounts = amounts,
    cost_dict = test_cost_dict,
    amount_classes = amount_classes,
    cost_classes = cost_classes,
    currency = currency)

ac_fig

In [34]:
# dataframe returned by the function
total_cost_amount_df

Unnamed: 0,threshold,amount_TP,amount_FP,amount_sum,cost_TN,cost_FP,cost_FN,cost_TP,cost_sum
0,0.0,45.761465,290.087727,335.849192,0.0,1590.0,0.0,0.0,1590.0
1,0.05,45.761465,260.801286,306.562751,0.0,1380.0,0.0,0.0,1380.0
2,0.1,45.489775,149.071538,194.561314,0.0,750.0,2.295028,0.0,752.295028
3,0.15,45.489775,104.835495,150.325271,0.0,490.0,2.295028,0.0,492.295028
4,0.2,44.66506,57.674171,102.339231,0.0,300.0,4.25104,0.0,304.25104
5,0.25,37.949052,29.933472,67.882524,0.0,160.0,9.48321,0.0,169.48321
6,0.3,37.949052,17.615456,55.564508,0.0,80.0,9.48321,0.0,89.48321
7,0.35,35.853736,2.022194,37.87593,0.0,20.0,13.266683,0.0,33.266683
8,0.4,33.40974,0.509828,33.919568,0.0,10.0,21.557577,0.0,31.557577
9,0.45,27.981711,0.0,27.981711,0.0,0.0,34.519345,0.0,34.519345


### Additional useful function

In [35]:
# the function get_confusion_class_df takes in input a "confusion class" {'TN', 'FP', 'FN', 'TP'},
# a feature dataset (X), the true labels (y), the predicted probabilites and a threshold
# and returns the portion of the feature dataset corresponding to the given class

# for example, if we want the True Positive data points with a 0.7 threshold:
confusion_category = 'TP'

bc.get_confusion_category_observations_df(
    confusion_category = confusion_category,
    X_data = X_test,
    true_y = y_test,
    predicted_proba = test_predicted_proba,
    threshold = 0.7 # default = 0.5
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
30,-2.601367,-1.51421,-0.081816,2.257485,-5.195684,-2.953742,3.949413,2.76187,1.651492,0.180683,-1.664504,-1.396264,-3.826065,0.941951,-2.004694,1.038209,-0.183376,1.504055,-0.797956,-0.512469
47,-2.568051,-4.736157,3.401512,0.614939,-0.390128,-3.364416,-3.667949,4.046054,3.568885,1.479944,3.078459,2.142917,1.48118,0.686454,0.416553,1.044883,0.718451,-1.232943,0.280403,1.074427
53,1.466142,2.557351,4.432927,-1.129646,-0.673413,-3.782365,-1.112528,3.371804,4.870778,2.628418,2.974501,3.786003,0.01542,-1.213112,0.290725,1.351958,0.576588,-1.929523,-0.327521,1.16338
100,-0.683903,-1.137473,2.989311,-2.349425,-2.312612,-5.200242,0.138438,3.786642,2.184161,4.529078,2.959609,1.633566,0.520825,1.296019,-0.086999,-0.778458,0.803152,1.031072,-0.212475,-0.237224
149,-3.892485,-0.50545,-1.10924,1.071018,-2.246515,-7.147058,4.757241,-0.231286,-1.42023,-0.60719,-1.24541,0.021053,-0.001838,0.741768,0.08832,1.367268,1.927205,-0.486881,-1.175421,1.039506
162,-3.049729,-3.784003,1.107009,-0.201179,0.873662,-3.947325,-2.886823,-0.819648,5.083153,0.85056,4.091439,0.033962,-2.147115,-2.442134,-0.254247,0.827896,-3.532146,0.291766,-0.181126,0.280283
192,0.62663,2.995032,1.472569,5.170367,-0.489948,-3.800033,-1.06871,0.274598,1.768753,1.10827,4.649526,-2.272895,1.402271,-1.062539,0.290058,0.188573,0.997652,-0.855024,-2.532455,0.466048


### Gain, Lift, Response and Cumulative Response Plot

In [36]:
cumgain_plot = bc.cumulative_gain_plot(true_y = y_test,
                                       full_predicted_proba = cls.predict_proba(X_test),
                                       pos_label = 1,
                                       )

Class 0 is associated with probabilities: full_predicted_proba[:, 0]
Class 1 is associated with probabilities: full_predicted_proba[:, 1]


In [37]:
cumgain_plot

In [38]:
lift_curve = bc.lift_curve_plot(true_y = y_test,
                                full_predicted_proba = cls.predict_proba(X_test),
                                pos_label = 1,
                                )

lift_curve

Class 0 is associated with probabilities: full_predicted_proba[:, 0]
Class 1 is associated with probabilities: full_predicted_proba[:, 1]


In [39]:
cumres_plot = bc.cumulative_response_plot(true_y = y_test,
                                          predicted_proba = test_predicted_proba,
                                          )

cumres_plot

In [40]:
resp_curve = bc.response_curve_plot(true_y = y_test,
                                    predicted_proba = test_predicted_proba,
                                    n_tiles = 10,
                                    )

resp_curve

## Calibration

In [41]:
calib_curve, ece = bc.calibration_curve_plot(true_y = y_test,
                                             predicted_proba = test_predicted_proba,
                                             n_bins = 10,           #default
                                             strategy = 'uniform',  #default
                                             show_gaps = True,      #default
                                             ece_bins = 'fd'        #default
                                            )

calib_curve

In [42]:
ece

0.13359495868308954

In [43]:
#ece can be directly obtained with:
bc.utilities.get_expected_calibration_error(true_y = y_test,
                                            predicted_proba = test_predicted_proba,
                                            bins = 'fd'           #default
                                           )

0.13359495868308954

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

lr = LogisticRegression(C=1.0)
gnb = GaussianNB()

clf_list = [lr, gnb]

for clf in clf_list:
    clf.fit(X_train, y_train)

In [45]:
line_fig, hist_fig, ece_ls = bc.calibration_plot_from_models(X = X_test,
                                                     true_y = y_test,
                                                     estimators = [cls, lr, gnb],
                                                     estimator_names = ["Random Forest", "Logistic", "Naive Bayes"],
                                                     n_bins = 10,           #default
                                                     strategy = 'uniform',  #default
                                                     ece_bins = 'fd'        #default
                                                    )

In [46]:
line_fig.show()
hist_fig.show()

In [47]:
ece_ls

[0.13359495868308954, 0.05032756223564376, 0.053718608412928796]