From: [https://gsarantitis.wordpress.com/2020/04/29/auc-roc-gains-chart-and-lift-curve-explained-with-business-implications/](https://gsarantitis.wordpress.com/2020/04/29/auc-roc-gains-chart-and-lift-curve-explained-with-business-implications/)

Data source: [http://archive.ics.uci.edu/dataset/222/bank+marketing](http://archive.ics.uci.edu/dataset/222/bank+marketing)

In [None]:
#Import modules
import os
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
%matplotlib inline
import scikitplot as skplt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
artifact_dir = os.path.join(os.getcwd(),"artifacts")
figures_dir = os.path.join(artifact_dir, "figures")
os.makedirs(figures_dir, exist_ok=True)

In [None]:
#Import and preprocess data
df = pd.read_csv('data/bank-full.csv', sep=';')
 
df.head()

In [None]:
df.describe()

In [None]:
#Encode categorical variables
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 
                       'contact', 'day', 'month', 'campaign','previous', 'poutcome']
 
use_cols = ['month', 'campaign', 'y']
 
df['y'] = df['y'].map({'yes':1, 'no':0})
 
for c in categorical_columns:
    df[c] = df[c].astype('category')
    df[c] = df[c].cat.codes

In [None]:
def calculate_scores(x_test, y_test, y_pred, y_pred_prob, fpr, tpr, modelname):

    scores={}
    scores['ROC_AUC'] = auc(fpr, tpr)
    scores['Acc'] = accuracy_score(y_test, y_pred)
    scores['Rec'] = recall_score(y_test, y_pred)
    scores['Prec'] = precision_score(y_test, y_pred)
    scores['F1'] = f1_score(y_test, y_pred)
        
    print(f"Modelname: {modelname}")
    print('AUC: {:.2f}'.format(scores['ROC_AUC']))
    print('Accuracy: {:.2f}'.format(scores['Acc']))
    print('Recall: {:.2f}'.format(scores['Rec']))
    print('Precision: {:.2f}'.format(scores['Prec']))
    print('F1-Score: {:.2f}'.format(scores['F1']))

    # clfreport = classification_report(y_test, y_pred)
    # print(clfreport)
    
    return scores

In [None]:
def create_skill_plots(y_test, pred_prob, fpr, tpr, modelname):
    plt.figure(figsize=(15,7))
    
    #Plot AUC-ROC
    ax1 = plt.subplot(1,3,1)
    plt.title('Receiver Operating Characteristic')
    roc_auc = auc(fpr, tpr)
    ax1.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    ax1.legend(loc = 'lower right')
    ax1.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    
    ax2 = plt.subplot(1,3,2)      
    skplt.metrics.plot_cumulative_gain(y_test, pred_prob, ax=ax2)
    
    ax3 = plt.subplot(1,3,3)      
    skplt.metrics.plot_lift_curve(y_test, pred_prob, ax=ax3)

    figname = os.path.join(figures_dir, f"{modelname}_skillplots.png")
    plt.savefig(figname)
    plt.show()
    print(f'saved figure:{figname}')

In [None]:
#Split in 75% train and 25% test set
train_df, test_df = train_test_split(df, test_size = 0.25, random_state= 1984)
 
#Separate target label
y_train = train_df['y']
y_test = test_df['y']
 
#Drop target label
x_train = train_df.drop(['y'], axis = 1)
x_test = test_df.drop(['y'], axis = 1)

In [None]:
#Build a logit model
basemodelname = "Logit"
params = {
    "penalty": None,
    "class_weight": 'balanced'}
parsuf = '_'.join([key.replace('_','')+str(val).replace('.','') for key,val in params.items()])
modelname=f"{basemodelname}_{parsuf}"

model = LogisticRegression(
    penalty=params['penalty'],
    class_weight=params['class_weight'],
    max_iter=5000)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
y_pred_prob = model.predict_proba(x_test)
fpr, tpr, threshold = roc_curve(y_test, y_pred_prob[:,1])

scores = calculate_scores(x_test, y_test, y_pred, y_pred_prob, fpr, tpr, modelname)
create_skill_plots(y_test, y_pred_prob, fpr, tpr, modelname)

In [None]:
#Build a logit model: V2
basemodelname = "Logit"
params = {
    "penalty": 'l2',
    "class_weight": 'balanced'}
parsuf = '_'.join([key.replace('_','')+str(val).replace('.','') for key,val in params.items()])
modelname=f"{basemodelname}_{parsuf}"

model = LogisticRegression(
    penalty=params['penalty'],
    class_weight=params['class_weight'],
    max_iter=5000)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
y_pred_prob = model.predict_proba(x_test)
fpr, tpr, threshold = roc_curve(y_test, y_pred_prob[:,1])

scores = calculate_scores(x_test, y_test, y_pred, y_pred_prob, fpr, tpr, modelname)
create_skill_plots(y_test, y_pred_prob, fpr, tpr, modelname)

In [None]:
# Build RandomForest model
basemodelname = "RandomForest"
params = {
    "max_depth": 4,
    "n_estimators": 20}
parsuf = '_'.join([key.replace('_','')+str(val).replace('.','') for key,val in params.items()])
modelname=f"{basemodelname}_{parsuf}"

model_rf1 = RandomForestClassifier(
    max_depth=params['max_depth'], 
    n_estimators = params['n_estimators']
)
model_rf1.fit(x_train, y_train)

y_pred = model_rf1.predict(x_test)
y_pred_prob = model_rf1.predict_proba(x_test)
fpr, tpr, threshold = roc_curve(y_test, y_pred_prob[:,1])

scores = calculate_scores(x_test, y_test, y_pred, y_pred_prob, fpr, tpr, modelname)
create_skill_plots(y_test, y_pred_prob, fpr, tpr, modelname)

In [None]:
# Build RandomForest model: V2
basemodelname = "RandomForest"
params = {
    "max_depth": 6,
    "n_estimators": 40}
parsuf = '_'.join([key.replace('_','')+str(val).replace('.','') for key,val in params.items()])
modelname=f"{basemodelname}_{parsuf}"

model_rf2 = RandomForestClassifier(
    max_depth=params['max_depth'], 
    n_estimators = params['n_estimators']
)
model_rf2.fit(x_train, y_train)

y_pred = model_rf2.predict(x_test)
y_pred_prob = model_rf2.predict_proba(x_test)
fpr, tpr, threshold = roc_curve(y_test, y_pred_prob[:,1])

scores = calculate_scores(x_test, y_test, y_pred, y_pred_prob, fpr, tpr, modelname)
create_skill_plots(y_test, y_pred_prob, fpr, tpr, modelname)

In [None]:
# --------------------------------
# Function to calculate response rate, cumulative gain and lift
# @Params:
# y_val: real labels of the data
# y_pred: probability predictions for such data
# q: number of quantiles (eg 10 for deciles)
# --------------------------------
def calculate_lift(y_val, y_pred, q=10):
    #Create auxiliary dataframe to store the real value and predicted probabilities
    aux_lift = pd.DataFrame()
    aux_lift['real'] = y_val
    aux_lift['predicted'] = y_pred
    
    #Drop missing observations with no value
    aux_lift = aux_lift.loc[aux_lift['predicted'].notnull()]
    # drop customers where score is over highest non-customer
    max_de = aux_lift["predicted"].where(aux_lift["real"] == 0).max()
    aux_lift.drop(aux_lift.index[aux_lift['predicted'] > max_de], inplace=True)
    aux_lift.reset_index()
    #Rank the scored file, in descending order by estimated probability 
    aux_lift = aux_lift.sort_values(by='predicted', ascending=False)

    #Split the ranked file into deciles (q is the number of scorecards to be used)
    aux_lift['percentile'] = pd.qcut(aux_lift['predicted'].where(aux_lift['real'] == 0).rank(method='first'), q=q, labels=False)
    #get the minimum predicted values of real 0 per quantile
    percentile_limits = aux_lift.groupby(['percentile']).min().predicted
    percentile_limits = percentile_limits.to_dict()
    #put real 1s in the equivalent score class based on their minimum value
    for quantil, min_score in percentile_limits.items():
        aux_lift.loc[aux_lift.predicted > min_score, 'percentile'] = quantil
    
    #turn the scores so that 1 is the best scoreclass
    aux_lift['percentile'] = q - aux_lift['percentile']

    #Overal response rate (threshold)
    total_num_customers = len(aux_lift)
    total_responses = aux_lift.real.value_counts()[1]
    response_rate = total_responses/total_num_customers*100
    
    #Create final dataframe to store the lift and gain scores
    lift = pd.DataFrame()
    
    #Observations in each percentile
    lift['num_customers'] = (aux_lift.groupby('percentile').size()).astype(int)
    lift['cum_customers'] = np.cumsum(lift['num_customers'])
    lift['cum_pct_customers'] = ((lift['cum_customers']/total_num_customers)*100).round(2)

    #Response rate in each percentile
    lift['num_responses'] = (aux_lift.groupby('percentile')['real'].sum()).astype(int)
    lift['response_rate'] = ((lift['num_responses']/lift['num_customers'])*100).round(2)
    lift['cum_responses'] = np.cumsum(lift['num_responses'])

    #Cumulative gain
    lift['pct_gain'] = ((lift['num_responses']/total_responses)*100).round(2)
    lift['cum_gain'] = ((lift['cum_responses']/total_responses)*100).round(2) #also known as cummulative gain
    #Lift score
    lift['lift'] = (lift['pct_gain'] / lift['pct_gain'].mean()).round(2)
    
    #Cumulative lift score
    lift['cum_lift'] = ((lift['cum_gain']/lift.index)/ lift['pct_gain'].mean()).round(2)
    
    return lift

In [None]:
# --------------------------------
# Function to plot lift curve
# @Params:
# y_val: real labels of the data
# y_pred: probability predictions for such data
# q: number of quantiles (eg 10 for deciles)
# --------------------------------
def plot_lift_curve_own(y_val, y_pred, q=10, proportion=False):
    #Calculate lift & cumulative lift score
    lift = calculate_lift(y_val, y_pred, q)
    
    #Cumulative  lift curve
    values = lift['cum_lift'].round(2)
    if proportion:
        x = np.arange(1./q, 1+1/q, 1./q) # THIS WOULD BE PROPORTION
    else:
        x = list(range(1, len(values)+1)) # THIS IS QUANTILE
    
    sample_size = lift['num_customers'].sum()
    sample_size = "\nSample size={:0.0f}".format(sample_size)
    
    fig, ax = plt.subplots()
    ax.plot(x, values, c='orange', linewidth=2)
    ax.plot(x, lift['lift'], c='gray', linewidth=2)
    ax.axhline(lift['lift'].mean(), linewidth=2, color='k', linestyle='--')
    # ax.set_xticks(list(range(1, len(values)+1))) # NO NEED
    if proportion:
        ax.set_xlabel('Proportion of sample\n'+sample_size)
    else:
        ax.set_xlabel(f"Quantile\n"+sample_size) 
    ax.set_ylabel('Lift')
    ax.set_title('Lift Curve')
    plt.savefig('lift_curve_'+y_val.name[-2:]+'.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
plot_lift_curve_own(y_test, y_pred, q=10, proportion=False)
plt.show()