In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Libraries

In [None]:
import time

# importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import special, stats

# preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


# model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, cross_val_score


# SMOTe
from imblearn.over_sampling import SMOTE

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


# metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, average_precision_score, precision_recall_curve 
from sklearn.metrics import recall_score, roc_curve, roc_auc_score, precision_recall_curve, auc, plot_confusion_matrix

# ensemble
from xgboost import XGBClassifier

# warnings
import warnings
warnings.filterwarnings("ignore")

# style
import matplotlib.style as style
style.use('fivethirtyeight')

In [None]:
df = pd.read_csv('/kaggle/input/bank-marketing-campaigns-dataset/bank-additional-full.csv', sep=';')

In [None]:
df.shape

In [None]:
df.head()

# Data Dictionary

| Input variables:                                         |                                                                                                                                                                                                                                                                                                                                                                                                                              |                                                                |   |
|----------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------|---|
| # bank client data:                                      |                                                                                                                                                                                                                                                                                                                                                                                                                              |                                                                |   |
| 1                                                        | age (numeric)                                                                                                                                                                                                                                                                                                                                                                                                                |                                                                |   |
| 2                                                        | job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')                                                                                                                                                                                                                                           |                                                                |   |
| 3                                                        | marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)                                                                                                                                                                                                                                                                                                  |                                                                |   |
| 4                                                        | education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')                                                                                                                                                                                                                                                                                     |                                                                |   |
| 5                                                        | default: has credit in default? (categorical: 'no','yes','unknown')                                                                                                                                                                                                                                                                                                                                                          |                                                                |   |
| 6                                                        | housing: has housing loan? (categorical: 'no','yes','unknown')                                                                                                                                                                                                                                                                                                                                                               |                                                                |   |
| 7                                                        | loan: has personal loan? (categorical: 'no','yes','unknown')                                                                                                                                                                                                                                                                                                                                                                 |                                                                |   |
| # related with the last contact of the current campaign: |                                                                                                                                                                                                                                                                                                                                                                                                                              |                                                                |   |
| 8                                                        | contact: contact communication type (categorical: 'cellular','telephone')                                                                                                                                                                                                                                                                                                                                                    |                                                                |   |
| 9                                                        | month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')                                                                                                                                                                                                                                                                                                                                      |                                                                |   |
| 10                                                       | day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')                                                                                                                                                                                                                                                                                                                                       |                                                                |   |
| 11                                                       | duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model. |                                                                |   |
| # other attributes:                                      |                                                                                                                                                                                                                                                                                                                                                                                                                              |                                                                |   |
| 12                                                       | campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)                                                                                                                                                                                                                                                                                                             |                                                                |   |
| 13                                                       | pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)                                                                                                                                                                                                                                                                   |                                                                |   |
| 14                                                       | previous: number of contacts performed before this campaign and for this client (numeric)                                                                                                                                                                                                                                                                                                                                    |                                                                |   |
| 15                                                       | poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')                                                                                                                                                                                                                                                                                                                        |                                                                |   |
| # social and economic context attributes                 |                                                                                                                                                                                                                                                                                                                                                                                                                              |                                                                |   |
| 16                                                       | emp.var.rate: employment variation rate                                                                                                                                                                                                                                                                                                                                                                                      | quarterly indicator (numeric)                                  |   |
| 17                                                       | cons.price.idx: consumer price index                                                                                                                                                                                                                                                                                                                                                                                         | monthly indicator (numeric)                                    |   |
| 18                                                       | cons.conf.idx: consumer confidence index                                                                                                                                                                                                                                                                                                                                                                                     | monthly indicator (numeric)                                    |   |
| 19                                                       | euribor3m: euribor 3 month rate                                                                                                                                                                                                                                                                                                                                                                                              | daily indicator (numeric)                                      |   |
| 20                                                       | nr.employed: number of employees                                                                                                                                                                                                                                                                                                                                                                                             | quarterly indicator (numeric)                                  |   |
| Output variable (desired target):                        |                                                                                                                                                                                                                                                                                                                                                                                                                              |                                                                |   |
| 21                                                       | y                                                                                                                                                                                                                                                                                                                                                                                                                            | has the client subscribed a term deposit? (binary: 'yes','no') |   |

In [None]:
df.info()

In [None]:
# Check for null values if any

# This method shows the count of null values, percent and dataTypes

def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
        
    tt['Types'] = types
    return(np.transpose(tt))

In [None]:
missing_data(df)

# How many term depositors are there in the data ?

In [None]:
df['y'].value_counts()

In [None]:
def plot_pie(dataset, target, size=(7,7)):
    return dataset[target].value_counts().plot.pie(autopct = '%1.1f%%', figsize=size)

plot_pie(df, 'y')

# What is the baseline likelihood of term deposits ? This tells us what is the success rate in the absence of ML model

In [None]:
# create a new variable `target` that takes 1 for `yes` else 0
df['target'] = np.where(df['y'].isin(['yes']), 1, 0)

df.head()

In [None]:
df['target'].mean()

### In a no model scenario, out of 100 calls, _11 are taking the term loan_

In [None]:
sns.countplot(df['target'])

# Inspecting Categorical variables

# 1. education

In [None]:
df['education'].value_counts()

In [None]:
# We will plot the relationship of `education` with `target`
# estimator is mean to show the likelihood of taking up the loan
def barplot_mean(x, y, df, hue=None, order=None, hue_order=None):
    print(df.groupby(x)[y].mean())
    uniqs = df[x].nunique()
    
    if uniqs > 4:
        plt.figure(figsize=(16,4))
        
    sns.barplot(x=x, y=y, data=df, estimator=np.mean, hue=hue, order=order, hue_order=hue_order)
    plt.show()

barplot_mean('education', 'target', df)

### `university.degree` and `professional.course` are a little better than the avg. success rate of the data

### `illiterate` and `unknown` is showing high chances of taking a loan (comparing to 0.112..), but we do not want to target such audience in the practical scenario. Always analyze graphs with practical perspective !

In [None]:
# We will group the basic education groups into one
basic_grps = ['basic.4y', 'basic.6y', 'basic.9y']

df['education'] = np.where(df['education'].isin(basic_grps), 'Basic', df['education'])
df.head()

In [None]:
barplot_mean('education', 'target', df)

### we will also go ahead and merge `unknown` and `illiterate` into one

In [None]:
df['education'] = np.where(df['education'].isin(['illiterate']), 'unknown', df['education'])
barplot_mean('education', 'target', df)

### There is no monotonic trend visible that accounts for the discrmination of the target variable. We will be doing similar analysis for other variables as well

# 2. day_of_week

In [None]:
barplot_mean('day_of_week', 'target', df)

### We see there is no significant difference in the trend for different days of the week so we can conclude that this is a weak predictor of the dependent target

# 3. job

In [None]:
barplot_mean('job', 'target', df)

### It appears as if `student` and `retired` people have very high chances of taking a term loan. We can also go ahead and group `unknown` and `unemployed` into one bucket

In [None]:
df['job'] = np.where(df['job'].isin(['unknown']), 'unemployed', df['job'])
barplot_mean('job', 'target', df)

# 4.marital_status 

In [None]:
barplot_mean('marital', 'target', df)

### We see there is no significant difference in the trend for different marital status so we can conclude that this is a weak predictor of the dependent target

# 5. default

In [None]:
barplot_mean('default', 'target', df)

### We see a trend in case of `default`. A person having `default` is having the mean as `0`. We will inspect this by doing the sum

In [None]:
print(df.groupby('default')['target'].sum())
print("*"*30)
# if we look at the count - we do have 3 cases of people who have `defaulted`
print(df.groupby('default')['target'].count())

# But we do have cases of `unknown`. 
# We can go ahead and group them in `yes` 
# because we are not sure which category they belong to

In [None]:
df['default'] = np.where(df['default'].isin(['unknown']), 'yes', df['default'])
barplot_mean('default', 'target', df)

### This is a very good variable to design the audience which can be utilised later since it is able to discriminate the dependent target variable. 

### Note: As a predictor, this is also weak because the likelihood of a person with `no default` buying a term loan is only `12.87 %` 

# 6. housing

In [None]:
barplot_mean('housing', 'target', df)

### We see there is no significant difference in the trend for different housing loan status so we can conclude that this is a weak predictor of the dependent target

# 7. loan

In [None]:
barplot_mean('loan', 'target', df)

### We see there is no significant difference in the trend for different personal loan status so we can conclude that this is a weak predictor of the dependent target

# 8. contact

In [None]:
barplot_mean('contact', 'target', df)

### This is a very good variable to design the audience which can be utilised later since it is able to discriminate the dependent target variable.

### Note: We see that there is a positive slope in the trend

# 9. month

In [None]:
barplot_mean('month', 'target', df)

### We are not able to see any monotonic trend here. One way to resolve this is to see if grouping them into quarters will help us find any signals.

In [None]:
qtr1 = ['jan', 'feb', 'mar']
qtr2 = ['apr', 'may', 'jun']
qtr3 = ['jul', 'aug', 'sep']
qtr4 = ['oct', 'nov', 'dec']

df['qtr'] = np.where(df['month'].isin(qtr1), 'Q1', 
                                       np.where(df['month'].isin(qtr2), 'Q2', 
                                       np.where(df['month'].isin(qtr3), 'Q3',
                                       np.where(df['month'].isin(qtr4), 'Q4', 0)
                                       )))
df['qtr'].value_counts()

### `Q1` has the least amount of observation while `Q2` is the majority

### We will study the behavior of the target variable with respect to quarter

In [None]:
barplot_mean('qtr', 'target', df, order=["Q1","Q2","Q3","Q4"])
# `order` as the name suggest orders the graph in similar fashion as the input list
# here we pass the order as per the quarters

### `Q1` is showing a likelihood of `50 %`. That is huge ! Of all the customers called in the start of the year, every second called customer will end up buying a term loan

### While building our strategy, we can inform the marketing team to invest their major efforts in the beginning of the year to get the best results.

### Arriving into `Q2`, there is a sharp decline in the interest of the customers. These may be the dry months for the marketing team. This can also support us building an effective strategy accordingly.

## Let us try combining `qtr` and other variable together.

In [None]:
barplot_mean('qtr', 'target', df, hue='contact', order=["Q1","Q2","Q3","Q4"])

### We have an interesting insight here ! `Q2` has a likelihood of `0.9%` but if you target the customers with `contact="cellular"`, your chances are very much better than `0.9%`

### We see an that `contact="cellular"` is leading most of the quarters except `Q4`. There might be some external factors at play here.

### We will try to look at the profiles of cellular customer across the quarters and their likelihood of purchasing the term loan

In [None]:
df[df['contact'] == "cellular"].groupby('qtr')['target'].mean()

# avg. likelihood across qtr
# Q1    0.505495
# Q2    0.091349
# Q3    0.112053
# Q4    0.163967

# 10. poutcome

In [None]:
barplot_mean('poutcome', 'target', df)

### This is a very good variable to design the audience which can be utilised later since it is able to discriminate the dependent target variable.
### Note: Customers who were a success earlier are `5 times` mre likely to buy the term loan than a person who was a failure

### We will also merge `nonexistent` and `failure` together

In [None]:
df['poutcome'] = np.where(df['poutcome'].isin(['nonexistent', 'failure']), 0, 1)
barplot_mean('poutcome', 'target', df)

In [None]:
barplot_mean('qtr', 'target', df, hue='poutcome', order=["Q1","Q2","Q3","Q4"])

In [None]:
df[df['poutcome'] == 1].groupby('qtr')['target'].mean()

# avg. likelihood across qtr
# Q1    0.505495
# Q2    0.091349
# Q3    0.112053
# Q4    0.163967

### We observe that when we combine `poutcome` and `quarter`. We will be able to build profiles that have 5-8 times better likelihood of success when compared to the no-model scenario. `Q3` has better chances than `Q4`

### The idea is as we have seen above that `Q2` and `Q3` are not favorable in comparison to `Q1` and `Q4`, but in the practical scenario we can't stop the team to run campaigns in `Q2` and `Q3`. So we are adding surrogate variables to boost their performance.

### Hope this notebook is useful 😄

# working with continous variables

### we will convert continuos variables into categories by binning them into ranks

# 11. age

In [None]:
df['age_rank'] = pd.qcut(df['age'].rank(method='first').values, 5, duplicates='drop').codes+1
df['age_rank'].value_counts()

# we have divided age into 5 ranks thery distributing 20% data in each rank
# we can now see if there is any trend with respect to age on target

In [None]:
barplot_mean('age_rank', 'target', df)

### we see that we have `U-shaped` curve. This variable might not help us discriminate the dependent variable. The prediction will be impacted

### let us try to combine this with `qtr`

In [None]:
barplot_mean('age_rank', 'target', df, hue='qtr', hue_order=["Q1","Q2","Q3","Q4"])

### As we can infer from this graph, `Q1` seems to stand out everytime as evident earlier since it has a likelihood of `50 %`

# 12. duration

In [None]:
df['duration_rank'] = pd.qcut(df['duration'].rank(method='first').values, 5, duplicates='drop').codes+1
df['duration_rank'].value_counts()

In [None]:
barplot_mean('duration_rank', 'target', df)

### From the perspective of discrimination, we see a very strong positive trend. As the duration is increasing, the chances of buying the term loan is almost doubling. This will be an excellent predictor while using in the model

### But we have to be cautious of using `duration` because by definition `duration` is last contact duration, in seconds and we do not know the duration until a call is made.

### so we will move ahead

# 13. campaign

In [None]:
df['campaign_rank'] = pd.qcut(df['campaign'].rank(method='first').values, 5, duplicates='drop').codes+1
df['campaign_rank'].value_counts()

In [None]:
barplot_mean('campaign_rank', 'target', df)

### this plot is showing a non-linear trend. Let us look at what kind of value does `campaign` takes ?

In [None]:
print(df.groupby('campaign_rank')['campaign'].min())
print("*"*30)
print(df.groupby('campaign_rank')['campaign'].mean())
print("*"*30)
print(df.groupby('campaign_rank')['campaign'].max())

# 14. pdays

In [None]:
df['pdays_rank'] = pd.qcut(df['pdays'].rank(method='first').values, 5, duplicates='drop').codes+1
df['pdays_rank'].value_counts()

In [None]:
barplot_mean('pdays_rank', 'target', df)

In [None]:
print(df.groupby('pdays_rank')['pdays'].min())
print("*"*30)
print(df.groupby('pdays_rank')['pdays'].mean())
print("*"*30)
print(df.groupby('pdays_rank')['pdays'].max())

# 15. previous

In [None]:
df['prev_rank'] = pd.qcut(df['previous'].rank(method='first').values, 5, duplicates='drop').codes+1
df['prev_rank'].value_counts()

In [None]:
barplot_mean('prev_rank', 'target', df)

### This is a very good variable to design the audience which can be utilised later since it is able to discriminate the dependent target variable.

In [None]:
barplot_mean('prev_rank', 'target', df, hue='qtr', hue_order=["Q1","Q2","Q3","Q4"])

# 16. emp.var.rate

In [None]:
df['emp.var.rate_rank'] = pd.qcut(df['emp.var.rate'].rank(method='first').values, 5, duplicates='drop').codes+1
barplot_mean('emp.var.rate_rank', 'target', df)

### This can behave as a good predictor in case of a decision tree since the lower ranks are capturing more and higher ranks are capturing less

# 17. cons.price.idx

In [None]:
df['cons.price.idx_rank'] = pd.qcut(df['cons.price.idx'].rank(method='first').values, 5, duplicates='drop').codes+1
barplot_mean('cons.price.idx_rank', 'target', df)

### We observe a strong slope from rank 1 to 4

# 18. cons.conf.idx

In [None]:
df['cons.conf.idx_rank'] = pd.qcut(df['cons.conf.idx'].rank(method='first').values, 5, duplicates='drop').codes+1
barplot_mean('cons.conf.idx_rank', 'target', df)

### We observe that this is a very weak predictor

# 19.euribor3m

In [None]:
df['euribor3m_rank'] = pd.qcut(df['euribor3m'].rank(method='first').values, 5, duplicates='drop').codes+1
barplot_mean('euribor3m_rank', 'target', df)

### We observe that this can used in a decision tree to make audiences

# 20.nr.employed

In [None]:
df['nr.employed_rank'] = pd.qcut(df['nr.employed'].rank(method='first').values, 5, duplicates='drop').codes+1
barplot_mean('nr.employed_rank', 'target', df)

### We observe that this can used in a decision tree to make audiences. Let us kbreak it into 10 ranks instead of 5

In [None]:
df['nr.employed_rank'] = pd.qcut(df['nr.employed'].rank(method='first').values, 10, duplicates='drop').codes+1
barplot_mean('nr.employed_rank', 'target', df)

### We can see the behavior that rank 1 has now higher likelihood and the trend is going in a decreasing fashion. We can transform this to add a discrimatory slope by merging everything after the 3rd rank with itself

In [None]:
df['nr.employed_rank'] = np.where(df['nr.employed_rank'].isin(['1']), 'A', 
                                       np.where(df['nr.employed_rank'].isin(['2']), 'B', 'C'))
                                                                            
df['nr.employed_rank'].value_counts()

In [None]:
barplot_mean('nr.employed_rank', 'target', df)

In [None]:
df.info()

# Modelling

In [None]:
# We are not considering education, job, day_of_week, housing, loan
cols_cat = ['default', 'contact', 'poutcome', 'nr.employed_rank']

# We are not considering age, duration, cons.conf.idx 
cols_num = ['campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'euribor3m'] #, 'duration']

# NOTE: duration is added to improve the ROC score

In [None]:
# dummy encoding categorical variable
# ref: https://stackoverflow.com/questions/36631163/what-are-the-pros-and-cons-between-get-dummies-pandas-and-onehotencoder-sciki
cols_cat_dummy = pd.get_dummies(df[cols_cat], drop_first=True)
cols_cat_dummy.head()

In [None]:
X_all = pd.concat([df[cols_num], cols_cat_dummy], axis=1, join='inner')
X_all.head()

In [None]:
# Assigning X and Y
X = X_all
y = df['target']

# Splitting the data set

In [None]:
# Train-Val split 75-25
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=101, test_size=0.30)

print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

# TODO: Do a grid search to explore best parameters
dt = DecisionTreeClassifier(criterion="gini", random_state=101, max_depth=7, min_samples_leaf=5)
dt.fit(X_train, y_train)

rf_1000 = RandomForestClassifier(n_estimators=1000, random_state=101, criterion="gini", max_features="auto", max_depth=2)
rf_1000.fit(X_train, y_train)

# Prediction

In [None]:
y_pred = lr.predict(X_val)
print("Accuracy of logistic regression on test set {:.2f}".format(lr.score(X_val, y_val)))

In [None]:
y_pred_tree = dt.predict(X_val)
print("Accuracy of decision tree on test set {:.2f}".format(dt.score(X_val, y_val)))

In [None]:
y_pred_rf = rf_1000.predict(X_val)
print("Accuracy of random forest on test set {:.2f}".format(rf_1000.score(X_val, y_val)))

In [None]:
rf_1000_train_score = rf_1000.score(X_train, y_train)
rf_1000_test_score = rf_1000.score(X_val, y_val)


print("Training Score:", rf_1000_train_score)
print("Test Score:", rf_1000_test_score)

# Confusion Matrices -- Classification Reports

In [None]:
sns.set_style({'axes.grid' : False})
# logistic regression
plot_confusion_matrix(lr, X_val, y_val)
print(classification_report(y_val, y_pred))

In [None]:
# decision tree
print(classification_report(y_val, y_pred_tree))
plot_confusion_matrix(dt, X_val, y_val)

In [None]:
# random forest
print(classification_report(y_val, y_pred_rf))
plot_confusion_matrix(rf_1000, X_val, y_val)

# Plotting the ROC curves

In [None]:
lr_roc_auc = roc_auc_score(y_val, lr.predict(X_val))
dt_roc_auc = roc_auc_score(y_val, dt.predict(X_val))
rf_roc_auc = roc_auc_score(y_val, rf_1000.predict(X_val))

fpr, tpr, thresholds = roc_curve(y_val, lr.predict_proba(X_val)[:, 1])
fpr, tpr, thresholds = roc_curve(y_val, dt.predict_proba(X_val)[:, 1])
fpr, tpr, thresholds = roc_curve(y_val, rf_1000.predict_proba(X_val)[:, 1])

plt.figure()

plt.plot(fpr, tpr, 'b', label = 'LR AUC = %0.2f' % lr_roc_auc)
plt.plot(fpr, tpr, 'r', label = 'DT AUC = %0.2f' % dt_roc_auc)
plt.plot(fpr, tpr, 'g', label = 'RF AUC = %0.2f' % rf_roc_auc)

plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.savefig('LR DT RF ROC Curve')
plt.show()



In [None]:
### The score is coming in the range of late 50s, lets try to improve the model by incorporating `duration`

### After using `duration`, we are able to get the score upto `0.77` for decision tree

# Making sense from the model - Lorenz curve

In [None]:
# Ranking the probabilities from the logistic regression model

y_pred_prob = lr.predict_proba(X)[:,1]
df['y_pred_P'] = pd.DataFrame(y_pred_prob)
df['P_rank'] = pd.qcut(df['y_pred_P'].rank(method='first').values, 10, duplicates='drop').codes+1
df.groupby('P_rank')['target'].mean()

# The highest rank has a likelihood of 48.28 percent (~ 4.3 times better than the average)

In [None]:
# Ranking the probabilities from the logistic regression model

y_pred_prob_dtree = dt.predict_proba(X)[:,1]
df['y_pred_P_dtree'] = pd.DataFrame(y_pred_prob_dtree)
df['P_rank_dtree'] = pd.qcut(df['y_pred_P_dtree'].rank(method='first').values, 10, duplicates='drop').codes+1
df.groupby('P_rank_dtree')['target'].mean()

# The highest rank has a likelihood of 51.56 percent (~ 4.6 times better than the average)

### From a discrimination point of view, decision tree is able to give a better likelihood

In [None]:
df.to_csv('telemarketing_model_scored_file.csv')