## Problem Statement

The Telemarketing team of a bank runs campaign to expand the term deposit portfolio. You are requested to enable prioritization for the Telemarketing team, so that overall responses and ROI(Returm On Investment) of the campaign increases.

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Importing the Data for ML Project

In [None]:
# Read the data
df = pd.read_csv('/Users/priyankac/Downloads/bank_telemaketing.xlsx - bank-additional-full.csv')

In [None]:
# Checking the number of rows and columns
df.shape

In [None]:
df.head()


In [None]:
df.dtypes

## Setting Display options to ensure feature name visibility

In [None]:
pd.set_option('display.max_columns', None)

## Warning Suppression

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Label the Target feature to 1/0

In [None]:
df['y'].value_counts()

In [None]:
df['target'] = np.where(df['y'] == 'yes', 1, 0)

In [None]:
df.target

## Drop Target feature to retain only the target column

In [None]:
df = df.drop(['y'], axis = 1)

In [None]:
df.head()

## Defining Target and Independent Features

In [None]:
Y = df[['target']]
X = df.drop(['target'], axis = 1)

## Get the Event Rate

In [None]:
Y.mean()

## Split features into Numerical and Categorical

In [None]:
num = X.select_dtypes(include = 'number')
char = X.select_dtypes(include = 'object')

In [None]:
num.dtypes

In [None]:
def unique_levels(x):
    x = x.value_counts().count()
    return(x)

df_value_counts = pd.DataFrame(num.apply(lambda x : unique_levels(x)))
df_value_counts

In [None]:
df_value_counts.columns = ['feature_levels']
df_value_counts

In [None]:
slice1 = df_value_counts.loc[df_value_counts['feature_levels'] <= 20]
cat_list = slice1.index
cat = num.loc[:, cat_list]
cat.dtypes

In [None]:
slice2 = df_value_counts.loc[df_value_counts['feature_levels'] > 20]
num_list = slice2.index
num = num.loc[:, num_list]

In [None]:
# Updating the char dataframe with the cat dataframe 
char = pd.concat([char, cat], axis = 1, join = 'inner')

In [None]:
char.head()

In [None]:
num.head()

## Outlier Analysis

In [None]:
num.describe(percentiles = [0.01,0.05,0.10,0.25,0.50,0.75,0.85,0.9,0.99])

## Flooring and Capping of Outliers

In [None]:
def outlier_cap(x):
    x = x.clip(lower = x.quantile(0.01))
    x = x.clip(upper = x.quantile(0.99))
    return(x)

In [None]:
num = num.apply(lambda x : outlier_cap(x))

In [None]:
num.describe(percentiles = [0.01,0.05,0.10,0.25,0.50,0.75,0.85,0.9,0.99])

## Missing Values Analysis

In [None]:
num.isnull().mean()

In [None]:
char.isnull().mean()

In [None]:
# No missing values in the data

## Feature Selection - Numerical Features

### Part 1 : Remove Features with 0 Variance

In [None]:
from sklearn.feature_selection import VarianceThreshold
varselector = VarianceThreshold(threshold = 0)
varselector.fit_transform(num)

# Get columns to keep and create new dataframe with those only
cols = varselector.get_support(indices = True)
num_1 = num.iloc[:, cols]

In [None]:
num_1.iloc[0]

## Part 2 : Bi Variate Analysis(Feature Discretization)

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

discrete = KBinsDiscretizer(n_bins = 10, encode = 'ordinal', strategy = 'quantile')
num_binned = pd.DataFrame(discrete.fit_transform(num_1), index = num_1.index, 
                          columns = num_1.columns).add_suffix('_Rank')
num_binned.head()

In [None]:
# Check if the features show a slope at all
# If they do, then do you see some deciles below the population average and some higher than the population average?
# If that is the case then the slope will be strong

# Conclusion: A strong slope is indicator of the features' ability to discriminate the event from non event
#             making it a good predictor

X_bin_combined = pd.concat([Y, num_binned], axis = 1, join = 'inner')

from numpy import mean
for cols in (num_binned.columns):
    plt.figure()
    sns.lineplot(x = cols, y = X_bin_combined['target'].mean(), data = X_bin_combined, color = 'red')
    sns.barplot(x = cols, y = 'target', data = X_bin_combined, estimator = mean)
plt.show()    

In [None]:
# plotting the above using scatterplot and lineplot
for cols in (num_binned.columns):
    plt.figure()
    sns.scatterplot(x = cols, y = X_bin_combined['target'].mean(), data = X_bin_combined, color = 'red')
    sns.lineplot(x = cols, y = 'target', data = X_bin_combined, estimator = mean)
plt.show()

In [None]:
# Dropping the features age(as it follows a u pattern so not much useful) 
# And pdays (doesn't demonstrate any slope)
num_1 = num_1.drop(['age', 'pdays'], axis = 1)

In [None]:
num_1.dtypes

In [None]:
# All the features from the num_1 will get selected due to good discrimination power by all of them
select_features_df_num = num_1

In [None]:
select_features_df_num.shape

## Feature Selection - Categorical Features

### Part 1 : Bi Variate Analysis

In [None]:
X_char_merged = pd.concat([Y, char], axis = 1, join = 'inner')

from numpy import mean
for col in (char.columns):
    plt.figure()
    sns.lineplot(x = col, y = X_char_merged['target'].mean(), data = X_char_merged, color = 'red')
    sns.barplot(x = col, y = 'target', data = X_char_merged, estimator = mean)
plt.show()    

In [None]:
# Dropping the features that do no have any slope visible
char = char.drop(['housing', 'education', 'marital','loan', 'day_of_week'], axis = 1)

In [None]:
# For the feature 'default', converting the 'unknown' as yes
char['default'] = np.where(char['default'] == 'unknown', 'yes', char['default'])

In [None]:
# emp.var.rate as  has negative valuesso covert it to object type before one hot encoding
char['emp.var.rate'] = char['emp.var.rate'].astype('object')

In [None]:
# Create dummy features with n-1 levels
X_char_dum = pd.get_dummies(char, drop_first = True)
X_char_dum.shape

### Part 2 : Select K best

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
selector = SelectKBest(chi2, k = 30)
selector.fit_transform(X_char_dum, Y)

# Get the columns to keep and create new dataframe with those only
cols = selector.get_support(indices = True)
select_features_df_char = X_char_dum.iloc[:, cols]

In [None]:
select_features_df_char.dtypes

## Create the Master Feature Set for Model Development

In [None]:
X_all = pd.concat([select_features_df_char, select_features_df_num], axis = 1, join = 'inner')

In [None]:
X_all.shape

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_all, Y, test_size = 0.3, random_state = 10)

In [None]:
print('Shape of Training data : ',X_train.shape)
print('Shape pf Testing data : ',X_test.shape)
print('Response Rate in Training data : ',y_train.mean())
print('Response Rate in Testing data : ',y_test.mean())

## Model Building Step

In [None]:
# Building Logistic Regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state = 10)
logreg.fit(X_train, y_train)

In [None]:
coeff_df = pd.DataFrame(X_all.columns)
coeff_df.columns = ['features']
coeff_df['Coefficient Estimate'] = pd.Series(logreg.coef_[0])
coeff_df

In [None]:
# Building Decision Tree model
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion = 'gini', random_state = 10)

In [None]:
# Using the Grid Searc to find the best parameters

np.random.seed(44)
from sklearn.model_selection import GridSearchCV
param_dist = {'max_depth' : [3,4,6,7,8], 'min_samples_split' : [120, 220, 300, 400, 500]}
tree_grid = GridSearchCV(dtree, cv = 10, param_grid = param_dist, n_jobs = -1)
tree_grid.fit(X_train, y_train)

print('Best parameters using Grid Search : \n', tree_grid.best_params_)

In [None]:
# Use the best parameters in Decison tree model
dtree = DecisionTreeClassifier(criterion = 'gini', random_state = 10, max_depth = 6, min_samples_split = 120)
dtree.fit(X_train, y_train)

In [None]:
from sklearn import tree
import pydotplus

plt.figure(figsize=[50,10])
tree.plot_tree(dtree,filled=True,fontsize=15,rounded=True,feature_names=X_all.columns)
plt.show()

In [None]:
# Building random Forest Model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(criterion = 'gini', random_state = 10, max_depth = 6, min_samples_split = 120)
rf.fit(X_train, y_train)

In [None]:
# Checking the feature importances

feature_importances=pd.DataFrame(rf.feature_importances_,
                                 index=X_train.columns,
                                 columns=['importance']).sort_values('importance',ascending=False)
feature_importances

In [None]:
# Building Gradient Boosting model
from sklearn.ensemble import GradientBoostingClassifier
gbm = GradientBoostingClassifier(criterion = 'mse', random_state = 10, max_depth = 6, min_samples_split = 120)
gbm.fit(X_train, y_train)

In [None]:
# Checking the feature importances
feature_importances=pd.DataFrame(gbm.feature_importances_,
                                 index=X_train.columns,
                                 columns=['importance']).sort_values('importance',ascending=False)
feature_importances

In [None]:
# Building the Stacking Classifier
base_learners =[
                ('rf',RandomForestClassifier(criterion='gini',random_state=0,max_depth=6,min_samples_split=120)),
                ('gbm',GradientBoostingClassifier(criterion='mse',random_state=0,max_depth=6,min_samples_split=120))  
              ] 

In [None]:
from sklearn.ensemble import StackingClassifier
clf = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression())

In [None]:
clf.fit(X_train, y_train)

## Model evaluation

In [None]:
y_pred_logreg = logreg.predict(X_test)
y_pred_tree = dtree.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_gbm = gbm.predict(X_test)
y_pred_clf = clf.predict(X_test)

In [None]:
# For Logistic Regression
from sklearn import metrics
print('Accuracy : ',metrics.accuracy_score(y_test,y_pred_logreg))
print('Precision : ',metrics.precision_score(y_test,y_pred_logreg))
print('Recall : ',metrics.recall_score(y_test,y_pred_logreg))
print('f1_score : ',metrics.f1_score(y_test,y_pred_logreg))

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
# Plotting confision matrix
metrics.plot_confusion_matrix(logreg, X_all, Y)

In [None]:
# For Decision Tree
print('Accuracy : ',metrics.accuracy_score(y_test,y_pred_tree))
print('Precision : ',metrics.precision_score(y_test,y_pred_tree))
print('Recall : ',metrics.recall_score(y_test,y_pred_tree))
print('f1_score : ',metrics.f1_score(y_test,y_pred_tree))

In [None]:
# Plotting confision matrix
metrics.plot_confusion_matrix(dtree, X_all, Y)

In [None]:
# For Random Forest
print('Accuracy : ',metrics.accuracy_score(y_test,y_pred_rf))
print('Precision : ',metrics.precision_score(y_test,y_pred_rf))
print('Recall : ',metrics.recall_score(y_test,y_pred_rf))
print('f1_score : ',metrics.f1_score(y_test,y_pred_rf))


In [None]:
# Plotting confision matrix
metrics.plot_confusion_matrix(rf, X_all, Y)

In [None]:
# For Gradient boosting
print('Accuracy : ',metrics.accuracy_score(y_test,y_pred_gbm))
print('Precision : ',metrics.precision_score(y_test,y_pred_gbm))
print('Recall : ',metrics.recall_score(y_test,y_pred_gbm))
print('f1_score : ',metrics.f1_score(y_test,y_pred_gbm))

In [None]:
# Plotting confision matrix
metrics.plot_confusion_matrix(gbm, X_all, Y)

In [None]:
# For Stacking
print('Accuracy : ',metrics.accuracy_score(y_test,y_pred_clf))
print('Precision : ',metrics.precision_score(y_test,y_pred_clf))
print('Recall : ',metrics.recall_score(y_test,y_pred_clf))
print('f1_score : ',metrics.f1_score(y_test,y_pred_clf))


In [None]:
# Plotting confision matrix
metrics.plot_confusion_matrix(clf, X_all, Y)

In [None]:
# Gains Chart Analysis
# This will help understand the power of discrimination offered by the model's estimated probabilities

In [None]:
# Gains chart using GBM model 

In [None]:
y_pred_prob = gbm.predict_proba(X_all)[:, 1]
df['pred_prob_gbm']=pd.DataFrame(y_pred_prob)
df['P_Rank_GBM']=pd.qcut(df['pred_prob_gbm'].rank(method='first').values,10,duplicates='drop').codes+1

rank_df_actuals=df.groupby('P_Rank_GBM')['target'].agg(['count','mean'])
rank_df_predicted=df.groupby('P_Rank_GBM')['pred_prob_gbm'].agg(['mean'])


rank_df_actuals.rename(columns={'mean':'Actual_event_rate'},inplace=True)
rank_df_predicted.rename(columns={'mean':'Predicted_event_rate'},inplace=True)
rank_df=pd.concat([rank_df_actuals,rank_df_predicted],axis=1,join="inner")

sorted_rank_df=rank_df.sort_values(by='P_Rank_GBM',ascending=False)
sorted_rank_df['N_events']=rank_df['count']*rank_df['Actual_event_rate']
sorted_rank_df['cum_events']=sorted_rank_df['N_events'].cumsum()
sorted_rank_df['event_cap']=sorted_rank_df['N_events']/max(sorted_rank_df['N_events'].cumsum())
sorted_rank_df['cum_event_cap']=sorted_rank_df['event_cap'].cumsum()

sorted_rank_df['N_non_events']=sorted_rank_df['count']-sorted_rank_df['N_events']
sorted_rank_df['cum_non_events']=sorted_rank_df['N_non_events'].cumsum()
sorted_rank_df['non_event_cap']=sorted_rank_df['N_non_events']/max(sorted_rank_df['N_non_events'].cumsum())
sorted_rank_df['cum_non_event_cap']=sorted_rank_df['non_event_cap'].cumsum()
sorted_rank_df['KS']=round((sorted_rank_df['cum_event_cap']-sorted_rank_df['cum_non_event_cap']),4)

sorted_reindexed=sorted_rank_df.reset_index()
sorted_reindexed['Decile']=sorted_reindexed.index+1
sorted_reindexed

In [None]:
ax = sns.lineplot(x = 'Decile', y = 'cum_non_event_cap', data = sorted_reindexed, color = 'red')
ax = sns.lineplot(x = 'Decile', y = 'cum_event_cap', data = sorted_reindexed, color = 'grey')

# Project Conclusion :


 The GBM Model has performed the best and will be used for customer targeting with retention offers in Telecom

In [None]:
# Since the KS value maximises in the 9th decile, the predicted response rank dataframe is created by taking the 
# top 2 and bottom 8 rank
df['Predicted_Response_Rank'] = np.where(df['P_Rank_GBM'] > 8, 'Top 2', 'Bottom 8')

In [None]:
df.groupby('Predicted_Response_Rank')['target'].agg(['mean'])

In [None]:
# From the model feature importances 'duration' seems to be very important
# Using the 'duration' feature to further analysis to find the response rate
# Creating deciles out of the 'duration' feature

df['Duration_Rank'] = pd.qcut(df['duration'].rank(method ='first').values, 10, duplicates = 'drop').codes+1

In [None]:
df.groupby('Duration_Rank')['duration'].agg(['min','mean' ,'max'])

In [None]:
df['duration'].mean()

In [None]:
# Creating dataframe for 'high duration' and 'low duration', on the basis of ranks having value greater than the 
# mean of 'duration'feature

df['duration_segment'] = np.where(df['Duration_Rank'] > 7, 'High Duration', 'Low Duration')
df.duration_segment.value_counts()

In [None]:
# Cross tabulating the duration segment with the predicted response rank 
pd.crosstab(index = df['duration_segment'], columns = df['Predicted_Response_Rank'],
            values = df['target'], aggfunc = 'mean')



In [None]:
# Cross tabulating the duration segment with the predicted response rank
pd.crosstab(index = df['duration_segment'], columns = df['Predicted_Response_Rank'],
            values = df['Predicted_Response_Rank'], aggfunc = 'count')


In [None]:
### Recommendations ###

# Phase 1 - Speak to customers who fall in Top 2 predicted rank (deciles) and also in High Duration
# Phase 2 - Next speak customers in Top 2 , Low Duration