**Context:**

This case is about a bank (Thera Bank) which has a growing customer base. Majority of these customers are liability customers (depositors) with varying size of deposits. The number of customers who are also borrowers (asset customers) is quite small, and the bank is interested in expanding this base rapidly to bring in more loan business and in the process, earn more through the interest on loans. In particular, the management wants to explore ways of converting its liability customers to personal loan customers (while retaining them as depositors). A campaign that the bank ran last year for liability customers showed a healthy conversion rate of over 9% success. This has encouraged the retail marketing department to devise campaigns with better target marketing to increase the success ratio with minimal budget.

Bank provided data on 5000 customers. The data include customer demographic information (age, income, etc.), the customer's relationship with the bank (mortgage, securities account, etc.), and the customer response to the last personal loan campaign (Personal Loan). Among these 5000 customers, only 480 (= 9.6%) accepted the personal loan that was offered to them in the earlier campaign.

**Data Descripsion:**


1-ID: Customer ID


2- Age: Customer’s age in completed years


3- Experience: years of professional experience


4- Income: Annual income of the customer (in thousand dollars)


5- ZIP Code: Home Address ZIP code.


6- Family: the Family size of the customer


7- CCAvg: Average spending on credit cards per month (in thousand dollars)


8- Education: 1: Undergrad; 2: Graduate;3: Advanced/Professional


9- Mortgage: Value of house mortgage if any. (in thousand dollars)


10- Personal_Loan: Did this customer accept the personal loan offered in the last campaign?


11- Securities_Account: Does the customer have securities account with the bank?


12- CD_Account: Does the customer have a certificate of deposit (CD) account with the bank?


13- Online: Do customers use internet banking facilities?


14- CreditCard: Does the customer use a credit card issued by any other Bank (excluding All life Bank)?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from imblearn.over_sampling import SMOTE 
from sklearn.svm import SVC
from sklearn import model_selection

In [None]:
import itertools
from datetime import datetime
import io


In [None]:
data=pd.read_csv("../input/bank-personal-loan-modelling/Bank_Personal_Loan_Modelling.csv")


In [None]:
des=data.head()
cm = sns.light_palette("b", as_cmap=True)
s = des.style.background_gradient(cmap=cm)
s

In [None]:
data.shape

In [None]:
data.info()

In [None]:
des=data.describe().T
cm = sns.light_palette("tan", as_cmap=True)
s = des.style.background_gradient(cmap=cm)
s

In [None]:
data['Personal Loan'].value_counts().head()

In [None]:
data.Experience.describe()

In [None]:
data[data['Experience'] < 0]['Experience'].count()

In [None]:
data[data['Experience'] < 0]['Experience'].value_counts()


In [None]:
data['Mortgage'].value_counts().head()

In [None]:
d=data.isnull().sum()
sns.heatmap(data.isnull(),cmap='YlGnBu',cbar=False,yticklabels=False)

plt.title('missing data')
plt.show()
print("Missing Value: ",data.isna().sum().values.sum())

In [None]:
data.nunique()

In [None]:
sns.pairplot(data.iloc[:,1:])

In [None]:
plt.figure(figsize=(12, 8)) 
sns.heatmap(data.corr(), cmap="YlGnBu",  linewidths=.2,annot = True)

In [None]:
q_Var = ['Age', 'Income', 'CCAvg', 'Mortgage']
expGrid = sns.PairGrid(data, y_vars = 'Experience', x_vars = q_Var)
expGrid.map(sns.regplot)

In [None]:
df_sum = data.describe()

In [None]:
from warnings import filterwarnings
filterwarnings("ignore")

def draw_axvlines(plt, col):
    mean = df_sum.loc["mean", col]
    q1 = df_sum.loc["25%", col]
    q2 = df_sum.loc["50%", col]
    q3 = df_sum.loc["75%", col]
    plt.axvline(mean, color = "g");              # Plotting a line to mark the mean 
    plt.axvline(q1, color = "b");                # Plotting a line to mark Q1 
    plt.axvline(q2, color = "navy");             # Plotting a line to mark Q2 
    plt.axvline(q3, color = "purple");           # Plotting a line to mark Q3
    plt.legend({"Mean": mean, "25%" : q1, "50%" : q2, "75%" : q3});

fig, axes = plt.subplots(3, 2, figsize = (20,12));
fig.suptitle('Distribution charts for Age, Experience and income.');


# Create boxplot and histogram to show distribution of Age
sns.boxplot(data["Age"], ax = axes[0][0], color = "mediumslateblue");
axes[0][0].set(xlabel = 'Distribution of Age');

pp = sns.distplot(data["Age"], ax = axes[0][1], bins = 10, color = "mediumslateblue");
axes[0][1].set(xlabel = 'Distribution of Age');
draw_axvlines(pp, "Age");


# Create boxplot and histogram to show distribution of Experience
sns.boxplot(data["Experience"], ax = axes[1][0], color = "mediumslateblue");
axes[1][0].set(xlabel = 'Distribution of Experience');

pp = sns.distplot(data["Experience"], ax = axes[1][1], bins = 10, color = "mediumslateblue");
axes[1][1].set(xlabel = 'Distribution of Experience');
draw_axvlines(pp, "Experience")


# Create boxplot and histogram to show distribution of Income
sns.boxplot(data["Income"], ax = axes[2][0], color = "mediumslateblue");
axes[2][0].set(xlabel = 'Distribution of income');

pp = sns.distplot(data["Income"], ax = axes[2][1], color = "mediumslateblue");
axes[2][1].set(xlabel = 'Distribution of income');
draw_axvlines(pp, "Income")

Experience The distribution is verry similar to that of Age.

---




In [None]:
data['Experience']=data['Experience'].apply(abs)

In [None]:
data.isna().sum()

In [None]:
data['Experience'].value_counts().head()

In [None]:
data[data['Experience'] < 0]['Experience'].count()

In [None]:
data.Experience.describe()

In [None]:
data.columns

**Analyze by visualizing data**

In [None]:
numcols = ['Age', 'Experience', 'Income','CCAvg','Mortgage','Education']

In [None]:
fig, ax = plt.subplots(figsize=(20,10), dpi=50)
for i in range(0,len(numcols)):
    plt.subplot(2,3,i+1)
    sns.distplot(data[numcols[i]],color="b")
    plt.xlabel(numcols[i])
  

In [None]:
data.columns

In [None]:

cat_columns = ['Family',   'Education',   'Personal Loan',   'Securities Account',
               'CD Account',   'Online',   'CreditCard']
title=['Number of Family', 'Education','Customers who took Personal Loan',
       ' Customer has Securities Account','Customers has a CD Account',
       'Customers  who transcat  Online',' Customers who has  Credit Card']
plt.figure(figsize=(7,14))

sns.set_theme(style="white") # just trying to make visualisation better. This will set background to white
#list_palette=['Blues_r','Greens_r','Purples_r','Reds_r','Blues_r','Greens_r','Purples_r','Reds_r','Blues_r']

for i, variable in enumerate(cat_columns):
                     plt.subplot(5,2,i+1)
                     order = data[variable].value_counts(ascending=False).index   
                     #sns.set_palette(list_palette[i]) # to set the palette
                     sns.set_palette('Set2')
                     ax=sns.countplot(x=data[variable], data=data )
                     sns.despine(top=True,right=True,left=True) # to remove sijjde line from graph
                     for p in ax.patches:
                           percentage = '{:.1f}%'.format(100 * p.get_height()/len(data[variable]))
                           x = p.get_x() + p.get_width() / 2 - 0.05
                           y = p.get_y() + p.get_height()
                           plt.annotate(percentage, (x, y),ha='center')
                     plt.tight_layout()
                     plt.title(title[i].upper())
                                     

In [None]:
loan_counts = pd.DataFrame(data["Personal Loan"].value_counts()).reset_index()
loan_counts.columns =["Labels","Personal Loan"]
loan_counts

In [None]:
#Report business
fig, ax = plt.subplots(nrows=1, ncols=2,squeeze=True)
fig.set_size_inches(14,6)
frequency_colums= pd.crosstab(index=data["Personal Loan"],columns="count")
frequency_colums.plot(kind='bar',ax=ax[0],color="c",legend=False,rot=True,fontsize=10)
frequency_colums.plot(kind='pie',ax=ax[1],subplots=True,legend=False,fontsize=10,autopct='%.2f')
ax[0].set_title('Frequency Distribution of Dependent variable: Survived',fontsize=10)
ax[1].set_title('Pie chart representation of Dependent variable: Survived',fontsize=10)

#adding the text labels
rects = ax[0].patches
labels = frequency_colums["count"].values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax[0].text(rect.get_x() + rect.get_width()/2, height +1,label, ha='center', va='bottom',fontsize=10)
plt.show()

In [None]:
sns.set_palette(sns.color_palette("Set2", 8))
sns.pairplot(data, hue="Personal Loan",corner=True)
plt.show()

In [None]:
def cat_view(x = 'Education'):
    """
    Function to create a Bar chart and a Pie chart for categorical variables.
    """
    from matplotlib import cm
    color1 = cm.inferno(np.linspace(.4, .8, 30))
    color2 = cm.viridis(np.linspace(.4, .8, 30))
    fig, ax = plt.subplots(1, 2, figsize=(16, 6))
    
     
    """
    Draw a Pie Chart on first subplot.
    """    
    s = data.groupby(x).size()

    mydata_values = s.values.tolist()
    mydata_index = s.index.tolist()

    def func(pct, allvals):
        absolute = int(pct/100.*np.sum(allvals))
        return "{:.1f}%\n({:d})".format(pct, absolute)


    wedges, texts, autotexts = ax[0].pie(mydata_values, autopct=lambda pct: func(pct, mydata_values),
                                      textprops=dict(color="w"))

    ax[0].legend(wedges, mydata_index,
              title="Index",
              loc="center left",
              bbox_to_anchor=(1, 0, 0.5, 1))

    plt.setp(autotexts, size=12, weight="bold")

    ax[0].set_title(f'{x.capitalize()} Piechart')
    
    
    """
    Draw a Bar Graph on second subplot.
    """
    
    df = pd.pivot_table(data, index = [x], columns = ['Personal Loan'], values = ['Income'], aggfunc = len)

    labels = df.index.tolist()
    loan_no = df.values[:, 0].tolist()
    loan_yes = df.values[:, 1].tolist()
    
    l = np.arange(len(labels))  # the label locations
    width = 0.35  # the width of the bars

    rects1 = ax[1].bar(l - width/2, loan_no, width, label='No Loan', color = 'tan')
    rects2 = ax[1].bar(l + width/2, loan_yes, width, label='Loan', color = 'pink')

    # Add some text for labels, title and custom x-axis tick labels, etc.
    
    ax[1].set_ylabel('Scores')
    ax[1].set_title(f'{x.capitalize()} Bar Graph')
    ax[1].set_xticks(l)
    ax[1].set_xticklabels(labels)
    ax[1].legend()
    
    def autolabel(rects):
        
        """Attach a text label above each bar in *rects*, displaying its height."""
        
        for rect in rects:
            height = rect.get_height()
            ax[1].annotate('{}'.format(height),
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),  # 3 points vertical offset
                        textcoords="offset points",
                        fontsize = 'large',   
                        ha='center', va='bottom')


    autolabel(rects1)
    autolabel(rects2)

    fig.tight_layout()
    plt.show()


In [None]:
cat_view('Family')


In [None]:
cat_view('Education')

In [None]:
cat_view('Online')

In [None]:
cat_view('Securities Account')

In [None]:
cat_view('CD Account')

In [None]:
A= sns.FacetGrid(data, col='Personal Loan')
A.map(plt.hist,'Age', bins=20,color='b')

In [None]:
E= sns.FacetGrid(data, col='Personal Loan')
E.map(plt.hist,'Experience', bins=20,color='b')

In [None]:
F= sns.FacetGrid(data, col='Personal Loan')
F.map(plt.hist,'Family', bins=20,color='b')

In [None]:
I = sns.FacetGrid(data, col='Personal Loan')
I.map(plt.hist,'Income', bins=20,color='b')

In [None]:
C= sns.FacetGrid(data, col='Personal Loan')
C.map(plt.hist,'CCAvg', bins=20,color='b')

In [None]:
M= sns.FacetGrid(data, col='Personal Loan')
M.map(plt.hist,'Mortgage', bins=20,color='b')

In [None]:
filterwarnings("ignore")

sns.catplot(x='Family', y='Income', hue='Personal Loan', data = data, kind='swarm') #Report businesss

In [None]:
sns.boxplot(x='Education', y='Income', hue='Personal Loan', data = data)

In [None]:
sns.boxplot(x='Family',y='CCAvg',hue='Personal Loan',data=data)

In [None]:
sns.boxplot(x="CreditCard", y='CCAvg', hue="Personal Loan", data=data)

In [None]:
plt.figure(figsize=(10,4))
sns.scatterplot(data.CCAvg, data.Income, hue = data['Personal Loan'], palette= ['Brown','g'])

plt.title('Income and CCAvg Scatter Distribution',fontsize=20)

In [None]:
plt.figure(figsize=(10,4))
sns.scatterplot(data.Income, data.Mortgage,hue = data['Personal Loan'], palette= ['Silver','g'])

plt.title('Income and  Mortgage Distribution',fontsize=20)

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (20,5))

xx = data[["CCAvg", "CreditCard", "Personal Loan"]]
xx['ccavg_bin'] = pd.cut(xx['CCAvg'], bins = [0, 2, 4, 6, 100], labels = ['0-2', '3-4', '5-6', '7+'])
xx = xx.groupby(["ccavg_bin", "CreditCard"])["CCAvg"].sum().reset_index()
sns.barplot(xx["ccavg_bin"], xx["CCAvg"], hue = xx["CreditCard"], palette= "cividis", ax=axes[0]);
axes[0].set(xlabel = 'CC avg bins', ylabel = 'Count of customers');

sns.scatterplot(x = "Income", y = "CCAvg", data = data, hue = "Personal Loan", ax = axes[1], palette=["skyblue", "darkgreen"], alpha = 0.7)

** Model Building **

In [None]:

def plot_confusion_matrix(cm, classes,normalize=False,title='Confusion matrix',cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)
    ax = plt.gca()
    ax.set_ylim(-.5, 5.5)
        
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
def perform_model(model, X_train, y_train, X_test, y_test,class_labels, cm_normalize=True,print_cm=True, cm_cmap=plt.cm.Reds):
    
    
    # to store results at various phases
    results = dict()
    
    # time at which model starts training 
    train_start_time = datetime.now()
    print('training the model..')
    model.fit(X_train, y_train)
    print('Done \n \n')
    train_end_time = datetime.now()
    results['training_time'] =  train_end_time - train_start_time
    print('training_time(HH:MM:SS.ms) - {}\n\n'.format(results['training_time']))
    
    
    # predict test data
    print('Predicting test data')
    test_start_time = datetime.now()
    y_pred = model.predict(X_test)
    test_end_time = datetime.now()
    print('Done \n \n')
    results['testing_time'] = test_end_time - test_start_time
    print('testing time(HH:MM:SS:ms) - {}\n\n'.format(results['testing_time']))
    results['predicted'] = y_pred
   

    # calculate overall accuracty of the model
    accuracy = metrics.accuracy_score(y_true=y_test, y_pred=y_pred)
    # store accuracy in results
    results['accuracy'] = accuracy
    print('---------------------')
    print('|      Accuracy      |')
    print('---------------------')
    print('\n    {}\n\n'.format(accuracy))
    
    
    # confusion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)
    results['confusion_matrix'] = cm
    if print_cm: 
        print('--------------------')
        print('| Confusion Matrix |')
        print('--------------------')
        print('\n {}'.format(cm))
        
    
    # get classification report
    print('-------------------------')
    print('| Classifiction Report |')
    print('-------------------------')
    classification_report = metrics.classification_report(y_test, y_pred)
    # store report in results
    results['classification_report'] = classification_report
    print(classification_report)
    
    # add the trained  model to the results
    results['model'] = model
    
    
    return results

In [None]:
def print_grid_search_attributes(model):
    # Estimator that gave highest score among all the estimators formed in GridSearch
    print('--------------------------')
    print('|      Best Estimator     |')
    print('--------------------------')
    print('\n\t{}\n'.format(model.best_estimator_))


    # parameters that gave best results while performing grid search
    print('--------------------------')
    print('|     Best parameters     |')
    print('--------------------------')
    print('\tParameters of best estimator : \n\n\t{}\n'.format(model.best_params_))


    #  number of cross validation splits
    print('---------------------------------')
    print('|   No of CrossValidation sets   |')
    print('--------------------------------')
    print('\n\tTotal numbre of cross validation sets: {}\n'.format(model.n_splits_))


    # Average cross validated score of the best estimator, from the Grid Search 
    print('--------------------------')
    print('|        Best Score       |')
    print('--------------------------')
    print('\n\tAverage Cross Validate scores of best estimator : \n\n\t{}\n'.format(model.best_score_))


In [None]:
df = data.drop(['ID','ZIP Code'], axis=1)

In [None]:
df.head(1)

In [None]:
labels=[0,1]

In [None]:
X = df.drop('Personal Loan',axis=1)
y=df['Personal Loan']

In [None]:
from imblearn.over_sampling import SMOTE 

In [None]:
ms=SMOTE(random_state=1)
X_ms , y_ms = ms.fit_resample(X,y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_ms, y_ms, test_size=0.30, random_state=1)
print('x train data {}'.format(X_train.shape))
print('y train data {}'.format(y_train.shape))
print('x test data  {}'.format(X_test.shape))
print('y test data  {}'.format(y_test.shape))

# KNN

In [None]:
parameters = {'n_neighbors': [1,3, 10, 11,]}
log_knn = KNeighborsClassifier(n_neighbors=6)
log_knn_grid = GridSearchCV(log_knn, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
log_knn_grid_results =  perform_model(log_knn_grid, X_train, y_train, X_test, y_test, class_labels=labels)

In [None]:
print_grid_search_attributes(log_knn_grid_results['model'])

# Naive Bayes

In [None]:
log_NB = GaussianNB()
log_NB_grid_results =  perform_model(log_NB, X_train, y_train, X_test, y_test, class_labels=labels)

# Linear SVC

In [None]:
parameters = {'C':[0.125, 0.5, 1, 2, 8, 16]}
lr_svc = LinearSVC(tol=0.00005)
lr_svc_grid = GridSearchCV(lr_svc, param_grid=parameters, n_jobs=-1, verbose=1)
lr_svc_grid_results = perform_model(lr_svc_grid, X_train, y_train, X_test, y_test, class_labels=labels)

# Kernel SVM

In [None]:
parameters = {'C':[2,8,16],\
              'gamma': [ 0.0078125, 0.125, 2]}
rbf_svm = SVC(kernel='rbf')
rbf_svm_grid = GridSearchCV(rbf_svm,param_grid=parameters, n_jobs=-1)
rbf_svm_grid_results = perform_model(rbf_svm_grid, X_train, y_train, X_test, y_test, class_labels=labels)

In [None]:
print_grid_search_attributes(rbf_svm_grid_results['model'])

# Random Forest 

In [None]:
params = {'n_estimators': np.arange(10,500,20), 'max_depth':np.arange(3,15,2)}
rfc = RandomForestClassifier()
rfc_grid = GridSearchCV(rfc, param_grid=params, n_jobs=-1)
rfc_grid_results = perform_model(rfc_grid, X_train, y_train, X_test, y_test, class_labels=labels)
print_grid_search_attributes(rfc_grid_results['model'])

# Comparing all models

In [None]:
from warnings import filterwarnings
filterwarnings("ignore")

models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('LSVC', LinearSVC()))
models.append(('KSVC', SVC()))
models.append(('RF', RandomForestClassifier()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=12345)
	cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())

print('\n                     Accuracy     Error')
print('                     ----------   --------')

print('KNN                 : {:.04}%       {:.04}% '.format(log_knn_grid_results['accuracy'] * 100,\
                                                        100-(log_knn_grid_results['accuracy'] * 100)))
print('Naive Bayes         : {:.04}%       {:.04}% '.format(log_NB_grid_results['accuracy'] * 100,\
                                                        100-(log_NB_grid_results['accuracy'] * 100)))
print('Linear SVC          : {:.04}%        {:.04}% '.format(lr_svc_grid_results['accuracy'] * 100,\
                                                           100-(lr_svc_grid_results['accuracy'] * 100)))
print('Kernel SVM          : {:.04}%       {:.04}% '.format(rbf_svm_grid_results['accuracy'] * 100,\
                                                           100-(rbf_svm_grid_results['accuracy'] * 100)))
print('Random Forest       : {:.04}%       {:.04} % '.format(rfc_grid_results['accuracy'] * 100,\
                                                           100-(rfc_grid_results['accuracy'] * 100)))
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

