<a href="https://colab.research.google.com/github/Rubina-Fathima/Bank-Marketing-Effectiveness-Prediction/blob/main/Individual_Collab_Bank_Marketing_Prediction_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project Title : Predicting the effectiveness of bank marketing campaigns 

Problem Description
# The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed. The classification goal is to predict if the client will subscribe a term deposit (variable y).



# Data Description
# Input variables:
# Bank Client data:

#    age (numeric)
#    job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
#    marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
#    education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
#    default: has credit in default? (categorical: 'no','yes','unknown')
#    housing: has housing loan? (categorical: 'no','yes','unknown')
#   loan: has personal loan? (categorical: 'no','yes','unknown')
### <b> Related with the last contact of the current campaign:</b>
* ### contact: contact communication type (categorical: 'cellular','telephone')
* ### month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
* ### day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
* ### duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

##<b>Other attributes: </b>
* ### campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
* ### pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
* ### previous: number of contacts performed before this campaign and for this client (numeric)
* ### poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')


### <b>Output variable (desired target):</b>
* ### y - has the client subscribed a term deposit? (binary: 'yes','no')

# Loading Libraries & Data

In [12]:
import numpy as np
import pandas as pd
import random
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
%matplotlib inline

from prettytable import PrettyTable
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, log_loss, classification_report, precision_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import NearMiss
from collections import Counter
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings('ignore')

In [13]:
#mounting drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
#Loading Dataset
bank_df= pd.read_csv('/content/drive/MyDrive/Bank Marketing Effectiveness Prediction/Copy of bank-full.csv',delimiter=';')

In [16]:
bank_df.head(10)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no
8,58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no
9,43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,no


In [17]:
bank_df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [18]:
bank_df.select_dtypes(include=['object'])

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,management,married,tertiary,no,yes,no,unknown,may,unknown,no
1,technician,single,secondary,no,yes,no,unknown,may,unknown,no
2,entrepreneur,married,secondary,no,yes,yes,unknown,may,unknown,no
3,blue-collar,married,unknown,no,yes,no,unknown,may,unknown,no
4,unknown,single,unknown,no,no,no,unknown,may,unknown,no
...,...,...,...,...,...,...,...,...,...,...
45206,technician,married,tertiary,no,no,no,cellular,nov,unknown,yes
45207,retired,divorced,primary,no,no,no,cellular,nov,unknown,yes
45208,retired,married,secondary,no,no,no,cellular,nov,success,yes
45209,blue-collar,married,secondary,no,no,no,telephone,nov,unknown,no


 Random Forest Accuracy = 1.0


Xgboost Accuracy = 1.0


In [None]:
!pip install lime
# Extract features
float_columns=[]
cat_columns=[]
int_columns=[]
# Putting features into respective float, cat , int list.
for i in bank_df.columns:
    if bank_df[i].dtype == 'float' : 
        float_columns.append(i)
    elif bank_df[i].dtype == 'int64':
        int_columns.append(i)
    elif bank_df[i].dtype == 'object':
        cat_columns.append(i)
bank_cat_features = bank_df[cat_columns]
bank_int_features = bank_df[int_columns]
## Transformation of categorical columns
# Label Encoding:
from sklearn import preprocessing
bank_cat_features_ver2 = pd.get_dummies(bank_cat_features, columns=['job','marital','education','default','housing','loan','contact','month','poutcome'])
bank_cat_features_ver2 = bank_cat_features.apply(preprocessing.LabelEncoder().fit_transform)
X= bank_df.drop(['y',], axis=1)
y= bank_df['y']

#Splitting the Dataset inro Train Set and Test Set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
#### Finalize X & Y
temp_1 = np.concatenate((bank_cat_features_ver2,bank_int_features),axis=1)
bank_df_transformed_features = np.concatenate((temp_1,bank_int_features),axis=1)
bank_df_transformed_features = pd.DataFrame(data=bank_df_transformed_features)
    
array = bank_df_transformed_features.values
number_of_features = len(array[0])
X = array[:,0:number_of_features]
train_target = np.ravel(np.array(bank_df['y'].values))
Y = train_target
# Split into training and validation set
from sklearn.model_selection import train_test_split
validation_size = 0.2
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed)
# Model 2 - XGB Classifier
model_xgb = XGBClassifier()
model_xgb.fit(X_train, Y_train)
print("Xgboost Accuracy =",accuracy_score(Y_validation, model_xgb.predict(X_validation)))
# Model 1 - RandomForest Classifier
model_rf = RandomForestClassifier()
model_rf.fit(X_train, Y_train)
print(" Random Forest Accuracy =",accuracy_score(Y_validation, model_rf.predict(X_validation)))




In [65]:
X_train

array([[ 1,  1,  1, ...,  1, -1,  0],
       [ 9,  2,  1, ...,  6, -1,  0],
       [ 0,  0,  1, ...,  2, -1,  0],
       ...,
       [10,  0,  1, ...,  1, -1,  0],
       [ 1,  2,  0, ...,  5, -1,  0],
       [ 4,  1,  2, ...,  5, -1,  0]])

In [64]:
# LIME SECTION
import sklearn
import sklearn.datasets
import sklearn.ensemble
import numpy as np
import lime
import lime.lime_tabular
from __future__ import print_function
predict_fn_rf = lambda x: model_rf.predict_proba(x).astype(float)
predict_fn_xgb = lambda x: model_xgb.predict_proba(x).astype(float)
# Line-up the feature names
feature_names_cat = list(bank_cat_features_ver2)
feature_names_int= list(bank_int_features)
feature_names1=sum([feature_names_cat,feature_names_int],[])
print(feature_names1)
X_train.drop('')

explainer=lime.lime_tabular.LimeTabularExplainer(X_train,feature_names=feature_names1,class_names=['1','2'] ,mode='classification')
# Pick the observation in the validation set for which explanation is required
observation_1 = 2
# Pick the observation in the validation set for which explanation is required
observation_2 = 45
# Get the explanation for RandomForest
exp = explainer.explain_instance(X_validation[observation_1], predict_fn_rf, num_features=6)
exp.show_in_notebook(show_all=False)
# Get the explanation for XGBoost
exp = explainer.explain_instance(X_validation[observation_1], predict_fn_xgb, num_features=6)
exp.show_in_notebook(show_all=False)
# Look at the actual value in the validation set
print(Y_validation[observation_1])
# Get the explanation for RandomForest
exp = explainer.explain_instance(X_validation[observation_2], predict_fn_rf, num_features=6)
exp.show_in_notebook(show_all=False)
# Get the explanation for XGBoost
exp = explainer.explain_instance(X_validation[observation_2], predict_fn_xgb, num_features=6)
exp.show_in_notebook(show_all=False)
# Look at the actual value in the validation set
print(Y_validation[observation_2])

['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y', 'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']


IndexError: ignored

In [60]:
import matplotlib.pyplot as plt
exp = self.as_list(label=label, **kwargs)
fig = plt.figure(figsize)
fig = plt.figure(figsize=figsize)
vals = [x[1] for x in exp]
names = [x[0] for x in exp]
vals.reverse()
names.reverse()
colors = ['green' if x > 0 else 'red' for x in vals]
pos = np.arange(len(exp)) + .5
plt.barh(pos, vals, align='center', color=colors)
        plt.yticks(pos, names)
        if self.mode == "classification":
            title = 'Local explanation for class %s' % self.class_names[label]
        else:
            title = 'Local explanation'
        plt.title(title)
        return fig
    def show_in_notebook(self,
                         labels=None,
                         predict_proba=True,
                         show_predicted_value=True,
                         **kwargs):
        """Shows html explanation in ipython notebook.
        See as_html() for parameters.
        This will throw an error if you don't have IPython installed"""
        from IPython.core.display import display, HTML
        display(HTML(self.as_html(labels=labels,
                                  predict_proba=predict_proba,
                                  show_predicted_value=show_predicted_value,
                                  **kwargs)))

IndentationError: ignored

In [56]:
!pip install shap
import pandas as pd
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
import shap 
import graphviz
sns.set_style('darkgrid') 
X_train1=pd.DataFrame(X_train,columns=["age","job","marital","education","default","balance","housing","loan","contact","day","month","duration","campaign","pdays","previous","poutcome"])
X_test=pd.DataFrame(X_validation, columns=["age","job","marital","education","default","balance","housing","loan","contact","day","month","duration","campaign","pdays","previous","poutcome"])
Y_train1=pd.DataFrame(Y_train, columns=["y"])
Y_test=pd.DataFrame(Y_validation, columns=["y"])

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


ValueError: ignored

In [None]:
bank_df.describe(include='all')

In [55]:
X_train

array([[ 1,  1,  1, ...,  1, -1,  0],
       [ 9,  2,  1, ...,  6, -1,  0],
       [ 0,  0,  1, ...,  2, -1,  0],
       ...,
       [10,  0,  1, ...,  1, -1,  0],
       [ 1,  2,  0, ...,  5, -1,  0],
       [ 4,  1,  2, ...,  5, -1,  0]])

In [None]:
#null values of entire dataframe
pd.isnull(bank_df)

In [None]:
#checking null values and printing in a list
bank_df.columns[bank_df.isna().any()].to_list()

In [None]:
#Datatypes of columns in dataframe.
bank_df.info()

In [None]:
bank_df.duplicate()

# SVM SUPPORT VECTOR MACHINE

In [None]:
from sklearn.svm import SVC
clf = SVC()

clf.fit(x_train, y_train)

pred = clf.predict(x_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)



