In [None]:
# Import required libraries 
import pandas as pd 
import numpy as np
from sklearn import metrics
from sklearn.metrics import confusion_matrix 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from IPython.display import Image
from sklearn import tree
from os import system

In [None]:
df=pd.read_csv("bank-full.csv") # Read the dataset

In [None]:
# Univariate analysis 

In [None]:
type(df)

In [None]:
df

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.skew()

In [None]:
df.isnull().sum() # There are no null values

In [None]:
# IQR Method of finding outliers
Q1age=df.age.quantile(0.25)
Q3age=df.age.quantile(0.75)
Q1balance=df.balance.quantile(0.25)
Q3balance=df.balance.quantile(0.75)
Q1day=df.day.quantile(0.25)
Q3day=df.day.quantile(0.75)
Q1duration=df.duration.quantile(0.25)
Q3duration=df.duration.quantile(0.75)
Q1campaign=df.campaign.quantile(0.25)
Q3campaign=df.campaign.quantile(0.75)
Q1pdays=df.pdays.quantile(0.25)
Q3pdays=df.pdays.quantile(0.75)
Q1previous=df.previous.quantile(0.25)
Q3previous=df.previous.quantile(0.75)

In [None]:
IQRage=Q3age-Q1age
IQRbalance=Q3balance-Q1balance
IQRday=Q3day-Q1day
IQRduration=Q3duration-Q1duration
IQRcampaign=Q3campaign-Q1campaign
IQRpdays=Q3pdays-Q1pdays
IQRprevious=Q3previous-Q1previous

In [None]:
lower_limit_age = Q1age - 1.5*IQRage
upper_limit_age = Q3age + 1.5*IQRage
lower_limit_balance = Q1balance - 1.5*IQRbalance
upper_limit_balance = Q3balance + 1.5*IQRbalance
lower_limit_day = Q1day - 1.5*IQRday
upper_limit_day = Q3day + 1.5*IQRday
lower_limit_duration = Q1duration - 1.5*IQRduration
upper_limit_duration = Q3duration + 1.5*IQRduration
lower_limit_campaign = Q1campaign - 1.5*IQRcampaign
upper_limit_campaign = Q3campaign + 1.5*IQRcampaign
lower_limit_pdays = Q1pdays - 1.5*IQRpdays
upper_limit_pdays = Q3pdays + 1.5*IQRpdays
lower_limit_previous = Q1previous - 1.5*IQRprevious
upper_limit_previous = Q3previous + 1.5*IQRprevious

In [None]:
df[((df.age<lower_limit_age)|(df.age>upper_limit_age))&((df.balance<lower_limit_balance)|(df.balance>upper_limit_balance))&((df.day<lower_limit_day)|(df.day>upper_limit_day))&((df.duration<lower_limit_duration)|(df.duration>upper_limit_duration))&((df.campaign<lower_limit_campaign)|(df.campaign>upper_limit_campaign))&((df.pdays<lower_limit_pdays)|(df.pdays>upper_limit_pdays))&((df.previous<lower_limit_previous)|(df.previous>upper_limit_previous))]

In [None]:
#Strategy to address Data Pollution

In [None]:
#1) Removing the columns that have almost all values as "unknown"
# These columns are 'contact' and 'poutcome'
# 'contact' should be deleted as the contact communication type does not affect our goal
# 'poutcome' should be deleted as most the inputs are unknown and also outcome of previous marketing campaign 
#  does not guarantee nor exclude the success of this campaign

In [None]:
df2=df.drop(["contact","poutcome"],axis=1)
df2

In [None]:
#2)Removing columns that are related to time i.e "day","month" and "duration" as these parameters will not help to get the goal.

In [None]:
df3=df2.drop(["day","month","duration"],axis=1)
df3

In [None]:
#3) All the "Other attributes" should be deleted as well

In [None]:
df4=df3.drop(["campaign","pdays","previous"],axis=1)

In [None]:
# Univariate 
columns= list(df)[0:-1]
df[columns].hist (stacked=False, bins=100, figsize=(30,50), layout=(7,2));

In [None]:
#Strategy to address outlier’s treatment

In [None]:
# We used the IQR method to find the outlier's and now we can delete them as they are already identified

In [None]:
df_no_outlier_age = df[((df.age<lower_limit_age)|(df.age>upper_limit_age))&((df.balance<lower_limit_balance)|(df.balance>upper_limit_balance))&((df.day<lower_limit_day)|(df.day>upper_limit_day))&((df.duration<lower_limit_duration)|(df.duration>upper_limit_duration))&((df.campaign<lower_limit_campaign)|(df.campaign>upper_limit_campaign))&((df.pdays<lower_limit_pdays)|(df.pdays>upper_limit_pdays))&((df.previous<lower_limit_previous)|(df.previous>upper_limit_previous))]
df_no_outlier_age

In [None]:
# Missing values treatment

In [None]:
# There are no missing values so we do not have to take any additional steps.

In [None]:
# Multivariate analysis

In [None]:
sns.pairplot(df,hue="Target",size=3)
plt.show()

In [None]:
plt.hist(df['Target'])
plt.show()
pd.DataFrame(df.Target.value_counts())

In [None]:
#From the above pairplot and histogram, we see that the target variable has larger "no" values (39922)
#compared to "yes"(5289). This means that fewer people have subscribed to a term deposit.
#This disribution is clearly seen in the histogram as the difference is widespread.
#Although in the paiplot, the values overlap each other causing a more crowded and unclear graph.

In [None]:
corr = df.drop('Target',axis=1).corr()

plt.figure(figsize=(12,8))
sns.heatmap(corr, annot = True)
plt.show()

In [None]:
# From above heatmap, we see that there is no high correlatin between features 

In [None]:
# Checking if attribute types are correct 
df.dtypes

In [None]:
# We see that all the attributes are correct

In [None]:
# Get data model ready 
# Create dummy variables (one-hot encoding) for all nominal categorical variables 

df= pd.get_dummies(df, columns=['job','marital','education','default','housing','loan','contact','month','poutcome','Target'])

In [None]:
df.head()

In [None]:
# Here we see that all the nominal categorical variabes have now been split in multiple columns: one-hot encoding

In [None]:
# Transforming the data is not required

In [None]:
# Creating the training set and test set in ratio of 70:30

In [None]:
from sklearn.model_selection import train_test_split

X= df.drop('Target_yes', axis=1)
Y=df['Target_yes']
model= LogisticRegression(solver="liblinear")
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.3, random_state=1)
model.fit(x_train,y_train)
y_predict= model.predict(x_test)

In [None]:
print("{0:0.2f}% data is in training set".format((len(x_train)/len(df.index))*100))
print("{0:0.2f}% data is in test set".format((len(x_test)/len(df.index))*100))

In [None]:
x_train.head() 
# This shows that the sample is randomized 
# Which is a good variation for modelling 

In [None]:
x_test.head()

In [None]:
#Logistic Regression Model

In [None]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score

logreg = LogisticRegression(random_state=42)
logreg.fit(x_train, y_train)

def draw_cm( actual, predicted ):
    cm = confusion_matrix( actual, predicted)
    sns.heatmap(cm, annot=True,  fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
    plt.ylabel('Observed')
    plt.xlabel('Predicted')
    
print(draw_cm(y_test,y_predict))

In [None]:
print("Training accuracy",logreg.score(x_train,y_train))  
print("Testing accuracy",logreg.score(x_test, y_test))
print("Recall:",recall_score(y_test,y_predict))
print("Precision:",precision_score(y_test,y_predict))
print("F1 Score:",f1_score(y_test,y_predict))

In [None]:
# From above we can see all the required data we needed like the accuracy, precision, F1 Score etc. 

In [None]:
# Decesion Tree Model

In [None]:
# Decesion Tree Training

classifier=DecisionTreeClassifier(criterion='gini',random_state=100,max_depth=None, min_samples_leaf=1)
classifier.fit(x_train,y_train)


In [None]:
y_pred=classifier.predict(x_test)

In [None]:
print(y_pred)
print(y_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
# We see that this Decesion Tree Model has a 100% accuracy

In [None]:
# Ensemble Model 1 - Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(base_estimator=classifier, n_estimators=50,random_state=1)
bgcl = bgcl.fit(x_train, y_train)

In [None]:
y_predict = bgcl.predict(x_test)

print(bgcl.score(x_test , y_test))

cm=metrics.confusion_matrix(y_test, y_predict,labels=[0, 1])

df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
                  columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
print(metrics.classification_report(y_pred,y_test))

In [None]:
# Accuracy for this ensemble model is also 100%

In [None]:
# Ensemble Mode 2 - AdaBoosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(base_estimator=classifier,n_estimators=50, random_state=1)
abcl = abcl.fit(x_train, y_train)

In [None]:
y_predict = abcl.predict(x_test)
print(abcl.score(x_test , y_test))

cm=metrics.confusion_matrix(y_test, y_predict,labels=[0, 2])

df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
                  columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
print(metrics.classification_report(y_pred,y_test))

In [None]:
# For the second ensemble model the accuracy is 100% as well

In [None]:
# Ensemble Model 3 - GradientBoost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50,random_state=1)
gbcl = gbcl.fit(x_train, y_train)

In [None]:
y_predict = gbcl.predict(x_test)
print(gbcl.score(x_test, y_test))
cm=metrics.confusion_matrix(y_test, y_predict,labels=[1, 0])

df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
                  columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
print(metrics.classification_report(y_pred,y_test))

In [None]:
# Even for the third ensemble model, the accuracy is 100%

In [None]:
# To conclude, we found out that out of the five models all of them have a high accuracy and 4 of them ( Decesion tree and
#  the three ensemble methods) had 100% accuracy. Out of all of them, I think the best algorithm to use is the bagging 
#  ensemble method because it works best for complex data and hence usually in Data scence problems we will have complex models
#  and complex data so to most efficiently deal with them the best method to use is Bagging.