In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
bank_dataset=pd.read_csv('../ClassificationDataSet/bank-additional-full.csv')

In [3]:
bank_dataset.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [5]:
bank_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [7]:
len(bank_dataset['age'].unique())

78

### Feature engineering

In [4]:
# selecting all values as predictor apart from 'duration' & target attribute 'y'

X=bank_dataset.columns.drop("duration","y")
y=bank_dataset['y']



In [10]:
# Encoding all the predictor variables to convert the categorical values to numerical values.

bank_data_encoded = pd.get_dummies(bank_dataset[X])
print("Total number of predictors after encoding = ", len(bank_data_encoded.columns))

# Printing the list of columns after encoding to understand the encoding process
bank_data_encoded.columns

Total number of predictors after encoding =  64


Index(['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_s

In [11]:
# Import the required function
from sklearn.model_selection import train_test_split

#splitting data into train and test datasets
X_train,X_test,y_train,y_test = train_test_split(bank_data_encoded, y,test_size=0.15,random_state=100) 

# Printing the shape of the resulting datasets
print("Shape of X_train and y_train are:", X_train.shape, "and", y_train.shape, " respectively")
print("Shape of X_test and y_test are:", X_test.shape, "and", y_test.shape, " respectively")


Shape of X_train and y_train are: (35009, 64) and (35009,)  respectively
Shape of X_test and y_test are: (6179, 64) and (6179,)  respectively


### Building the model using Scikit-Learn:

In [12]:
# Importing required class 
from sklearn.tree import DecisionTreeClassifier

# Creating an object of the DecisionTreeClassifier model
model = DecisionTreeClassifier(random_state = 1)

# Training model on the training data
model.fit(X_train,y_train)

DecisionTreeClassifier(random_state=1)

In [13]:
# Predicting target values using the model built on training data
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)


### Visulazing the split

In [15]:
!pip install graphviz

Collecting graphviz
  Downloading graphviz-0.19-py3-none-any.whl (46 kB)
Installing collected packages: graphviz
Successfully installed graphviz-0.19


In [18]:
'''# Importing the required libraries (Ensure that they are already installed.)
from sklearn.tree import export_graphviz
import graphviz
# Generating the tree
dot_data = export_graphviz(model, out_file=None,
                          feature_names=bank_data_encoded.columns,  
                          class_names=model.classes_,  
                          ) 
graph = graphviz.Source(dot_data) 
graph
'''

'# Importing the required libraries (Ensure that they are already installed.)\nfrom sklearn.tree import export_graphviz\nimport graphviz\n# Generating the tree\ndot_data = export_graphviz(model, out_file=None,\n                          feature_names=bank_data_encoded.columns,  \n                          class_names=model.classes_,  \n                          ) \ngraph = graphviz.Source(dot_data) \ngraph\n'

### Evaluating Performance of a model

In [19]:
# Getting the accuracy on train data
train_accuracy = model.score(X_train,y_train)
print("Accuracy of the model on train data = ",train_accuracy)

# Getting the accuracy on test data
test_accuracy = model.score(X_test,y_test)
print("Accuracy of the model on test data = ",test_accuracy)


Accuracy of the model on train data =  1.0
Accuracy of the model on test data =  1.0


In [20]:
from sklearn.metrics import confusion_matrix

#confusion matrix compares the actual target values and the predicted target values
train_conf_matrix = confusion_matrix(y_train,train_predictions)
test_conf_matrix = confusion_matrix(y_test,test_predictions)

#### Confusion Matrix
Confusion matrix helps to assess how good the model works on individual classes in the outcome

In [21]:
#confusion matrix on the train data
pd.DataFrame(train_conf_matrix,columns=model.classes_,index=model.classes_)

Unnamed: 0,no,yes
no,31055,0
yes,0,3954


In [22]:
#confusion matrix on the test data
pd.DataFrame(test_conf_matrix,columns=model.classes_,index=model.classes_)

Unnamed: 0,no,yes
no,5493,0
yes,0,686


$
\begin{align}
Accuracy = \frac{No. of correct predictions}{No. of predictions made}
\end{align}
$

In [23]:
#train accuracy calculated from confusion matrix
train_correct_predictions = train_conf_matrix[0][0]+train_conf_matrix[1][1]
train_total_predictions = train_conf_matrix.sum()
train_accuracy = train_correct_predictions/train_total_predictions
print(train_accuracy)

1.0


In [24]:
#test accuracy calculated from confusion matrix
test_correct_predictions = test_conf_matrix[0][0]+test_conf_matrix[1][1]
total_predictions = test_conf_matrix.sum()
test_accuracy = test_correct_predictions/total_predictions
print(test_accuracy)

1.0


#### Classification report
precision for class 'A' = (number of outcomes correctly predicted as class 'A' by the model) / (total number of instances predicted as class 'A' by the model)
recall for class 'A' = (number of outcomes correctly predicted as class 'A' by the model) / (total number of class 'A' instances present in the dataset)
f1-score for class 'A'- harmonic mean of precision and recall for class 'A'
support for class 'A'- number of instances classified as class 'A'

In [25]:
from sklearn.metrics import classification_report

# report based on tain data
print("Report based on tain data")
print(classification_report(y_train,train_predictions))

print("\n")

# report based on test data
print("Report based on test data")
print(classification_report(y_test,test_predictions))

Report based on tain data
              precision    recall  f1-score   support

          no       1.00      1.00      1.00     31055
         yes       1.00      1.00      1.00      3954

    accuracy                           1.00     35009
   macro avg       1.00      1.00      1.00     35009
weighted avg       1.00      1.00      1.00     35009



Report based on test data
              precision    recall  f1-score   support

          no       1.00      1.00      1.00      5493
         yes       1.00      1.00      1.00       686

    accuracy                           1.00      6179
   macro avg       1.00      1.00      1.00      6179
weighted avg       1.00      1.00      1.00      6179

