In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import KFold
import pickle
import zipfile

In [2]:
# file path
FILE_PATH = "/Users/pradeep/PycharmProjects/ontrack3/data/bank-full.csv"

In [3]:
# Load into dataframe and see if it works
df = pd.read_csv(FILE_PATH, sep=';')

# see top 5 rows
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
# Feature selection - select only relevent columns
features = df[['age', 'job', 'marital', 'default', 'housing', 'loan', 'poutcome', 'y']]
features

Unnamed: 0,age,job,marital,default,housing,loan,poutcome,y
0,58,management,married,no,yes,no,unknown,no
1,44,technician,single,no,yes,no,unknown,no
2,33,entrepreneur,married,no,yes,yes,unknown,no
3,47,blue-collar,married,no,yes,no,unknown,no
4,33,unknown,single,no,no,no,unknown,no
...,...,...,...,...,...,...,...,...
45206,51,technician,married,no,no,no,unknown,yes
45207,71,retired,divorced,no,no,no,unknown,yes
45208,72,retired,married,no,no,no,success,yes
45209,57,blue-collar,married,no,no,no,unknown,no


In [5]:
# Feature Selection
# select important features
X = features[['age', 'job', 'marital', 'default', 'housing', 'loan', 'poutcome']]

# set target
y = features[['y']] # target variable

In [6]:
# check data types for columns
features.dtypes

age          int64
job         object
marital     object
default     object
housing     object
loan        object
poutcome    object
y           object
dtype: object

In [7]:
# numerical features
num_features = ['age']

# make a pipeline to perform scaling
num_pipeline = Pipeline([
    ('scale', StandardScaler())
])
num_pipeline

Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True))],
         verbose=False)

In [8]:
# categorical features
cat_features = ['job', 'marital', 'default', 'housing', 'loan', 'poutcome']

# make a pipeline to perfom one hot encoding
cat_pipeline = Pipeline([
    ('encode', OneHotEncoder())
])
cat_pipeline

Pipeline(memory=None,
         steps=[('encode',
                 OneHotEncoder(categories='auto', drop=None,
                               dtype=<class 'numpy.float64'>,
                               handle_unknown='error', sparse=True))],
         verbose=False)

In [9]:
# make full pipeline for converting categorical and numrical features accordingly
column_transform = ColumnTransformer([
    ('numerical', num_pipeline, num_features),
    ('categorical', cat_pipeline, cat_features)
])
column_transform

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('numerical',
                                 Pipeline(memory=None,
                                          steps=[('scale',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True))],
                                          verbose=False),
                                 ['age']),
                                ('categorical',
                                 Pipeline(memory=None,
                                          steps=[('encode',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                               

In [10]:
# pipeline to transform columns and decision tree instance
clf = Pipeline([('pipeline', column_transform),('classifier', DecisionTreeClassifier())])
clf

Pipeline(memory=None,
         steps=[('pipeline',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical',
                                                  Pipeline(memory=None,
                                                           steps=[('scale',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  ['age']),
                                                 ('categorical',
                                                  Pipeline(memo

In [11]:
# split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
# train the model
clf = clf.fit(X_train, y_train)
clf

Pipeline(memory=None,
         steps=[('pipeline',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical',
                                                  Pipeline(memory=None,
                                                           steps=[('scale',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  ['age']),
                                                 ('categorical',
                                                  Pipeline(memo

In [13]:
# make a prediction
y_pred = clf.predict(X_test)
y_pred

array(['no', 'no', 'no', ..., 'no', 'no', 'no'], dtype=object)

In [14]:
# get accuracy score
accuracy_score(y_test, y_pred)

0.8789118655313503

In [15]:
# see confusion matrix
confusion_matrix(y_test, y_pred)

array([[7775,  175],
       [ 920,  173]])

In [16]:
# see classification report in detail
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n          no       0.89      0.98      0.93      7950\n         yes       0.50      0.16      0.24      1093\n\n    accuracy                           0.88      9043\n   macro avg       0.70      0.57      0.59      9043\nweighted avg       0.85      0.88      0.85      9043\n'

In [17]:
'''
Hypertuning - let's find the optimal values for some of the parameters
'''
# define some parameters and some values
params = {
        'classifier__criterion': ["gini", "entropy"],
 
        'classifier__splitter': ["best", "random"],
 
        'classifier__max_depth': [2,4,6,8,10,20,30,50],

        'classifier__max_leaf_nodes' : [2,4,6,8,10,20,30,50]
    }

In [18]:
# Instanciate the GridSearch object and pass estimator and grid parameters
grid_cv = GridSearchCV(clf, params, cv=10).fit(X_train, y_train)
grid_cv

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pipeline',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('numerical',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('scale',
                                                                                          StandardScaler(copy=True,
                                                                                                         with_mean=True,
                                      

In [19]:
#find optimal values
grid_cv.best_params_

{'classifier__criterion': 'gini',
 'classifier__max_depth': 20,
 'classifier__max_leaf_nodes': 50,
 'classifier__splitter': 'random'}

In [44]:
# instanciate decision tree classifier with optimal values
dt = DecisionTreeClassifier(criterion='gini', max_depth=20, max_leaf_nodes=50, splitter='random')

# build pipeline with column transformer and classifier
clf_2 = Pipeline([('pipeline', column_transform),('classifier', dt)])

# fit data
clf_2 = clf.fit(X_train, y_train)
clf_2

Pipeline(memory=None,
         steps=[('pipeline',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical',
                                                  Pipeline(memory=None,
                                                           steps=[('scale',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  ['age']),
                                                 ('categorical',
                                                  Pipeline(memo

In [45]:
# predict
y_pred = clf_2.predict(X_test)
y_pred

array(['no', 'no', 'no', ..., 'no', 'no', 'no'], dtype=object)

In [46]:
# Check accuracy
accuracy_score(y_test, y_pred)

0.878801282760146

In [23]:
# confusion matrix
confusion_matrix(y_test, y_pred)

array([[7776,  174],
       [ 922,  171]])

In [24]:
# classification report
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n          no       0.89      0.98      0.93      7950\n         yes       0.50      0.16      0.24      1093\n\n    accuracy                           0.88      9043\n   macro avg       0.69      0.57      0.59      9043\nweighted avg       0.85      0.88      0.85      9043\n'

In [25]:
'''Random Forest Section'''

'Random Forest Section'

In [26]:
# build pipeline with column transformer and random forest classifier
rf_clf = Pipeline([
    ('columnTransformer', column_transform),
    ('classifier', RandomForestClassifier(random_state=1))
])
rf_clf

Pipeline(memory=None,
         steps=[('columnTransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical',
                                                  Pipeline(memory=None,
                                                           steps=[('scale',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  ['age']),
                                                 ('categorical',
                                                  Pipe

In [27]:
# train with data
rf_clf = rf_clf.fit(X_train, y_train.values.ravel())
rf_clf

Pipeline(memory=None,
         steps=[('columnTransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical',
                                                  Pipeline(memory=None,
                                                           steps=[('scale',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  ['age']),
                                                 ('categorical',
                                                  Pipe

In [28]:
# make prediction
y_pred = rf_clf.predict(X_test)
y_pred

array(['no', 'no', 'no', ..., 'no', 'no', 'no'], dtype=object)

In [29]:
# accuracy score
accuracy_score(y_test, y_pred)

0.878801282760146

In [30]:
# print confusion matrix
confusion_matrix(y_test, y_pred)

array([[7754,  196],
       [ 900,  193]])

In [31]:
# Hyperparameter tuning
# define some parameters to tune
params = {
    'classifier__n_estimators' : [1, 5, 10, 20, 50],
    'classifier__criterion' : ["gini", "entropy"],
    'classifier__max_depth': [2,4,6,8,10,20,30],
    'classifier__max_leaf_nodes' : [2,4,6,8,10,20,30]
}

In [32]:
# grid search with parameters and cross validation of 2 times
rf_grid_search = GridSearchCV(rf_clf, params, cv=2)
rf_grid_search

GridSearchCV(cv=2, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columnTransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('numerical',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('scale',
                                                                                          StandardScaler(copy=True,
                                                                                                         with_mean=True,
                              

In [33]:
# fit the training data
rf_grid_search = rf_grid_search.fit(X_train, y_train.values.ravel())
rf_grid_search

GridSearchCV(cv=2, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columnTransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('numerical',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('scale',
                                                                                          StandardScaler(copy=True,
                                                                                                         with_mean=True,
                              

In [34]:
# get optimal values
rf_grid_search.best_params_

{'classifier__criterion': 'gini',
 'classifier__max_depth': 6,
 'classifier__max_leaf_nodes': 30,
 'classifier__n_estimators': 10}

In [47]:
# random forest classifier with best params
rf_clf_2 = RandomForestClassifier(criterion='gini', max_depth=6, max_leaf_nodes=30, n_estimators=10, random_state=1)
rf_clf_2

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=6, max_features='auto',
                       max_leaf_nodes=30, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [48]:
# build pipeline with column transformater and classifier
rf_cl2 = Pipeline([
    ('columnTransformer', column_transform),
    ('classifier', rf_clf_2)
])
rf_cl2

Pipeline(memory=None,
         steps=[('columnTransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical',
                                                  Pipeline(memory=None,
                                                           steps=[('scale',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  ['age']),
                                                 ('categorical',
                                                  Pipe

In [49]:
# fit the traind data
rf_cl2 = rf_cl2.fit(X_train, y_train.values.ravel())
rf_cl2

Pipeline(memory=None,
         steps=[('columnTransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical',
                                                  Pipeline(memory=None,
                                                           steps=[('scale',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  ['age']),
                                                 ('categorical',
                                                  Pipe

In [50]:
# pred the test data
y_pred = rf_cl2.predict(X_test)
y_pred

array(['no', 'no', 'no', ..., 'no', 'no', 'no'], dtype=object)

In [51]:
# accuracy score
accuracy_score(y_test, y_pred)

0.8879796527700984

In [52]:
# confusion matrix
confusion_matrix(y_test, y_pred)

array([[7885,   65],
       [ 948,  145]])

In [41]:
'''
Summary
Decision Tree Classifier
- without optimal values for parameters (Hyperparameters)
    Accuracy = 88.05%
    confusion matrix:
    array([[7794,  214],
       [ 862,  173]])
       
- wtih Hyperparameter tuning
    Accuracy = 88.09%
    confusion matrix:
    array([[7794,  214],
       [ 863,  172]])
       
       
Random Forest Classifier
- without optimal values for parameters (Hyperparameters)
    Accuracy = 87.91%
    confusion matrix:
    array([[7759,  249],
       [ 844,  191]])
       
- wtih Hyperparameter tuning
    Accuracy = 89.42%
    confusion matrix:
    array([[7950,   58],
       [ 898,  137]])
       
       
Random Forest Classifier perfomed slighlty better than Decision Tree Classifier. 
confusion matrix:
    array([[7950,   58],
       [ 898,  137]])
This confusion matrix tells that 7950 items were correctly identified as yes and 137 items as no. Similarly, 898 items
were incorrectly predicted as no and 58 as yes.

'''

'\nSummary\nDecision Tree Classifier\n- without optimal values for parameters (Hyperparameters)\n    Accuracy = 88.05%\n    confusion matrix:\n    array([[7794,  214],\n       [ 862,  173]])\n       \n- wtih Hyperparameter tuning\n    Accuracy = 88.09%\n    confusion matrix:\n    array([[7794,  214],\n       [ 863,  172]])\n       \n       \nRandom Forest Classifier\n- without optimal values for parameters (Hyperparameters)\n    Accuracy = 87.91%\n    confusion matrix:\n    array([[7759,  249],\n       [ 844,  191]])\n       \n- wtih Hyperparameter tuning\n    Accuracy = 89.42%\n    confusion matrix:\n    array([[7950,   58],\n       [ 898,  137]])\n       \n       \nRandom Forest Classifier perfomed slighlty better than Decision Tree Classifier. \nconfusion matrix:\n    array([[7950,   58],\n       [ 898,  137]])\nThis confusion matrix tells that 7950 items were correctly identified as yes and 137 items as no. Similarly, 898 items\nwere incorrectly predicted as no and 58 as yes.\n\n'

In [42]:
# save best model
f = open('myModel.pkl', 'wb')
pickle.dump(rf_cl2,f)
f.close()

In [43]:
# Zip fie
zipfile.ZipFile('myModel.zip', 'w').write('mymodel.pkl')