In [116]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import KFold
import pickle
import zipfile

In [117]:
# file path
FILE_PATH = "/Users/pradeep/PycharmProjects/ontrack3/data/bank-full.csv"

In [118]:
# Load into dataframe and see if it works
df = pd.read_csv(FILE_PATH, sep=';')

# see top 5 rows
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [119]:
# Feature selection - select only relevent columns
features = df[['age', 'job', 'marital', 'default', 'housing', 'loan', 'poutcome', 'y']]
features

Unnamed: 0,age,job,marital,default,housing,loan,poutcome,y
0,58,management,married,no,yes,no,unknown,no
1,44,technician,single,no,yes,no,unknown,no
2,33,entrepreneur,married,no,yes,yes,unknown,no
3,47,blue-collar,married,no,yes,no,unknown,no
4,33,unknown,single,no,no,no,unknown,no
...,...,...,...,...,...,...,...,...
45206,51,technician,married,no,no,no,unknown,yes
45207,71,retired,divorced,no,no,no,unknown,yes
45208,72,retired,married,no,no,no,success,yes
45209,57,blue-collar,married,no,no,no,unknown,no


In [120]:
# Feature Selection
# select important features
X = features[['age', 'job', 'marital', 'default', 'housing', 'loan', 'poutcome']]

# set target
y = features[['y']] # target variable

In [121]:
# check data types for columns
features.dtypes

age          int64
job         object
marital     object
default     object
housing     object
loan        object
poutcome    object
y           object
dtype: object

In [122]:
# numerical features
num_features = ['age']

# make a pipeline to perform scaling
num_pipeline = Pipeline([
    ('scale', StandardScaler())
])

In [165]:
# categorical features
cat_features = ['job', 'marital', 'default', 'housing', 'loan', 'poutcome']

# make a pipeline to perfom one hot encoding
cat_pipeline = Pipeline([
    ('encode', OneHotEncoder())
])


In [162]:
# make full pipeline for converting categorical and numrical features accordingly
column_transform = ColumnTransformer([
    ('numerical', num_pipeline, num_features),
    ('categorical', cat_pipeline, cat_features)
])

In [163]:
# pipeline to transform columns and decision tree instance
clf = Pipeline([('pipeline', column_transform),('classifier', DecisionTreeClassifier())])

In [126]:
# split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [166]:
# train the model
clf = clf.fit(X_train, y_train)

ValueError: A given column is not a column of the dataframe

In [128]:
# make a prediction
y_pred = clf.predict(X_test)

In [129]:
# get accuracy score
accuracy_score(y_test, y_pred)

0.877031958420878

In [130]:
# see confusion matrix
confusion_matrix(y_test, y_pred)

array([[7764,  207],
       [ 905,  167]])

In [131]:
# see classification report in detail
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n          no       0.90      0.97      0.93      7971\n         yes       0.45      0.16      0.23      1072\n\n    accuracy                           0.88      9043\n   macro avg       0.67      0.56      0.58      9043\nweighted avg       0.84      0.88      0.85      9043\n'

In [132]:
'''
Hypertuning - let's find the optimal values for some of the parameters
'''
# define some parameters and some values
params = {
        'classifier__criterion': ["gini", "entropy"],
 
        'classifier__splitter': ["best", "random"],
 
        'classifier__max_depth': [2,4,6,8,10,20,30,50],

        'classifier__max_leaf_nodes' : [2,4,6,8,10,20,30,50]
    }

In [133]:
# Instanciate the GridSearch object and pass estimator and grid parameters
grid_cv = GridSearchCV(clf, params, cv=10).fit(X_train, y_train)

In [134]:
#find optimal values
grid_cv.best_params_

{'classifier__criterion': 'entropy',
 'classifier__max_depth': 4,
 'classifier__max_leaf_nodes': 30,
 'classifier__splitter': 'random'}

In [135]:
# instanciate decision tree classifier with optimal values
dt = DecisionTreeClassifier(criterion='entropy', max_depth=8, max_leaf_nodes=20, splitter='best')

# build pipeline with column transformer and classifier
clf_2 = Pipeline([('pipeline', column_transform),('classifier', dt)])

# fit data
clf_2 = clf.fit(X_train, y_train)

In [136]:
# predict
y_pred = clf_2.predict(X_test)

In [137]:
# Check accuracy
accuracy_score(y_test, y_pred)

0.8769213756496738

In [138]:
# confusion matrix
confusion_matrix(y_test, y_pred)

array([[7763,  208],
       [ 905,  167]])

In [139]:
# classification report
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n          no       0.90      0.97      0.93      7971\n         yes       0.45      0.16      0.23      1072\n\n    accuracy                           0.88      9043\n   macro avg       0.67      0.56      0.58      9043\nweighted avg       0.84      0.88      0.85      9043\n'

In [140]:
'''Random Forest Section'''

'Random Forest Section'

In [141]:
# build pipeline with column transformer and random forest classifier
rf_clf = Pipeline([
    ('columnTransformer', column_transform),
    ('classifier', RandomForestClassifier(random_state=1))
])

In [142]:
# train with data
rf_clf = rf_clf.fit(X_train, y_train.values.ravel())

In [143]:
# make prediction
y_pred = rf_clf.predict(X_test)

In [144]:
# accuracy score
accuracy_score(y_test, y_pred)

0.8794647793873714

In [145]:
# print confusion matrix
confusion_matrix(y_test, y_pred)

array([[7767,  204],
       [ 886,  186]])

In [146]:
# Hyperparameter tuning
# define some parameters to tune
params = {
    'classifier__n_estimators' : [1, 5, 10, 20, 50],
    'classifier__criterion' : ["gini", "entropy"],
    'classifier__max_depth': [2,4,6,8,10,20,30],
    'classifier__max_leaf_nodes' : [2,4,6,8,10,20,30]
}

In [147]:
# grid search with parameters and cross validation of 2 times
rf_grid_search = GridSearchCV(rf_clf, params, cv=2)

In [148]:
# fit the training data
rf_grid_search = rf_grid_search.fit(X_train, y_train.values.ravel())

In [149]:
# get optimal values
rf_grid_search.best_params_

{'classifier__criterion': 'entropy',
 'classifier__max_depth': 20,
 'classifier__max_leaf_nodes': 30,
 'classifier__n_estimators': 20}

In [150]:
# random forest classifier with best params
rf_clf_2 = RandomForestClassifier(criterion='entropy', max_depth=6, max_leaf_nodes=30, n_estimators=50, random_state=1)

In [151]:
# build pipeline with column transformater and classifier
rf_cl2 = Pipeline([
    ('columnTransformer', column_transform),
    ('classifier', rf_clf_2)
])

In [152]:
# fit the traind data
rf_cl2 = rf_cl2.fit(X_train, y_train.values.ravel())

In [153]:
# pred the test data
y_pred = rf_cl2.predict(X_test)

In [154]:
# accuracy score
accuracy_score(y_test, y_pred)

0.8920712153046555

In [155]:
# confusion matrix
confusion_matrix(y_test, y_pred)

array([[7917,   54],
       [ 922,  150]])

In [156]:
'''
Summary
Decision Tree Classifier
- without optimal values for parameters (Hyperparameters)
    Accuracy = 88.05%
    confusion matrix:
    array([[7794,  214],
       [ 862,  173]])
       
- wtih Hyperparameter tuning
    Accuracy = 88.09%
    confusion matrix:
    array([[7794,  214],
       [ 863,  172]])
       
       
Random Forest Classifier
- without optimal values for parameters (Hyperparameters)
    Accuracy = 87.91%
    confusion matrix:
    array([[7759,  249],
       [ 844,  191]])
       
- wtih Hyperparameter tuning
    Accuracy = 89.42%
    confusion matrix:
    array([[7950,   58],
       [ 898,  137]])
       
       
Random Forest Classifier perfomed slighlty better than Decision Tree Classifier. 
confusion matrix:
    array([[7950,   58],
       [ 898,  137]])
This confusion matrix tells that 7950 items were correctly identified as yes and 137 items as no. Similarly, 898 items
were incorrectly predicted as no and 58 as yes.

'''

'\nSummary\nDecision Tree Classifier\n- without optimal values for parameters (Hyperparameters)\n    Accuracy = 88.05%\n    confusion matrix:\n    array([[7794,  214],\n       [ 862,  173]])\n       \n- wtih Hyperparameter tuning\n    Accuracy = 88.09%\n    confusion matrix:\n    array([[7794,  214],\n       [ 863,  172]])\n       \n       \nRandom Forest Classifier\n- without optimal values for parameters (Hyperparameters)\n    Accuracy = 87.91%\n    confusion matrix:\n    array([[7759,  249],\n       [ 844,  191]])\n       \n- wtih Hyperparameter tuning\n    Accuracy = 89.42%\n    confusion matrix:\n    array([[7950,   58],\n       [ 898,  137]])\n       \n       \nRandom Forest Classifier perfomed slighlty better than Decision Tree Classifier. \nconfusion matrix:\n    array([[7950,   58],\n       [ 898,  137]])\nThis confusion matrix tells that 7950 items were correctly identified as yes and 137 items as no. Similarly, 898 items\nwere incorrectly predicted as no and 58 as yes.\n\n'

In [157]:
# save best model
f = open('myModel.pkl', 'wb')
pickle.dump(rf_cl2,f)
f.close()

In [158]:
# Zip fie
zipfile.ZipFile('myModel.zip', 'w').write('mymodel.pkl')