In [2]:
#Import needed modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from matplotlib import pyplot as plt

In [3]:
#Load the data
df = pd.read_csv("bank-additional.csv", header= 'infer', sep = ';')


In [4]:
#Inspect the top 10 rows of data
df.head(10)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,no
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no
5,32,services,single,university.degree,no,no,no,cellular,sep,thu,...,3,999,2,failure,-1.1,94.199,-37.5,0.884,4963.6,no
6,32,admin.,single,university.degree,no,yes,no,cellular,sep,mon,...,4,999,0,nonexistent,-1.1,94.199,-37.5,0.879,4963.6,no
7,41,entrepreneur,married,university.degree,unknown,yes,no,cellular,nov,mon,...,2,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no
8,31,services,divorced,professional.course,no,no,no,cellular,nov,tue,...,1,999,1,failure,-0.1,93.2,-42.0,4.153,5195.8,no
9,35,blue-collar,married,basic.9y,unknown,no,no,telephone,may,thu,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no


In [5]:
df['y'].value_counts()

no     3668
yes     451
Name: y, dtype: int64

In [6]:
#Convert the y column to a bit operator, indiactes whether or not customer took out a savings bond
df.loc[df['y'] == 'yes', 'sale'] = 1
df.loc[df['y'] == 'no', 'sale'] = 0

In [7]:
#One hot encoding goes here
df = df.drop('y', axis =1)
df_ohe = pd.get_dummies(df)

In [8]:
#Create our input values matrix (X) and our outcome variable (y)
X = df_ohe.drop('sale', axis = 1)
y = df['sale'].values #could also just use y from original data frame

In [9]:
#Split into training and test sets, 70:30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [10]:
#Build fit and return the predictions from our initial random forest estimator
classifier = RandomForestClassifier(n_estimators=200,  random_state=123)
classifier.fit(X_train, y_train)
y_pred = classifier.predict_proba(X_test)[:,1]
#y_pred = classifier.predict(X_test) <- Do not use, incredibly, this gives the wrong answer

In [11]:
#Trying to use appropriate metrics for a binary classification model
print('AUC:', metrics.roc_auc_score(y_test, y_pred))
print('Log Loss:', metrics.log_loss(y_test, y_pred))


AUC: 0.8941048356240986
Log Loss: 0.2609925013726643


In [12]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 1000, num = 100)]

# Minimum number of samples required to split a node
min_samples_split = [2, 3, 5, 10, 15, 20] #Default is 2

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 8, 16] #Default is 1

# Create the random grid
param_grid = {'n_estimators': n_estimators
              ,'min_samples_split': min_samples_split
              ,'min_samples_leaf' : min_samples_leaf
             }

In [13]:
#build our random search cross validator
rf_random = RandomizedSearchCV(estimator = classifier, 
                               param_distributions = param_grid, 
                               n_iter = 20, 
                               cv = 3, verbose=2, 
                               scoring = 'roc_auc'
                               ,random_state=123
                               ,n_jobs = -1)#, refit=False)
# Fit the random search model
rf_random.fit(X, y)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   18.1s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators=200,
                                                    n_jobs=None,
 

In [14]:
#Print results.
print(rf_random.best_params_)
best = rf_random.best_params_
results = rf_random.cv_results_
print(rf_random.best_estimator_.score)

{'n_estimators': 623, 'min_samples_split': 10, 'min_samples_leaf': 8}
<bound method ClassifierMixin.score of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=623,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)>


In [15]:
#Build a classifier with our best fitting parameters
classifier2 = RandomForestClassifier(n_estimators=best['n_estimators']
                                   ,min_samples_split= best['min_samples_split']
                                   ,min_samples_leaf=best['min_samples_leaf']
                                   ,random_state=123)
classifier2.fit(X_train, y_train)
y_pred_2 = classifier2.predict_proba(X_test)[:,1]

In [17]:
#compare output metrics
print('AUC:', metrics.roc_auc_score(y_test, y_pred_2))

print('Log Loss:', metrics.log_loss(y_test, y_pred_2))


AUC: 0.9076728375077259
Log Loss: 0.21011647270212302


In [18]:
print('AUC:', metrics.roc_auc_score(y_test, y_pred))

print('Log Loss:', metrics.log_loss(y_test, y_pred))


AUC: 0.8941048356240986
Log Loss: 0.2609925013726643


In [19]:
#Setting up the parameter grid for Grid Search
# Number of trees in random forest
n_estimators_2 = list(range(615,633,2))

# Minimum number of samples required to split a node
min_samples_split_2 = [9,10,11]

# Minimum number of samples required at each leaf node
min_samples_leaf_2 = [7,8,9]

# Create the random grid
param_grid_2 = {'n_estimators': n_estimators_2
              ,'min_samples_split': min_samples_split_2
              , 'min_samples_leaf' : min_samples_leaf_2
             }

In [20]:
#Building the grid search optimiser
rf_grid = GridSearchCV(estimator = classifier, 
                               param_grid = param_grid_2, 
                               cv = 3, verbose=2, 
                               scoring = 'roc_auc',
                               n_jobs = -1)#, refit=False)
# Fit the random search model
rf_grid.fit(X, y)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   41.7s
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed:  1.2min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=200, n_jobs=None,
                                              oob_score=False, random_state=123,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'min_

In [21]:
#Print model results
print(rf_grid.best_params_)
best = rf_grid.best_params_
results = rf_grid.cv_results_
print(rf_grid.best_estimator_.score)

{'min_samples_leaf': 8, 'min_samples_split': 9, 'n_estimators': 617}
<bound method ClassifierMixin.score of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=9,
                       min_weight_fraction_leaf=0.0, n_estimators=617,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)>


In [22]:
#Building and fitting a futher classifier with our very finely tuned model
classifier3 = RandomForestClassifier(n_estimators= best['n_estimators']
                                   ,min_samples_split= best['min_samples_split']
                                   ,min_samples_leaf=best['min_samples_leaf']
                                   ,random_state=123)
classifier3.fit(X_train, y_train)
y_pred_3 = classifier3.predict_proba(X_test)[:,1]

In [23]:
#Print and compare model metrics

print('AUC:', metrics.roc_auc_score(y_test, y_pred_3))
print('Log Loss:', metrics.log_loss(y_test, y_pred_3))

AUC: 0.9077022691820938
Log Loss: 0.2100596103811192


In [24]:
print('AUC:', metrics.roc_auc_score(y_test, y_pred_2))
print('Log Loss:', metrics.log_loss(y_test, y_pred_2))

AUC: 0.9076728375077259
Log Loss: 0.21011647270212302
