# Random Forest Third Trial
## Summary of Tasks:
1. Encode feature variables - improvement over trial 1 - encode job using dummy encoding (better than frequency encoding)
2. Stratified K-Fold cross validation to handle the data imbalance
2. Train random forest model
3. Use grid search to find best hyperparameters that produce best f1 score

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#to ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [38]:
data = pd.read_csv('../data/bank.csv')
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no


## Encoding categorical variables
One hot encoding: marital, poutcome, contact

Frequency encoding: job

Binary encoding: loan, housing, default, y

Ordinal encoding: education

sin/cosine encoding: month, day_of_week

In [39]:
# Encoding categorical data
# Using ordinal encoding for y
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder(categories=[['no','yes']])
data['y_encoded'] = encoder.fit_transform(data[['y']])
data['pdays'] = data['pdays'].replace(999, 0)


# Using dummy encoding to encode marital, poutcome, contact, job
to_dummy_cols = pd.DataFrame(data[['marital', 'poutcome','contact', 'job']])
dummy_cols = pd.get_dummies(to_dummy_cols)


# Using binary encoding for loan, housing, default, y
from category_encoders import BinaryEncoder 
encoder = BinaryEncoder(cols =['loan', 'housing', 'default']) 
# Transforming the column after fitting
newdata = encoder.fit_transform(data[['loan', 'housing', 'default']])


# Using ordinal encoding for education
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder(categories=[["illiterate","unknown","basic.4y","basic.6y","basic.9y","high.school","professional.course","university.degree"]])
data['education_encoded'] = encoder.fit_transform(data[['education']])

# Sin/cosine encoding for month, day_of_week
# Map categorical values to numerical values
month_mapping = {month: i+1 for i, month in enumerate(['jan', 'feb', 'mar', 'apr', 'may', 'jun', 
                                                       'jul', 'aug', 'sep', 'oct', 'nov', 'dec'])}
day_mapping = {day: i+1 for i, day in enumerate(['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'])}

data['month_num'] = data['month'].map(month_mapping)
data['day_num'] = data['day_of_week'].map(day_mapping)

# Sine and Cosine Encoding
data['month_sin'] = np.sin(2 * np.pi * data['month_num'] / 12)
data['month_cos'] = np.cos(2 * np.pi * data['month_num'] / 12)
data['day_sin'] = np.sin(2 * np.pi * data['day_num'] / 7)
data['day_cos'] = np.cos(2 * np.pi * data['day_num'] / 7)


# concatenating dataframe
data = pd.concat([data, newdata, dummy_cols], axis = 1) 
# dropping old column 
data = data.drop(['loan', 'housing', 'default','y','marital', 'poutcome','contact','job','month', 'day_of_week', 'month_num', 'day_num','education'], axis = 1)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 43 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   41188 non-null  int64  
 1   duration              41188 non-null  int64  
 2   campaign              41188 non-null  int64  
 3   pdays                 41188 non-null  int64  
 4   previous              41188 non-null  int64  
 5   emp.var.rate          41188 non-null  float64
 6   cons.price.idx        41188 non-null  float64
 7   cons.conf.idx         41188 non-null  float64
 8   euribor3m             41188 non-null  float64
 9   nr.employed           41188 non-null  int64  
 10  y_encoded             41188 non-null  float64
 11  education_encoded     41188 non-null  float64
 12  month_sin             41188 non-null  float64
 13  month_cos             41188 non-null  float64
 14  day_sin               41188 non-null  float64
 15  day_cos            

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X = np.array(data.drop(['y_encoded'], axis=1))
y = np.array(data[['y_encoded']])
# Feature Scaling for input features.
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)



In [46]:
# STRATIFIES K-FOLD CROSS VALIDATION { 10-fold }

# Import Required Modules.
from statistics import mean, stdev
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Create classifier object.
rfc_skf = RandomForestClassifier(n_jobs=-1)

# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
f1_stratified = []

for train_index, test_index in skf.split(X, y):
	X_train_fold, X_test_fold = X[train_index], X[test_index]
	y_train_fold, y_test_fold = y[train_index], y[test_index]
	rfc_skf.fit(X_train_fold, y_train_fold)
	y_predict = rfc_skf.predict(X_test_fold)
	fold_f1 = f1_score(y_test_fold, y_predict, average='binary')  # Use 'binary' for binary classification
	f1_stratified.append(fold_f1)

# Print the output.
print('List of possible f1:', f1_stratified)
print('\nMaximum f1 That can be obtained from this model is:',
	max(f1_stratified)*100, '%')
print('\nMinimum f1:',
	min(f1_stratified)*100, '%')
print('\nOverall f1:',
	mean(f1_stratified)*100, '%')
print('\nStandard Deviation is:', stdev(f1_stratified))


List of possible f1: [0.5833333333333334, 0.5463917525773195, 0.5422446406052963, 0.5506172839506173, 0.5797101449275363, 0.5595984943538268, 0.5410872313527181, 0.5392405063291139, 0.6081730769230769, 0.5580838323353293]

Maximum f1 That can be obtained from this model is: 60.817307692307686 %

Minimum f1: 53.92405063291139 %

Overall f1: 56.084802966881675 %

Standard Deviation is: 0.022674858570793068


In [None]:
# Train basic random forest model now
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import *

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=31, test_size=0.2)

rfc = RandomForestClassifier(n_jobs=-1, random_state=31, class_weight='balanced')
rfc.fit(X_train, y_train)


# Checking scores
y_predict = rfc.predict(X_test)
y_predict_train = rfc.predict(X_train)
accuracy_test = accuracy_score(y_predict, y_test)
accuracy_train = accuracy_score(y_predict_train, y_train)
precision_test = precision_score(y_predict, y_test)
precision_train = precision_score(y_predict_train, y_train)
recall_test = recall_score(y_predict, y_test)
recall_train = recall_score(y_predict_train, y_train)
f1_test = f1_score(y_predict, y_test)
f1_train = f1_score(y_predict_train, y_train)

# Printing out results
# Predicting the accuracy score for Random Forest Classifier 
print('\n')
print('-------Random Forest Classifier-------')
print('accuracy for test is', accuracy_test)
print('accuracy for train is', accuracy_train)
print('\n')
print('precision for test is', precision_test)
print('precision for train is', precision_train)
print('\n')
print('recall for test is', recall_test)
print('recall for train is', recall_train)
print('\n')
print('f1 for test is', f1_test)
print('f1 for train is', f1_train)
print('\n')


print(classification_report(y_test, y_predict))




-------Random Forest Classifier-------
accuracy for test is 0.9121145909201263
accuracy for train is 0.9998786039453718


precision for test is 0.41713014460511677
precision for train is 0.9991980753809142


recall for test is 0.6521739130434783
recall for train is 0.9997325488098422


f1 for test is 0.508819538670285
f1 for train is 0.9994652406417113


              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95      7339
         1.0       0.65      0.42      0.51       899

    accuracy                           0.91      8238
   macro avg       0.79      0.69      0.73      8238
weighted avg       0.90      0.91      0.90      8238



In [None]:
rf = RandomForestClassifier(random_state=31, n_jobs=-1)

params = {
    'max_depth': [2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100,200],
    'n_estimators': [10,25,30,50,100,200],
    'max_features': ['auto', 'sqrt', 'log2'],

}

from sklearn.model_selection import GridSearchCV

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf,
                           param_grid=params,
                           cv = 5,
                           n_jobs=-1, verbose=1, scoring="f1")

grid_search.fit(X_train, y_train)
print(grid_search.best_score_)
rf_best = grid_search.best_estimator_
print(rf_best)

Fitting 5 folds for each of 540 candidates, totalling 2700 fits
