# Random Forest Implementation in Python

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
df = pd.read_csv('bank_sample.csv')
df.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
0,39,156,5,999,0,1.4,94.465,-41.8,4.865,5228.1,...,4,1,1,1,0,0,1,4,1,1
1,25,103,1,999,0,1.4,93.918,-42.7,4.963,5228.1,...,0,2,3,0,2,0,0,3,2,1
2,50,121,2,999,0,1.1,93.994,-36.4,4.859,5191.0,...,9,1,5,0,0,0,1,6,0,1
3,45,187,1,999,0,1.4,93.444,-36.1,4.965,5228.1,...,1,1,0,1,2,2,0,1,1,1
4,46,124,4,999,0,-1.8,93.075,-47.1,1.405,5099.1,...,7,1,3,0,0,0,0,0,1,1


In [4]:
df.isnull().sum()

age               0
duration          0
campaign          0
pdays             0
previous          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
purchased         0
id                0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
poutcome          0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3295 entries, 0 to 3294
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             3295 non-null   int64  
 1   duration        3295 non-null   int64  
 2   campaign        3295 non-null   int64  
 3   pdays           3295 non-null   int64  
 4   previous        3295 non-null   int64  
 5   emp.var.rate    3295 non-null   float64
 6   cons.price.idx  3295 non-null   float64
 7   cons.conf.idx   3295 non-null   float64
 8   euribor3m       3295 non-null   float64
 9   nr.employed     3295 non-null   float64
 10  purchased       3295 non-null   int64  
 11  id              3295 non-null   int64  
 12  job             3295 non-null   int64  
 13  marital         3295 non-null   int64  
 14  education       3295 non-null   int64  
 15  default         3295 non-null   int64  
 16  housing         3295 non-null   int64  
 17  loan            3295 non-null   i

In [24]:
df.purchased.value_counts()

0    2940
1     355
Name: purchased, dtype: int64

In [25]:
X = df.drop('purchased',axis=1)
y = df['purchased']

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,random_state=42)
X_train.shape, X_test.shape

((2306, 21), (989, 21))

In [27]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42,bootstrap=True,criterion='gini',max_depth=3,n_estimators=10,
                           oob_score=True)

In [28]:
rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=3, n_estimators=10, oob_score=True,
                       random_state=42)

In [29]:
from sklearn.metrics import confusion_matrix, accuracy_score
def evaluate_model(classifier):
    print("Train Accuracy :", accuracy_score(y_train, classifier.predict(X_train)))
    print("Train Confusion Matrix:")
    print(confusion_matrix(y_train, classifier.predict(X_train)))
    print("-"*50)
    print("Test Accuracy :", accuracy_score(y_test, classifier.predict(X_test)))
    print("Test Confusion Matrix:")
    print(confusion_matrix(y_test, classifier.predict(X_test)))

In [30]:
evaluate_model(rf)

Train Accuracy : 0.9032957502168256
Train Confusion Matrix:
[[2048    3]
 [ 220   35]]
--------------------------------------------------
Test Accuracy : 0.9110212335692619
Test Confusion Matrix:
[[884   5]
 [ 83  17]]


In [31]:
rf.oob_score_

0.8946227233304423

In [35]:
from sklearn.metrics import classification_report
train_report = classification_report(y_train,rf.predict(X_train))
print(train_report)

              precision    recall  f1-score   support

           0       0.90      1.00      0.95      2051
           1       0.92      0.14      0.24       255

    accuracy                           0.90      2306
   macro avg       0.91      0.57      0.59      2306
weighted avg       0.90      0.90      0.87      2306



In [36]:
print(classification_report(y_test,rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95       889
           1       0.77      0.17      0.28       100

    accuracy                           0.91       989
   macro avg       0.84      0.58      0.62       989
weighted avg       0.90      0.91      0.88       989



In [37]:
from sklearn.metrics import f1_score
f1_score(y_test,rf.predict(X_test))

0.27868852459016397

In [38]:
rf.feature_importances_

array([0.02301768, 0.28975178, 0.0045601 , 0.015324  , 0.01024751,
       0.08175584, 0.05330772, 0.03108918, 0.27704523, 0.13017772,
       0.02278901, 0.00516279, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.01377033, 0.        , 0.00320168,
       0.03879941])

In [40]:
imp_df = pd.DataFrame({
    "Varname": X_train.columns,
    "Imp": rf.feature_importances_
})

In [41]:
imp_df.sort_values(by="Imp", ascending=False)

Unnamed: 0,Varname,Imp
1,duration,0.289752
8,euribor3m,0.277045
9,nr.employed,0.130178
5,emp.var.rate,0.081756
6,cons.price.idx,0.053308
20,poutcome,0.038799
7,cons.conf.idx,0.031089
0,age,0.023018
10,id,0.022789
3,pdays,0.015324


# According to our model, duration is the most important feature.

In [42]:
#The End