# Random Forest Classification on Bill_authentication Dataset.

# Importing the Libraries.

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

# Inporting the Datasets.

In [4]:
df = pd.read_csv('bill_authentication.csv')

In [5]:
df

Unnamed: 0,Variance,Skewness,Curtosis,Entropy,Class
0,3.62160,8.66610,-2.8073,-0.44699,0
1,4.54590,8.16740,-2.4586,-1.46210,0
2,3.86600,-2.63830,1.9242,0.10645,0
3,3.45660,9.52280,-4.0112,-3.59440,0
4,0.32924,-4.45520,4.5718,-0.98880,0
...,...,...,...,...,...
1367,0.40614,1.34920,-1.4501,-0.55949,1
1368,-1.38870,-4.87730,6.4774,0.34179,1
1369,-3.75030,-13.45860,17.5932,-2.77710,1
1370,-3.56370,-8.38270,12.3930,-1.28230,1


# Performing EDA.

In [6]:
df.shape

(1372, 5)

In [7]:
df.head()

Unnamed: 0,Variance,Skewness,Curtosis,Entropy,Class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1372 entries, 0 to 1371
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Variance  1372 non-null   float64
 1   Skewness  1372 non-null   float64
 2   Curtosis  1372 non-null   float64
 3   Entropy   1372 non-null   float64
 4   Class     1372 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 53.7 KB


In [9]:
df.describe()

Unnamed: 0,Variance,Skewness,Curtosis,Entropy,Class
count,1372.0,1372.0,1372.0,1372.0,1372.0
mean,0.433735,1.922353,1.397627,-1.191657,0.444606
std,2.842763,5.869047,4.31003,2.101013,0.497103
min,-7.0421,-13.7731,-5.2861,-8.5482,0.0
25%,-1.773,-1.7082,-1.574975,-2.41345,0.0
50%,0.49618,2.31965,0.61663,-0.58665,0.0
75%,2.821475,6.814625,3.17925,0.39481,1.0
max,6.8248,12.9516,17.9274,2.4495,1.0


In [10]:
df.isnull().sum()

Variance    0
Skewness    0
Curtosis    0
Entropy     0
Class       0
dtype: int64

In [11]:
#pretty much balanced dataset
df['Class'].value_counts()

0    762
1    610
Name: Class, dtype: int64

In [12]:
df.head()

Unnamed: 0,Variance,Skewness,Curtosis,Entropy,Class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [17]:
#seperating X and y
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [18]:
#Splitting Into train and test data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 0,test_size = 0.3)

# Using Single Decision Tree. 

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score

model = DecisionTreeClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
r2_score(y_test,y_pred)

0.9013409961685824

# RandomForest Without Hyperparameter tuning.

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
model = RandomForestClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [22]:
r2_score(y_test,y_pred)

0.960536398467433

# Hyperparameter tuning.

In [23]:
from sklearn.model_selection import RandomizedSearchCV
param_distributions = {
    'n_estimators':np.arange(50,251,50),
    'criterion':['gini','entropy'],
    'max_depth':np.arange(1,10),
    'max_features':np.arange(1,4),
    'min_samples_leaf':np.arange(1,11)
}

In [24]:
estimator = RandomForestClassifier(random_state=0)

In [25]:
rscv = RandomizedSearchCV(estimator,param_distributions,cv=10)

In [26]:
rscv.fit(X_train,y_train)

RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(random_state=0),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
                                        'max_features': array([1, 2, 3]),
                                        'min_samples_leaf': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
                                        'n_estimators': array([ 50, 100, 150, 200, 250])})

In [27]:
rscv.best_params_

{'n_estimators': 200,
 'min_samples_leaf': 2,
 'max_features': 2,
 'max_depth': 8,
 'criterion': 'gini'}

In [28]:
rscv.best_estimator_

RandomForestClassifier(max_depth=8, max_features=2, min_samples_leaf=2,
                       n_estimators=200, random_state=0)

# Model Building with Hyperparameter tuning.

In [29]:
model.fit(X_train,y_train)

RandomForestClassifier()

In [30]:
y_pred = model.predict(X_test)

In [31]:
r2_score(y_test,y_pred)

0.960536398467433