## Random Forest algorithm example

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [3]:
df.shape

(344, 7)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [5]:
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [6]:
df.dropna(inplace = True)

In [7]:
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

### One hot encoding to convert categorical data into numeric

In [9]:
df.sex.unique()

array(['Male', 'Female'], dtype=object)

In [10]:
pd.get_dummies (df['sex']).head()

Unnamed: 0,Female,Male
0,False,True
1,True,False
2,True,False
4,True,False
5,False,True


In [11]:
sex = pd.get_dummies (df['sex'], drop_first = True) 
sex.head()

Unnamed: 0,Male
0,True
1,False
2,False
4,False
5,True


### Applying one hot encoding to the island feature

In [13]:
df.island.unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [14]:
pd.get_dummies (df['island']).head()

Unnamed: 0,Biscoe,Dream,Torgersen
0,False,False,True
1,False,False,True
2,False,False,True
4,False,False,True
5,False,False,True


In [16]:
island = pd.get_dummies(df['island'], drop_first = True) 
island.head(5)

Unnamed: 0,Dream,Torgersen
0,False,True
1,False,True
2,False,True
4,False,True
5,False,True


### Concatenate the aove 2 data frames to the original df

In [18]:
new_data = pd.concat([df, island, sex], axis = 1)
new_data.drop(['sex', 'island'], axis = 1, inplace = True)
new_data.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Dream,Torgersen,Male
0,Adelie,39.1,18.7,181.0,3750.0,False,True,True
1,Adelie,39.5,17.4,186.0,3800.0,False,True,False
2,Adelie,40.3,18.0,195.0,3250.0,False,True,False
4,Adelie,36.7,19.3,193.0,3450.0,False,True,False
5,Adelie,39.3,20.6,190.0,3650.0,False,True,True


In [19]:
y = new_data.species
y.head()

0    Adelie
1    Adelie
2    Adelie
4    Adelie
5    Adelie
Name: species, dtype: object

In [20]:
y.unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [21]:
y = y.map({'Adelie': 0, 'Chinstrap':1, 'Gentoo': 2})
y.head()

0    0
1    0
2    0
4    0
5    0
Name: species, dtype: int64

In [30]:
X = new_data
X.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Dream,Torgersen,Male
0,39.1,18.7,181.0,3750.0,False,True,True
1,39.5,17.4,186.0,3800.0,False,True,False
2,40.3,18.0,195.0,3250.0,False,True,False
4,36.7,19.3,193.0,3450.0,False,True,False
5,39.3,20.6,190.0,3650.0,False,True,True


### Split data into train and test

In [32]:
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.3, random_state = 8)
print('X_train', X_train)
print('X_test', X_test)
print('y_train', y_train)
print('y_test', y_test)

X_train      bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  Dream  \
220            46.1           13.2              211.0       4500.0  False   
65             41.6           18.0              192.0       3950.0  False   
214            45.7           17.0              195.0       3650.0   True   
218            50.8           19.0              210.0       4100.0   True   
230            40.9           13.7              214.0       4650.0  False   
..              ...            ...                ...          ...    ...   
91             41.1           18.1              205.0       4300.0   True   
54             34.5           18.1              187.0       2900.0  False   
197            50.8           18.5              201.0       4450.0   True   
142            32.1           15.5              188.0       3050.0   True   
139            39.7           17.9              193.0       4250.0   True   

     Torgersen   Male  
220      False  False  
65       False   Tr

### Training Random forest classifier on training set

In [36]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 5, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

### Predicting the test results

In [38]:
y_pred = classifier.predict(X_test)
y_pred

array([0, 0, 0, 2, 2, 1, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 1, 2, 2, 0, 0, 0,
       1, 2, 2, 2, 0, 2, 2, 0, 2, 0, 0, 0, 0, 1, 2, 2, 1, 2, 1, 2, 2, 0,
       2, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 2, 1, 2, 2, 1, 0, 0, 2, 2, 0,
       0, 2, 0, 2, 0, 2, 1, 0, 0, 0, 1, 0, 0, 1, 2, 2, 1, 2, 2, 2, 1, 0,
       1, 0, 0, 2, 2, 0, 2, 0, 1, 0, 0, 0])

### Confusion matrix

In [41]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print (cm)

[[43  0  0]
 [ 0 19  0]
 [ 0  0 38]]


In [42]:
accuracy_score(y_test, y_pred)

1.0

In [43]:
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       1.00      1.00      1.00        19
           2       1.00      1.00      1.00        38

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



### Try with different number of trees and gini criteria

In [45]:
from sklearn.ensemble import RandomForestClassifier

In [46]:
classifier = RandomForestClassifier(n_estimators = 7, criterion = 'gini' , random_state = 0)
classifier.fit(X_train, y_train)

In [47]:
y_pred = classifier.predict(X_test)
y_pred

array([0, 0, 0, 2, 2, 1, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 1, 2, 2, 0, 0, 0,
       1, 2, 2, 2, 0, 2, 2, 0, 2, 0, 0, 0, 0, 1, 2, 2, 1, 2, 1, 2, 2, 0,
       2, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 2, 1, 2, 2, 1, 0, 0, 2, 2, 0,
       0, 2, 0, 2, 0, 2, 1, 0, 0, 0, 1, 0, 0, 1, 2, 2, 1, 2, 2, 2, 1, 0,
       1, 0, 0, 2, 2, 0, 2, 0, 1, 0, 0, 0])

In [49]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print (cm)

[[43  0  0]
 [ 0 19  0]
 [ 0  0 38]]


In [50]:
accuracy_score(y_test, y_pred)

1.0

In [51]:
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       1.00      1.00      1.00        19
           2       1.00      1.00      1.00        38

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

