In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('iris.csv')
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
df.Species.value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Species, dtype: int64

In [4]:
df['Species'] = df['Species'].replace(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], [0,1,2])

In [5]:
df.sample(5)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
55,56,5.7,2.8,4.5,1.3,1
76,77,6.8,2.8,4.8,1.4,1
66,67,5.6,3.0,4.5,1.5,1
103,104,6.3,2.9,5.6,1.8,2
132,133,6.4,2.8,5.6,2.2,2


In [6]:
X = df.drop(['Id','Species'],axis=1)
y = df['Species']

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

# Grid search for hyper-parameter tuning

In [9]:
# Create the parameter grid based on the results of random search 
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html (link to show parameters)

params = {
    'n_estimators':[1,20,50,100,200],
    'criterion':["gini", "entropy", 'log_loss'],
    'max_depth':[1, 2, 5, 10, 20],
    'min_samples_split':[*range(1,20,4)],
    'min_samples_leaf':[5, 10, 20, 50, 100],
    'max_features' : ["auto", "sqrt", "log2"]
}

In [10]:
from sklearn.model_selection import GridSearchCV

In [11]:
# Instantiate the grid search model

grid_search = GridSearchCV(estimator=rfc, param_grid=params, 
                          cv=5, n_jobs=-1, verbose=1, scoring = "accuracy")

In [12]:
%%time
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 5625 candidates, totalling 28125 fits
Wall time: 8min 40s


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_depth': [1, 2, 5, 10, 20],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [5, 10, 20, 50, 100],
                         'min_samples_split': [1, 5, 9, 13, 17],
                         'n_estimators': [1, 20, 50, 100, 200]},
             scoring='accuracy', verbose=1)

In [13]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=10, max_features='log2', min_samples_leaf=10,
                       min_samples_split=9, n_estimators=1)

In [14]:
grid_search.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'log2',
 'min_samples_leaf': 10,
 'min_samples_split': 9,
 'n_estimators': 1}

In [17]:
rfc = RandomForestClassifier(max_depth=10, max_features='log2', min_samples_leaf=10,
                       min_samples_split=9, n_estimators=1,criterion='gini')

In [18]:
rfc.fit(X_train,y_train)

RandomForestClassifier(max_depth=10, max_features='log2', min_samples_leaf=10,
                       min_samples_split=9, n_estimators=1)

In [20]:
y_train_pred = rfc.predict(X_train)
y_test_pred = rfc.predict(X_test)

In [22]:
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report

In [32]:
print("Train Performance :", accuracy_score(y_train, y_train_pred))
print("Train Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))
print("-"*50)
print("Test Performance :", accuracy_score(y_test, y_test_pred))
print("Test Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))
print("-"*50)
print(classification_report(y_test, y_test_pred))

Train Performance : 0.94
Train Confusion Matrix:
[[31  0  0]
 [ 0 30  5]
 [ 0  1 33]]
--------------------------------------------------
Test Performance : 0.98
Test Confusion Matrix:
[[19  0  0]
 [ 0 14  1]
 [ 0  0 16]]
--------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      0.93      0.97        15
           2       0.94      1.00      0.97        16

    accuracy                           0.98        50
   macro avg       0.98      0.98      0.98        50
weighted avg       0.98      0.98      0.98        50



In [34]:
rfc.n_features_ 

4

In [36]:
rfc.feature_names_in_

array(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'],
      dtype=object)

In [38]:
rfc.n_outputs_ 

1

# Variable importance in RandomForest and Decision trees

In [39]:
rfc.feature_importances_

array([0.        , 0.00093377, 0.43828206, 0.56078418])

In [41]:
imp_df = pd.DataFrame({
    "Varname": X_train.columns,
    "Imp": rfc.feature_importances_
})

In [42]:
imp_df.sort_values(by="Imp", ascending=False)

Unnamed: 0,Varname,Imp
3,PetalWidthCm,0.560784
2,PetalLengthCm,0.438282
1,SepalWidthCm,0.000934
0,SepalLengthCm,0.0
