In [1]:
# Importing the required Package
import warnings
warnings.filterwarnings("ignore")

from sklearn import datasets

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score,confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

import statistics

import numpy as np

import pandas as pd

In [2]:
# Load data
iris = datasets.load_iris()

In [3]:
# Description About data set
print(iris.DESCR)

# After seeing below we need to build a model for classifying the classes of iris, so it is classification Problem.

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [4]:
# Training Data
X = pd.DataFrame(iris.data,columns = iris.feature_names)

# Testing Data
y = pd.DataFrame(iris.target,columns = ['species'])

<h3>K-Fold Cross Validation</h3>

In [5]:
kf = KFold(n_splits=10)

k_fold_score = []
for train_index, test_index in kf.split(X,y):
    
    # print(X.iloc[list(train_index),:])
    
    X_train, X_test = X.iloc[list(train_index),:], X.iloc[list(test_index),:]
    y_train, y_test = y.iloc[list(train_index),:], y.iloc[list(test_index),:]
    
    model = RandomForestClassifier()
    model.fit(X_train,y_train)
    y_predict = model.predict(X_test)
    k_fold_score.append(accuracy_score(y_test,y_predict))
   

In [6]:
print("Minimum accuracy we get is {}".format(min(k_fold_score)))
print("Maximun accuracy we get is {}".format(max(k_fold_score)))
print("We can get average accuracy is {}".format(statistics.mean(k_fold_score)))

Minimum accuracy we get is 0.8
Maximun accuracy we get is 1.0
We can get average accuracy is 0.9466666666666667


<h3>Stratified K Fold Cross Validation</h3>

In [7]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10)

Stratified_score = []
for train_index, test_index in skf.split(X, y):
    
    X_train, X_test = X.iloc[list(train_index),:], X.iloc[list(test_index),:]
    y_train, y_test = y.iloc[list(train_index),:], y.iloc[list(test_index),:]
    
    model = RandomForestClassifier()
    model.fit(X_train,y_train)
    y_predict = model.predict(X_test)
    Stratified_score.append(accuracy_score(y_test,y_predict))

In [8]:
print("Minimum accuracy we get is {}".format(min(Stratified_score)))
print("Maximun accuracy we get is {}".format(max(Stratified_score)))
print("We can get average accuracy is {}".format(
    statistics.mean(Stratified_score)))

Minimum accuracy we get is 0.8666666666666667
Maximun accuracy we get is 1.0
We can get average accuracy is 0.96


**<center><h1>Parameter Tunning</h1></center>**

**Parameters**

1. **n_estimators:**
  
  It defines the number of decision trees to be created in a random forest.
Generally, a higher number makes the predictions stronger and more stable, but a very large number can result in higher training time.
2. **criterion:**

  It defines the function that is to be used for splitting.
The function measures the quality of a split for each feature and chooses the best split.
3. **max_features:**

  It defines the maximum number of features allowed for the split in each decision tree.
Increasing max features usually improve performance but a very high number can decrease the diversity of each tree.
4. **max_depth:**

  Random forest has multiple decision trees. This parameter defines the maximum depth of the trees.
5. **min_samples_split:**

  Used to define the minimum number of samples required in a leaf node before a split is attempted.
If the number of samples is less than the required number, the node is not split.
6. **min_samples_leaf:**

  This defines the minimum number of samples required to be at a leaf node.
Smaller leaf size makes the model more prone to capturing noise in train data.
7. **max_leaf_nodes:**

  This parameter specifies the maximum number of leaf nodes for each tree.
The tree stops splitting when the number of leaf nodes becomes equal to the max leaf node.
8. **n_jobs:**

  This indicates the number of jobs to run in parallel.
Set value to -1 if you want it to run on all cores in the system.
9. **random_state:**

  This parameter is used to define the random selection.
It is used for comparison between various models.


We will try adjusting the following set of hyperparameters:

- n_estimators: Number of trees in the foreset
- max_features: Max number of features considered for splitting a node
- max_depth: Max number of levels in each decision tree
- min_samples_split: Min number of data points placed in a node before the node is split
- min_samples_leaf: Min number of data points allowed in a leaf node
- bootstrap: Method for sampling data points (with or without replacement)

In [9]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


<u>Randomised search</u>

In [10]:
random_search = RandomizedSearchCV(RandomForestClassifier(), 
                                   random_grid, 
                                   random_state=1, 
                                   n_iter=100, 
                                   cv=5, 
                                   verbose=0, 
                                   n_jobs=-1)

random_search.fit(X_train,y_train)

#Print The value of best Hyperparameters
print(random_search.best_params_)

{'n_estimators': 1200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False}


In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                 test_size=0.2,
                                                 random_state=10)
model = RandomForestClassifier(n_estimators =  1400, 
                               min_samples_split =  10, 
                               min_samples_leaf =  2, 
                               max_features = 'auto', 
                               max_depth =  40, 
                               bootstrap =  False)
model.fit(X_train,y_train)

y_predict = model.predict(X_test)
y_proba = model.predict_proba(X_test)
print("Accuracy of random forest tree model for classifying iris species",
      accuracy_score(y_test,y_predict))

Accuracy of random forest tree model for classifying iris species 1.0
