In [1]:
# Importing the required Package

import pandas as pd
import numpy as np

from sklearn import datasets

from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data
iris = datasets.load_iris()

# Description About data set
print(iris.DESCR)
# After seeing below we need to build a model for classifying the classes of iris, so it is classification Problem.

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [3]:
# Training Data
train = pd.DataFrame(iris.data,columns = iris.feature_names)

# Testing Data
target = pd.DataFrame(iris.target,columns = ['species'])

print(train.head())
print(target.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2
   species
0        0
1        0
2        0
3        0
4        0


In [4]:
X_train,X_test,y_train,y_test = train_test_split(train,target,
                                                 test_size=0.2,
                                                 random_state=10)
model = AdaBoostClassifier()
model.fit(X_train,y_train)

y_predict = model.predict(X_test)

In [5]:
skf = StratifiedKFold(n_splits=10)

Stratified_score = []
for train_index, test_index in skf.split(train, target):
    
    X_train, X_test = train.iloc[list(train_index),:], \
    train.iloc[list(test_index),:]
    y_train, y_test = target.iloc[list(train_index),:], target.iloc[list(test_index),:]
    
    model = AdaBoostClassifier()
    model.fit(X_train,y_train)
    y_predict = model.predict(X_test)
    Stratified_score.append(accuracy_score(y_test,y_predict))

In [6]:
import statistics

print("Minimum accuracy we get is {}".format(min(Stratified_score)))
print("Maximun accuracy we get is {}".format(max(Stratified_score)))
print("We can get average accuracy is {}".\
      format(statistics.mean(Stratified_score)))

print("Accuracy of random forest tree model for classifying iris species",
      accuracy_score(y_test,y_predict))

Minimum accuracy we get is 0.8666666666666667
Maximun accuracy we get is 1.0
We can get average accuracy is 0.9533333333333334
Accuracy of random forest tree model for classifying iris species 1.0


PARAMETER TUNING

**Parameters**

1. **base_estimators:**
It helps to specify the type of base estimator, that is, the machine learning algorithm to be used as base learner.
2. **n_estimators:**
It defines the number of base estimators.
The default value is 10, but you should keep a higher value to get better performance.
3. **learning_rate:**
This parameter controls the contribution of the estimators in the final combination.
There is a trade-off between learning_rate and n_estimators.
4. **max_depth:**
Defines the maximum depth of the individual estimator.
Tune this parameter for best performance.
5. **n_jobs**
Specifies the number of processors it is allowed to use.
Set value to -1 for maximum processors allowed.
6. **random_state :**
An integer value to specify the random data split.
A definite value of random_state will always produce same results if given with same parameters and training data.


In [7]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
learning_rate = [0.1,0.01,0.02]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate
              }
print(random_grid)


{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'learning_rate': [0.1, 0.01, 0.02]}


In [8]:
random_search = RandomizedSearchCV(AdaBoostClassifier(), 
                                   random_grid, 
                                   random_state=1, 
                                   n_iter=100, 
                                   cv=5, 
                                   verbose=0, 
                                   n_jobs=-1)

random_search.fit(X_train,y_train)

#Print The value of best Hyperparameters
print(random_search.best_params_)

{'n_estimators': 200, 'learning_rate': 0.1}


In [9]:
model = AdaBoostClassifier(learning_rate = 0.1, n_estimators = 200)

model.fit(X_train,y_train)

y_predict=model.predict(X_test)

print("Accuracy of random forest tree model for classifying iris species",
      accuracy_score(y_test,y_predict))
print("\nCurrently used params\n\n",model.get_params())

Accuracy of random forest tree model for classifying iris species 1.0

Currently used params

 {'algorithm': 'SAMME.R', 'base_estimator': None, 'learning_rate': 0.1, 'n_estimators': 200, 'random_state': None}
