# 10 | Performance metrics + Ensemble Learning

## Importing Libraries

In [1]:
import numpy as np                                            #data manipulation
import pandas as pd                                           #data manipulation
from sklearn import datasets                                  #readymade datasets
from sklearn.model_selection import train_test_split          #splitting train-test data
from sklearn.model_selection import cross_val_score as cvs    #cross validation 
from sklearn import metrics                                   #performance metrics
from sklearn.metrics import confusion_matrix as cm            #confusion matrix
from sklearn.svm import SVC                                   #support vector classification
from sklearn.naive_bayes import GaussianNB                    #gaussian naive bayes
from sklearn.tree import DecisionTreeClassifier as dtc        #decision tree 
from sklearn.neighbors import KNeighborsClassifier as knc     #k nearest neighnors
from sklearn.linear_model import LogisticRegression as lr     #regression
from sklearn.ensemble import VotingClassifier as vc           #bagging ensemble
from sklearn.ensemble import AdaBoostClassifier               #boosting ensemble

In [2]:
iris = datasets.load_iris()
X = iris.data
y = iris.target
X.shape , y.shape

((150, 4), (150,))

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=0)
gnb=GaussianNB()
clf=gnb.fit(X_train,y_train)
y_predict=gnb.predict(X)

## Cross Validation

In [4]:
scores=cvs(clf, X, y, cv=5) # 5 folds of training-testing data
scores

array([0.93333333, 0.96666667, 0.93333333, 0.93333333, 1.        ])

In [5]:
scores.mean()

0.9533333333333334

## Confusion matrix

In [6]:
print(cm(y,y_predict))
print("Accuracy=",clf.score(X_test,y_test))

[[50  0  0]
 [ 0 48  2]
 [ 0  4 46]]
Accuracy= 0.9333333333333333


# Ensemble

To combine predictions of several best estimations (multiple algorithms used together for predicton)

## Averaging method: Bagging Algorithm

Majority of predictions if discrete labels --> Classification

Average of predictions if continuous labels --> Regression

In [7]:
estimators=[]
m1= knc(n_neighbors=3)
estimators.append(('KNN', m1))

m2= dtc()
estimators.append(('DTC', m2))

m3= SVC()
estimators.append(('SVM', m3))

m4= lr()
estimators.append(('LR', m4))

In [8]:
ensemble = vc(estimators)
eclf1=ensemble.fit(X_train,y_train)



In [9]:
y_pred=eclf1.predict(X_test)
y_pred

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0,
       0, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2])

In [10]:
print("Bagging Accuracy=")
metrics.accuracy_score(y_test, y_pred)

Bagging Accuracy=


0.95

## Boosting method: Adaboost Algorithm

The classification models work on the weak learers of previous stages.
So more accuracy due to feedback mechanism.

The next model tries to improve the inaccurate predictions of previous models.

In [11]:
#base esimator = SVM classifier
svc = SVC(probability=True, kernel='linear')

#Model creation with 50 classifiers 
abc = AdaBoostClassifier(n_estimators=50, base_estimator=svc)

#training adaboost model
model = abc.fit(X_train, y_train)

#predict the response for test dataset
y_pred = model.predict(X_test)
y_pred

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0,
       0, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2])

In [12]:
print("SVM Adaboost Accuracy=")
metrics.accuracy_score(y_test, y_pred)

SVM Adaboost Accuracy=


0.9666666666666667

In [13]:
#Default estimator = Decision Tree

#Model creation with 50 classifiers 
abc = AdaBoostClassifier(n_estimators=50)

#training adaboost model
model = abc.fit(X_train, y_train)

#predict the response for test dataset
y_pred = model.predict(X_test)
y_pred

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 2, 1, 0, 1, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0,
       0, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2])

In [14]:
print("Adaboost Accuracy=")
metrics.accuracy_score(y_test, y_pred)

Adaboost Accuracy=


0.8666666666666667