### Implementation of Ensemble Learning Model


![](ensemble_learning.jpg)

# Performance of Weak Learners

In [None]:
# A weak learner is a model that may have errors.

In [None]:
# We are Importing some libraries to build the models

import numpy as np # use for numerical operations
import pandas as pd # use for dataframe

# Here we are imporing classifiers libraries
from sklearn.tree import DecisionTreeClassifier # Decision Tree classifier
from sklearn.linear_model import LogisticRegression # Logistic Regression
from sklearn.svm import SVC # Support Vector Classifier
from sklearn.neighbors import KNeighborsClassifier # KNN Classifier


In [None]:
#Reading the dataset
dataset = pd.read_csv('Ensemble.csv')

print(dataset)

# We have update the dataset by puting some incomplete or missing informations.
# Those are represented by NaN (Not a Number) or Null values.

    Height (in cms)  Weight (in kgs) T Shirt Size
0             158.0             58.0            M
1             158.0             59.0            M
2             158.0             63.0            M
3             160.0             59.0            M
4             160.0             60.0            M
5             189.0              NaN            M
6             100.0             67.0          NaN
7             168.0             66.0            L
8             170.0             63.0            L
9             170.0             64.0            L
10            200.0              NaN            M
11            210.0             89.0          NaN
12            163.0             60.0            M
13            163.0             61.0            M
14            160.0             64.0            L
15            163.0             64.0            L
16            165.0             61.0            L
17            163.0              NaN          NaN
18            165.0             61.0          NaN


In [None]:
# You can know the size of the dataset the by the following syntax
dataset.shape # 29 29 Rows and 03 columns

(29, 3)

In [None]:
# In this dataset we have incomplete or missing information
# we need to remove those rows and columns which are having NaN.
# We can use dropna () method to drop null values based rows and columns

# drop all rows with any NaN values

ds1 = dataset.dropna()
print(ds1)

# ds1 is complete dataset which are not having by any NaN value

    Height (in cms)  Weight (in kgs) T Shirt Size
0             158.0             58.0            M
1             158.0             59.0            M
2             158.0             63.0            M
3             160.0             59.0            M
4             160.0             60.0            M
7             168.0             66.0            L
8             170.0             63.0            L
9             170.0             64.0            L
12            163.0             60.0            M
13            163.0             61.0            M
14            160.0             64.0            L
15            163.0             64.0            L
16            165.0             61.0            L
21            165.0             62.0            L
22            165.0             65.0            L
23            168.0             62.0            L
24            168.0             63.0            L
28            170.0             68.0            L


In [None]:
# Let us check total number of rows in correct or without null values into updated dataset
ds1.shape

(18, 3)

In [None]:
# There are two ways to make your dataset as complete 
# 1. To remove NaN or null values (DS1)
# 2. To replace some values instead of NaN values (DS2)

# fill the values using forward fill (ffill) method 
# meaning that the value NaN will be updated by or replaced by its previous or upper value
ds2 = dataset.fillna (method = 'ffill')
print (ds2)

    Height (in cms)  Weight (in kgs) T Shirt Size
0             158.0             58.0            M
1             158.0             59.0            M
2             158.0             63.0            M
3             160.0             59.0            M
4             160.0             60.0            M
5             189.0             60.0            M
6             100.0             67.0            M
7             168.0             66.0            L
8             170.0             63.0            L
9             170.0             64.0            L
10            200.0             64.0            M
11            210.0             89.0            M
12            163.0             60.0            M
13            163.0             61.0            M
14            160.0             64.0            L
15            163.0             64.0            L
16            165.0             61.0            L
17            163.0             61.0            L
18            165.0             61.0            L


In [None]:
# Dataset variable contains the data including NaN values
# DS1- A dataset variable which does not contain NaN value
# DS2- A dataset variable which is using all rows and columns by filling or updating NaN values

ds2.shape 
# Now the dataset ds2 contains no null or NaN values

(29, 3)

In [None]:
# DS1- 18 rows
# DS2- 29 rows without NaN values

ds2.columns
# ds2 is our updated or filter based dataset 
# while dataset variable contains all data along with NaN values

Index(['Height (in cms)', 'Weight (in kgs)', 'T Shirt Size'], dtype='object')

In [None]:
X = ds2.iloc[:, [0, 1]].values # Independent variable (Height and weight)
y = ds2.iloc[:, 2].values # Dependent variable or target class (T-Shirt Size)

print (X)
print (y)

[[158.  58.]
 [158.  59.]
 [158.  63.]
 [160.  59.]
 [160.  60.]
 [189.  60.]
 [100.  67.]
 [168.  66.]
 [170.  63.]
 [170.  64.]
 [200.  64.]
 [210.  89.]
 [163.  60.]
 [163.  61.]
 [160.  64.]
 [163.  64.]
 [165.  61.]
 [163.  61.]
 [165.  61.]
 [165.  64.]
 [168.  64.]
 [165.  62.]
 [165.  65.]
 [168.  62.]
 [168.  63.]
 [168.  67.]
 [210.  67.]
 [210.  89.]
 [170.  68.]]
['M' 'M' 'M' 'M' 'M' 'M' 'M' 'L' 'L' 'L' 'M' 'M' 'M' 'M' 'L' 'L' 'L' 'L'
 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'M' 'L' 'L']


In [None]:
# As we know that our T-Shirt Size values are non-integer and if we need to perform some 
# mathematical Calculations as per need, so we have to transform the non-integer values 
# into integer values

# This will encode your non-integer values into integer values

# Import LabelEncoder class from preprocessing library from sklearn package
#from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

#creating labelEncoder
le = LabelEncoder() # le is an object of labelencoder class

# By defauly we have values in M/L of Y variable
y_new = y # now y_new will contain m/l

#y_new is variable for transformed values
# Converting string labels into numbers
y_new=le.fit_transform(y_new) # passing the T-Shirt Size values (M/L) which is represented by y variable

# we are replacing y actual values into 0 and 1.

print (y) # values in M/L

print(y_new) # now y variable will contain information in 0 and 1.

# 1 means M and 0 means L

['M' 'M' 'M' 'M' 'M' 'M' 'M' 'L' 'L' 'L' 'M' 'M' 'M' 'M' 'L' 'L' 'L' 'L'
 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'M' 'L' 'L']
[1 1 1 1 1 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]


In [None]:
# we have transformed our M/L values into 1 and 0
# Now update y variable by updated values

y = y_new # we are replacing y values by 0 and 1 using y_new variable

In [None]:
# So far prepared dataset and transformed non-interger values to integer values
# Next step is to split your dataset into training and testing

# We are Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.45, random_state = 50)

In [None]:
print (X_train.shape) # size of the training data
print (X_test.shape) # Size of the testing data

(15, 2)
(14, 2)


In [None]:
# Now we are building our models

# Here we are imporing classifiers libraries
from sklearn.tree import DecisionTreeClassifier # Decision Tree classifier
from sklearn.linear_model import LogisticRegression # Logistic Regression
from sklearn.svm import SVC # Support Vector Classifier
from sklearn.neighbors import KNeighborsClassifier # KNN Classifier

#Defining the machine learning models by creating their objects
model1 = LogisticRegression()
model2 = DecisionTreeClassifier(max_depth = 5)
# 21 would be total nodes in a decision tree
# max_depth = depth of tree based on 21 nodes
model3 = SVC()
model4 = KNeighborsClassifier(n_neighbors = 6, metric = 'euclidean')

In [None]:
# We need to fit the models towards the training dataset
#Training the machine learning models
model1.fit(X_train, y_train) # LR

LogisticRegression()

In [None]:
model2.fit(X_train, y_train) # DTC


DecisionTreeClassifier(max_depth=5)

In [None]:
model3.fit(X_train, y_train) # SVC


SVC()

In [None]:
model4.fit(X_train, y_train) # KNN

KNeighborsClassifier(metric='euclidean', n_neighbors=6)

In [None]:
# We are predicting by using testing dataset
#Making the prediction
y_pred1 = model1.predict(X_test) # LR
y_pred2 = model2.predict(X_test) # DTC
y_pred3 = model3.predict(X_test) # SVC
y_pred4 = model4.predict(X_test) # KNN


In [None]:
print ("Actual values of the testing data: ", y_test)
print ("Prediction by LR:", y_pred1)
print ("Prediction by DT:", y_pred2)
print ("Prediction by SVC:", y_pred3)
print ("Prediction by KNN:", y_pred4)


Actual values of the testing data:  [0 0 0 0 0 0 0 0 0 0 1 1 1 1]
Prediction by LR: [1 1 1 1 1 1 1 1 1 1 0 0 0 1]
Prediction by DT: [0 0 1 0 1 0 0 0 1 0 1 1 1 1]
Prediction by SVC: [1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Prediction by KNN: [0 0 0 0 0 0 0 0 0 0 0 0 0 1]


In [None]:
# We are Confusion Matric to measure the performance models
#Confusion matrix

from sklearn.metrics import confusion_matrix

cm_LogisticRegression = confusion_matrix(y_test, y_pred1)
cm_DecisionTree = confusion_matrix(y_test, y_pred2)
cm_SupportVectorClass = confusion_matrix(y_test, y_pred3)
cm_KNN = confusion_matrix(y_test, y_pred4)

In [None]:
print (cm_LogisticRegression)
print (cm_DecisionTree)
print (cm_SupportVectorClass)
print (cm_KNN)

[[ 0 10]
 [ 3  1]]
[[7 3]
 [0 4]]
[[ 0 10]
 [ 0  4]]
[[10  0]
 [ 3  1]]


In [None]:
# When we need to calculate the accuracy of the model under classification 
# always use testing dataset to get the result

# we are measuring the performance of the model by accuracy score
from sklearn.metrics import accuracy_score

#accuracy scores
log_acc = accuracy_score(y_pred1, y_test) #(prediction value , actual value)
dt_acc = accuracy_score(y_pred2, y_test)
svm_acc = accuracy_score(y_pred3, y_test)
knn_acc = accuracy_score(y_pred4, y_test)

print ("Accuracy of Logistic Regression Model in %:",log_acc*100)
print ("Accuracy of Decision Tree Model in %:",dt_acc*100)
print("Accuracy of Support Vector Machine Model in %:",svm_acc*100)
print("Accuracy of KNN Model in %:",knn_acc*100)

Accuracy of Logistic Regression Model in %: 7.142857142857142
Accuracy of Decision Tree Model in %: 78.57142857142857
Accuracy of Support Vector Machine Model in %: 28.57142857142857
Accuracy of KNN Model in %: 78.57142857142857


In [None]:
# The model which does have accuracy score less than 100%, it means the model contains some errors and
# this is the reason it it known as weak learner model.


### Ensemble Learning Model

### In this work, we have defined each of the four machine learning models 5 times that results in a combination of a total of (4 ML models x 5 times)=20 weak learners. Then finally, then Max Voting Classifier method is used where the class which has been predicted mostly by the weak learners will be the final class prediction of the ensemble model. 

In [None]:
# Defining Hybrid (becuase we have 4 differet models) Ensemble Learning Model
# means we are aggregating all 4 models as a one group (Ensemble)
# create the sub-models

estimators = []
# estimators is a variable and it contains or append all the models details in a list.

# Defining 5 Logistic Regression Models
# model11, model12, model13, model14, model15 are the objects of the LR
model11 = LogisticRegression()
estimators.append(('logistic1', model11))
model12 = LogisticRegression()
estimators.append(('logistic2', model12))
model13 = LogisticRegression()
estimators.append(('logistic3', model13))
model14 = LogisticRegression()
estimators.append(('logistic4', model14))
model15 = LogisticRegression()
estimators.append(('logistic5', model15))

# Printing the values of the estimators
estimators

[('logistic1', LogisticRegression()),
 ('logistic2', LogisticRegression()),
 ('logistic3', LogisticRegression()),
 ('logistic4', LogisticRegression()),
 ('logistic5', LogisticRegression())]

In [None]:
# Defining 5 Decision Tree Classifiers

# CART = Classification and Regression Technique

# Decision tree is the best example of CART becuase it supports the both approaches 
# on the same datasets at the same time.

model16 = DecisionTreeClassifier(max_depth = 4)
estimators.append(('cart1', model16))
model17 = DecisionTreeClassifier(max_depth = 3)
estimators.append(('cart2', model17))
model18 = DecisionTreeClassifier(max_depth = 6)
estimators.append(('cart3', model18))
model19 = DecisionTreeClassifier(max_depth = 7)
estimators.append(('cart4', model19))
model20 = DecisionTreeClassifier(max_depth = 2)
estimators.append(('cart5', model20))

estimators

[('logistic1', LogisticRegression()),
 ('logistic2', LogisticRegression()),
 ('logistic3', LogisticRegression()),
 ('logistic4', LogisticRegression()),
 ('logistic5', LogisticRegression()),
 ('cart1', DecisionTreeClassifier(max_depth=4)),
 ('cart2', DecisionTreeClassifier(max_depth=3)),
 ('cart3', DecisionTreeClassifier(max_depth=6)),
 ('cart4', DecisionTreeClassifier(max_depth=7)),
 ('cart5', DecisionTreeClassifier(max_depth=2))]

In [None]:
#Defining 5 Support Vector Classifiers
model21 = SVC(kernel = 'linear')
estimators.append(('svm1', model21))
model22 = SVC(kernel = 'poly')
estimators.append(('svm2', model22))
model23 = SVC(kernel = 'rbf')
estimators.append(('svm3', model23))
model24 = SVC(kernel = 'rbf')
estimators.append(('svm4', model24))
model25 = SVC(kernel = 'linear')
estimators.append(('svm5', model25))

estimators


[('logistic1', LogisticRegression()),
 ('logistic2', LogisticRegression()),
 ('logistic3', LogisticRegression()),
 ('logistic4', LogisticRegression()),
 ('logistic5', LogisticRegression()),
 ('cart1', DecisionTreeClassifier(max_depth=4)),
 ('cart2', DecisionTreeClassifier(max_depth=3)),
 ('cart3', DecisionTreeClassifier(max_depth=6)),
 ('cart4', DecisionTreeClassifier(max_depth=7)),
 ('cart5', DecisionTreeClassifier(max_depth=2)),
 ('svm1', SVC(kernel='linear')),
 ('svm2', SVC(kernel='poly')),
 ('svm3', SVC()),
 ('svm4', SVC()),
 ('svm5', SVC(kernel='linear'))]

In [None]:
#Defining 5 K-NN classifiers
model26 = KNeighborsClassifier(n_neighbors = 3, metric = 'euclidean')
estimators.append(('knn1', model26))
model27 = KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean')
estimators.append(('knn2', model27))
model28 = KNeighborsClassifier(n_neighbors = 6, metric = 'euclidean')
estimators.append(('knn3', model28))
model29 = KNeighborsClassifier(n_neighbors = 4, metric = 'euclidean')
estimators.append(('knn4', model29))
model30 = KNeighborsClassifier(n_neighbors = 7, metric = 'euclidean')
estimators.append(('knn5', model30))

estimators

# At the last estimators contains the all the models details in terms of their objects values
# ans it stores the detail in a group (list type variable)

[('logistic1', LogisticRegression()),
 ('logistic2', LogisticRegression()),
 ('logistic3', LogisticRegression()),
 ('logistic4', LogisticRegression()),
 ('logistic5', LogisticRegression()),
 ('cart1', DecisionTreeClassifier(max_depth=4)),
 ('cart2', DecisionTreeClassifier(max_depth=3)),
 ('cart3', DecisionTreeClassifier(max_depth=6)),
 ('cart4', DecisionTreeClassifier(max_depth=7)),
 ('cart5', DecisionTreeClassifier(max_depth=2)),
 ('svm1', SVC(kernel='linear')),
 ('svm2', SVC(kernel='poly')),
 ('svm3', SVC()),
 ('svm4', SVC()),
 ('svm5', SVC(kernel='linear')),
 ('knn1', KNeighborsClassifier(metric='euclidean', n_neighbors=3)),
 ('knn2', KNeighborsClassifier(metric='euclidean')),
 ('knn3', KNeighborsClassifier(metric='euclidean', n_neighbors=6)),
 ('knn4', KNeighborsClassifier(metric='euclidean', n_neighbors=4)),
 ('knn5', KNeighborsClassifier(metric='euclidean', n_neighbors=7))]

In [None]:
# At the last we need to ensemble or aggregate our models by using estimators variable
# and fit the new model with estimators variable

# Defining the ensemble model

# Here we are using votingclassifier because it is basic ensemble learning model
# others are bagging classifier (Random Forest) and boosing classifier(ADABOOST)

# Here we are not able to use Random Forest classifier 
# because all models must be followed by decision trees

from sklearn.ensemble import VotingClassifier

# we need to pass an estimator variable and that must contain all the models as our desire
ensemble = VotingClassifier(estimators) 

# Like the other models we need to fit the ensemble model with same training dataset
ensemble.fit(X_train, y_train)

# we need to predict the ensemble model like others with same testing dataset
y_pred = ensemble.predict(X_test)

print (y_test) # actual values of testing data
print (y_pred) # predicted values by the ensemble model using voting classifier 

[0 0 0 0 0 0 0 0 0 0 1 1 1 1]
[0 0 1 0 1 0 0 0 1 0 1 1 0 1]


In [None]:
#Confisuin matrix
cm_Ensembler = confusion_matrix(y_test, y_pred)
cm_Ensembler

# You can note that the confusion matrix of this ensemble model is same as logistic regression model
# because the five learners of the logistic regression are giving maximum vote for accuracy.


array([[7, 3],
       [1, 3]], dtype=int64)

In [None]:
acc = accuracy_score(y_pred, y_test) # prediction value of Ensemble model and Actual testing value

In [None]:
# We are displaying the accuracy of the our defined ensemble learning model (VotingClassier)
acc*100

71.42857142857143