### 1. Voting Ensemble:

In [35]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn import metrics
from sklearn.datasets import load_breast_cancer
warnings.filterwarnings(action='ignore')                  # Turn off the warnings.

#### 1.1. Read in data:

In [36]:
# Load data.
data = load_breast_cancer()
data.data


array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [37]:
# Explanatory variables.
X = data.data
#Print feature names
print(data.feature_names)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [38]:
X.shape

(569, 30)

In [39]:
# Response variable.
# Relabel such that 0 = 'benign' and 1 = malignant.
Y = 1 - data['target']
label = list(data['target_names'])
label.reverse()
print(label)

['benign', 'malignant']


In [40]:
#Split with test_size=0.3, random_state=1234
X_train, X_test, Y_train,Y_test=train_test_split(X,Y, test_size=0.3, random_state=1234)
X_train.shape,X_test.shape, Y_train.shape, Y_test.shape

((398, 30), (171, 30), (398,), (171,))

#### 1.2. Predicting with individual estimator:


Question 1 : Use Decision tree with max_depth=10, KNN with k = 5 and Logistic Regression to classify your data.

In [41]:
# Classification Tree.
tree = DecisionTreeClassifier(max_depth=10)

#Train part
model=tree.fit(X_train,Y_train)
#Prediction
Y_pred = model.predict(X_test)
print( f"Tree accuracy : {metrics.accuracy_score(Y_test,Y_pred)}" )

Tree accuracy : 0.9298245614035088


In [42]:
# Classification with KNN
knn = KNeighborsClassifier(n_neighbors=5)

#Train part
model2=knn.fit(X_train, Y_train)
#Prediction
Y_pred = model2.predict(X_test)

#Accuracy
print(f"KNN accuracy: {metrics.accuracy_score(Y_test,Y_pred)} " )

KNN accuracy: 0.935672514619883 


In [43]:
# Classification with Logistic Regression.
lg = LogisticRegression()

#Train part
model3=lg.fit(X_train, Y_train)
#Prediction
Y_pred = lg.predict(X_test)

#Accuracy
print( f"Logistic Reg accuracy : {metrics.accuracy_score(Y_test,Y_pred)}" )

Logistic Reg accuracy : 0.9239766081871345


#### 1.3. Predicting with a voting ensemble:

The EnsembleVoteClassifier is a meta-classifier for combining similar or conceptually different machine learning classifiers for classification via majority or plurality voting.In hard voting, we predict the final class label as the class label that has been predicted most frequently by the classification models

Question:  

- Use VotingClassifier () ensemble method with DTC, KNN, LR set voting to hard, then fit your training data. (hint : consider using the  `estimator` parameter )

- Use VotingClassifier () ensemble method with DTC, KNN, LR set voting to soft, then fit your training data. 

-Compare the two results.


https://www.geeksforgeeks.org/ml-voting-classifier-using-sklearn/

In [44]:
#Hard voting
VC = VotingClassifier(estimators=[('knn',knn),('lg',lg),('tree',tree)], voting='hard')

#Train part
model4=VC.fit(X_train,Y_train)
#Prediction
Y_pred = VC.predict(X_test)

#Accuracy
print( f"Voting Classifier Accuracy  using hard strategy : {metrics.accuracy_score(Y_test,Y_pred)}" )

Voting Classifier Accuracy  using hard strategy : 0.9532163742690059


In [47]:
#Soft voting
VC =VotingClassifier(estimators=[('knn',knn),('lg',lg),('tree',tree)], voting='soft')

#Train part
model4=VC.fit(X_train,Y_train)

#Prediction
Y_pred = VC.predict(X_test)
#Accuracy
print( f"Voting Classifier Accuracy using soft strategy :{metrics.accuracy_score(Y_test,Y_pred)} " )

Voting Classifier Accuracy using soft strategy :0.9473684210526315 


Draw a conclusion !

Ensemble methods bring together the results of two or more separate machine learning algorithm in an attempt to produce a collective result that is more accurate than any of the individual algorithms.
In soft voting the probabilities of each of the classes are averaged to produce a result. 
In hard voting the predictions of each algorithm are considered with the ensemble selecting the class with the highest number of votes.