## Voting for Classification 

### We will be using the IRIS dataset 


In [1]:
#Lets import the libraries and datasets 
import numpy as np

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split

import statistics as stat

In [2]:
#Lets load the dataset
iris = datasets.load_iris()
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [3]:
iris.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [4]:
type(iris.data)

numpy.ndarray

In [5]:
X = iris.data[:, [0, 2]]
y = iris.target

In [6]:
#Lets check the shape of our data 
print(X.shape)
print(y.shape)

(150, 2)
(150,)


In [7]:
#Output classes
list(iris.target_names)

['setosa', 'versicolor', 'virginica']

In [8]:
#Lets split our data in train and test 
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.30, random_state=101)

In [9]:
#We initialize the models
model1 = DecisionTreeClassifier(max_depth=4)
model2 = KNeighborsClassifier(n_neighbors=7)
model3 = SVC(kernel='rbf', probability=True)

In [10]:
#We then fit the models 
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

### Lets look at the accuracy for each model 

In [11]:
#The accuracy score for the first model is
score1=model1.score(X_test, y_test)
print(score1)

0.9333333333333333


In [12]:
#The accuracy score for the second model is
score2=model2.score(X_test, y_test)
print(score2)

0.9333333333333333


In [13]:
#The accuracy score for the third model is
score3=model3.score(X_test, y_test)
print(score3)

0.9555555555555556


### Lets combine the models using voting in python 

In [14]:
#Lets find out what each model predicts 
pred1=model1.predict(X_test)
pred2=model2.predict(X_test)
pred3=model3.predict(X_test)

#Lets take the vote using the mode function (assume unique)
final_pred = np.array([])
for i in range(0,len(X_test)):
    print(pred1[i], pred2[i], pred3[i])
    final_pred = np.append(final_pred, stat.mode([pred1[i], pred2[i], pred3[i]]))
    print(stat.mode([pred1[i], pred2[i], pred3[i]]))

0 0 0
0
0 0 0
0
0 0 0
0
2 2 2
2
1 1 1
1
1 1 1
1
1 1 1
1
1 1 1
1
2 2 2
2
0 0 0
0
2 2 2
2
0 0 0
0
0 0 0
0
2 2 2
2
2 1 2
2
1 1 1
1
1 1 1
1
1 1 1
1
0 0 0
0
1 2 2
2
1 1 1
1
0 0 0
0
1 1 1
1
1 1 1
1
1 1 1
1
1 1 1
1
1 1 1
1
2 2 2
2
0 0 0
0
0 0 0
0
2 2 2
2
1 1 1
1
2 2 2
2
1 1 1
1
2 2 2
2
1 1 1
1
1 1 1
1
1 1 1
1
1 1 1
1
2 2 2
2
0 0 0
0
0 0 0
0
0 0 0
0
2 2 2
2
1 1 1
1


In [15]:
pred2

array([0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 2, 0, 0, 2, 1, 1, 1, 1, 0, 2, 1, 0,
       1, 1, 1, 1, 1, 2, 0, 0, 2, 1, 2, 1, 2, 1, 1, 1, 1, 2, 0, 0, 0, 2,
       1])

In [16]:
stat.mode([1,2,3,1])

1

In [17]:
len(X_test)

45

### This is also called as a hard voting since we are using the model predictions to get the final combined prediction 

In [18]:
total=np.sum(y_test==final_pred)

print("Accuracy:",total,"/",len(final_pred),"* 100 =","{0:.3f}".format(total/len(final_pred)*100),"%")

Accuracy: 43 / 45 * 100 = 95.556 %


### Soft voting in python

soft voting - average<br>
hard voting - count<br>
Example - [0.4,0.4,0.9]<br>
hard voting - False (2/3)<br>
soft voting - True (0.57)

In [19]:
spred1=model1.predict_proba(X_test)
spred2=model2.predict_proba(X_test)
spred3=model3.predict_proba(X_test)

finalpred=(spred1+spred2+spred3)/3

In [20]:
spred2[:10]

array([[1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.71428571, 0.28571429],
       [0.        , 0.57142857, 0.42857143],
       [0.        , 1.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.42857143, 0.57142857],
       [1.        , 0.        , 0.        ]])

In [21]:
finalpred[:3]

array([[0.98432491, 0.00774715, 0.00792793],
       [0.98586455, 0.00689541, 0.00724004],
       [0.98638238, 0.00688109, 0.00673653]])

In [22]:
final_classes = finalpred.argmax(axis=-1) 
final_classes

array([0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 2, 0, 0, 2, 2, 1, 1, 1, 0, 2, 1, 0,
       1, 1, 1, 1, 1, 2, 0, 0, 2, 1, 2, 1, 2, 1, 1, 1, 1, 2, 0, 0, 0, 2,
       1], dtype=int64)

In [23]:
total=np.sum([y_test[i]==final_classes[i] for i in range(len(final_classes))])

print("Accuracy:",total,"/",len(final_classes),"* 100 =","{0:.3f}".format(total/len(final_classes)*100),"%")

Accuracy: 43 / 45 * 100 = 95.556 %


### We can also use VotingClassifier from sklearn to combine the models

In [24]:
emodel = VotingClassifier(estimators=[('dt', model1), ('knn', model2),
                                    ('svc', model3)],
                        voting='soft')
emodel.fit(X_train, y_train)




VotingClassifier(estimators=[('dt',
                              DecisionTreeClassifier(class_weight=None,
                                                     criterion='gini',
                                                     max_depth=4,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     presort=False,
                                                     random_state=None,
                                                     splitter='best')),
         

In [25]:
#The accuracy score for the ensemble model is
escore=emodel.score(X_test, y_test)
print(escore)

0.9555555555555556


In [26]:
##########################################
# try on logistig regression ex-1 emails.csv, RF, NB, logREG
# compare to sole logReg classifier, and soft vs. hard
# add 2 more models - KNN and SVM - compare results
# which model is the best? why?

In [27]:
import pandas as pd

In [28]:
df = pd.read_csv('emails.csv')

In [29]:
df.head()

Unnamed: 0,email,rec,elen,attch,slen,spam
0,eli302@outlook.com,10,235,1,35,1
1,eli194@outlook.com,3,211,1,19,0
2,eli415@me.com,17,232,1,69,1
3,eli105@aws.com,5,175,0,14,0
4,eli40@aws.com,5,67,1,10,0


In [30]:
X = df.drop(['email', 'spam'], axis=1)
y = df['spam']

In [31]:
#Lets split our data in train and test 
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.30, random_state=101)

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [33]:
clf1 = LogisticRegression()
clf2 = GaussianNB()

In [34]:
clf1.fit(X_train, y_train)
print(clf1.score(X_test, y_test))

0.8699029126213592




In [35]:
clf2.fit(X_train, y_train)
print(clf2.score(X_test, y_test))

0.9067961165048544


In [36]:
emodel = VotingClassifier(estimators=[
    ('lr', clf1), 
    ('g', clf2),
],
                          voting='soft')
emodel.fit(X_train, y_train)
emodel.score(X_test, y_test)



0.9067961165048544

In [37]:
emodel = VotingClassifier(estimators=[
    ('lr', clf1), 
    ('g', clf2),
    ('knn', KNeighborsClassifier(n_neighbors=7)),
    ('SVM', SVC(kernel='rbf', probability=True)),
],
                          voting='soft')
emodel.fit(X_train, y_train)
emodel.score(X_test, y_test)



0.996116504854369