In [6]:
import pandas as pd 
import numpy as np 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB



A voting classifier is a machine learning model that trains on an ensemble of numerous models and predicts an output (class) based on their highest probability of chosen class as the output

It simply aggregates the findings of each classifier passed into voting classifier and predicts the output class based on the highest majority of voting.

Two types of voting:

Hard voting predicted output class will work on majority.

Soft voting predicted output class will work on probability.

In [23]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
df  = pd.read_csv('Assets/heart.csv'
                  )

df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [8]:
df.shape

(303, 14)

In [9]:
X = df.drop(columns='target', axis = 1)

In [10]:
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [16]:
y = df['target']

In [17]:
y.ndim

1

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= .2, random_state= 42)


### Stacking
- When you use different models to create a strong model. Like using Kneighbours (KNN), Logistic Regression, RandomForestClassifier

In [67]:
log_clf = LogisticRegression()
knn_clf = KNeighborsClassifier()
nb_clf = GaussianNB()
rf_clf = RandomForestClassifier(n_estimators=5,criterion='gini')

In [68]:
for clf in (log_clf, knn_clf, nb_clf,rf_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,accuracy_score(y_test, y_pred))

LogisticRegression 0.8852459016393442
KNeighborsClassifier 0.6885245901639344
GaussianNB 0.8688524590163934
RandomForestClassifier 0.8524590163934426


In [69]:
vt_clf = VotingClassifier(estimators= [('lr',log_clf),('nbg',nb_clf),
                                       ('knn',knn_clf),('rf',rf_clf)],voting='hard')



In [70]:
for clf in (log_clf, knn_clf, nb_clf,vt_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,accuracy_score(y_test, y_pred))

LogisticRegression 0.8852459016393442
KNeighborsClassifier 0.6885245901639344
GaussianNB 0.8688524590163934
VotingClassifier 0.8688524590163934


In [71]:
vt_clf = VotingClassifier(estimators= [('lr',log_clf),('nbg',nb_clf),
                                       ('knn',knn_clf),('rf',rf_clf)],voting='soft')

for clf in (log_clf, knn_clf, nb_clf,rf_clf,vt_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,accuracy_score(y_test, y_pred))

LogisticRegression 0.8852459016393442
KNeighborsClassifier 0.6885245901639344
GaussianNB 0.8688524590163934
RandomForestClassifier 0.819672131147541
VotingClassifier 0.9016393442622951
