https://github.com/erykml/medium_articles/blob/master/Machine%20Learning/voting_classifier_article.ipynb

In [127]:
# ML Algorithms used 
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

In [128]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import matplotlib.style as style
%matplotlib inline
style.use('ggplot')

from collections import Counter

import warnings
warnings.simplefilter("ignore")

pd.pandas.set_option('display.max_columns',None)

In [129]:
df = pd.read_csv('classification_dataset.csv')
df.head()

Unnamed: 0,density_per_km,latitude,longitude,maxtempC,mintempC,totalSnow_cm,sunHour,moon_illumination,DewPointC,WindGustKmph,cloudcover,humidity,precipMM,pressure,visibility,winddirDegree,year,month,Target
0,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,1
1,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,0
2,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,1
3,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,0
4,12625.800781,-34.603684,-58.381559,22,10,0.0,11.5,76,10,27,34,72,0.0,1025,10,99,2016,3,0


In [130]:
df = df.drop_duplicates(keep='first', inplace=False)

In [131]:
feature_scale = [feature for feature in df.columns if feature not in ['Target']]

scaler=MinMaxScaler()
scaler.fit(df[feature_scale])

# transform the train and test set, and add on the Id and SalePrice variables
data = pd.concat([df[['Target']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(df[feature_scale]), columns=feature_scale)],axis=1)

data.head()

Unnamed: 0,Target,density_per_km,latitude,longitude,maxtempC,mintempC,totalSnow_cm,sunHour,moon_illumination,DewPointC,WindGustKmph,cloudcover,humidity,precipMM,pressure,visibility,winddirDegree,year,month
0,1,0.505636,0.196981,0.742734,0.641791,0.567164,0.0,0.625806,0.87,0.622642,0.164835,0.0,0.55914,0.0,0.602941,1.0,0.531429,0.333333,0.181818
1,0,0.505636,0.196981,0.742734,0.641791,0.567164,0.0,0.625806,0.87,0.622642,0.164835,0.0,0.55914,0.0,0.602941,1.0,0.531429,0.333333,0.181818
2,0,0.505636,0.196981,0.742734,0.61194,0.61194,0.0,0.619355,0.76,0.698113,0.285714,0.34,0.698925,0.0,0.705882,1.0,0.274286,0.333333,0.181818
3,1,0.505636,0.196981,0.742734,0.61194,0.61194,0.0,0.619355,0.76,0.698113,0.285714,0.34,0.698925,0.0,0.705882,1.0,0.274286,0.333333,0.181818
4,0,0.505636,0.196981,0.742734,0.61194,0.731343,0.0,0.432258,0.26,0.792453,0.175824,0.61,0.763441,0.097011,0.558824,0.8,0.291429,0.333333,0.272727


In [134]:
X = data.drop('Target',axis=1)
y = data['Target']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.50,random_state=10,stratify=y) #train test split in 50:50 ratio

In [135]:
X_train.shape, y_train.shape

((37672, 18), (37672,))

In [136]:
Counter(y)

Counter({1: 34611, 0: 40734})

# Fitting the models

In [137]:
clf_list = [('Catboost', CatBoostClassifier(depth=3,learning_rate=0.02,l2_leaf_reg=10,loss_function='Logloss', eval_metric='Accuracy',verbose=False)),
            ('Adaboost', AdaBoostClassifier(learning_rate=0.1,n_estimators=500)),
            ('Xgboost', XGBClassifier(colsample_bytree=0.7,gamma=0.2,learning_rate=0.3,max_depth=3,min_child_weight=5,objective= 'binary:logistic', eval_metric = 'logloss')),
            ('Random forest', RandomForestClassifier(criterion='entropy', max_depth=8, max_features='sqrt',n_estimators=500)),
            ('Decision tree', DecisionTreeClassifier(criterion='gini',max_depth=5, min_samples_leaf=10,min_samples_split= 5, splitter= 'best'))]

In [138]:
for model_tuple in clf_list:
    model = model_tuple[1]
    if 'random_state' in model.get_params().keys():
        model.set_params(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_pred, y_test)
    print(f"{model_tuple[0]}'s accuracy: {acc:.4f}")

Catboost's accuracy: 0.6040
Adaboost's accuracy: 0.5986
Xgboost's accuracy: 0.6033
Random forest's accuracy: 0.5957
Decision tree's accuracy: 0.5928


# Using the VotingClassifier

In [139]:
voting_clf_hard = VotingClassifier(clf_list, voting='hard')
voting_clf_hard.fit(X_train, y_train)
y_pred_hard = voting_clf_hard.predict(X_test)
print(f"Voting Classifier's accuracy: {accuracy_score(y_pred_hard, y_test):.2f}")

Voting Classifier's accuracy: 0.60


In [140]:
voting_clf_soft = VotingClassifier(clf_list, voting='soft')
voting_clf_soft.fit(X_train, y_train)
y_pred_soft = voting_clf_soft.predict(X_test)
print(f"Voting Classifier's accuracy: {accuracy_score(y_pred_soft, y_test):.2f}")

Voting Classifier's accuracy: 0.60


# Make prediction

In [141]:
df.head()

Unnamed: 0,density_per_km,latitude,longitude,maxtempC,mintempC,totalSnow_cm,sunHour,moon_illumination,DewPointC,WindGustKmph,cloudcover,humidity,precipMM,pressure,visibility,winddirDegree,year,month,Target
0,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,1
1,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,0
4,12625.800781,-34.603684,-58.381559,22,10,0.0,11.5,76,10,27,34,72,0.0,1025,10,99,2016,3,0
5,12625.800781,-34.603684,-58.381559,22,10,0.0,11.5,76,10,27,34,72,0.0,1025,10,99,2016,3,1
8,12625.800781,-34.603684,-58.381559,22,18,0.0,8.6,26,15,17,61,78,33.1,1015,8,105,2016,4,0


In [150]:
pred_data = df.copy()
pred_data = pred_data.iloc[:50]
X_preddata = pred_data.drop('Target',axis=1)
y_preddata = pred_data['Target']

In [155]:
for model_tuple in clf_list:
    model = model_tuple[1]
    y_pred = model.predict(X_preddata)
    print(f"{model_tuple[0]} : {y_pred}")

Catboost : [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]
Adaboost : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
Xgboost : [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]
Random forest : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
Decision tree : [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [156]:
max_vote = voting_clf_hard.predict(X_preddata)
print("Voting Classifier",max_vote)

Voting Classifier [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [157]:
data.head()

Unnamed: 0,Target,density_per_km,latitude,longitude,maxtempC,mintempC,totalSnow_cm,sunHour,moon_illumination,DewPointC,WindGustKmph,cloudcover,humidity,precipMM,pressure,visibility,winddirDegree,year,month
0,1,0.505636,0.196981,0.742734,0.641791,0.567164,0.0,0.625806,0.87,0.622642,0.164835,0.0,0.55914,0.0,0.602941,1.0,0.531429,0.333333,0.181818
1,0,0.505636,0.196981,0.742734,0.641791,0.567164,0.0,0.625806,0.87,0.622642,0.164835,0.0,0.55914,0.0,0.602941,1.0,0.531429,0.333333,0.181818
2,0,0.505636,0.196981,0.742734,0.61194,0.61194,0.0,0.619355,0.76,0.698113,0.285714,0.34,0.698925,0.0,0.705882,1.0,0.274286,0.333333,0.181818
3,1,0.505636,0.196981,0.742734,0.61194,0.61194,0.0,0.619355,0.76,0.698113,0.285714,0.34,0.698925,0.0,0.705882,1.0,0.274286,0.333333,0.181818
4,0,0.505636,0.196981,0.742734,0.61194,0.731343,0.0,0.432258,0.26,0.792453,0.175824,0.61,0.763441,0.097011,0.558824,0.8,0.291429,0.333333,0.272727


In [169]:
pred_data = data.copy()
pred_data = pred_data.iloc[:50]
X_preddata = pred_data.drop('Target',axis=1)
y_preddata = pred_data['Target']

In [170]:
for model_tuple in clf_list:
    model = model_tuple[1]
    y_pred = model.predict(X_preddata)
    print(f"{model_tuple[0]}'s accuracy: {y_pred}")

Catboost's accuracy: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
Adaboost's accuracy: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
Xgboost's accuracy: [0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 0
 0 1 1 0 0 0 0 0 0 0 0 0 0]
Random forest's accuracy: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
Decision tree's accuracy: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [171]:
max_vote = voting_clf_hard.predict(X_preddata)
print("Voting Classifier",max_vote)

Voting Classifier [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
