# Voting classifier - ensemble learning
## Summary:

Interestingly, the hard voting (unweighted) classifier works better than the soft voting (weighted) classifier (0.736 compared to 0.724 accuracy). AdaBoost works better than that with 0.744. The best classifiers are RandomForest and Logistic Regression with 0.748 accuracy.

In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv('german.data-numeric.txt',header=None, delim_whitespace=True)

In [3]:
X=data.iloc[:,0:24].values
y=data[24].values

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Voting classifier

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

gnb_clf = GaussianNB()
knn_clf = KNeighborsClassifier(n_neighbors=4)
log_clf = LogisticRegression(random_state=42)
sgd_clf = SGDClassifier(max_iter=5, random_state=42, loss='modified_huber')
svm_clf = SVC(random_state=42)

voting_clf = VotingClassifier(
    estimators=[('gnb',gnb_clf),('knn',knn_clf),('lr', log_clf), ('sgd', sgd_clf), ('svc', svm_clf)],
    voting='hard')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('gnb', GaussianNB(priors=None)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='uniform')), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=Tr...f',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [6]:
from sklearn import metrics
from sklearn.metrics import accuracy_score

for clf in ( gnb_clf, knn_clf, log_clf, sgd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
    print(clf.__class__.__name__, metrics.classification_report(y_test, y_pred))
    print(clf.__class__.__name__, metrics.confusion_matrix(y_test, y_pred))

GaussianNB 0.736
GaussianNB              precision    recall  f1-score   support

          1       0.85      0.77      0.81       178
          2       0.53      0.65      0.59        72

avg / total       0.76      0.74      0.74       250

GaussianNB [[137  41]
 [ 25  47]]
KNeighborsClassifier 0.696
KNeighborsClassifier              precision    recall  f1-score   support

          1       0.72      0.93      0.81       178
          2       0.41      0.12      0.19        72

avg / total       0.63      0.70      0.63       250

KNeighborsClassifier [[165  13]
 [ 63   9]]
LogisticRegression 0.748
LogisticRegression              precision    recall  f1-score   support

          1       0.79      0.88      0.83       178
          2       0.58      0.43      0.50        72

avg / total       0.73      0.75      0.74       250

LogisticRegression [[156  22]
 [ 41  31]]
SGDClassifier 0.632
SGDClassifier              precision    recall  f1-score   support

          1       0.79     

In [7]:
svm_clf = SVC(probability=True, random_state=42)
voting_clf = VotingClassifier(
    estimators=[('gnb',gnb_clf),('knn',knn_clf),('lr', log_clf), ('sgd', sgd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('gnb', GaussianNB(priors=None)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='uniform')), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=Tr...bf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [8]:
from sklearn.metrics import accuracy_score

for clf in (gnb_clf, knn_clf, log_clf, sgd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
    print(clf.__class__.__name__, metrics.classification_report(y_test, y_pred))
    print(clf.__class__.__name__, metrics.confusion_matrix(y_test, y_pred))

GaussianNB 0.736
GaussianNB              precision    recall  f1-score   support

          1       0.85      0.77      0.81       178
          2       0.53      0.65      0.59        72

avg / total       0.76      0.74      0.74       250

GaussianNB [[137  41]
 [ 25  47]]
KNeighborsClassifier 0.696
KNeighborsClassifier              precision    recall  f1-score   support

          1       0.72      0.93      0.81       178
          2       0.41      0.12      0.19        72

avg / total       0.63      0.70      0.63       250

KNeighborsClassifier [[165  13]
 [ 63   9]]
LogisticRegression 0.748
LogisticRegression              precision    recall  f1-score   support

          1       0.79      0.88      0.83       178
          2       0.58      0.43      0.50        72

avg / total       0.73      0.75      0.74       250

LogisticRegression [[156  22]
 [ 41  31]]
SGDClassifier 0.632
SGDClassifier              precision    recall  f1-score   support

          1       0.79     

# Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(random_state=42)
rnd_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [11]:
from sklearn.metrics import accuracy_score
y_pred = rnd_clf.predict(X_test)
print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
print(clf.__class__.__name__, metrics.classification_report(y_test, y_pred))
print(clf.__class__.__name__, metrics.confusion_matrix(y_test, y_pred))

VotingClassifier 0.748
VotingClassifier              precision    recall  f1-score   support

          1       0.78      0.91      0.84       178
          2       0.61      0.35      0.44        72

avg / total       0.73      0.75      0.72       250

VotingClassifier [[162  16]
 [ 47  25]]


# AdaBoost

In [12]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    algorithm="SAMME.R", learning_rate=0.5, random_state=42)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.5, n_estimators=50, random_state=42)

In [13]:
from sklearn.metrics import accuracy_score
y_pred = ada_clf.predict(X_test)
print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
print(clf.__class__.__name__, metrics.classification_report(y_test, y_pred))
print(clf.__class__.__name__, metrics.confusion_matrix(y_test, y_pred))

VotingClassifier 0.744
VotingClassifier              precision    recall  f1-score   support

          1       0.78      0.89      0.83       178
          2       0.58      0.39      0.47        72

avg / total       0.72      0.74      0.73       250

VotingClassifier [[158  20]
 [ 44  28]]
