In [1]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit
from sklearn.metrics import make_scorer, matthews_corrcoef, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import datasets

import pandas as pd
import numpy as np

np.random.seed(seed=2017)

# Load the iris data 

In [7]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [13]:
missing_rate = sorted(np.sum(np.isnan(X), axis =1)/X.shape[0])
print('the max missing rate among the columns is:', max(missing_rate))

the max missing rate among the columns is: 0.0


# Data splitting

In [16]:
# Apply stratifiedShuffleSplit to conserve the imbalanced set
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=0)

# get the index
train_index, test_index  = next(iter(sss.split(X,y)))

# split the data
X_train = X[train_index, :]
y_train = y[train_index]
X_test = X[test_index, :]
y_test = y[test_index]

# Show the results of the split
from collections import Counter
distribution_test = Counter(y_test)
distribution_train = Counter(y_train)

for distribution, length in zip([distribution_train, distribution_test], [len(y_train), len(y_test)]):
    for key, value in distribution.items():
        distribution[key] = value/length

print(distribution_train)
print(distribution_test)

Counter({2: 0.3392857142857143, 1: 0.33035714285714285, 0: 0.33035714285714285})
Counter({0: 0.34210526315789475, 1: 0.34210526315789475, 2: 0.3157894736842105})


# Stacking

The basic classifiers considered for stacking are:  Linear Discriminant Analysis, Quadratic Discriminant Analysis, Logistic Regression, SVM,    Random Forest, Gaussian NB, and MLP.

In [34]:
# a list of classifiers
clfs = [LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto'), 
        QuadraticDiscriminantAnalysis(),
        LogisticRegression(), 
        SVC(probability = True), 
        RandomForestClassifier(bootstrap=True), 
        GaussianNB(), 
        MLPClassifier(solver='lbfgs', 
                      alpha=1e-5, 
                      hidden_layer_sizes=(16, len(np.unique(y_train))), activation = 'relu')]

# a list of the name of classifiers
labels = ['LDA', 'QDA', 'LR', 'SVC', 'RF', 'GNB', 'MLP']

# a simple logistic regression for meta-classifer
lr = LogisticRegression()

# ensemble classifier
eclf = VotingClassifier(estimators=list(zip(labels, clfs)), 
                        voting='soft', 
                        weights=None)

# stacking classifier
sclf = StackingClassifier(classifiers=clfs,
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=lr)

# using matthews_corrcoef for prediction score
scorer = make_scorer(matthews_corrcoef)

# run a CV with 10-fold
for clf, label in zip(clfs+ [eclf, sclf], labels + ['Ensemble', 'Stacking']):
    scores = cross_val_score(clf, X_train, y_train, cv=10, scoring=scorer)
    print("Matthews_corrcoef: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))


Matthews_corrcoef: 0.93 (+/- 0.06) [LDA]
Matthews_corrcoef: 0.96 (+/- 0.06) [QDA]
Matthews_corrcoef: 0.93 (+/- 0.08) [LR]
Matthews_corrcoef: 0.91 (+/- 0.10) [SVC]
Matthews_corrcoef: 0.93 (+/- 0.06) [RF]
Matthews_corrcoef: 0.93 (+/- 0.06) [GNB]


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Matthews_corrcoef: 0.75 (+/- 0.38) [MLP]
Matthews_corrcoef: 0.93 (+/- 0.06) [Ensemble]
Matthews_corrcoef: 0.94 (+/- 0.06) [Stacking]


# Parameter tuning

Here, we tune the parameters for the ensemble classifier using GridSearchCV. 

In [37]:
# remove the MLP
eclf.set_params(MLP=None)

# the parameter space
params = {'LR__C': [0.2, 1, 5],
          'RF__n_estimators': [10, 100],
          'SVC__C': [0.2, 1, 5]}

# run a grid search on the eclf
grid = GridSearchCV(estimator=eclf, 
                    param_grid=params, 
                    cv=10,
                    scoring = scorer)

# using the training data
grid.fit(X_train, y_train)

# information of interest
cv_keys = ('mean_test_score', 'std_test_score', 'params')

for i, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_[cv_keys[0]][i],
             grid.cv_results_[cv_keys[1]][i] / 2.0,
             grid.cv_results_[cv_keys[2]][i]))

print('Best parameters: %s' % grid.best_params_)
print('Best gridsearch cv Matthews_corrcoef: %.2f' % grid.best_score_)

# set the parameters
eclf.set_params(**grid.best_params_)

# fit and generate confusion matrix
eclf.fit(X_train, y_train)
eclf_predict = eclf.predict(X_test)
eclf_cm = confusion_matrix(y_test, eclf_predict)
eclf_mc = matthews_corrcoef(y_test, eclf_predict)

print('confusion matrix:\n', eclf_cm, '\n', 'matthews_corrcoef:', eclf_mc)

0.926 +/- 0.03 {'LR__C': 0.2, 'RF__n_estimators': 10, 'SVC__C': 0.2}
0.926 +/- 0.03 {'LR__C': 0.2, 'RF__n_estimators': 10, 'SVC__C': 1}
0.926 +/- 0.03 {'LR__C': 0.2, 'RF__n_estimators': 10, 'SVC__C': 5}
0.926 +/- 0.03 {'LR__C': 0.2, 'RF__n_estimators': 100, 'SVC__C': 0.2}
0.926 +/- 0.03 {'LR__C': 0.2, 'RF__n_estimators': 100, 'SVC__C': 1}
0.926 +/- 0.03 {'LR__C': 0.2, 'RF__n_estimators': 100, 'SVC__C': 5}
0.926 +/- 0.03 {'LR__C': 1, 'RF__n_estimators': 10, 'SVC__C': 0.2}
0.926 +/- 0.03 {'LR__C': 1, 'RF__n_estimators': 10, 'SVC__C': 1}
0.926 +/- 0.03 {'LR__C': 1, 'RF__n_estimators': 10, 'SVC__C': 5}
0.926 +/- 0.03 {'LR__C': 1, 'RF__n_estimators': 100, 'SVC__C': 0.2}
0.926 +/- 0.03 {'LR__C': 1, 'RF__n_estimators': 100, 'SVC__C': 1}
0.926 +/- 0.03 {'LR__C': 1, 'RF__n_estimators': 100, 'SVC__C': 5}
0.926 +/- 0.03 {'LR__C': 5, 'RF__n_estimators': 10, 'SVC__C': 0.2}
0.926 +/- 0.03 {'LR__C': 5, 'RF__n_estimators': 10, 'SVC__C': 1}
0.926 +/- 0.03 {'LR__C': 5, 'RF__n_estimators': 10, 'SVC__C': 