In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot
from random import randint
import math

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from scipy.spatial import distance
from sklearn.model_selection import KFold
from sklearn.tree import export_graphviz
from sklearn import tree
from sklearn import metrics
from sklearn import tree
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn import linear_model
from sklearn import neighbors
from sklearn.utils import resample
from sklearn.metrics import cohen_kappa_score

%matplotlib inline




In [0]:
import io
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
def create_classifier(classifier_type, tree_min_samples_split = 20):

    if classifier_type == "svm":
        c = svm.SVC(probability=True)

    elif classifier_type == "logreg":
        c = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear', max_iter=1000)

    elif classifier_type == "knn":
        c = neighbors.KNeighborsClassifier()

    elif classifier_type == "tree":
        c = tree.DecisionTreeClassifier(min_samples_split = tree_min_samples_split)

    elif classifier_type == "randomforest":
        c = ensemble.RandomForestClassifier()
        
    else:
        c = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear', max_iter=1000)
    
    return c

##StackedEnsembleClassifier

In [0]:
# Create a new classifier which is based on the sckit-learn BaseEstimator and ClassifierMixin classes
class StackedEnsembleClassifier(BaseEstimator, ClassifierMixin):
    
    """An ensemble classifier that uses heterogeneous models at the base layer and a aggregatnio model at the aggregation layer. A k-fold cross validation is used to gnerate training data for the stack layer model.

    Parameters
    ----------
    base_estimators: list 
        A list of the classifiers in the ase layer of the ensemble. Supported types are
        - "svm" Support Vector Machine implemented by sklearn.svm.SVC
        - "logreg" Logistic Regression implemented by sklearn.linear_models.LogisticRegression
        - "knn" k Nearest Neighbour implemented by sklearn.neighbors.KNeighborsClassifier
        - "tree" Decision Tree implemented by sklearn.tree.DecisionTreeClassifier
        - "randomforest" RandomForest implemented by sklearn.tree.RandomForestClassifier    
    classifier_duplicates: int, optional (default = 1)
        How many instances of each classifier type listed in base_estimators is included in the ensemble
    stack_layer_classifier: string, optional (default = "logreg')
        The classifier type used at the stack layer. The same classifier types as are supported at the base layer are supported        
    training_folds: int, optional (default = 4)
        How many folds will be used to generate the training set for the stacked layer
        
    Attributes
    ----------
    classes_ : array of shape = [n_classes] 
        The classes labels (single output problem).


    Notes
    -----
    The default values for most base learners are used.

    See also
    --------
    
    ----------
    .. [1]  van der Laan, M., Polley, E. & Hubbard, A. (2007). 
            Super Learner. Statistical Applications in Genetics 
            and Molecular Biology, 6(1) 
            doi:10.2202/1544-6115.1309
    Examples
    --------
    >>> from sklearn.datasets import load_iris
    >>> from sklearn.model_selection import cross_val_score
    >>> clf = StackedEnsembleClassifier()
    >>> iris = load_iris()
    >>> cross_val_score(clf, iris.data, iris.target, cv=10)

    """
    # Constructor for the classifier object
    def __init__(self, base_estimator_types = ["svm", "logreg", "tree"], base_estimator_duplicates = 8, stack_layer_classifier_type = "logreg"):
        """Setup a SuperLearner classifier .
        Parameters
        ----------
        base_estimator_types: The types of classifiers to include at the base layer
        base_estimator_duplicates: The number of duplicates of each type of classiifer to include
        stack_layer_classifier_type: The type of classifier to include at the stack layer 
        
        Returns
        -------
        Nothing
        """     

        # Initialise class variabels
        self.base_estimator_types = base_estimator_types
        self.base_estimator_type_list = list()
        self.base_estimator_duplicates = base_estimator_duplicates
        self.stack_layer_classifier_type = stack_layer_classifier_type

    # The fit function to train a classifier
    def fit(self, X, y):
        """Build a SuperLearner classifier from the training set (X, y).
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples. 
        y : array-like, shape = [n_samples] 
            The target values (class labels) as integers or strings.
        Returns
        -------
        self : object
        """    
        
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        
        ########################
        # LEVEL 0
        ########################
        
        # Set up the base classifeirs in the ensemble
        self.classifiers_ = list()
        
        for i in range(0, self.base_estimator_duplicates):
            for t in self.base_estimator_types:

                self.base_estimator_type_list.append(t)      
                c = create_classifier(t, tree_min_samples_split=math.ceil(len(X)*0.05))
                self.classifiers_.append(c)
        
        # Store the number of classifers in the ensemble
        self.n_estimators_ = len(self.classifiers_)

        # Use all training data to train base classifiers
        X_train = X
        y_train = y
        
        # Set up empty arrays to hold stack layer training data
        self.X_stack_train = None #(dtype = float)
        self.y_stack_train = y_train
          
        # Train each base calssifier and generate the stack layer training dataset
        for classifier in self.classifiers_:

            # Extract a bootstrap sample
            X_train_samp, y_train_samp = resample(X_train, y_train, replace=True)    
            
            # Train a base classifier
            classifier.fit(X_train_samp, y_train_samp)
            
            # Make predictions for all instances in the training set
            y_pred = classifier.predict_proba(X_train)

            # Append the predictions ot the stack layer traing set (a bit of hacking here!)
            try:
                self.X_stack_train = np.c_[self.X_stack_train, y_pred]
            except ValueError:
                self.X_stack_train = y_pred
      
        ########################
        # LEVEL 1
        ########################
        
        # Create the stack layer classifier
        self.stack_layer_classifier_ = create_classifier(self.stack_layer_classifier_type, tree_min_samples_split=math.ceil(len(X)*0.05))

        # Train the stack layer using the newly created dataset
        self.stack_layer_classifier_.fit(self.X_stack_train, self.y_stack_train)
            
        # Return the classifier
        return self

    # The predict function to make a set of predictions for a set of query instances
    def predict(self, X):
        """Predict class labels of the input samples X.
        Parameters
        ----------
        X : array-like matrix of shape = [n_samples, n_features]
            The input samples. 
        Returns
        -------
        p : array of shape = [n_samples, ].
            The predicted class labels of the input samples. 
        """
        
        # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
        check_is_fitted(self, ['stack_layer_classifier_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)
   
        X_stack_queries = None
              
        # Make a prediction with each base classifier and assemble the stack layer query
        for classifier in self.classifiers_:
            
            y_pred = classifier.predict_proba(X)
            
            try:
                X_stack_queries = np.c_[X_stack_queries, y_pred]
            except ValueError:
                X_stack_queries = y_pred
        
        # Return the prediction made by the stack layer classifier
        return self.stack_layer_classifier_.predict(X_stack_queries)
    
    # The predict function to make a set of predictions for a set of query instances
    def predict_proba(self, X):
        """Predict class probabilities of the input samples X.
        Parameters
        ----------
        X : array-like matrix of shape = [n_samples, n_features]
            The input samples. 
        Returns
        -------
        p : array of shape = [n_samples, n_labels].
            The predicted class label probabilities of the input samples. 
        """
        # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
        check_is_fitted(self, ['stack_layer_classifier_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)
        
        X_stack_queries = None
        
        # Make a prediction with each base classifier
        for classifier in self.classifiers_:
            
            y_pred = classifier.predict_proba(X)
                
            try:
                X_stack_queries = np.c_[X_stack_queries, y_pred]
            except ValueError:
                X_stack_queries = y_pred

        # Return the prediction made by the stack layer classifier        
        return self.stack_layer_classifier_.predict_proba(X_stack_queries)


##StackedEnsembleHoldout Classifier

In [0]:
class StackedEnsembleHoldOut(BaseEstimator, ClassifierMixin):
  # Constructor for the classifier object
  def __init__(self, base_estimator_types = ["svm", "logreg", "tree"], base_estimator_duplicates = 8, stack_layer_classifier_type = "logreg"):
    # Initialise class variabels
    self.base_estimator_types = base_estimator_types
    self.base_estimator_type_list = list()
    self.base_estimator_duplicates = base_estimator_duplicates
    self.stack_layer_classifier_type = stack_layer_classifier_type

  # The fit function to train a classifier
  def fit(self, X, y):
    # Check that X and y have correct shape
    X, y = check_X_y(X, y)
        
    # Store the classes seen during fit
    self.classes_ = unique_labels(y)
        
    # Set up the base classifeirs in the ensemble
    self.classifiers_ = list()
    
    for i in range(0, self.base_estimator_duplicates):
      
      for t in self.base_estimator_types:
        
        self.base_estimator_type_list.append(t)      
        c = create_classifier(t, tree_min_samples_split=math.ceil(len(X)*0.05))
        self.classifiers_.append(c)
        
    # Store the number of classifers in the ensemble
    self.n_estimators_ = len(self.classifiers_)
        
    # Set up empty arrays to hold stack layer training data
    self.X_stack_train = None #(dtype = float)
    self.y_stack_train = None
  
    #create a hold out set
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,\
                                                              train_size = 0.8)
    
    self.X_train = X_train
    # Append the true value of hold out set to y_stack_train
    try:
      self.y_stack_train = np.c_[self.y_stack_train, y_test]
    except ValueError:
      self.y_stack_train = y_test #* might need to change
    
    for classifier in self.classifiers_:
      # Train a base classifier
      classifier.fit(X_train, y_train)
      
      # Make predictions for all instances in the hold out set
      y_pred = classifier.predict_proba(X_test)

      # Append the predictions ot the stack layer traing set (a bit of hacking here!)
      try:
        self.X_stack_train = np.c_[self.X_stack_train, y_pred]
      except ValueError:
        self.X_stack_train = y_pred #* might need to change
    
    # Create the stack layer classifier
    self.stack_layer_classifier_ = create_classifier(self.stack_layer_classifier_type, tree_min_samples_split=math.ceil(len(X)*0.05))

    # Train the stack layer using the newly created dataset
    self.stack_layer_classifier_.fit(self.X_stack_train, self.y_stack_train)
                
    return self
    
  # The predict function to make a set of predictions for a set of query instances
  def predict(self, X):
    """Predict class labels of the input samples X.
    Parameters
    ----------
    X : array-like matrix of shape = [n_samples, n_features]
    
    The input samples. 
    Returns
    -------
    p : array of shape = [n_samples, ].
            The predicted class labels of the input samples. 
        """
        
    # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
    check_is_fitted(self, ['stack_layer_classifier_'])

    # Check that the input features match the type and shape of the training features
    X = check_array(X)
   
    X_stack_queries = None
              
    # Make a prediction with each base classifier and assemble the stack layer query
    for classifier in self.classifiers_:
      y_pred = classifier.predict_proba(X)
      
      try:
        X_stack_queries = np.c_[X_stack_queries, y_pred]
      except ValueError:
        X_stack_queries = y_pred
        
    # Return the prediction made by the stack layer classifier
    return self.stack_layer_classifier_.predict(X_stack_queries)
      
  # The predict function to make a set of predictions for a set of query instances
  def predict_proba(self, X):
    """Predict class probabilities of the input samples X.
    Parameters
    ----------
    X : array-like matrix of shape = [n_samples, n_features]
            The input samples. 
    Returns
    -------
    p : array of shape = [n_samples, n_labels].
         The predicted class label probabilities of the input samples. 
        """
    # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
    check_is_fitted(self, ['stack_layer_classifier_'])

    # Check that the input features match the type and shape of the training features
    X = check_array(X)
        
    X_stack_queries = None
        
    # Make a prediction with each base classifier
    for classifier in self.classifiers_:
      y_pred = classifier.predict_proba(X)
      
      try:
        X_stack_queries = np.c_[X_stack_queries, y_pred]
      except ValueError:
        X_stack_queries = y_pred

    # Return the prediction made by the stack layer classifier        
    return self.stack_layer_classifier_.predict_proba(X_stack_queries)
    

##StackedEnsembleKFold

In [0]:
class StackedEnsembleKFold(BaseEstimator, ClassifierMixin):
  #fields
  
  # Constructor for the classifier object
  def __init__(self, base_estimator_types = ["svm", "logreg", "tree"], base_estimator_duplicates = 8, stack_layer_classifier_type = "logreg"):
    # Initialise class variabels
    self.base_estimator_types = base_estimator_types
    self.base_estimator_type_list = list()
    self.base_estimator_duplicates = base_estimator_duplicates
    self.stack_layer_classifier_type = stack_layer_classifier_type
    
  # The fit function to train a classifier
  def fit(self, X, y):
    # Check that X and y have correct shape
    X, y = check_X_y(X, y)
    self.X = X
    self.y = y
    # Store the classes seen during fit
    self.classes_ = unique_labels(y)
        
    # Set up the base classifeirs in the ensemble
    self.classifiers_ = list()
    
    for i in range(0, self.base_estimator_duplicates):
      for t in self.base_estimator_types:
        self.base_estimator_type_list.append(t)      
        c = create_classifier(t, tree_min_samples_split=math.ceil(len(X)*0.05))
        self.classifiers_.append(c)
        
    # Store the number of classifers in the ensemble
    self.n_estimators_ = len(self.classifiers_)
        
    # Set up empty arrays to hold stack layer training data
    self.X_stack_train = None #(dtype = float)
    self.y_stack_train = None
    
        
    # Train each base classifier and generate the stack layer training dataset
    kf = KFold(n_splits=3)
        
    for train_index, test_index in kf.split(X):
      X_train, X_test = X[train_index], X[test_index]
      y_train, y_test = y[train_index], y[test_index]
      X_fold =None

      # Append the true value for X_test in i fold to y_stack_train (a bit of hacking here!)
      try:
        self.y_stack_train = np.concatenate((self.y_stack_train, y_test), axis=0)
      except ValueError:
        self.y_stack_train = y_test
             
      for classifier in self.classifiers_:
        
        # Train a base classifier
        classifier.fit(X_train, y_train)

        # Make predictions for all instances in the training set
        y_pred = classifier.predict_proba(X_test)
        
        try:
          X_fold= np.c_[X_fold, y_pred]
        except ValueError:
          X_fold = y_pred
      
      try:
          self.X_stack_train = np.concatenate((self.X_stack_train, X_fold), axis=0)
      except ValueError:
          self.X_stack_train = X_fold
      
      
    
    # Create the stack layer classifier
    self.stack_layer_classifier_ = create_classifier(self.stack_layer_classifier_type, tree_min_samples_split=math.ceil(len(X)*0.05))

    # Train the stack layer using the newly created dataset
    self.stack_layer_classifier_.fit(self.X_stack_train, self.y_stack_train)
        
    return self

  # The predict function to make a set of predictions for a set of query instances
  def predict(self, X):
    
    # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
    check_is_fitted(self, ['stack_layer_classifier_'])

    # Check that the input features match the type and shape of the training features
    X = check_array(X)
   
    X_stack_queries = None
              
    # Make a prediction with each base classifier and assemble the stack layer query
    for classifier in self.classifiers_:
      y_pred = classifier.predict_proba(X)
      try:
        X_stack_queries = np.c_[X_stack_queries, y_pred]
      except ValueError:
        X_stack_queries = y_pred
      
    # Return the prediction made by the stack layer classifier
    return self.stack_layer_classifier_.predict(X_stack_queries)
      
  # The predict function to make a set of predictions for a set of query instances
  def predict_proba(self, X):
    
    # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
    check_is_fitted(self, ['stack_layer_classifier_'])

    # Check that the input features match the type and shape of the training features
    X = check_array(X)
        
    X_stack_queries = None
        
    # Make a prediction with each base classifier
    for classifier in self.classifiers_:
      y_pred = classifier.predict_proba(X)
      try:
        X_stack_queries = np.c_[X_stack_queries, y_pred]
      except ValueError:
        X_stack_queries = y_pred

    # Return the prediction made by the stack layer classifier        
    return self.stack_layer_classifier_.predict_proba(X_stack_queries)
    

##StackedEnsembleOneVsOne Classifier

In [0]:
class StackedEnsembleOnevsOne(BaseEstimator, ClassifierMixin):
  
  # Constructor for the classifier object
  def __init__(self, base_estimator_types = ["svm", "logreg", "tree"], base_estimator_duplicates = 8, stack_layer_classifier_type = "logreg"):
    
    # Initialise class variabels
    self.base_estimator_types = base_estimator_types
    self.base_estimator_type_list = list()
    self.base_estimator_duplicates = base_estimator_duplicates
    self.stack_layer_classifier_type = stack_layer_classifier_type

  # The fit function to train a classifier
  def fit(self, X, y):
    # Check that X and y have correct shape
    X, y = check_X_y(X, y)

    # Store the classes seen during fit
    self.classes_ = unique_labels(y)

    #store the number of unique class
    num_class = len(self.classes_)

    # Set up the base classifeirs in the ensemble
    self.classifiers_ = list()
    
    #calculate the number of models needed using the formula: n!/k!(n-k)!
    n_models = math.factorial(num_class) / (math.factorial(2) * math.factorial((num_class - 2)))
    
    duplicates = int(n_models / len(self.base_estimator_types))
    for i in range(0,duplicates):
      for t in self.base_estimator_types:
          self.base_estimator_type_list.append(t)      
          c = create_classifier(t, tree_min_samples_split=math.ceil(len(X)*0.05))
          self.classifiers_.append(c)
    
    # Store the number of classifers in the ensemble
    self.n_estimators_ = len(self.classifiers_)

    # Set up empty arrays to hold stack layer training data
    self.X_stack_train = None #(dtype = float)
    self.y_stack_train = None

    #create a hold out set
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,\
                                                                train_size = 0.8)

    # Append the truth value to the stack layer training set (a bit of hacking here!)
    try: 
      self.y_stack_train = np.c_[self.y_stack_train, y]
    except ValueError:
      self.y_stack_train = y

    #append y_train column to X_train
    X_train = np.c_[X_train, y_train]

    X_train = pd.DataFrame(X_train)

    #get the index for the target column
    target = len(X_train.columns) - 1

    n=0
    for i in range(num_class):
      #check if num of models created is equal to the size needed
      if n < n_models:
        
        for j in range(i+1, num_class):
          
          #get the instances that contain i class
          i_instances = X_train[X_train[target] == self.classes_[i]]

          #get the instances that contain j class
          j_instances = X_train[X_train[target] == self.classes_[j]]

          #add both i and j instances
          combine= pd.concat([i_instances,j_instances])
          X_train_model = combine[combine.columns.difference([target])]
          y_train_model = combine[target]

          #create a model for X_train_model and y_train_model
          classifier = self.classifiers_[n]

          n += 1
          
          #Train a base classifier
          classifier.fit(np.array(X_train_model), np.array(y_train_model))

          #Get the output for the model
          y_pred = classifier.predict_proba(X)

          # Append the predictions to the stack layer training set (a bit of hacking here!)
          try:
            self.X_stack_train = np.c_[self.X_stack_train, y_pred]
          except ValueError:
            self.X_stack_train = y_pred

    # Create the stack layer classifier
    self.stack_layer_classifier_ = create_classifier(self.stack_layer_classifier_type, tree_min_samples_split=math.ceil(len(X)*0.05))

    # Train the stack layer using the newly created dataset
    self.stack_layer_classifier_.fit(self.X_stack_train, self.y_stack_train)
   
    return self

  # The predict function to make a set of predictions for a set of query instances
  def predict(self, X):
    # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
    check_is_fitted(self, ['stack_layer_classifier_'])

    # Check that the input features match the type and shape of the training features
    X = check_array(X)

    X_stack_queries = None

    # Make a prediction with each base classifier and assemble the stack layer query
    for classifier in self.classifiers_:
      y_pred = classifier.predict_proba(X)

      try:
        X_stack_queries = np.c_[X_stack_queries, y_pred]
      except ValueError:
        X_stack_queries = y_pred

    # Return the prediction made by the stack layer classifier
    return self.stack_layer_classifier_.predict(X_stack_queries)

  # The predict function to make a set of predictions for a set of query instances
  def predict_proba(self, X):
    # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
    check_is_fitted(self, ['stack_layer_classifier_'])

    # Check that the input features match the type and shape of the training features
    X = check_array(X)

    X_stack_queries = None

    # Make a prediction with each base classifier
    for classifier in self.classifiers_:
      y_pred = classifier.predict_proba(X)
      try:
        X_stack_queries = np.c_[X_stack_queries, y_pred]
      except ValueError:
        X_stack_queries = y_pred

    # Return the prediction made by the stack layer classifier        
    return self.stack_layer_classifier_.predict_proba(X_stack_queries)


##Performance of different Stack Layer Approaches

In [0]:
train_data = pd.read_csv('/content/drive/My Drive/Dataset/fashion-mnist_train.csv')
test_data = pd.read_csv('/content/drive/My Drive/Dataset/fashion-mnist_test.csv')


In [0]:
train_sampling_rate = 0.01
test_sampling_rate = 0.03
target = "label"

#create a sample of train_data for fast training
train_samp = train_data.sample(frac=train_sampling_rate)
X = [i for i in train_samp.columns if i not in target]
X = train_samp[X]
y = train_samp[target]

#create a sample of train_data for fast training
test_samp = test_data.sample(frac=test_sampling_rate)
X_test = [i for i in test_samp.columns if i not in target]
X_test = test_samp[X_test]
y_test = test_samp[target]

In [0]:
len(X_test)

500

StackedEnsemble Classifier

In [0]:
clf_standard = StackedEnsembleClassifier()
clf_standard.fit(X, y)

#Evaluation Metrics
y_pred = clf_standard.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print("Confusion Matrix")
display(pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

#Accuracy
scores = cross_val_score(clf_standard, X_test, y_test, cv=10)
scores = pd.Series(scores)
print("Overall Accuracy: ",scores.mean(), "\nSt. Dev (+/-) :", scores.std())



              precision    recall  f1-score   support

           0       0.71      0.80      0.75        25
           1       1.00      0.97      0.98        30
           2       0.73      0.66      0.69        29
           3       0.78      0.83      0.80        42
           4       0.67      0.62      0.64        26
           5       0.89      0.84      0.86        38
           6       0.55      0.46      0.50        26
           7       0.88      0.82      0.85        28
           8       0.94      0.97      0.95        30
           9       0.73      0.92      0.81        26

   micro avg       0.80      0.80      0.80       300
   macro avg       0.79      0.79      0.79       300
weighted avg       0.80      0.80      0.79       300

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,20,0,1,2,0,1,1,0,0,0,25
1,0,29,0,1,0,0,0,0,0,0,30
2,0,0,19,2,5,0,3,0,0,0,29
3,2,0,1,35,2,1,1,0,0,0,42
4,0,0,2,3,16,0,5,0,0,0,26
5,0,0,0,1,0,32,0,1,0,4,38
6,6,0,3,1,1,0,12,0,2,1,26
7,0,0,0,0,0,2,0,23,0,3,28
8,0,0,0,0,0,0,0,0,29,1,30
9,0,0,0,0,0,0,0,2,0,24,26




Overall Accuracy:  0.7490015924935279 
St. Dev (+/-) : 0.0511210666105625


StackedEnsemble Holdout Approach

In [0]:
clf_holdout = StackedEnsembleHoldOut()
clf_holdout.fit(X, y)

y_pred = clf_holdout.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print("Confusion Matrix")
display(pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

#Accuracy
scores = cross_val_score(clf_holdout, X_test, y_test, cv=10)
scores = pd.Series(scores)
print("Overall Accuracy",scores.mean(), "\nSt. Dev (+/-) :", scores.std())



              precision    recall  f1-score   support

           0       0.60      0.72      0.65        25
           1       0.97      0.97      0.97        30
           2       0.60      0.52      0.56        29
           3       0.76      0.76      0.76        42
           4       0.52      0.42      0.47        26
           5       0.79      0.89      0.84        38
           6       0.38      0.38      0.38        26
           7       0.89      0.86      0.87        28
           8       0.90      0.93      0.92        30
           9       0.80      0.77      0.78        26

   micro avg       0.74      0.74      0.74       300
   macro avg       0.72      0.72      0.72       300
weighted avg       0.73      0.74      0.73       300

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,18,0,1,2,2,1,1,0,0,0,25
1,0,29,0,1,0,0,0,0,0,0,30
2,0,0,15,2,6,0,6,0,0,0,29
3,3,0,2,32,2,1,2,0,0,0,42
4,0,0,5,3,11,0,7,0,0,0,26
5,0,0,0,1,0,34,0,1,0,2,38
6,9,0,2,1,0,0,10,0,3,1,26
7,0,0,0,0,0,2,0,24,0,2,28
8,0,1,0,0,0,1,0,0,28,0,30
9,0,0,0,0,0,4,0,2,0,20,26




Overall Accuracy 0.635889834627738 
St. Dev (+/-) : 0.06185652752568185


StackedEnsemble K-Fold Approach

In [0]:
clf_kfold = StackedEnsembleKFold()
clf_kfold.fit(X, y)

y_pred = clf_kfold.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print("Confusion Matrix")
display(pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

#Accuracy
scores = cross_val_score(clf_kfold, X_test, y_test, cv=10)
scores = pd.Series(scores)
print("Overall Accuracy",scores.mean(), "\nSt. Dev (+/-) :", scores.std())



              precision    recall  f1-score   support

           0       0.56      0.60      0.58        25
           1       0.94      0.97      0.95        30
           2       0.72      0.62      0.67        29
           3       0.78      0.76      0.77        42
           4       0.63      0.65      0.64        26
           5       0.80      0.84      0.82        38
           6       0.38      0.38      0.38        26
           7       0.83      0.86      0.84        28
           8       0.93      0.93      0.93        30
           9       0.83      0.77      0.80        26

   micro avg       0.75      0.75      0.75       300
   macro avg       0.74      0.74      0.74       300
weighted avg       0.75      0.75      0.75       300

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,15,0,0,1,0,1,8,0,0,0,25
1,0,29,0,1,0,0,0,0,0,0,30
2,0,0,18,2,7,0,2,0,0,0,29
3,4,2,1,32,2,0,1,0,0,0,42
4,0,0,3,2,17,0,4,0,0,0,26
5,0,0,0,1,0,32,0,3,0,2,38
6,8,0,3,1,1,0,10,0,2,1,26
7,0,0,0,0,0,3,0,24,0,1,28
8,0,0,0,1,0,0,1,0,28,0,30
9,0,0,0,0,0,4,0,2,0,20,26




Overall Accuracy 0.6746286186931348 
St. Dev (+/-) : 0.06044130125974202


StackedEnsemble OneVsOne Approach

In [0]:
clf_1v1 = StackedEnsembleOnevsOne()
clf_1v1.fit(X, y)

y_pred = clf_1v1.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print("Confusion Matrix")
display(pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

#Accuracy
scores = cross_val_score(clf_1v1, X_test, y_test, cv=10)
scores = pd.Series(scores)
print("Overall Accuracy",scores.mean(), "\nSt. Dev (+/-) :", scores.std())



              precision    recall  f1-score   support

           0       0.69      0.72      0.71        25
           1       0.91      0.97      0.94        30
           2       0.72      0.62      0.67        29
           3       0.89      0.79      0.84        42
           4       0.63      0.65      0.64        26
           5       0.81      0.45      0.58        38
           6       0.41      0.46      0.44        26
           7       0.61      0.79      0.69        28
           8       0.93      0.93      0.93        30
           9       0.65      0.92      0.76        26

   micro avg       0.73      0.73      0.73       300
   macro avg       0.73      0.73      0.72       300
weighted avg       0.74      0.73      0.72       300

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,18,0,0,1,0,1,5,0,0,0,25
1,0,29,0,1,0,0,0,0,0,0,30
2,0,0,18,0,7,0,4,0,0,0,29
3,1,2,1,33,1,0,4,0,0,0,42
4,0,0,5,1,17,0,3,0,0,0,26
5,0,1,0,0,0,17,0,13,0,7,38
6,7,0,1,1,2,0,12,0,2,1,26
7,0,0,0,0,0,2,0,22,0,4,28
8,0,0,0,0,0,0,1,0,28,1,30
9,0,0,0,0,0,1,0,1,0,24,26




Overall Accuracy 0.6731276331195686 
St. Dev (+/-) : 0.04836858760715362




##Performance of other types of classifiers

Decision Tree

In [0]:
tree_model = tree.DecisionTreeClassifier(max_depth = 12)
tree_model.fit(X,y)

y_pred = tree_model.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print("Confusion Matrix")
display(pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

scores = cross_val_score(tree_model, X_test, y_test, cv=10)
scores = pd.Series(scores)
print("Overall Accuracy: ",scores.mean(), "St. Dev (+/-) :", scores.std())

              precision    recall  f1-score   support

           0       0.62      0.72      0.67        25
           1       0.94      0.97      0.95        30
           2       0.67      0.62      0.64        29
           3       0.82      0.64      0.72        42
           4       0.52      0.50      0.51        26
           5       0.80      0.84      0.82        38
           6       0.34      0.42      0.38        26
           7       0.66      0.75      0.70        28
           8       0.77      0.57      0.65        30
           9       0.69      0.77      0.73        26

   micro avg       0.69      0.69      0.69       300
   macro avg       0.68      0.68      0.68       300
weighted avg       0.70      0.69      0.69       300

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,18,0,1,1,0,1,3,0,1,0,25
1,0,29,0,1,0,0,0,0,0,0,30
2,1,0,18,0,5,0,4,0,1,0,29
3,2,1,0,27,4,1,6,0,1,0,42
4,0,0,7,1,13,0,5,0,0,0,26
5,0,0,0,1,0,32,0,5,0,0,38
6,8,0,0,1,3,0,11,1,2,0,26
7,0,0,0,0,0,2,0,21,0,5,28
8,0,1,1,1,0,1,3,2,17,4,30
9,0,0,0,0,0,3,0,3,0,20,26


Overall Accuracy:  0.6186188972318005 St. Dev (+/-) : 0.09380150173234014


Bagging

In [0]:

bag_model = ensemble.BaggingClassifier(base_estimator = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear', max_iter=1000), \
                                      n_estimators=12)
bag_model.fit(X,y)

y_pred = bag_model.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print("Confusion Matrix")
display(pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

scores = cross_val_score(bag_model, X_test, y_test, cv=10)
scores = pd.Series(scores)
print("Overall Accuracy: ",scores.mean(), "\nSt. Dev (+/-) :", scores.std())

              precision    recall  f1-score   support

           0       0.60      0.72      0.65        25
           1       0.97      0.97      0.97        30
           2       0.66      0.66      0.66        29
           3       0.82      0.79      0.80        42
           4       0.64      0.54      0.58        26
           5       0.96      0.71      0.82        38
           6       0.46      0.42      0.44        26
           7       0.86      0.89      0.88        28
           8       0.85      0.97      0.91        30
           9       0.74      0.96      0.83        26

   micro avg       0.77      0.77      0.77       300
   macro avg       0.76      0.76      0.75       300
weighted avg       0.77      0.77      0.76       300

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,18,0,3,2,0,0,2,0,0,0,25
1,0,29,0,1,0,0,0,0,0,0,30
2,0,0,19,1,5,0,3,0,1,0,29
3,4,1,1,33,2,0,1,0,0,0,42
4,0,0,4,2,14,0,6,0,0,0,26
5,0,0,0,1,0,27,1,3,1,5,38
6,8,0,2,0,1,0,11,0,3,1,26
7,0,0,0,0,0,1,0,25,0,2,28
8,0,0,0,0,0,0,0,0,29,1,30
9,0,0,0,0,0,0,0,1,0,25,26


Overall Accuracy:  0.728090179981309 
St. Dev (+/-) : 0.06298225119905927


Grid Search with Decision Tree

In [0]:
# Set up the parameter grid to seaerch
param_grid ={'criterion': ['gini', "entropy"], \
             'max_depth': list(range(3, 12, 3)), \
             'min_samples_split': [50] }

# Perform the search
tuned_tree = GridSearchCV(tree.DecisionTreeClassifier(), \
                                param_grid, cv=2, verbose = 0, \
                            return_train_score=True)
tuned_tree.fit(X_test, y_test)

y_pred = tuned_tree.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print("Confusion Matrix")
display(pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

scores = cross_val_score(tuned_tree, X_test, y_test, cv=10)
scores = pd.Series(scores)
print("Overall Accuracy: ",scores.mean(), "\nSt. Dev (+/-) :", scores.std())

              precision    recall  f1-score   support

           0       0.56      0.92      0.70        25
           1       0.97      0.93      0.95        30
           2       0.54      0.76      0.63        29
           3       0.77      0.81      0.79        42
           4       0.66      0.81      0.72        26
           5       1.00      0.66      0.79        38
           6       0.00      0.00      0.00        26
           7       0.69      0.89      0.78        28
           8       0.95      0.60      0.73        30
           9       0.70      0.88      0.78        26

   micro avg       0.73      0.73      0.73       300
   macro avg       0.68      0.73      0.69       300
weighted avg       0.71      0.73      0.70       300

Confusion Matrix


  'precision', 'predicted', average, warn_for)


Predicted,0,1,2,3,4,5,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,23,0,0,1,0,0,0,1,0,25
1,0,28,0,2,0,0,0,0,0,30
2,2,0,22,0,5,0,0,0,0,29
3,5,0,1,34,1,0,0,0,1,42
4,0,0,3,2,21,0,0,0,0,26
5,1,0,0,0,0,25,10,0,2,38
6,9,0,12,2,3,0,0,0,0,26
7,0,0,0,0,0,0,25,0,3,28
8,0,1,2,3,2,0,0,18,4,30
9,1,0,1,0,0,0,1,0,23,26




Overall Accuracy:  0.5931988173117206 
St. Dev (+/-) : 0.08432085885501962




Grid Search with Bagging

In [0]:
estimators = []
estimators.append(tree.DecisionTreeClassifier())
estimators.append(neighbors.KNeighborsClassifier())

estimator_grids = []
tree_grid = {'base_estimator__criterion':['gini', 'entropy'],\
             'base_estimator__max_depth': list(range(3, 20, 3)), \
             'base_estimator__min_samples_split': [50]}

knn_grid = {'base_estimator__n_neighbors': list(range(1, 10, 3)),\
            'base_estimator__metric': ['minkowski', 'euclidean']}

estimator_grids.append(tree_grid)
estimator_grids.append(knn_grid)

tuned_models = []
for i in range(len(estimators)):
  # Perform the search
  tuned_models.append(GridSearchCV(ensemble.BaggingClassifier(estimators[i], n_estimators=10),\
                                estimator_grids[i], cv=2, verbose = 0, \
                                return_train_score=True))
  tuned_models[i].fit(X,y)

#get best descision tree model
tuned_tree = tuned_models[0]

#get best knn model
tuned_knn = tuned_models[1]

#find the best classifier for bagging
if tuned_tree.best_score_ > tuned_knn.best_score_:
  best_classifier = tuned_tree
else:
  best_classifier = tuned_knn





Bagging Grid Search Performance

In [0]:
y_pred = best_classifier.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print("Confusion Matrix")
display(pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

scores = cross_val_score(best_classifier, X_test, y_test, cv=10)
scores = pd.Series(scores)
print("Overall Accuracy: ",scores.mean(), "\nSt. Dev (+/-) :", scores.std())

              precision    recall  f1-score   support

           0       0.59      0.76      0.67        25
           1       0.94      0.97      0.95        30
           2       0.59      0.55      0.57        29
           3       0.86      0.74      0.79        42
           4       0.72      0.50      0.59        26
           5       1.00      0.66      0.79        38
           6       0.32      0.50      0.39        26
           7       0.84      0.93      0.88        28
           8       0.93      0.90      0.92        30
           9       0.80      0.92      0.86        26

   micro avg       0.74      0.74      0.74       300
   macro avg       0.76      0.74      0.74       300
weighted avg       0.78      0.74      0.75       300

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,19,0,2,0,0,0,4,0,0,0,25
1,0,29,0,1,0,0,0,0,0,0,30
2,0,0,16,1,2,0,10,0,0,0,29
3,6,2,0,31,2,0,1,0,0,0,42
4,0,0,6,2,13,0,5,0,0,0,26
5,0,0,0,1,0,25,6,3,0,3,38
6,7,0,3,0,1,0,13,0,2,0,26
7,0,0,0,0,0,0,0,26,0,2,28
8,0,0,0,0,0,0,2,0,27,1,30
9,0,0,0,0,0,0,0,2,0,24,26




Overall Accuracy:  0.7047446733374153 
St. Dev (+/-) : 0.06159170077957284


