# Support Vector Machine Embedded Forests
### Vincent Buekers
### Master of statistics thesis, KUL

In [8]:
import numpy as np
import pandas as pd

from sklearn import datasets, ensemble, tree, svm, model_selection, metrics

import multiprocessing as multip
from joblib import Parallel, delayed

import warnings
warnings.filterwarnings("ignore")

In [9]:
# load data
X=datasets.load_iris(return_X_y=True)[0]
y=datasets.load_iris(return_X_y=True)[1].reshape(-1,1)

# amount of trees in forest
trees = 100
# use all cores for multiprocessing later on
nr_cores = multip.cpu_count()

## Partition input space into subsets obtained from the leaf nodes of an extremely randomized tree
Note: these are non-overlapping subsets due to the recursive branching mechanism used in such trees

In [10]:
def get_groups(X,y):
    
    subsets = []
    
    # extremely randomized tree
    extra = tree.ExtraTreeClassifier(min_samples_leaf=1/10)
    extra.fit(X,y)
    # predictions
    preds_tree = extra.predict(X).reshape(-1,1)
    
    # obtain leaf indices datapoints appear in
    leaf_indexes = extra.apply(X)
    leaf_indexes = pd.DataFrame(leaf_indexes)
    
    # Group observations by their leaf node
    groups = leaf_indexes.groupby(leaf_indexes[0],axis=0).groups
    
    print("The input space has been partitioned into {} leaf nodes, i.e. samples.".format(len(groups)))
    
    # Obtain subsets created by the leaf node partitioning
    for value in list(groups.values()):
        
        X_sub, y_sub, y_tree = X[value],y[value], preds_tree[value].reshape(-1,1)
        
        group_indexes = np.array(value).reshape(-1,1)
        
        sub = np.concatenate((group_indexes, X_sub, y_sub, y_tree), axis=1)
        subsets.append(sub)
        
    return subsets

# Embedded SVM classifiers
for each subset an svm classifier is trained, given the subset consist of sufficient class labels. Otherwise, the predictions from the randomized tree are inherited.

In [11]:
def svm_tree(X,y):
    
    subsets = get_groups(X,y)
    
    # svm classifier
    clf = svm.SVC()
    
    preds_leaf = []
    
    count = 0
    
    for subset in subsets:
        
        indexes, X, y, y_tree = subset[:,0], subset[:,1:-2], subset[:,-2] , subset[:,-1]
        
        # check if leaf node has sufficient class labels
        if (len(np.unique(y)) >= 2):
            # fit svm to subset
            clf.fit(X,y)
            # obtain predictions for subset
            svm_pred = clf.predict(X)
            svm_pred = np.concatenate((indexes.reshape(-1,1), svm_pred.reshape(-1,1)), axis=1)
            
            preds_leaf.append(svm_pred)
            count += 1
        
        # use tree predictions if leaf (subset) is already pure   
        else:
            tree_pred = np.concatenate((indexes.reshape(-1,1), y_tree.reshape(-1,1)), axis=1)
            preds_leaf.append(tree_pred)
            
    print("{} of which have been used for training an SVM"\
          " based on sufficient class labels.".format(count))
    print("The remaining {} samples have the inherited"\
          " prediction from the randomized tree.".format(len(subsets)-count))
    # combine leaf predictions of all the arrays
    preds_all = np.concatenate(preds_leaf,axis=0)
    return preds_all[np.argsort(preds_all[:,0])] # sort predictions by their index

## Run the above procedure for multiple trees in parallel
This results in a support vector machine embedded random forest

In [13]:
# Run trees in parallel using all cores
def svm_forest(X,y):
    forest = Parallel(nr_cores)(delayed(svm_tree)(X,y) for i in range(1,trees))
    return forest

In [14]:
forest_pred = svm_forest(X,y)

# reshape numpy array such that column k denotes prediction for tree k
forest_pred = np.concatenate(forest_pred,axis=1)
# remove excessive index columns
forest_pred = np.delete(forest_pred, list(range(2, forest_pred.shape[1], 2)), axis=1)

In [15]:
from statistics import mode, StatisticsError

majority = []

# Obtain majority vote for each datapoint
def majority_vote(l):
    try:
        return mode(l)
    except StatisticsError:
        return 0
    
for i in range(0, forest_pred.shape[0]):
    majority.append(majority_vote(forest_pred[i]))

# Accuracy

In [16]:
from sklearn import metrics

# compute accuracy of svm forest
metrics.accuracy_score(majority,y)

0.9733333333333334