# Support Vector Machine Embedded Forests
### Vincent Buekers
### Master of statistics thesis, KUL

In [1]:
import numpy as np
import pandas as pd

from sklearn import datasets, ensemble, tree, svm, metrics
from sklearn.model_selection import train_test_split

import multiprocessing as multip
from joblib import Parallel, delayed
import time

import warnings
warnings.filterwarnings("ignore")

In [7]:
# load data
X, y = datasets.load_boston(return_X_y=True)
y = y.reshape(-1,1)

# amount of trees in forest
trees = 100
# min samples in leaf
min_leaf_size = int(np.sqrt(len(X)))

# use all cores for multiprocessing later on
nr_cores = multip.cpu_count()

## Partition input space into subsets obtained from the leaf nodes of an extremely randomized tree
Note: these are non-overlapping subsets due to the recursive branching mechanism used in such trees

In [2]:
def get_groups(X,y, X_train,X_test, y_train,y_test, idx_train,idx_test):
    
    subsets = []
        
    # totally randomized tree (max_features=1)
    extra = tree.ExtraTreeRegressor(max_features=1, min_samples_leaf = int(np.sqrt(len(X))) )
    extra.fit(X_train,y_train)
    
    # obtain leaf indices the datapoints appear in
    leaf_idx_train, leaf_idx_test = extra.apply(X_train), extra.apply(X_test)
    
    # Keep track of observation indexes and prepare for pandas' .groupby
    leaf_idx_train = pd.DataFrame(leaf_idx_train, index=idx_train)
    leaf_idx_test = pd.DataFrame(leaf_idx_test, index=idx_test)
    
    # Group train and test observations by their leaf node
    groups_train = leaf_idx_train.groupby(leaf_idx_train[0],axis=0).groups
    groups_test = leaf_idx_test.groupby(leaf_idx_test[0],axis=0).groups
        
    # Obtain train and test subsets created by the leaf node partitioning
    # iterables are a list of Int64index objects for the data in each leaf node
    for value_train, value_test in zip(list(groups_train.values()),list(groups_test.values())) :
        
        # subset the data
        X_train_sub, y_train_sub = X[value_train], y[value_train]
        X_test_sub, y_test_sub = X[value_test] ,y[value_test]
        
        # original indexes of the observations appearing in this leaf
        train_indexes = np.array(value_train).reshape(-1,1)
        test_indexes = np.array(value_test).reshape(-1,1)
        
        # training subset including original observation indexes
        sub_train = np.concatenate((train_indexes, X_train_sub, y_train_sub), axis=1)
        # testing subset including original observation indexes and tree predictions
        sub_test = np.concatenate((test_indexes, X_test_sub, y_test_sub), axis=1)

        subsets.append([sub_train,sub_test])
    
    return subsets

# Embedded SVM Regressor
for each subset an svm classifier is trained on the training subset and used to predict the corresponding leaf test test.

## Serialized Version

In [3]:
def svm_tree(X,y, X_train,X_test, y_train,y_test, idx_train,idx_test):
    
    subsets = get_groups(X,y, X_train,X_test, y_train,y_test, idx_train,idx_test)
    
    preds_leaf = []
    
    for subset in subsets:
        
        # Train data stored in first element of each leaf (subset)
        idx_train, X_train, y_train = subset[0][:,0], subset[0][:,1:-1], subset[0][:,-1]
        # test data  stored in second element of each leaf (subset)
        idx_test, X_test, y_test = subset[1][:,0], subset[1][:,1:-1], subset[1][:,-1]
        
        # svm classifier with Radial basis function kernel
        reg = svm.SVR(kernel='rbf')
        # fit svm to subset
        reg.fit(X_train,y_train)
            
        # obtain predictions for subset
        svm_pred = reg.predict(X_test)
        # keep track of observaton indexes
        svm_pred = np.concatenate((idx_test.reshape(-1,1), svm_pred.reshape(-1,1)), axis=1)
        preds_leaf.append(svm_pred)

    # aggregate leaf predictions for all test samples across different nodes
    preds_all = np.concatenate(preds_leaf,axis=0)
    # sort predictions by their index
    preds_sorted = preds_all[np.argsort(preds_all[:,0])]
    return preds_sorted[:,1].reshape(-1,1) # return predictions without index

## Run the above procedure for multiple trees in parallel
This results in a support vector machine embedded random forest

In [4]:
# Run trees in parallel using all cores
def svm_forest(X,y):
   
    X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y
                                                                          ,np.arange(len(X))
                                                                          ,test_size=1/3)

    t = time.time()
    
    forest_pred = Parallel(nr_cores)(delayed(svm_tree)(X,y,X_train,X_test
                                                  , y_train,y_test
                                                  , idx_train,idx_test) for i in range(1,100))
    
    # training time
    training_time = time.time() - t
    
    # reshape array such that column k denotes prediction for tree k
    forest_pred = np.concatenate(forest_pred,axis=1)
    
    mean_pred = np.apply_along_axis(np.mean, 1, forest_pred) 
    
    test = y_test[np.argsort(idx_test)]
    
    R_2 = metrics.r2_score(test, mean_pred)
    
    return R_2, training_time

In [11]:
# Forest predictions and training time
svm_rf = svm_forest(X,y)
print("R_2: {}".format(svm_rf[0]))
print("training time: {0:.2f} seconds".format(svm_rf[1]))

R_2: 0.2739225850395439
training time: 0.51 seconds
