# A Support Vector Machine Embedded Random Forest Classifier
## Thesis: Master of statistics, KUL
### Vincent Buekers
Promotor: Prof. dr. Johan A.K. Suykens

Supervision: Yingyi Chen

In [2]:
import numpy as np
import pandas as pd

from sklearn import datasets, tree, svm, metrics, preprocessing, pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from joblib import Parallel, delayed

import time

import warnings
warnings.filterwarnings("ignore")

In [11]:
%cd '/Users/Vincent/Desktop/Thesis/Python/Data'

/Users/Vincent/Desktop/Thesis/Python/Data


## Partition input space into subsets 
Subsets are obtained from the leaf nodes of an extremely randomized tree. For purposes of theoretical consistency, only one candidate feature is selected from all d features using the option max_features = 1, yielding totally random trees.

Note: these are non-overlapping subsets due to the recursive branching mechanism in decision trees

In [4]:
def get_groups(X,y, X_train,X_test, y_train,y_test, idx_train,idx_test):
    
    subsets = []
        
    # totally randomized tree (max_features=1)
    extra = tree.ExtraTreeClassifier(max_features=1,min_samples_leaf = int(np.sqrt(len(X))) )
    extra.fit(X_train,y_train)
    
    # tree predictions
    preds_tree = extra.predict(X).reshape(-1,1)
    
    # obtain leaf indices the datapoints appear in
    leaf_idx_train, leaf_idx_test = extra.apply(X_train), extra.apply(X_test)
    
    # Keep track of observation indexes and prepare for pandas' .groupby
    leaf_idx_train = pd.DataFrame(leaf_idx_train, index=idx_train)
    leaf_idx_test = pd.DataFrame(leaf_idx_test, index=idx_test)
    
    # Group train and test observations by their leaf node
    groups_train = leaf_idx_train.groupby(leaf_idx_train[0],axis=0).groups
    groups_test = leaf_idx_test.groupby(leaf_idx_test[0],axis=0).groups
        
    # Obtain train and test subsets created by the leaf node partitioning
    # iterables are a list of Int64index objects for the data in each leaf node
    for value_train, value_test in zip(list(groups_train.values()),list(groups_test.values())) :
        
        # subset the data
        X_train_sub, y_train_sub = X[value_train], y[value_train]
        X_test_sub, y_test_sub, y_tree = X[value_test] ,y[value_test], preds_tree[value_test]
        
        # original indexes of the observations appearing in this leaf
        train_indexes = np.array(value_train).reshape(-1,1)
        test_indexes = np.array(value_test).reshape(-1,1)
        
        # training subset including original observation indexes
        sub_train = np.concatenate((train_indexes, X_train_sub, y_train_sub), axis=1)
        # testing subset including original observation indexes and tree predictions
        sub_test = np.concatenate((test_indexes, X_test_sub, y_test_sub, y_tree), axis=1)

        subsets.append([sub_train,sub_test])
    
    return subsets

# Embedded SVM classifiers
for each subset an svm classifier is trained on the training subset and used to predict the corresponding leaf test test (if the leaf is not yet homogenous in terms of class labels). 

For heterogeneous leafs, there might still be an issue with class imbalance since the forest partitioning tries aims at leaf purity. Therefore, the C parameter is automatically weighted inversely proportional to class frequencies.

Since support vector machines are not scale-invariant, the data are scaled to have zero mean and unit variance for each local SVM 
 

## Svm fitting and testing

In [5]:
def fit_svm(subset):
    
    idx_train, X_train, y_train = subset[0][:,0], subset[0][:,1:-1], subset[0][:,-1]
    idx_test,X_test,y_test,y_tree = subset[1][:,0],subset[1][:,1:-2],subset[1][:,-2],subset[1][:,-1]
        
    # check if leaf node is heterogeneous (i.e. consists of more than one class)
    if (len(np.unique(y_train)) >= 2):
        
         # preprocess data
        scaler = preprocessing.StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        # decide whether to solve in primal or dual
        QP_bool = False if (X_train.shape[0] > X_train.shape[0]) else True
        
        # fit svm to subset
        clf = svm.LinearSVC(fit_intercept=False,class_weight='balanced', dual=QP_bool)
        clf.fit(X_train,y_train)
        
        # fit svm to subset
        #clf = svm.SVC(kernel="rbf", class_weight='balanced')
        #pipe = pipeline.Pipeline([('Scaler', preprocessing.StandardScaler()), ('svc', clf)])
        #pipe.fit(X_train,y_train)
        
        # obtain predictions for subset
        svm_pred = clf.predict(X_test)
        svm_pred = np.concatenate((idx_test.reshape(-1,1), svm_pred.reshape(-1,1)), axis=1)
        
        return svm_pred
    
    # use tree predictions if leaf (subset) is already pure   
    else:
        tree_pred = np.concatenate((idx_test.reshape(-1,1), y_tree.reshape(-1,1)), axis=1)
        
        return tree_pred

## Concurrent execution of svms within tree
Within each decision tree, the SVMs can be trained in parallel to achieve optimal computational efficiency.

In [6]:
def svm_parallel(X,y, X_train,X_test, y_train,y_test, idx_train,idx_test):
    # Obtain subsets
    subsets = get_groups(X,y, X_train,X_test, y_train,y_test, idx_train,idx_test)
    
    preds_leaf = []
    
    # Run SVM's in parallel
    with Parallel() as parallel:
        result = parallel(delayed(fit_svm)(subset) for subset in subsets)
        preds_leaf.append(result)
        
    # aggregate predictions of the leafs into one set of predictions for the tree
    preds_all = np.concatenate(preds_leaf[0],axis=0)
    
    # sort predictions by their index
    preds_sorted = preds_all[np.argsort(preds_all[:,0])]
    
    return preds_sorted[:,1].reshape(-1,1) # return predictions without index

## Serialized version

In [18]:
def svm_tree(X,y, X_train,X_test, y_train,y_test, idx_train,idx_test):
    
    subsets = get_groups(X,y, X_train,X_test, y_train,y_test, idx_train,idx_test)
    
    preds_leaf = []
    
    count = 0
    
    for subset in subsets:
        
        # Train data stored in first element of each leaf (subset)
        idx_train, X_train, y_train = subset[0][:,0], subset[0][:,1:-1], subset[0][:,-1]
        # test data  stored in second element of each leaf (subset)
        idx_test,X_test,y_test,y_tree = subset[1][:,0],subset[1][:,1:-2],subset[1][:,-2],subset[1][:,-1]
        
        # check if leaf node is heterogeneous (i.e. consists of more than one class)
        if (len(np.unique(y_train)) >= 2):
            
            # svm classifier with Radial basis function kernel
            clf = svm.SVC(kernel='rbf')
            pipe = pipeline.Pipeline([('Scaler', preprocessing.StandardScaler()), ('svc', clf)])
            # fit svm to subset
            pipe.fit(X_train,y_train)
            
            # obtain predictions for subset
            svm_pred = pipe.predict(X_test)
            # keep track of observaton indexes
            svm_pred = np.concatenate((idx_test.reshape(-1,1), svm_pred.reshape(-1,1)), axis=1)
            preds_leaf.append(svm_pred)
            
            count += 1
        
        # use tree predictions if leaf (subset) is already pure   
        else:
            tree_pred = np.concatenate((idx_test.reshape(-1,1), y_tree.reshape(-1,1)), axis=1)
            preds_leaf.append(tree_pred)
    
    print("The input space has been partitioned into {} leaf nodes, i.e. samples.".format(len(subsets)))

    print("{} of which have been used for training an SVM"\
          " given the heterogeneity of those nodes.".format(count))
    print("The remaining {} samples have the inherited"\
          " prediction from the randomized tree.".format(len(subsets)-count))
    
    # aggregate leaf predictions for all test samples across different nodes
    preds_all = np.concatenate(preds_leaf,axis=0)
    # sort predictions by their index
    preds_sorted = preds_all[np.argsort(preds_all[:,0])]
    
    return preds_sorted[:,1].reshape(-1,1) # return predictions without index

# Majority Vote
For classification, a majority vote is implemented to ultimately obtain the forest prediction, as is commonmy done in ensemble methods.

In [7]:
from statistics import mode, StatisticsError

# Obtain majority vote for each datapoint
def majority_vote(l):
    try:
        return mode(l)
    except StatisticsError:
        return 0

## SVM embedded Forest
Finally, the above procedure can be extended to an ensemble of randomized trees with embedded SVMs. Apart from the parallel execution of the embedded svm predictors, the forest ensemble itself can also be concurrently executed. 

In [8]:
# Run trees in parallel using all cores
def svm_forest(X,y):
    
    trees = 100
    
    forest_pred=[]
    
    X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y
                                                                          ,np.arange(len(X))
                                                                       ,test_size=1/3)
    
    t = time.time()
    
    with Parallel() as parallel:
        forest_pred = parallel(delayed(svm_parallel)(X,y 
                                                     ,X_train,X_test 
                                                     ,y_train,y_test
                                                     ,idx_train,idx_test) for i in range(1,trees))
        
    # training time
    training_time = time.time() - t
    
    # reshape array such that column k denotes prediction for tree k
    forest_pred = np.concatenate(forest_pred,axis=1)
    
    majority = np.apply_along_axis(majority_vote, 1, forest_pred) 
    
    test = y_test[np.argsort(idx_test)]
    
    accuracy = metrics.accuracy_score(test, majority)
    
    return accuracy, training_time

In [9]:
# load breast cancer
X, y = datasets.load_breast_cancer(return_X_y=True)
y = y.reshape(-1,1)

svm_rf = svm_forest(X,y)
print("Accuracy: {}".format(svm_rf[0]))
print("training time: {0:.2f} seconds".format(svm_rf[1]))

Accuracy: 0.9631578947368421
training time: 1.05 seconds


In [12]:
# load credit card defaults
credit = pd.read_csv('credit-card-full.csv', index_col=0)
X, y = np.array(credit.iloc[:,:-1]), np.array(credit.iloc[:,-1])
y = y.reshape(-1,1)

svm_rf = svm_forest(X,y)
print("Accuracy: {}".format(svm_rf[0]))
print("training time: {0:.2f} seconds".format(svm_rf[1]))

Accuracy: 0.589
training time: 416.73 seconds
