# Random forest support vector regressor (RF-SVR)
## Vincent Buekers
Promotor: Prof. dr. Johan A.K. Suykens

Supervision: Yingyi Chen

In [None]:
import numpy as np
import pandas as pd

from sklearn import tree, svm, linear_model
from sklearn.model_selection import RandomizedSearchCV, StratifiedShuffleSplit

from joblib import Parallel, delayed

# ExtraTree Partition
Subsets are obtained from the leaf nodes of an extremely randomized tree. For purposes of theoretical consistency: 

- only one candidate feature is selected from all d features using the option max_features = 1, yielding totally random trees
- the leaf size is set to sqrt(n)

Note: these are non-overlapping subsets due to the recursive branching mechanism in decision trees

In [2]:
def extra_partition(X_train,X_test, y_train,y_test, idx_train,idx_test):

    subsets = []
    
    # totally randomized tree (max_features=1)
    extra = tree.ExtraTreeRegressor(max_features=1,min_samples_leaf = int(np.sqrt(len(X_train))) )
    extra.fit(X_train,y_train)
    
    # obtain leaf indices the datapoints appear in
    leaf_idx_train, leaf_idx_test = extra.apply(X_train), extra.apply(X_test)
    
    # Keep track of observation indexes and prepare for pandas' .groupby
    leaf_idx_train = pd.DataFrame(leaf_idx_train, index=idx_train)
    leaf_idx_test = pd.DataFrame(leaf_idx_test, index=idx_test)
    
    # Group train and test observations by their leaf node
    groups_train = leaf_idx_train.groupby(leaf_idx_train[0],axis=0).groups
    groups_test = leaf_idx_test.groupby(leaf_idx_test[0],axis=0).groups
    
    # collect all data back into one array, sorted by original observation indexes
    X_train, X_test = np.c_[idx_train,X_train], np.c_[idx_test,X_test]
    y_train, y_test = np.c_[idx_train,y_train], np.c_[idx_test,y_test]
    X, y = np.r_[X_train,X_test], np.r_[y_train,y_test]
    X, y = X[np.argsort(X[:,0])], y[np.argsort(y[:,0])]
    X, y = np.delete(X, 0, 1), np.delete(y, 0, 1)

    # Obtain train and test subsets created by the leaf node partitioning
    # iterables are a list of Int64index objects for the data in each leaf node
    for leaf_train, leaf_test in zip(list(groups_train.values()),list(groups_test.values())) :
        
        # subset the data
        X_train_sub, y_train_sub = X[leaf_train], y[leaf_train]
        X_test_sub, y_test_sub = X[leaf_test] ,y[leaf_test]
        
        # original indexes of the observations appearing in this leaf
        train_indexes = np.array(leaf_train).reshape(-1,1)
        test_indexes = np.array(leaf_test).reshape(-1,1)
        
        # training subset including original observation indexes
        sub_train = np.concatenate((train_indexes, X_train_sub, y_train_sub), axis=1)
        # testing subset including original observation indexes and tree predictions
        sub_test = np.concatenate((test_indexes, X_test_sub, y_test_sub), axis=1)

        subsets.append([sub_train,sub_test])
    
    return subsets

# SVR Training
For each leaf an svm regressor is trained on the corresponding subset.

- fit_svr_linear: LinearSVC (LibLinear)
- fit_svr_sgd: SGDRegressor corresponds to stochastic gradient Linear SVM
- fit_svr_kernel: tuned kernel svm

In [1]:
def fit_svr_linear(subset):
    
    X_train, y_train = subset[0][:,1:-1], subset[0][:,-1]
        
    # decide whether to solve in primal or dual
    QP_bool = False if (X_train.shape[0] > X_train.shape[0]) else True
        
    # fit svm to subset
    reg = svm.LinearSVR(dual=QP_bool)
    reg.fit(X_train,y_train)
    
    return reg

In [11]:
def fit_svr_sgd(subset):
    
    idx_train, X_train, y_train = subset[0][:,0], subset[0][:,1:-1], subset[0][:,-1]
    idx_test,X_test,y_test = subset[1][:,0],subset[1][:,1:-1],subset[1][:,-1]
                
    # fit svm to subset
    reg = linear_model.SGDRegressor(early_stopping=True)
    reg.fit(X_train,y_train)
        
    return reg

In [11]:
def fit_svr_kernel(subset):
    
    idx_train, X_train, y_train = subset[0][:,0], subset[0][:,1:-1], subset[0][:,-1]
    idx_test,X_test,y_test = subset[1][:,0],subset[1][:,1:-1],subset[1][:,-1]
                
    # fit svm to subset
    reg = svm.SVR()
    reg.fit(X_train,y_train)
        
    return reg

# Parallel SVR training across nodes

In [3]:
def fit_svr_tree(subsets, variant):
    
    # amount of subsets or leaf nodes
    leaf_count = range(0,len(subsets))
    
    if variant == "linear":
        
        # Run SVM's in parallel
        with Parallel() as parallel:
            submodels = parallel(delayed(fit_svr_linear)(subset) for subset in subsets)
    
        submodels = dict(zip(leaf_count,submodels))
        
        return submodels
    
    elif variant == 'sgd':
        
        # Run SVM's in parallel
        with Parallel() as parallel:
            submodels = parallel(delayed(fit_svr_sgd)(subset) for subset in subsets)
    
        submodels = dict(zip(leaf_count,submodels))
        
        return submodels
    
    elif variant == 'tuned kernel':
        
        # Run SVM's in parallel
        with Parallel() as parallel:
            submodels = parallel(delayed(fit_svr_tuned)(subset) for subset in subsets)
    
        submodels = dict(zip(leaf_count,submodels))
        
        return submodels
  

# Prediction

In [7]:
def predict_leaf(submodel, subset):  
    
    idx_test, X_test = subset[1][:,0], subset[1][:,1:-1]
    
    # obtain predictions for subset
    pred = submodel.predict(X_test)
    # include original observation index
    pred = np.concatenate((idx_test.reshape(-1,1), pred.reshape(-1,1)), axis=1)
        
    return pred

In [7]:
def predict_svr_tree(submodels,subsets):
    
    # Predict SVM's in parallel
    with Parallel() as parallel:
        preds = parallel(delayed(predict_leaf)(submodel, subset) for 
                             submodel,subset in zip(submodels.values(),subsets))
       
    #aggregate predictions of the leafs into one set of predictions for the tree
    preds_all = np.concatenate(preds,axis=0)
    
    # sort predictions by their index
    preds_sorted = preds_all[np.argsort(preds_all[:,0])]
    # remove index and reshape
    preds_sorted = preds_sorted[:,1].reshape(-1,1)
    
    return preds_sorted

## Extend model to ensemble of SVR trees

In [5]:
# Run trees in parallel using all cores
def fit_svr_forest(variant, n_trees):
    with Parallel() as parallel:
        forest = parallel(delayed(fit_svr_tree)(subsets,variant) for i in range(1,n_trees))