# Random forest support vector machine regressor (RF-SVR)
## Vincent Buekers
Promotor: Prof. dr. Johan A.K. Suykens

Supervision: Yingyi Chen

In [225]:
import numpy as np
import pandas as pd

from sklearn import ensemble, svm, linear_model, datasets
from sklearn.model_selection import GridSearchCV, ShuffleSplit, train_test_split

from joblib import Parallel, delayed

## ExtraTree Partition
leafs are obtained from the leaf nodes of an extremely randomized tree. For purposes of theoretical consistency: 

- only one candidate feature is selected from all d features using the option max_features = 1, yielding totally random trees
- the leaf size is set to sqrt(n)

Note: these are non-overlapping leafs due to the recursive branching mechanism in decision trees
Note2: store extratree predictions to use when leaf node are pure

In [304]:
def extra_partition(X_train, X_test, y_train, n_estimators=10):
    
    # totally randomized forest (max_features=1)
    extra = ensemble.ExtraTreesRegressor(n_estimators=n_estimators,
                                          max_features=1, 
                                          min_samples_leaf = int(np.sqrt(len(X_train))), 
                                          n_jobs=-1, 
                                          )
    
    extra.fit(X_train,y_train)

    leaf_idx_train = pd.DataFrame(extra.apply(X_train))
    leaf_idx_test = pd.DataFrame(extra.apply(X_test))

    partitions={}
    
    for k in leaf_idx_train.columns:
    
        leafs_train = leaf_idx_train.groupby(leaf_idx_train[k],axis=0).groups
        leafs_test = leaf_idx_test.groupby(leaf_idx_test[k],axis=0).groups

        partition={}
        
        i=0
    
        for leaf_train, leaf_test in zip(leafs_train.values(), leafs_test.values()):
        
            X_train_sub, y_train_sub = X_train[leaf_train], y_train[leaf_train]
            X_test_sub = X_test[leaf_test]
    
            leaf={}
        
            leaf.update({'X_train':X_train_sub,
                         'y_train':y_train_sub, 
                         'X_test':X_test_sub, 
                         'index_test': leaf_test})
        
            partition.update({"leaf_"+str(i):leaf})
            i+=1
        
        partitions.update({"tree_"+str(k):partition})
        
    return partitions

## Embedded SVM regressors
for each leaf of each tree, an svm classifier is trained on that training subset, subsequently used to predict the corresponding leaf test test

- fit_svr_linear: LinearSVR (LibLinear)
- fit_svr_sgd: SGDRegressor

Whether the regressors should be tuned or not can be specified by setting tune = True

In [2]:
def fit_svr_linear(leaf, tune):
    
    X_train, y_train = leaf["X_train"], leaf["y_train"]

    if tune == True:
            
        # regularization values
        C_range = np.logspace(-1,1,6)
        grid = dict(C=C_range)
        
        # fit svm to leaf
        cv = ShuffleSplit(n_splits=5, test_size=0.2)
        reg = svm.LinearSVR(loss='squared_epsilon_insensitive',dual=False)
        tuned = GridSearchCV(reg, param_grid= grid, cv=cv, n_jobs=-1)
        tuned.fit(X_train,y_train)

        return tuned
        
    else:
            
        reg = svm.LinearSVR(loss='squared_epsilon_insensitive', dual=False)
        reg.fit(X_train,y_train)
            
        return reg

In [311]:
def fit_svr_sgd(leaf, tune):
    
    X_train, y_train = leaf["X_train"], leaf["y_train"]
    n = X_train.shape[0]

    if tune == True:
            
        alpha_range=10.0**-np.arange(1,7)
        grid = dict(alpha=alpha_range)
            
        cv = ShuffleSplit(n_splits=5, test_size=0.2)
        reg = linear_model.SGDRegressor(loss= "epsilon_insensitive", early_stopping=False, max_iter = np.ceil(10**6 / n))
            
        tuned = GridSearchCV(reg, param_grid=grid, cv=cv, n_jobs=-1)
        tuned.fit(X_train,y_train)

        return tuned
        
    else:
        reg = linear_model.SGDRegressor(loss= "epsilon_insensitive", early_stopping=True,  max_iter = np.ceil(10**6 / n))
        reg.fit(X_train,y_train)

        return reg

## Parallel training across nodes

In [312]:
def fit_svr_tree(leafs, variant, tune):
    
    # amount of leafs or leaf nodes
    leaf_count = leafs.keys()
    
    if variant == "linear":
        
        # Run SVM's in parallel
        with Parallel() as parallel:
            leaf_models = parallel(delayed(fit_svr_linear)(leaf, tune) for leaf in leafs.values())
    
        leaf_models = dict(zip(leaf_count,leaf_models))
        
        return leaf_models
    
    elif variant == "sgd":
        
        # Run SVM's in parallel
        with Parallel() as parallel:
            leaf_models = parallel(delayed(fit_svr_sgd)(leaf, tune) for leaf in leafs.values())
    
        leaf_models = dict(zip(leaf_count,leaf_models))
        
        return leaf_models

## Parallel training across trees

In [313]:
def fit_rf_svr(partitions, variant, tune):
    
    tree_count = partitions.keys()
    
    with Parallel() as parallel:
        forest = parallel(delayed(fit_svr_tree)(partition,variant, tune) for partition in partitions.values())
        
    forest = dict(zip(tree_count, forest))
        
    return forest

# Prediction

In [191]:
def predict_leaf(leaf, leaf_model):  
    
    idx_test, X_test= leaf["index_test"], leaf["X_test"]
  
    # obtain predictions for leaf
    pred = leaf_model.predict(X_test)
    # include original observation index
    pred = dict(zip(idx_test, pred))
        
    return pred

In [340]:
def predict_tree(leafs,leaf_models):
    
    # Predict SVM's in parallel
    with Parallel() as parallel:
        preds = parallel(delayed(predict_leaf)(leaf, leaf_model) for leaf,leaf_model in zip(leafs.values(),leaf_models.values()))
    
    preds_all = {}
    for leaf_preds in preds:
        preds_all.update(leaf_preds)
        
    preds_sorted=[]
    for key in sorted(preds_all):
        preds_sorted.append(preds_all[key])
    
    return np.array(preds_sorted)

In [437]:
def predict_forest(partitions,forest):
    
    forest_pred = []
    
    for leafs, leaf_models in zip(partitions.values(),forest.values()):
        
        tree_pred = predict_tree(leafs, leaf_models)
        tree_pred = tree_pred.reshape(-1,1)
        forest_pred.append(tree_pred)
        
    # reshape array such that column k denotes prediction for tree k
    forest_pred = np.concatenate(forest_pred,axis=1)
    # mean predicition
    mean_pred = np.apply_along_axis(np.mean, 1, forest_pred) 
   
    return mean_pred