# Random forest support vector machine classifier (RF-SVC)
## Vincent Buekers
Promotor: Prof. dr. Johan A.K. Suykens

Supervision: Yingyi Chen

In [5]:
import numpy as np
import pandas as pd

from sklearn import tree, svm, linear_model
from sklearn.model_selection import RandomizedSearchCV, StratifiedShuffleSplit

from statistics import mode, StatisticsError

from joblib import Parallel, delayed

# ExtraTree Partition
Subsets are obtained from the leaf nodes of an extremely randomized tree. For purposes of theoretical consistency: 

- only one candidate feature is selected from all d features using the option max_features = 1, yielding totally random trees
- the leaf size is set to sqrt(n)

Note: these are non-overlapping subsets due to the recursive branching mechanism in decision trees
Note2: store extratree predictions to use when leaf node are pure

In [36]:
def extra_partition(X_train,X_test, y_train,y_test, idx_train,idx_test):
    
    # totally randomized tree (max_features=1)
    extra = tree.ExtraTreeClassifier(max_features=1,min_samples_leaf = int(np.sqrt(len(X_train))) )
    extra.fit(X_train,y_train)
    
    # obtain leaf indices the datapoints appear in
    leaf_idx_train, leaf_idx_test = extra.apply(X_train), extra.apply(X_test)
    
    # Keep track of observation indexes and prepare for pandas' .groupby
    leaf_idx_train = pd.DataFrame(leaf_idx_train, index=idx_train)
    leaf_idx_test = pd.DataFrame(leaf_idx_test, index=idx_test)
    
    # Group train and test observations by their leaf node
    groups_train = leaf_idx_train.groupby(leaf_idx_train[0],axis=0).groups
    groups_test = leaf_idx_test.groupby(leaf_idx_test[0],axis=0).groups
    
    # collect all data back into one array, sorted by original observation indexes
    X_train, X_test = np.c_[idx_train,X_train], np.c_[idx_test,X_test]
    y_train, y_test = np.c_[idx_train,y_train], np.c_[idx_test,y_test]
    X, y = np.r_[X_train,X_test], np.r_[y_train,y_test]
    X, y = X[np.argsort(X[:,0])], y[np.argsort(y[:,0])]
    X, y = np.delete(X, 0, 1), np.delete(y, 0, 1)
    
    # tree predictions (only test observations will be retrieved later on)
    preds_tree = extra.predict(X).reshape(-1,1)
    
    subsets_train = {}
    subsets_test = {}
    
    leaf_count = 1
    
    # Obtain train and test subsets created by the leaf node partitioning
    # iterables are a list of Int64index objects for the data in each leaf node
    for leaf_train, leaf_test in zip(list(groups_train.values()),list(groups_test.values())) :
        
        # subset the data
        X_train_sub, y_train_sub = X[leaf_train], y[leaf_train]
        X_test_sub, y_test_sub, y_tree = X[leaf_test] ,y[leaf_test], preds_tree[leaf_test]
        
        # original indexes of the observations appearing in this leaf
        train_indexes = np.array(leaf_train).reshape(-1,1)
        test_indexes = np.array(leaf_test).reshape(-1,1)
        
        # training subset including original observation indexes
        sub_train = np.c_[train_indexes, X_train_sub, y_train_sub]
        # testing subset including original observation indexes and tree predictions
        sub_test = np.c_[test_indexes, X_test_sub, y_test_sub, y_tree]

        subsets_train.update({'leaf_'+str(leaf_count):sub_train})
        subsets_test.update({'leaf_'+str(leaf_count):sub_test})
        
        leaf_count +=1
    
    subsets = {}
    subsets.update({'train':subsets_train})
    subsets.update({'test':subsets_test})
        
    return subsets

# Embedded SVM classifiers
for each subset an svm classifier is trained on the training subset and used to predict the corresponding leaf test test (if the leaf is not yet homogenous in terms of class labels). 

- fit_svc_linear: LinearSVC (LibLinear)
- fit_svc_sgd: SGDClassifier 
- fit_svc_kernel: tuned kernel svm

In [42]:
def fit_svc_linear(subset):
    
    X_train, y_train = subset[:,1:-1], subset[:,-1]
    
    # check if leaf node is heterogeneous (i.e. consists of more than one class) 
    # also check if it contains enough samples to conduct training (2)
    if len(np.unique(y_train)) >= 2 and (np.bincount(y_train.astype(int)) >= 2).all():
        
        # decide whether to solve in primal or dual
        QP_bool = False if (X_train.shape[0] > X_train.shape[0]) else True
        
        # regularization values
        C_range = np.logspace(-2, 10, 13)
        param_grid = dict(C=C_range)
        
        # fit svm to subset
        cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2)
        clf = svm.LinearSVC(class_weight='balanced', dual=QP_bool)
        tuned = RandomizedSearchCV(clf, param_distributions=param_grid, cv=cv, n_jobs=-1)
        tuned.fit(X_train,y_train)

        return tuned
    
    else:
        return None

In [42]:
def fit_svc_sgd(subset):
    
    X_train, y_train = subset[:,1:-1], subset[:,-1]
        
    # check if leaf node is heterogeneous (i.e. consists of more than one class) 
    # also check if it contains enough samples to conduct training (2)
    if len(np.unique(y_train)) >= 2 and (np.bincount(y_train.astype(int)) >= 2).all():
        
        # fit sgd classifier
        # validation_fraction=0.2 = 5-fold cross-validation
        # balanced class_weight = class-specific regularization
        clf = linear_model.SGDClassifier(class_weight='balanced',
                                         early_stopping=True,
                                         validation_fraction=0.2)
        clf.fit(X_train,y_train)

        return clf
    
    else:
        return None

In [42]:
def fit_svc_tuned(subset):
    
    X_train, y_train = subset[:,1:-1], subset[:,-1]
    
    C_range = np.logspace(-2, 10, 13)
    gamma_range = np.logspace(-9, 3, 13)
    kernel_list = ['linear','rbf','poly']
    param_grid = dict(gamma=gamma_range, C=C_range, kernel=kernel_list)
        
    # check if leaf node is heterogeneous (i.e. consists of more than one class) 
    # also check if it contains enough samples to conduct training (2)
    if len(np.unique(y_train)) >= 2 and (np.bincount(y_train.astype(int)) >= 2).all():

        # fit svm to subset
        cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2)
        clf = svm.SVC(class_weight='balanced')
        tuned = RandomizedSearchCV(clf, param_distributions=param_grid, cv=cv, n_jobs=-1)
        tuned.fit(X_train, y_train)
        
        #print(tuned.best_params_)

        return tuned
    
    else:
        return None

# Parallel training across nodes

In [55]:
def fit_svc_tree(subsets, variant):
    
    # retrieve training subsets
    subsets_train = subsets['train']
    # amount of subsets or leaf nodes
    leaf_count = subsets_train.keys()
    
    if variant == "linear":
        
        # Run SVM's in parallel
        with Parallel() as parallel:
            submodels = parallel(delayed(fit_svc_linear)(subset) for subset in subsets_train.values())
    
        submodels = dict(zip(leaf_count,submodels))
        
        return submodels
    
    elif variant == 'sgd':
        
        # Run SVM's in parallel
        with Parallel() as parallel:
            submodels = parallel(delayed(fit_svc_sgd)(subset) for subset in subsets_train.values())
    
        submodels = dict(zip(leaf_count,submodels))
        
        return submodels
    
    elif variant == 'tuned kernel':
        
        # Run SVM's in parallel
        with Parallel() as parallel:
            submodels = parallel(delayed(fit_svc_tuned)(subset) for subset in subsets_train.values())
    
        submodels = dict(zip(leaf_count,submodels))
        
        return submodels

# Prediction

In [55]:
def predict_leaf(submodel, subset):  
    
    idx_test, X_test, y_tree = subset[:,0], subset[:,1:-2], subset[:,-1]
    
    # use tree predictions if leaf node is pure
    if submodel==None:
        tree_pred = np.concatenate((idx_test.reshape(-1,1), y_tree.reshape(-1,1)), axis=1)
        return tree_pred
    
    else:    
        # obtain predictions for subset
        svm_pred = submodel.predict(X_test)
        # include original observation index
        svm_pred = np.concatenate((idx_test.reshape(-1,1), svm_pred.reshape(-1,1)), axis=1)
        
        return svm_pred

In [55]:
def predict_svc_tree(submodels,subsets):
    
    # retrieve test subsets
    subsets_test = subsets['test']
    
    # Predict SVM's in parallel
    with Parallel() as parallel:
        preds = parallel(delayed(predict_leaf)(submodel, subset) for 
                             submodel,subset in zip(submodels.values(),subsets_test.values()))
       
    #aggregate predictions of the leafs into one set of predictions for the tree
    preds_all = np.concatenate(preds,axis=0)
    
    # sort predictions by their index
    preds_sorted = preds_all[np.argsort(preds_all[:,0])]
    # remove index and reshape
    preds_sorted = preds_sorted[:,1].reshape(-1,1)
    
    return preds_sorted

# RF-SVC ensemble
Finally, the above procedure can be extended to an ensemble of randomized trees

In [3]:
def forest_partition(X_train,X_test,y_train,y_test,idx_train,idx_test, n_trees):
    
    partitions = {}
    
    for i in range(0,trees):
        subsets = extra_partition(X_train,X_test, y_train,y_test, idx_train,idx_test)
        partitions.update({'partition_'+str(i): subsets})
    
    return partitions    

In [1]:
def fit_rf_svc(partitions, variant):

    forest={}
    i = 0
    
    for partition in partitions.values():
        submodels = fit_svc_tree(partition,variant)
        forest.update({'tree_'+str(i): submodels})
        i += 1
        
    return forest

# ensemble prediction

In [63]:
# Obtain majority vote for each datapoint
def majority_vote(l):
    try:
        return mode(l)
    except StatisticsError:
        return 0

In [None]:
def predict_rf_svc(forest, partitions):
    
    forest_pred = []
    
    for submodels, subsets in zip(forest.values(),partitions.values()):
        
        tree_pred = predict_svc_tree(submodels, subsets)
        forest_pred.append(tree_pred)
        
    # reshape array such that column k denotes prediction for tree k
    forest_pred = np.concatenate(forest_pred,axis=1)
    # majority vote
    majority = np.apply_along_axis(majority_vote, 1, forest_pred) 
   
    return majority.reshape(-1,1)