# RSVT Classifier
## - Vincent Buekers

In [1]:
import numpy as np
import pandas as pd

from sklearn import ensemble, svm, linear_model, datasets
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, train_test_split

from statistics import mode, StatisticsError

from joblib import Parallel, delayed

## ExtraTree Partition

- only one candidate feature is selected from all d features using the option max_features = 1, yielding totally random trees
- the leaf size is set proportional to sqrt(n)
- the trees are trained in parallel for computational efficiency

In [304]:
def extra_partition(X_train, X_test, y_train, n_estimators=10, random_state=None, min_samples_factor=1):
    
    min_samples = int(min_samples_factor*np.sqrt(X_train.shape[0]))
    
    extra = ensemble.ExtraTreesClassifier(n_estimators=n_estimators,
                                          max_features=1, 
                                          min_samples_leaf = min_samples, 
                                          n_jobs=-1, 
                                          random_state=random_state
                                          )
    
    extra.fit(X_train,y_train)
    y_pred_extra = extra.predict(X_test)

    leaf_idx_train = pd.DataFrame(extra.apply(X_train))
    leaf_idx_test = pd.DataFrame(extra.apply(X_test))

    partitions={}
    
    for k in leaf_idx_train.columns:
    
        leafs_train = leaf_idx_train.groupby(leaf_idx_train[k],axis=0).groups
        leafs_test = leaf_idx_test.groupby(leaf_idx_test[k],axis=0).groups

        partition={}
        
        i=0
    
        for leaf_train, leaf_test in zip(leafs_train.values(), leafs_test.values()):
        
            X_train_sub, y_train_sub = X_train[leaf_train], y_train[leaf_train]
            X_test_sub = X_test[leaf_test]
            y_pred_sub = y_pred_extra[leaf_test]
    
            leaf={}
        
            leaf.update({'X_train':X_train_sub,
                         'y_train':y_train_sub, 
                         'X_test':X_test_sub, 
                         'index_test': leaf_test,
                         'y_pred_extra':y_pred_sub})
        
            partition.update({"leaf_"+str(i):leaf})
            i+=1
        
        partitions.update({"tree_"+str(k):partition})
        
    return partitions

## SVM training
- for each leaf, or local region, an svm classifier is trained on the training data stored in that leaf. 
- Each SVC is tuned specific to its own local region.
- Note that it is not possible to train a leaf node that is already pure, in which case the prediction of the extremely randomized forest is adopted instead.

In [2]:
def fit_svc(leaf):
    
    X_train, y_train = leaf["X_train"], leaf["y_train"]
    
    # check if leaf node is heterogeneous (i.e. consists of more than one class) 
    # also check if it contains enough samples to conduct training (2)
    if len(np.unique(y_train)) >= 2 and (pd.Series(y_train).value_counts() >= 2).all(): #(np.bincount(y_train.astype(int)) >= 2).all()
        
        # decide whether to solve in primal or dual
        QP_bool = False if (X_train.shape[0] > X_train.shape[1]) else True
            
        # regularization values
        C_range = np.logspace(-2,2,5,base=2)
        grid = dict(C=C_range)
        
        # train svm on leaf data + optimize C by means of stratified CV 
        cv = StratifiedShuffleSplit(n_splits=3, test_size=1/3)
        clf = svm.LinearSVC(class_weight='balanced', dual=QP_bool)#, random_state=random_fix)
        tuned = GridSearchCV(clf, param_grid= grid, cv=cv, n_jobs=-1)
        tuned.fit(X_train, y_train)

        return tuned.best_estimator_

    else:
        return None

## Parallel training across nodes and trees
- the individual leaf SVMs are trained in parallel.

In [312]:
def fit_svc_tree(leafs):
    
    # amount of leafs or leaf nodes
    leaf_count = leafs.keys()
    
    # Run SVM's in parallel
    with Parallel() as parallel:
        leaf_models = parallel(delayed(fit_svc)(leaf) for leaf in leafs.values())
    
    leaf_models = dict(zip(leaf_count,leaf_models))
        
    return leaf_models

In [313]:
def fit_rsvt(partitions):
    
    tree_count = partitions.keys()
    
    with Parallel() as parallel:
        forest = parallel(delayed(fit_svc_tree)(partition) for partition in partitions.values())
        
    forest = dict(zip(tree_count, forest))
        
    return forest

# Prediction
- The test subset stored within each leaf is predicted using its corresponding leaf model. 
- In the case a leaf node was already pure, i.e. consists of only one class, the prediction of the extra forest is used instead.

In [191]:
def predict_leaf(leaf, leaf_model):  
    
    idx_test, X_test, y_pred_extra = leaf["index_test"], leaf["X_test"], leaf["y_pred_extra"], 
    
    # use tree predictions if leaf node is pure
    if leaf_model == None:
        pred = dict(zip(idx_test,y_pred_extra))
        return pred
    
    else:    
        # obtain predictions for leaf
        pred = leaf_model.predict(X_test)
        # include original observation index
        pred = dict(zip(idx_test, pred))
        
        return pred

In [340]:
def predict_tree(leafs,leaf_models):
    
    # Predict SVM's in parallel
    with Parallel() as parallel:
        preds = parallel(delayed(predict_leaf)(leaf, leaf_model) for leaf,leaf_model in zip(leafs.values(),leaf_models.values()))
    
    preds_all = {}
    for leaf_preds in preds:
        preds_all.update(leaf_preds)
        
    preds_sorted=[]
    for key in sorted(preds_all):
        preds_sorted.append(preds_all[key])
    
    return np.array(preds_sorted)

## Forest prediction
- forest predictions are obtained by aggregating the tree predicitions by means of a majority vote, as is commonly implemented in tree ensembles

In [386]:
# Obtain majority vote for each datapoint
def most_common(lst):
    return max(set(lst), key=lst.count)

def majority_vote(l):
    try:
        return mode(l)
    except StatisticsError:
        return most_common(list(l))

In [437]:
def predict_forest(partitions,forest):
    
    forest_pred = []
    
    for leafs, leaf_models in zip(partitions.values(),forest.values()):
        
        tree_pred = predict_tree(leafs, leaf_models)
        tree_pred = tree_pred.reshape(-1,1)
        forest_pred.append(tree_pred)
        
    # reshape array such that column k denotes prediction for tree k
    forest_pred = np.concatenate(forest_pred,axis=1)
    # majority vote
    majority = np.apply_along_axis(majority_vote, 1, forest_pred) 
   
    return majority