# RSVT: Benchmark Results

In [1]:
import numpy as np
import pandas as pd

from sklearn import datasets, metrics, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

import warnings
warnings.filterwarnings("ignore")

%run "RSVT.ipynb"

In [1]:
def benchmarks(X_train,X_test, y_train,y_test):
    RFC = RandomForestClassifier(random_state=754046)
    RFC.fit(X_train, y_train)
    preds_RFC = RFC.predict(X_test)
    acc_RFC = metrics.accuracy_score(y_test, preds_RFC)

    ETC = ExtraTreesClassifier(random_state=754046)
    ETC.fit(X_train, y_train)
    preds_ETC = ETC.predict(X_test)
    acc_ETC = metrics.accuracy_score(y_test, preds_ETC)

    partitions = extra_partition(X_train,X_test, y_train, n_estimators=10, random_state=754046)
    rsvt = fit_rsvt(partitions)
    preds = predict_forest(partitions,rsvt)
    acc_RSVT = metrics.accuracy_score(y_test, preds)

    print("Random Forest: ", acc_RFC, "\nExtra-Trees: ",acc_ETC, "\nRSVT: ", acc_RSVT)

## Wine

In [5]:
X,y = datasets.load_wine(return_X_y=True)

X_train,X_test, y_train,y_test = train_test_split(X,y, test_size=1/3, stratify=y, random_state=754046)

# Rescale input space to [0,1] range
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
benchmarks(X_train,X_test, y_train,y_test)

Random Forest:  0.9666666666666667 
Extra-Trees:  0.95 
RSVT:  0.95


## Breast Cancer

In [7]:
X,y = datasets.load_breast_cancer(return_X_y=True)

X_train,X_test, y_train,y_test = train_test_split(X,y, test_size=1/3, stratify=y, random_state=754046)

# Rescale input space to [0,1] range
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
benchmarks(X_train,X_test, y_train,y_test)

Random Forest:  0.9526315789473684 
Extra-Trees:  0.9578947368421052 
RSVT:  0.968421052631579


# Land cover

In [9]:
landcover_train = pd.read_csv("Data/landcover_train.csv")
landcover_test = pd.read_csv("Data/landcover_test.csv")

X_train = np.array(landcover_train.iloc[:,1:])
X_test = np.array(landcover_test.iloc[:,1:])

y_train = landcover_train.iloc[:,0]
y_test = landcover_test.iloc[:,0]

enc = preprocessing.LabelEncoder()
y_train = enc.fit_transform(y_train)
y_test = enc.transform(y_test)

# Rescale input space to [0,1] range
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
benchmarks(X_train,X_test, y_train,y_test)

Random Forest:  0.8106508875739645 
Extra-Trees:  0.8244575936883629 
RSVT:  0.7396449704142012


# Vehicle

In [11]:
vehicle = pd.read_csv("Data/vehicle.txt", sep=" ", header=None)
vehicle = vehicle.drop([19], axis=1)
X = vehicle.iloc[:,0:-1]
y = vehicle.iloc[:,-1]
enc = preprocessing.LabelEncoder()
y = enc.fit_transform(y)

X_train,X_test, y_train,y_test = train_test_split(X,y, test_size=1/3, stratify=y, random_state=754046)

# Rescale input space to [0,1] range
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
benchmarks(X_train,X_test, y_train,y_test)

Random Forest:  0.723404255319149 
Extra-Trees:  0.7198581560283688 
RSVT:  0.7411347517730497


## Contraceptive

In [13]:
contraceptive = pd.read_csv("Data/cmc.csv", header=None)
X = np.array(contraceptive.iloc[:,:-1])
y = np.array(contraceptive.iloc[:,-1])

X_train,X_test, y_train,y_test = train_test_split(X,y, test_size=1/3, stratify=y, random_state=754046)

# Rescale input space to [0,1] range
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
benchmarks(X_train,X_test, y_train,y_test)

Random Forest:  0.5132382892057027 
Extra-Trees:  0.5010183299389002 
RSVT:  0.5234215885947047


# Image

In [15]:
segment = np.loadtxt("Data/segment.txt")
X = np.array(segment[:,:-1])
y = np.array(segment[:,-1])

X_train,X_test, y_train,y_test = train_test_split(X,y, test_size=1/3, stratify=y, random_state=754046)

# Rescale input space to [0,1] range
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
benchmarks(X_train,X_test, y_train,y_test)

Random Forest:  0.9688311688311688 
Extra-Trees:  0.9701298701298702 
RSVT:  0.9207792207792208


# Madelon

In [17]:
X_train = pd.read_csv("Data/madelon_train.csv", sep=" ", header=None)
X_train = np.array(X_train.drop([500], axis=1))
y_train = pd.read_csv("Data/madelon_trainlabels.csv", header=None)
y_train = np.array(y_train).reshape(-1,1)

train = np.c_[X_train,y_train]
X_train = train[:,:-1]
y_train = train[:,-1]

X_test = pd.read_csv("Data/madelon_valid.csv", sep=" ", header=None)
X_test = np.array(X_test.drop([500], axis=1))
y_test = pd.read_csv("Data/madelon_validlabels.csv", header=None)
y_test = np.array(y_test).reshape(-1,1)

test = np.c_[X_test,y_test]
X_test = test[:,:-1]
y_test = test[:,-1]

# Rescale input space to [0,1] range
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
benchmarks(X_train,X_test, y_train,y_test)

Random Forest:  0.7033333333333334 
Extra-Trees:  0.6733333333333333 
RSVT:  0.58


# Spambase

In [20]:
spambase = pd.read_csv("Data/spambase.csv", header=None)
X = np.array(spambase.iloc[:,:-1])
y = np.array(spambase.iloc[:,-1])

X_train,X_test, y_train,y_test = train_test_split(X,y, test_size=1/3, stratify=y, random_state=754046)

# Rescale input space to [0,1] range
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [21]:
benchmarks(X_train,X_test, y_train,y_test)

Random Forest:  0.9569752281616688 
Extra-Trees:  0.9576271186440678 
RSVT:  0.939374185136897


## Handwritten Sklearn

In [22]:
X,y = datasets.load_digits(return_X_y=True)

X_train,X_test, y_train,y_test = train_test_split(X,y, test_size=1/3, stratify=y, random_state=754046)

# Rescale input space to [0,1] range
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [23]:
benchmarks(X_train,X_test, y_train,y_test)

Random Forest:  0.9666110183639399 
Extra-Trees:  0.9849749582637729 
RSVT:  0.9666110183639399


## Isolet

In [24]:
isolet_train = pd.read_csv("Data/isolet-train.csv", header=None)
isolet_test = pd.read_csv("Data/isolet-test.csv", header=None)

X_train = np.array(isolet_train.iloc[:,:-1])
y_train = isolet_train.iloc[:,-1]

X_test = np.array(isolet_test.iloc[:,:-1])
y_test = isolet_test.iloc[:,-1]

# Rescale input space
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [25]:
benchmarks(X_train,X_test, y_train,y_test)

Random Forest:  0.9422706863373957 
Extra-Trees:  0.9512508017960231 
RSVT:  0.9583066067992303
