# RF-SVC: parameter experiments

In [26]:
import numpy as np
import pandas as pd
import statistics

from sklearn import datasets, metrics, preprocessing, model_selection
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

import csv

import warnings
warnings.filterwarnings("ignore")

from IPython.display import clear_output

%run "RSVT.ipynb"

In [170]:
n_trees_1 = list(np.linspace(1,10,10))
n_trees_2 = list(np.linspace(20,100,9))
n_trees = n_trees_1+ n_trees_2
n_trees = [int(i) for i in n_trees]

def forest_size(filename):
    with open(filename, "w") as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        writer.writerow(["Forest Size", "CV_Accuracy"])
    
        i=1
        
        # 3 fold stratified CV
        folds = model_selection.StratifiedKFold(n_splits=5)
        
        for t in n_trees:
                    
            clear_output(wait=True)
                    
            acc = []
                    
            for train_index, test_index in folds.split(X, y):
                        
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                    
                # Rescale input space to [0,1] range
                scaler = preprocessing.MinMaxScaler()
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)
                        
                partitions = extra_partition(X_train,X_test, y_train, n_estimators=t)
                rsvt = fit_rsvt(partitions)
                        
                preds = predict_forest(partitions,rsvt)
                acc.append(metrics.accuracy_score(y_test,preds))
            
            cv_acc = np.mean(acc) 
            
            # append to csv file
            writer.writerow([t,cv_acc])
        
            print("current progress: ", np.round(i/(len(n_trees))*100),"%")
            i+=1

In [40]:
sample_factors = np.linspace(1/3,3, 10)

def p_var(X,y):
    
        i=1

        folds = model_selection.StratifiedKFold(n_splits=5)
        
        cv_acc = []
        
        for sample_factor in sample_factors:
                    
            clear_output(wait=True)
                    
            acc = []
                    
            for train_index, test_index in folds.split(X, y):
                        
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                    
                scaler = preprocessing.MinMaxScaler()
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)
                        
                partitions = extra_partition(X_train,X_test, y_train, min_samples_factor=sample_factor)
                rsvt = fit_rsvt(partitions)
                        
                preds = predict_forest(partitions,rsvt)
                acc.append(metrics.accuracy_score(y_test,preds))
            
            cv_acc.append(np.mean(acc)) 
        
            print("current progress: ", np.round(i/len(sample_factors)*100),"%")
            i+=1
            
        print("Variance on CV_acc: ", statistics.variance(cv_acc))
            
            

## Wine

In [6]:
X,y = datasets.load_wine(return_X_y=True)

In [191]:
forest_size("Perf/forest_wine.csv")

current progress:  100.0 %


In [34]:
p_var(X,y)

current progress:  100.0 %
Variance on CV_acc:  2.3731130898066203e-06


## Wisconsin

In [35]:
X,y = datasets.load_breast_cancer(return_X_y=True)

In [194]:
forest_size("Perf/forest_wisconsin.csv")

current progress:  100.0 %


In [36]:
p_var(X,y)

current progress:  100.0 %
Variance on CV_acc:  2.77434419607155e-06


# Land cover

In [37]:
landcover_train = pd.read_csv("Data/landcover_train.csv")

X = np.array(landcover_train.iloc[:,1:])
y = landcover_train.iloc[:,0]

enc = preprocessing.LabelEncoder()
y = enc.fit_transform(y)

In [195]:
forest_size("Perf/forest_landcover.csv")

current progress:  100.0 %


In [38]:
p_var(X,y)

current progress:  100.0 %
Variance on CV_acc:  0.0044704388966735654


# Vehicle

In [45]:
vehicle = pd.read_csv("Data/vehicle.txt", sep=" ", header=None)
vehicle = vehicle.drop([19], axis=1)
X = np.array(vehicle.iloc[:,:-1])
y = np.array(vehicle.iloc[:,-1])

enc = preprocessing.LabelEncoder()
y = enc.fit_transform(y)

In [195]:
forest_size("Perf/forest_vehicle.csv")

current progress:  100.0 %


In [46]:
p_var(X,y)

current progress:  100.0 %
Variance on CV_acc:  3.6106231600370165e-05


## Contraceptive

In [47]:
contraceptive = pd.read_csv("Data/cmc.csv", header=None)
X = np.array(contraceptive.iloc[:,:-1])
y = np.array(contraceptive.iloc[:,-1])

In [197]:
forest_size("Perf/forest_contraceptive.csv")

current progress:  100.0 %


In [48]:
p_var(X,y)

current progress:  100.0 %
Variance on CV_acc:  5.531347656359474e-05


# Image

In [49]:
segment = np.loadtxt("Data/segment.txt")
X = np.array(segment[:,:-1])
y = np.array(segment[:,-1])

In [198]:
forest_size("Perf/forest_image.csv")

current progress:  100.0 %


In [50]:
p_var(X,y)

current progress:  100.0 %
Variance on CV_acc:  4.92452873405268e-06


# Madelon

In [51]:
X = pd.read_csv("Data/madelon_train.csv", sep=" ", header=None)
X = np.array(X.drop([500], axis=1))
y = pd.read_csv("Data/madelon_trainlabels.csv", header=None)
y = np.array(y).reshape(-1,1)

train = np.c_[X,y]
X = train[:,:-1]
y = train[:,-1]

In [198]:
forest_size("Perf/forest_madelon.csv")

current progress:  100.0 %


In [52]:
p_var(X,y)

current progress:  100.0 %
Variance on CV_acc:  9.566666666666385e-06


# Spambase

In [53]:
spambase = pd.read_csv("Data/spambase.csv", header=None)
X = np.array(spambase.iloc[:,:-1])
y = np.array(spambase.iloc[:,-1])

In [200]:
forest_size("Perf/forest_spambase.csv")

current progress:  100.0 %


In [55]:
p_var(X,y)

current progress:  100.0 %
Variance on CV_acc:  2.15211141260225e-08


## Handwritten Sklearn

In [57]:
X,y = datasets.load_digits(return_X_y=True)

In [201]:
forest_size("Perf/forest_digits.csv")

current progress:  100.0 %


In [58]:
p_var(X,y)

current progress:  100.0 %
Variance on CV_acc:  6.931553518670648e-05


## Isolet

In [68]:
isolet_train = pd.read_csv("Data/isolet-train.csv", header=None)

X = np.array(isolet_train.iloc[:,:-1])
y = np.array(isolet_train.iloc[:,-1])

In [201]:
forest_size("Perf/forest_isolet.csv")

current progress:  100.0 %


In [72]:
p_var(X,y)

current progress:  100.0 %
Variance on CV_acc:  4.421756312315742e-05
