# RSVT: Experiments on training time with artificially generated data

In [21]:
import numpy as np
import pandas as pd

from sklearn import datasets, preprocessing
from sklearn.model_selection import train_test_split

import time
import csv

import warnings
warnings.filterwarnings("ignore")

from IPython.display import clear_output

%run "RSVT.ipynb"

# Forest Size

In [30]:
samples = np.linspace(15000,150000,10)
samples = [int(i) for i in samples]

n_trees = [1,5,10]

In [7]:
with open("Time/forest.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    writer.writerow(['sample size','forest size', 'time'])
    
    i=1
    for n in samples:
        
        n_train = 2/3*n

        X, y = datasets.make_classification(n_samples=n)
    
        X_train,X_test, y_train,y_test = train_test_split(X,y,test_size=1/3)
        
        for trees in n_trees:
            clear_output(wait=True)
                        
            t = time.time()
            
            partitions = extra_partition(X_train,X_test,y_train, n_estimators=trees)
            rsvt = fit_rsvt(partitions)
            
            time_train = time.time() - t

            writer.writerow([n_train, trees, time_train])
                           
            print("current progress: ", np.round(i/(len(samples)*len(n_trees))*100),"%")
            i+=1

current progress:  100.0 %


# Dimensionality

In [25]:
dim_list = [100,200,500]

In [26]:
with open("Time/dimensions.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    writer.writerow(['sample size','dimensions', 'time'])
    
    i=1
    for n in samples:
        
        n_train = 2/3*n
        
        for dim in dim_list:
            clear_output(wait=True)

            X, y = datasets.make_classification(n_samples=n, n_features=dim)
    
            X_train,X_test, y_train,y_test = train_test_split(X,y,test_size=1/3)
                        
            # Rescale input space to [0,1] range
            scaler = preprocessing.MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            
            t = time.time()
            
            partitions = extra_partition(X_train,X_test,y_train, n_estimators=1)
            rsvt = fit_rsvt(partitions)
            
            time_train = time.time() - t

            writer.writerow([n_train, dim, time_train])
                           
            print("current progress: ", np.round(i/(len(samples)*len(dim_list))*100),"%")
            i+=1

current progress:  100.0 %


# Number of Classes

In [27]:
n_classes = [4,10,24]

In [29]:
with open("Time/classes.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    writer.writerow(["sample size", "classes", "time"])     
    
    i=1
    for n in samples:
        
        n_train = 2/3*n
        
        for k in n_classes:
            clear_output(wait=True)
        
            X, y = datasets.make_classification(n_samples=n, n_classes=k, n_informative=k, n_features=2*k)
    
            X_train,X_test, y_train,y_test = train_test_split(X,y,test_size=1/3)
        
            # Rescale input space to [0,1] range
            scaler = preprocessing.MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            
            t = time.time()
            
            partition = extra_partition(X_train,X_test, y_train, n_estimators=1)
            rsvt = fit_rsvt(partition)
            
            time_model = time.time() - t

            writer.writerow([n_train, k, time_model])
        
            print("current progress: ", np.round(i/(len(samples)*len(n_classes))*100),"%")
            i+=1

current progress:  100.0 %
