# RF-SVC: Experiments on training time with artificially generated data

In [3]:
import numpy as np
import pandas as pd

from sklearn import datasets, metrics, preprocessing
from sklearn.model_selection import train_test_split

import time
import csv

import warnings
warnings.filterwarnings("ignore")

%run 'RF-SVM-Classification.ipynb'

In [2]:
n_samples = np.linspace(10000, 1000000, num=5) 
samples = [int(i) for i in n_samples]

# Binary Classification
## Single trees
### linear SVC tree: liblinear

In [31]:
with open("Time/time_lineartree_bin.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
                        
    for n in samples:
        print("training on "+ str(n) +" samples:")
        X, y = datasets.make_classification(n_samples=n)
    
        X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y, np.arange(len(X))
                                                                       ,test_size=1/3, stratify=y)
    
        # Rescale input space to [0,1] range (for purposes of consistency and stability)
        scaler = preprocessing.MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        subsets = extra_partition(X_train,X_test, y_train,y_test, idx_train,idx_test)
        
        training_time = [n]

        for tune in [False, True]:
            t = time.time()
            
            svc_tree = fit_svc_tree(subsets, variant="linear", tune=tune)
            
            time_model = time.time() - t
            training_time.append(time_model)
            
            print("    * tune = " + str(tune) + " ({:.2f} seconds)".format(time_model))
        
        # append to csv file
        writer.writerow(training_time)

training on 10000 samples:
    * tune = False (0.04 seconds)
    * tune = True (1.77 seconds)
training on 257500 samples:
    * tune = False (0.58 seconds)
    * tune = True (15.17 seconds)
training on 505000 samples:
    * tune = False (1.32 seconds)
    * tune = True (26.54 seconds)
training on 752500 samples:
    * tune = False (1.40 seconds)
    * tune = True (35.36 seconds)
training on 1000000 samples:
    * tune = False (2.47 seconds)
    * tune = True (79.99 seconds)


### linear SVC tree: SGD

In [4]:
with open("Time/time_sgdtree_bin.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
                        
    for n in samples:
        print("training on "+ str(n) +" samples:")
        X, y = datasets.make_classification(n_samples=n)
    
        X_train,X_test, y_train,y_test, idx_train,idx_test =train_test_split(X,y,
                                                                             np.arange(len(X)),
                                                                             test_size=1/3,
                                                                             stratify=y)
    
        # Rescale input space to [0,1] range (for purposes of consistency and stability)
        scaler = preprocessing.MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        subsets = extra_partition(X_train,X_test, y_train,y_test, idx_train,idx_test)
        
        training_time = [n]

        for tune_bool in [False, True]:
            t = time.time()
            
            svc_tree = fit_svc_tree(subsets, variant="sgd", tune=tune_bool)
            
            time_model = time.time() - t
            training_time.append(time_model)
            
            print("    * tune = " + str(tune_bool) + " ({:.2f} seconds)".format(time_model))
        
        # append to csv file
        writer.writerow(training_time)

training on 10000 samples:
    * tune = False (0.14 seconds)
    * tune = True (2.44 seconds)
training on 257500 samples:
    * tune = False (0.47 seconds)
    * tune = True (12.45 seconds)
training on 505000 samples:
    * tune = False (0.76 seconds)
    * tune = True (24.57 seconds)
training on 752500 samples:
    * tune = False (0.77 seconds)
    * tune = True (38.72 seconds)
training on 1000000 samples:
    * tune = False (1.04 seconds)
    * tune = True (50.96 seconds)


## Multiple kernel SVC tree

In [None]:
with open("Time/time_multitree_bin.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
                        
    for n in samples:
        print("training multiple kernel on "+ str(n) +" samples:")
        
        X, y = datasets.make_classification(n_samples=n)
    
        X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y, np.arange(len(X))
                                                                       ,test_size=1/3, stratify=y)
    
        # Rescale input space to [0,1] range (for purposes of consistency and stability)
        scaler = preprocessing.MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        subsets = extra_partition(X_train,X_test, y_train,y_test, idx_train,idx_test)
        
        training_time = [n]
        
        for tune_bool in [False,True]:
            
            t = time.time()
            
            svc_tree = fit_svc_tree(subsets, variant="multiple kernel", tune=tune_bool)
            
            time_kernel = time.time() - t
            training_time.append(time_kernel)
                    
            print("    * tune = "+str(tune_bool)+" ({:.2f} seconds)".format(time_kernel))
        
        # append to csv file
        writer.writerow(training_time)

## Ensembles

In [38]:
trees_list = np.linspace(10,100,5)
trees_list = [int(trees) for trees in trees_list]
trees_list

[10, 32, 55, 77, 100]

### liblinear SVC forest (untuned)

In [14]:
with open("Time/time_linforest_untuned_bin.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    
    for n in samples:
        print("Training linear SVC forest on " + str(n) + " samples:")
        X, y = datasets.make_classification(n_samples=n)
    
        X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y, np.arange(len(X))
                                                                       ,test_size=1/3, stratify=y)
    
        # Rescale input space to [0,1] range (for purposes of consistency and stability)
        scaler = preprocessing.MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        training_time=[n]
        
        for trees in trees_list:
            t = time.time()
        
            partitions = forest_partition(X_train,X_test,
                                          y_train,y_test,
                                          idx_train,idx_test, 
                                          n_trees=trees)
            forest = fit_rf_svc(partitions, variant="linear", tune=False)
            
            time_tree = round(time.time() - t, 2)
            training_time.append(time_tree)
            
            print("    * " + str(trees) + " trees: {:.2f} seconds".format(time_tree))
            
        # write to csv file
        writer.writerow(training_time)

Training linear SVC forest on 1000 samples:
    * 10 trees: 0.15 seconds
    * 25 trees: 0.21 seconds
    * 50 trees: 0.34 seconds
Training linear SVC forest on 112000 samples:
    * 10 trees: 3.96 seconds
    * 25 trees: 10.03 seconds
    * 50 trees: 23.83 seconds
Training linear SVC forest on 223000 samples:
    * 10 trees: 11.71 seconds
    * 25 trees: 22.21 seconds
    * 50 trees: 42.42 seconds
Training linear SVC forest on 334000 samples:
    * 10 trees: 13.31 seconds
    * 25 trees: 31.76 seconds
    * 50 trees: 67.43 seconds
Training linear SVC forest on 445000 samples:
    * 10 trees: 17.27 seconds
    * 25 trees: 43.16 seconds
    * 50 trees: 93.90 seconds
Training linear SVC forest on 556000 samples:
    * 10 trees: 19.22 seconds
    * 25 trees: 47.32 seconds
    * 50 trees: 116.76 seconds
Training linear SVC forest on 667000 samples:
    * 10 trees: 27.87 seconds
    * 25 trees: 63.27 seconds
    * 50 trees: 172.33 seconds
Training linear SVC forest on 778000 samples:
    * 

### SGD SVC forest (untuned)

In [22]:
with open("Time/time_sgdforest_untuned_bin.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    
    for n in samples:
        print("Training linear SVC forest on " + str(n) + " samples:")
        X, y = datasets.make_classification(n_samples=n)
    
        X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y, np.arange(len(X))
                                                                       ,test_size=1/3, stratify=y)
    
        # Rescale input space to [0,1] range (for purposes of consistency and stability)
        scaler = preprocessing.MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        training_time=[n]
        
        for trees in trees_list:
            t = time.time()
        
            partitions = forest_partition(X_train,X_test,
                                          y_train,y_test,
                                          idx_train,idx_test, 
                                          n_trees=trees)
            forest = fit_rf_svc(partitions, variant="sgd", tune=True)
            
            time_tree = round(time.time() - t, 2)
            training_time.append(time_tree)
            
            print("    * " + str(trees) + " trees: {:.2f} seconds".format(time_tree))
            
        # write to csv file
        writer.writerow(training_time)

### Liblinear SVC forest (tuned)

In [None]:
with open("Time/time_linforest_tuned_bin.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    
    for n in samples:
        print("Training linear SVC forest on " + str(n) + " samples:")
        X, y = datasets.make_classification(n_samples=n)
    
        X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y, np.arange(len(X))
                                                                       ,test_size=1/3, stratify=y)
    
        # Rescale input space to [0,1] range (for purposes of consistency and stability)
        scaler = preprocessing.MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        training_time=[n]
        
        for trees in trees_list:
            t = time.time()
        
            partitions = forest_partition(X_train,X_test,
                                          y_train,y_test,
                                          idx_train,idx_test, 
                                          n_trees=trees)
            forest = fit_rf_svc(partitions, variant="linear", tune=True)
            
            time_tree = round(time.time() - t, 2)
            training_time.append(time_tree)
            
            print("    * " + str(trees) + " trees: {:.2f} seconds".format(time_tree))
            
        # write to csv file
        writer.writerow(training_time)

### SGD SVC forest (tuned)

In [None]:
with open("Time/time_sgdforest_tuned_bin.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    
    for n in samples:
        print("Training linear SVC forest on " + str(n) + " samples:")
        X, y = datasets.make_classification(n_samples=n)
    
        X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y, np.arange(len(X))
                                                                       ,test_size=1/3, stratify=y)
    
        # Rescale input space to [0,1] range (for purposes of consistency and stability)
        scaler = preprocessing.MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        training_time=[n]
        
        for trees in trees_list:
            t = time.time()
        
            partitions = forest_partition(X_train,X_test,
                                          y_train,y_test,
                                          idx_train,idx_test, 
                                          n_trees=trees)
            forest = fit_rf_svc(partitions, variant="sgd", tune=False)
            
            time_tree = round(time.time() - t, 2)
            training_time.append(time_tree)
            
            print("    * " + str(trees) + " trees: {:.2f} seconds".format(time_tree))
            
        # write to csv file
        writer.writerow(training_time)