# RF-SVM
## Experiments on time complexity

In [1]:
import numpy as np
import pandas as pd

from sklearn import datasets, metrics, preprocessing
from sklearn.model_selection import train_test_split

import time
import csv

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# Time complexity analysis on artificially generated data

In [2]:
n_samples = np.linspace(100, 1000000, num=10) 
samples = [int(i) for i in n_samples]

variants = ['linear', 'sgd', 'multiple kernel']

trees_list = np.arange(10,30,10)

## Classification

In [3]:
%run 'RF-SVM-Classification.ipynb'

### Single tree untuned

In [None]:
with open("time_tree_untuned.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
                        
    for n in samples:
        X, y = datasets.make_classification(n_samples=n)
    
        X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y, np.arange(len(X))
                                                                       ,test_size=1/3, stratify=y)
    
        # Rescale input space to [0,1] range (for purposes of consistency and stability)
        scaler = preprocessing.MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        subsets = extra_partition(X_train,X_test, y_train,y_test, idx_train,idx_test)
        
        training_time = [n]

        for variant in variants:
            t = time.time()
            
            svc_tree = fit_svc_tree(subsets, variant, tune=False)

            training_time.append(time.time() - t)
        
        # append to csv file
        writer.writerow(training_time)
        print("training on "+ str(n) +" samples is done.")

training on 100 samples is done.
training on 111200 samples is done.
training on 222300 samples is done.


### Forest

In [35]:
with open("time_linear_svc_forest.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    
    for n in samples:
        X, y = datasets.make_classification(n_samples=n)
    
        X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y, np.arange(len(X))
                                                                       ,test_size=1/3, stratify=y)
    
        # Rescale input space to [0,1] range (for purposes of consistency and stability)
        scaler = preprocessing.MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        training_time=[n]
        
        for trees in trees_list:
            t = time.time()
        
            partitions = forest_partition(X_train,X_test,
                                          y_train,y_test,
                                          idx_train,idx_test, 
                                          n_trees=trees)
            forest = fit_rf_svc(partitions, variant="linear")

            training_time.append(time.time() - t)
            
        # write to csv file
        writer.writerow(training_time)

### SGD SVC tree

In [22]:
with open("time_sgd_svc_tree.csv", "w") as out_file:
    for n in samples:
        X, y = datasets.make_classification(n_samples=n)
    
        X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y, np.arange(len(X))
                                                                       ,test_size=1/3, stratify=y)
    
        # Rescale input space to [0,1] range (for purposes of consistency and stability)
        scaler = preprocessing.MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    
        t = time.time()

        subsets = extra_partition(X_train,X_test, y_train,y_test, idx_train,idx_test)
        svc_tree = fit_svc_tree(subsets, variant="sgd")

        training_time = time.time() - t
        
        # append to csv file
        out = ""
        out += str(n)
        out += ","+str(training_time)
        out += "\n"
        out_file.write(out)

# SGD SVC forest

### multiple Kernel tree

In [None]:
with open("time_kernel_svc_tree.csv", "w") as out_file:
    for n in samples:
        X, y = datasets.make_classification(n_samples=n)
    
        X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y, np.arange(len(X))
                                                                       ,test_size=1/3, stratify=y)
    
        # Rescale input space to [0,1] range (for purposes of consistency and stability)
        scaler = preprocessing.MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    
        t = time.time()

        subsets = extra_partition(X_train,X_test, y_train,y_test, idx_train,idx_test)
        svc_tree = fit_svc_tree(subsets, variant="kernel")

        training_time = time.time() - t
        
        # append to csv file
        out = ""
        out += str(n)
        out += ","+str(training_time)
        out += "\n"
        out_file.write(out)

### Multiple kernel forest

In [None]:
with open("time_kernel_svc_tree.csv", "w") as out_file:
    for n in samples:
        X, y = datasets.make_classification(n_samples=n)
    
        X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y, np.arange(len(X))
                                                                       ,test_size=1/3, stratify=y)
    
        # Rescale input space to [0,1] range (for purposes of consistency and stability)
        scaler = preprocessing.MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        for trees in trees_list:
            t = time.time()
        
            partitions = forest_partition(X_train,X_test,
                                          y_train,y_test,
                                          idx_train,idx_test, 
                                          n_trees=trees)
            forest = fit_rf_svc(partitions, variant="kernel")

            training_time = time.time() - t
        
            # append to csv file
            out = ""
            out += str(n)
            out += ","+str(training_time)
            
        out += "\n"
        out_file.write(out)

# Multiclass Clasification
### LibLinear

In [None]:
time_linear = []

for n in samples:
    X, y = datasets.make_classification(n_samples=n, n_classes=3)
    #size.append(sys.getsizeof(np.c_[X,y]))
    
    X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y, np.arange(len(X))
                                                                       ,test_size=1/3, stratify=y)
    
    # Rescale input space to [0,1] range (for purposes of consistency and stability)
    scaler = preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    t = time.time()

    # Run trees in parallel
    trees = 10
    with Parallel() as parallel:
        parallel(delayed(svc_tree_linear)(X_train,X_test 
                                                    ,y_train,y_test
                                                    ,idx_train,idx_test) for i in range(1,trees))
        

    training_time = time.time() - t
    time_linear.append(training_time)

### SGD

In [14]:
time_sgd = []

for n in n_samples:
    X, y = datasets.make_classification(n_samples=n)
    
    X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y, np.arange(len(X))
                                                                       ,test_size=1/3, stratify=y)
    
    # Rescale input space to [0,1] range (for purposes of consistency and stability)
    scaler = preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    t = time.time()

    # Run trees in parallel
    trees = 10
    with Parallel() as parallel:
        parallel(delayed(svc_tree_sgd)(X_train,X_test 
                                                    ,y_train,y_test
                                                    ,idx_train,idx_test) for i in range(1,trees))
        

    training_time = time.time() - t
    time_sgd.append(training_time)

# tuned SVC

In [14]:
time_kernel = []

for n in n_samples:
    X, y = datasets.make_classfication(n_samples=n)
    
    X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y, np.arange(len(X))
                                                                       ,test_size=1/3, stratify=y)
    
    # Rescale input space to [0,1] range (for purposes of consistency and stability)
    scaler = preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    t = time.time()

    # Run trees in parallel
    trees = 10
    with Parallel() as parallel:
        parallel(delayed(svc_tree_kernel)(X_train,X_test 
                                                    ,y_train,y_test
                                                    ,idx_train,idx_test) for i in range(1,trees))
        

    training_time = time.time() - t
    time_kernel.append(training_time)

# Regression

In [None]:
%run 'SVM_RF_Regression.ipynb'

In [14]:
time_linear = []

for n in samples:
    X, y = datasets.make_regression(n_samples=n)
    
    X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y, np.arange(len(X))
                                                                       ,test_size=1/3)
    
    # Rescale input space to [0,1] range (for purposes of consistency and stability)
    scaler = preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    t = time.time()

    # Run trees in parallel
    trees = 10
    with Parallel() as parallel:
        parallel(delayed(svr_tree_kernel)(X_train,X_test 
                                                    ,y_train,y_test
                                                    ,idx_train,idx_test) for i in range(1,trees))
        

    training_time = time.time() - t
    time_linear.append(training_time)

KeyboardInterrupt: 

In [None]:
time_sgd = []

for n in n_samples:
    X, y = datasets.make_regression(n_samples=n)
    
    X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y, np.arange(len(X))
                                                                       ,test_size=1/3)
    
    # Rescale input space to [0,1] range (for purposes of consistency and stability)
    scaler = preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    t = time.time()

    # Run trees in parallel
    trees = 10
    with Parallel() as parallel:
        parallel(delayed(svr_tree_sgd)(X_train,X_test 
                                                    ,y_train,y_test
                                                    ,idx_train,idx_test) for i in range(1,trees))
        

    training_time = time.time() - t
    time_kernel.append(training_time)

In [None]:
time_kernel = []

for n in n_samples:
    X, y = datasets.make_regression(n_samples=n)
    
    X_train,X_test, y_train,y_test, idx_train,idx_test = train_test_split(X,y, np.arange(len(X))
                                                                       ,test_size=1/3)
    
    # Rescale input space to [0,1] range (for purposes of consistency and stability)
    scaler = preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    t = time.time()

    # Run trees in parallel
    trees = 10
    with Parallel() as parallel:
        parallel(delayed(svr_tree_kernel)(X_train,X_test 
                                                    ,y_train,y_test
                                                    ,idx_train,idx_test) for i in range(1,trees))
        

    training_time = time.time() - t
    time_kernel.append(training_time)

In [None]:
plt.plot(n_samples, time_sgd, 'b-x')
plt.title("RF-SVC: SGD")
plt.xlabel('Number of training samples')
plt.ylabel('Training time (sec)')
plt.show()