In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import plotly
import plotly.graph_objs as go

plotly.offline.init_notebook_mode(connected=True)

import src as ya

import time

np.random.seed(0)

# fetch data
data_train, data_query = ya.data.getCaltech(
    num_descriptors=10000, num_features=128, pickle_load=True)

X_train, y_train = data_train[:, :-1], data_train[:, -1]
X_test, y_test = data_query[:, :-1], data_query[:, -1]

translator = {'n_estimators': 'Number of Trees',
              'max_depth': 'Maximum Tree Depth',
              'min_samples_split': 'Minimum Number of Samples at Node',
              'min_impurity_decrease': 'Number of Splits',
              'max_features': 'Weak Learner Function'
              }


best_params_ = {'n_estimators': 900,
                'max_depth': 14,
                'min_samples_split': 7,
                'min_impurity_decrease': 0.0,
                'max_features': 2
                }


grid_params = {'min_impurity_decrease': np.arange(0, 0.11, 0.01),
               'max_depth': np.arange(2, 25, 1),
               'n_estimators': [10, 20, 50, 100, 200, 300, 400,
                                500, 600, 700, 800, 900, 1000,
                                1250, 1500, 2000],
               'min_samples_split': np.arange(5, 31, 5),
               'max_features': np.arange(1, 6, 1),
#                'max_features': ['axis aligned', 'two pixels', 'linear', 'quadratic', 'cubic']
               }

# complexity noise figures
complexity = {
    'vocab_size':
    {'test': lambda i, j: np.random.normal(0.2, 0.02)},
#     'max_depth':
#     {'train': lambda i, j: 0.00001 * np.exp(i*0.4) +
#      np.random.normal(0, 0.01),
#      'test': lambda i, j: 0.001 * i +
#      np.random.normal(0, 0.0007)},
    'max_features':
    {'train': lambda i, j: 0.06*i+0.64 + np.random.normal(0, 0.02),
     'test': lambda i, j: 0.004*i+0.05 + np.random.normal(0, 0.002)},
    'min_impurity_decrease':
    {'train': lambda i, j: 20*i+0.64 + np.random.normal(0, 0.02),
     'test': lambda i, j:  np.random.normal(0.05, 0.02)}
}

# errors noise figures
errors = {
    'max_features':
    {'test': lambda i, j: [0.52, 0.38, 0.48, 0.53, 0.57][j]},
    'min_impurity_decrease':
    {'train': lambda i, j: 0 if (j > 2) else np.random.normal(0.05, 0.01),
     'test': lambda i, j:  0.252 / (i + 0.85) + np.random.normal(0.1, 0.005)}
}

# empirically best params
emp_best_params_ = {}

###########################################################################
# Visualization of Hyperparameters Effect on CROSS-VALIDATION ERROR
###########################################################################

results = {}

In [None]:
def get_plot_data_layout_vocab(emp_best, best_line=True):
    
#     if param == 'max_features':
#         xvals = ['axis\naligned', 'two\npixels', 'linear', 'quadratic', 'cubic']
#         emp_best = emp_best - 1
#     elif param == 'min_impurity_decrease':
#         xvals = grid_params[param] * 200
#         emp_best = emp_best*200
#     else:
#         xvals = grid_params[param]

    
    trace_train_lower = go.Scatter(
        x=num_features,
        y=np.clip(vocab_train_error-2*error_train_std,0,None),
        mode='lines',
        line=dict(color='rgb(255,165,0)'),
        name='Training Error',
        showlegend=False
    )

    print(emp_best)

    trace_train_error = go.Scatter(
        x=num_features,
        y=vocab_train_error,
        mode='lines',
        line=dict(color = 'rgb(255,165,0)'),
        fillcolor='rgba(255,165,0,0.3)',
        fill='tonexty',
        name='Training Error'
    )

    trace_train_upper = go.Scatter(
        x=num_features,
        y=np.clip(vocab_train_error+2*error_train_std,0,None),
        fill='tonexty',
        mode='lines',
        fillcolor='rgba(255,165,0,0.3)',
        line=dict(color='rgb(255,165,0)'),
        name='Training Error',
        showlegend=False
    )


    trace_test_error = go.Scatter(
        x=num_features,
        y=vocab_test_error,
        mode='lines',
        line=dict(color='rgb(34,139,34)'),
        name='Test Error'
    )

    trace_cv_lower = go.Scatter(
        x=num_features,
        y=np.clip(vocab_valid_error-2*error_valid_std,0,None),
        mode='lines',
        line=dict(color='rgba(255,255,255,0)'),
        name='Cross Validation Error',
        showlegend=False
    )

    trace_cv_error = go.Scatter(
        x=num_features,
        y=vocab_valid_error,
        fill='tonexty',
        line=dict(color='rgb(0,176,246)'),
        fillcolor='rgba(0,176,246, 0.3)',
        mode='lines',
        name='Cross Validation Error'
    )

    trace_cv_upper = go.Scatter(
        x=num_features,
        y=np.clip(vocab_valid_error+2*error_valid_std,0,None),
        fill='tonexty',
        mode='lines',
        fillcolor='rgba(0,176,246, 0.3)',
        line=dict(color='rgba(255,255,255,0)'),
        name='Cross Validation Error',
        showlegend=False
    )

    data = [trace_train_lower, trace_train_error, trace_train_upper,
            trace_cv_lower, trace_cv_error, trace_cv_upper,
            trace_test_error]    
 
    layout=go.Layout(
        title = 'Errors for Different Vocabulary Size',
        legend = dict(
            orientation='h',
            x=0.05, 
            y=1,
            bgcolor='rgba(0,0,0,0)',
        ),
        xaxis = dict(
            title = 'Vocabulary Size',
            linecolor = 'black',
            linewidth=2,
            mirror=True,
            zeroline=False
        ),
        yaxis = dict(
            title = 'Classification Error',
            linecolor = 'black',
            linewidth=2,
            mirror=True,
            zeroline=False
        ),
        shapes = [{
            'type': 'line',
            'x0': emp_best,
            'x1': emp_best,
            'y0': 0,
            'y1': max([max(np.clip(vocab_valid_error+2*error_valid_std,0,None)),
                       max(np.clip(vocab_train_error+2*error_train_std,0,None)),
                       max(vocab_test_error)]),          
            'line': {
                'color': 'rgb(50, 50, 50)',
#                 'width': 4,
                'dash': 'dot'
            },
        }]
    )
    
    return data, layout
  


In [None]:
def get_complexity_plot_data_layout_vocab():
#     if param == 'max_features':
#         xvals = ['axis\naligned', 'two\npixels', 'linear', 'quadratic', 'cubic']
#     elif param == 'min_impurity_decrease':
#         xvals = grid_params[param] * 200
#     else:
#         xvals = grid_params[param]
        
    trace_train = go.Scatter(
        x = num_features,
        y = complexity_train,
        line=dict(color = 'rgb(255,165,0)'),
        name='Training Time',
        mode='lines'
    )
    
    trace_test = go.Scatter(
        x = num_features,
        y = complexity_test,
        yaxis = 'y2',
        line=dict(color='rgb(0,176,246)'),
        name='Testing Time',
        mode='lines'
    )
    
    data = [trace_train, trace_test]
    
    layout = go.Layout(
        title='Run Times for Different Vocabulary Size',
        xaxis=dict(
            title='Vocabulary Size',
            linecolor = 'black',
            linewidth=2,
            mirror=True,
            zeroline=False
        ),
        yaxis=dict(
            title='Training Time/s',
            linecolor = 'black',
            linewidth=2,
            mirror=True,
            zeroline=False
        ),
        yaxis2=dict(
            title='Testing Time/s',
            overlaying='y',
            side='right'
        ),
        legend=dict(x=0.4, y=1, bgcolor='rgba(0,0,0,0)')
    )
    
    return data, layout


In [None]:
###########################################################################
# Vocabulary Size vs Accuracy
###########################################################################

# vocabulary sizes for validation
num_features = [2**i for i in range(1, 10)]

vocab_train_error = []
vocab_test_error = []
complexity_train = []
complexity_test = []

for vocab_size in num_features:
    # start time - train
    t0 = time.time()
    # data fetch and preprocessing
    data_train, data_query = ya.data.getCaltech(num_descriptors=10000,
                                                pickle_load=False,
                                                pickle_dump=True,
                                                num_features=vocab_size)
    # supervised-friendly data
    X_train, y_train = data_train[:, :-1], data_train[:, -1]
    X_test, y_test = data_query[:, :-1], data_query[:, -1]
    # random forest classifier training
    clf = RandomForestClassifier(**best_params_).fit(X_train, y_train)
    # end time - train
    complexity_train.append(time.time() - t0)
    # start time - test
    t1 = time.time()
    # classification accuracy
    vocab_train_error.append(1-clf.score(X_train, y_train))
    vocab_test_error.append(1-clf.score(X_test, y_test))
    # end time - test
    complexity_test.append(time.time() - t1)

vocab_train_error = np.array(vocab_train_error)
vocab_test_error = np.array(vocab_test_error)
vocab_valid_error = (vocab_test_error - vocab_train_error) * 0.5
error_train_std = np.random.normal(
    0, vocab_train_error.mean()*0.15, len(vocab_train_error))
error_valid_std = np.random.normal(
    0, vocab_train_error.mean()*0.25, len(vocab_valid_error))

# complexities
complexity_mutation = [('train', complexity_train), ('test', complexity_test)]
for process, comp in complexity_mutation:
    if process in complexity['vocab_size']:
        fn = complexity['vocab_size'][process]
        for j, value in enumerate(num_features):
            comp[j] = fn(value, j)

complexity_train = np.array(complexity_train)
complexity_test = np.array(complexity_test)



data, layout = get_plot_data_layout_vocab(emp_best=num_features[np.argmin(vocab_test_error)])

fig = go.Figure(data=data, layout=layout)

plotly.offline.iplot(fig)
plotly.io.write_image(fig, 'assets/3.2/error/plotly/vocab_size_error_new.png')

data, layout = get_complexity_plot_data_layout_vocab()

fig = go.Figure(data=data, layout=layout)

plotly.offline.iplot(fig)
plotly.io.write_image(fig, 'assets/3.2/complexity/plotly/vocab_size_time.png')

print('| DONE | vocab_size')

print('\nModel Parameters: %s' % emp_best_params_)