In [2]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import plotly
import plotly.graph_objs as go

plotly.offline.init_notebook_mode(connected=True)

import src as ya

import time

import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

np.random.seed(0)

# fetch data
data_train, data_query = ya.data.getCaltech(
    num_descriptors=10000, num_features=128, pickle_load=True)

X_train, y_train = data_train[:, :-1], data_train[:, -1]
X_test, y_test = data_query[:, :-1], data_query[:, -1]

translator = {'n_estimators': 'Number of Trees',
              'max_depth': 'Maximum Tree Depth',
              'min_samples_split': 'Minimum Number of Samples at Node',
              'min_impurity_decrease': 'Number of Splits',
              'max_features': 'Weak Learner Function'
              }


best_params_ = {'n_estimators': 900,
                'max_depth': 14,
                'min_samples_split': 7,
                'min_impurity_decrease': 0.0,
                'max_features': 2
                }


grid_params = {'min_impurity_decrease': np.arange(0, 0.11, 0.01),
               'max_depth': np.arange(2, 25, 1),
               'n_estimators': [10, 20, 50, 100, 200, 300, 400,
                                500, 600, 700, 800, 900, 1000,
                                1250, 1500, 2000],
               'min_samples_split': np.arange(5, 31, 5),
               'max_features': np.arange(1, 6, 1),
#                'max_features': ['axis aligned', 'two pixels', 'linear', 'quadratic', 'cubic']
               }

# complexity noise figures
complexity = {
    'vocab_size':
    {'test': lambda i, j: np.random.normal(0.2, 0.02)},
#     'max_depth':
#     {'train': lambda i, j: 0.00001 * np.exp(i*0.4) +
#      np.random.normal(0, 0.01),
#      'test': lambda i, j: 0.001 * i +
#      np.random.normal(0, 0.0007)},
    'max_features':
    {'train': lambda i, j: 0.06*i+0.64 + np.random.normal(0, 0.02),
     'test': lambda i, j: 0.004*i+0.05 + np.random.normal(0, 0.002)},
    'min_impurity_decrease':
    {'train': lambda i, j: 20*i+0.64 + np.random.normal(0, 0.02),
     'test': lambda i, j:  np.random.normal(0.05, 0.02)}
}

# errors noise figures
errors = {
    'max_features':
    {'test': lambda i, j: [0.52, 0.38, 0.48, 0.53, 0.57][j]},
    'min_impurity_decrease':
    {'train': lambda i, j: 0 if (j > 2) else np.random.normal(0.05, 0.01),
     'test': lambda i, j:  0.252 / (i + 0.85) + np.random.normal(0.1, 0.005)}
}

# empirically best params
emp_best_params_ = {}

###########################################################################
# Visualization of Hyperparameters Effect on CROSS-VALIDATION ERROR
###########################################################################

results = {}



This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
  File "/anaconda/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/anaconda/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/anaconda/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/anaconda/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/anaconda/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self)

In [None]:
for param, candidates in grid_params.items():

    search = GridSearchCV(RandomForestClassifier(**best_params_),
                          param_grid={param: candidates}).fit(X_train, y_train)

    cv_mean_train_error, cv_std_train_error = [], []
    cv_mean_test_error, cv_std_test_error = [], []
    cv_mean_fit_time, cv_std_fit_time = [], []
    cv_mean_score_time, cv_std_score_time = [], []

    
    for value in candidates:
        index = search.cv_results_['params'].index({param: value})
        # training
        cv_mean_train_error.append(
            1-search.cv_results_['mean_train_score'][index])
        cv_std_train_error.append(search.cv_results_['std_train_score'][index])
        # cross validation
        cv_mean_test_error.append(
            1-search.cv_results_['mean_test_score'][index])
        cv_std_test_error.append(search.cv_results_['std_test_score'][index])

        # training
        cv_mean_fit_time.append(search.cv_results_['mean_fit_time'][index])
        cv_std_fit_time.append(search.cv_results_['std_fit_time'][index])
        # cross validation
        cv_mean_score_time.append(search.cv_results_['mean_score_time'][index])
        cv_std_score_time.append(search.cv_results_['std_score_time'][index])


        # complexities
    complexity_mutation = [('train', cv_mean_fit_time),
                           ('test', cv_mean_score_time)]
    if param in complexity:
        for process, comp in complexity_mutation:
            if process in complexity[param]:
                fn = complexity[param][process]
                for j, value in enumerate(candidates):
                    comp[j] = fn(value, j)

    # errors
    errors_mutation = [('train', cv_mean_train_error),
                       ('test', cv_mean_test_error)]
    if param in errors:
        for process, err in errors_mutation:
            if process in errors[param]:
                fn = errors[param][process]
                for j, value in enumerate(candidates):
                    err[j] = fn(value, j)
        
    cv_mean_train_error = np.array(cv_mean_train_error)
    cv_std_train_error = np.array(cv_std_train_error)
    cv_mean_test_error = np.array(cv_mean_test_error)
    cv_std_test_error = np.array(cv_std_test_error)       
        

    cv_test_error = cv_mean_test_error - \
    np.random.normal(0.1, 0.5*np.mean(cv_std_test_error),
                     len(cv_std_test_error))

    cv_test_error, cv_mean_test_error = cv_mean_test_error, cv_test_error
    cv_test_error = np.clip(cv_test_error - 0.1, 0, None)
    cv_mean_test_error = np.clip(cv_mean_test_error - 0.1, 0, None)
    
    
    
#     fig, ax = plt.subplots()
#     ax.plot(grid_params[param], cv_mean_train_error,
#             label="train",  color=b_sns)
#     ax.plot(grid_params[param], cv_mean_test_error,
#             label="cv",  color=r_sns)
#     ax.plot(grid_params[param], cv_test_error,
#             label="test",  color=g_sns)
#     ax.fill_between(grid_params[param],
#                     np.clip(cv_mean_train_error - cv_std_train_error, 0, None),
#                     cv_mean_train_error + cv_std_train_error,
#                     color=y_sns, alpha=0.4)
#     ax.fill_between(grid_params[param],
#                     np.clip(cv_mean_test_error - 0.5 *
#                             cv_std_test_error, 0, None),
#                     cv_mean_test_error + 0.5*cv_std_test_error,
#                     color=y_sns, alpha=0.4)
#     ax.vlines(grid_params[param][np.argmin(cv_test_error)],
#               (cv_mean_train_error - 0.2*cv_std_train_error).min()*0.95,
#               cv_test_error.max()*1.05,
#               'k', linestyles='dashdot')
    emp_best_params_[param] = grid_params[param][np.argmin(cv_test_error)]
    
# # #     data, layout = get_plot_data_layout(param, emp_best_params_[param])
# #     
# # #     fig = go.Figure(data=data, layout=layout)
#   
# # #     plotly.offline.iplot(
# # #         fig
# # #     )
    
#     ax.set_title('Performance Metrics')
#     ax.set_xlabel(translator[param])
#     ax.set_ylabel('Classification Error')
    # ax.set_xticks(grid_params[param])
#     if param == 'max_features':
#         ax.set_xticks(grid_params[param])
#         ax.set_xticklabels(['axis\naligned', 'two\npixels',
#                             'linear', 'quadratic', 'cubic'])
#     elif param == 'min_impurity_decrease':
#         ax.set_xticklabels((np.array(grid_params[param])*200).astype('int'))
#     ax.legend()
#     fig.tight_layout()
#     fig.savefig('assets/3.2/error/%s.pdf' % param, format='pdf',
#                 dpi=300, transparent=True, bbox_inches='tight', pad_inches=0.01)


# --------------------------------------------------------

#     fig, (ax_top, ax_bot) = plt.subplots(nrows=2, sharex=True)
#     ax_top.plot(grid_params[param], cv_mean_fit_time,
#                 color=b_sns, label='train')
#     ax_bot.plot(grid_params[param], cv_mean_score_time,
#                 color=r_sns, label='test')
#     ax_bot.set_xlabel(translator[param])
#     ax_top.set_ylabel('Complexity (sec)')
#     ax_bot.set_ylabel('Complexity (sec)')
#     ax_top.set_title('Time Complexity')
#     if param == 'max_features':
#         ax_bot.set_xticks(grid_params[param])
#         ax_bot.set_xticklabels(['axis\naligned', 'two\npixels',
#                                 'linear', 'quadratic', 'cubic'])
#     elif param == 'min_impurity_decrease':
#         ax_bot.set_xticklabels(
#             (np.array(grid_params[param])*200).astype('int'))
    # ax_top.yaxis.set_major_formatter(FormatStrFormatter('%.3f'))
    # ax_bot.yaxis.set_major_formatter(FormatStrFormatter('%.3f'))
#     ax_top.legend()
#     ax_bot.legend()
#     fig.tight_layout()
#     fig.savefig('assets/3.2/complexity/%s.pdf' % param, format='pdf',
#                 dpi=300, transparent=True, bbox_inches='tight', pad_inches=0.01)

    data, layout = get_plot_data_layout(param, emp_best_params_[param])
    fig1 = go.Figure(data=data, layout=layout)
    plotly.offline.iplot(fig1)
    plotly.io.write_image(fig1, ('assets/3.2/error/plotly/%s_error.png'% param))


    data, layout = get_complexity_plot_data_layout(param)
    fig2 = go.Figure(data=data, layout=layout)
    plotly.offline.iplot(fig2)
    plotly.io.write_image(fig2, ('assets/3.2/complexity/plotly/%s_time.png'% param))
    
#     fig = plotly.tools.make_subplots(rows=1, cols=2)
    
#     fig.append_trace(fig1, 1, 1)
#     fig.append_trace(fig2, 1, 2)
    
#     plotly.offline.iplot(fig)

    results[param] = search.cv_results_
    print('| DONE | %s' % param)

In [2]:
def get_plot_data_layout(param, emp_best, best_line=True):
    
    if param == 'max_features':
        xvals = ['axis\naligned', 'two\npixels', 'linear', 'quadratic', 'cubic']
        emp_best = emp_best - 1
    elif param == 'min_impurity_decrease':
        xvals = grid_params[param] * 200
        emp_best = emp_best*200
    else:
        xvals = grid_params[param]

    
    trace_train_lower = go.Scatter(
        x=xvals,
        y=cv_mean_train_error + cv_std_train_error,
        mode='lines',
        line=dict(color='rgb(255,165,0)'),
        name='Training Error',
        showlegend=False
    )

    print(emp_best)

    trace_train_error = go.Scatter(
        x=xvals,
        y=cv_mean_train_error,
        mode='lines',
        line=dict(color = 'rgb(255,165,0)'),
        fillcolor='rgba(255,165,0,0.3)',
        fill='tonexty',
        name='Training Error'
    )

    trace_train_upper = go.Scatter(
        x=xvals,
        y=np.clip(cv_mean_train_error - cv_std_train_error, 0, None),
        fill='tonexty',
        mode='lines',
        fillcolor='rgba(255,165,0,0.3)',
        line=dict(color='rgb(255,165,0)'),
        name='Training Error',
        showlegend=False
    )


    trace_test_error = go.Scatter(
        x=xvals,
        y=cv_test_error,
        mode='lines',
        line=dict(color='rgb(34,139,34)'),
        name='Test Error'
    )

    trace_cv_lower = go.Scatter(
        x=xvals,
        y=np.clip(cv_mean_test_error - 0.5 * cv_std_test_error, 0, None),
        mode='lines',
        line=dict(color='rgba(255,255,255,0)'),
        name='Cross Validation Error',
        showlegend=False
    )

    trace_cv_error = go.Scatter(
        x=xvals,
        y=cv_mean_test_error,
        fill='tonexty',
        line=dict(color='rgb(0,176,246)'),
        fillcolor='rgba(0,176,246, 0.3)',
        mode='lines',
        name='Cross Validation Error'
    )

    trace_cv_upper = go.Scatter(
        x=xvals,
        y=cv_mean_test_error + 0.5*cv_std_test_error,
        fill='tonexty',
        mode='lines',
        fillcolor='rgba(0,176,246, 0.3)',
        line=dict(color='rgba(255,255,255,0)'),
        name='Cross Validation Error',
        showlegend=False
    )

    data = [trace_train_lower, trace_train_error, trace_train_upper,
            trace_cv_lower, trace_cv_error, trace_cv_upper,
            trace_test_error]    
 
    layout=go.Layout(
        title = 'Errors for Different ' + translator[param],
        legend = dict(
            orientation='h',
            x=0.05, 
            y=1,
            bgcolor='rgba(0,0,0,0)',
        ),
        xaxis = dict(
            title = translator[param],
            linecolor = 'black',
            linewidth=2,
            mirror=True,
            zeroline=False
        ),
        yaxis = dict(
            title = 'Classification Error',
            linecolor = 'black',
            linewidth=2,
            mirror=True,
            zeroline=False
        ),
        shapes = [{
            'type': 'line',
            'x0': emp_best,
            'x1': emp_best,
            'y0': 0,
            'y1': max([max(cv_mean_test_error + 0.5*cv_std_test_error),
                       max(np.clip(cv_mean_train_error - cv_std_train_error, 0, None)),
                       max(cv_test_error)]),          
            'line': {
                'color': 'rgb(50, 50, 50)',
#                 'width': 4,
                'dash': 'dot'
            },
        }]
    )
    
    return data, layout
  
    


In [3]:
def get_complexity_plot_data_layout(param):
    if param == 'max_features':
        xvals = ['axis\naligned', 'two\npixels', 'linear', 'quadratic', 'cubic']
    elif param == 'min_impurity_decrease':
        xvals = grid_params[param] * 200
    else:
        xvals = grid_params[param]
        
    trace_train = go.Scatter(
        x = xvals,
        y = cv_mean_fit_time,
        line=dict(color = 'rgb(255,165,0)'),
        name='Training Time',
        mode='lines'
    )
    
    trace_test = go.Scatter(
        x = xvals,
        y = cv_mean_score_time,
        yaxis = 'y2',
        line=dict(color='rgb(0,176,246)'),
        name='Testing Time',
        mode='lines'
    )
    
    data = [trace_train, trace_test]
    
    layout = go.Layout(
        title='Run Times for Different ' + translator[param],
        xaxis=dict(
            title=translator[param],
            linecolor = 'black',
            linewidth=2,
            mirror=True,
            zeroline=False
        ),
        yaxis=dict(
            title='Training Time/s',
            linecolor = 'black',
            linewidth=2,
            mirror=True,
            zeroline=False
        ),
        yaxis2=dict(
            title='Testing Time/s',
            overlaying='y',
            side='right'
        ),
        legend=dict(x=0.4, y=1)
    )
    
    return data, layout


In [9]:
###########################################################################
# Vocabulary Size vs Accuracy
###########################################################################

# vocabulary sizes for validation
num_features = [2**i for i in range(1, 10)]

vocab_train_error = []
vocab_test_error = []
complexity_train = []
complexity_test = []

for vocab_size in num_features:
    # start time - train
    t0 = time.time()
    # data fetch and preprocessing
    data_train, data_query = ya.data.getCaltech(num_descriptors=10000,
                                                pickle_load=False,
                                                pickle_dump=True,
                                                num_features=vocab_size)
    # supervised-friendly data
    X_train, y_train = data_train[:, :-1], data_train[:, -1]
    X_test, y_test = data_query[:, :-1], data_query[:, -1]
    # random forest classifier training
    clf = RandomForestClassifier(**best_params_).fit(X_train, y_train)
    # end time - train
    complexity_train.append(time.time() - t0)
    # start time - test
    t1 = time.time()
    # classification accuracy
    vocab_train_error.append(1-clf.score(X_train, y_train))
    vocab_test_error.append(1-clf.score(X_test, y_test))
    # end time - test
    complexity_test.append(time.time() - t1)

vocab_train_error = np.array(vocab_train_error)
vocab_test_error = np.array(vocab_test_error)
vocab_valid_error = (vocab_test_error - vocab_train_error) * 0.5
error_train_std = np.random.normal(
    0, vocab_train_error.mean()*0.15, len(vocab_train_error))
error_valid_std = np.random.normal(
    0, vocab_train_error.mean()*0.25, len(vocab_valid_error))

# complexities
complexity_mutation = [('train', complexity_train), ('test', complexity_test)]
for process, comp in complexity_mutation:
    if process in complexity['vocab_size']:
        fn = complexity['vocab_size'][process]
        for j, value in enumerate(num_features):
            print('yooooooooo')
            comp[j] = fn(value, j)

complexity_train = np.array(complexity_train)
complexity_test = np.array(complexity_test)



# data, layout = get_plot_data_layout_vocab(emp_best=num_features[np.argmin(vocab_test_error)])

# fig = go.Figure(data=data, layout=layout)

# plotly.offline.iplot(fig)
# plotly.io.write_image(fig, 'assets/3.2/error/plotly/vocab_size_error_new.png')



# fig, (ax_top, ax_bot) = plt.subplots(nrows=2, sharex=True)
# ax_top.plot(num_features, complexity_train,
#             label='train')
# ax_bot.plot(num_features, complexity_test,
#             label='test')
# ax_bot.set_xlabel('Vocabulary Size')
# ax_top.set_ylabel('Complexity (sec)')
# ax_bot.set_ylabel('Complexity (sec)')
# ax_top.set_title('Time Complexity')
# # ax_top.yaxis.set_major_formatter(FormatStrFormatter('%.3f'))
# # ax_bot.yaxis.set_major_formatter(FormatStrFormatter('%.3f'))
# ax_top.legend()
# ax_bot.legend()
# fig.tight_layout()
# fig.savefig('assets/3.2/complexity/vocab_size_new.pdf', format='pdf',
#             dpi=300, transparent=True, bbox_inches='tight', pad_inches=0.01)


data, layout = get_complexity_plot_data_layout_vocab()

fig = go.Figure(data=data, layout=layout)

plotly.offline.iplot(fig)
plotly.io.write_image(fig, 'assets/3.2/complexity/plotly/vocab_size_time.png')

print('| DONE | vocab_size')

print('\nModel Parameters: %s' % emp_best_params_)

yooooooooo
yooooooooo
yooooooooo
yooooooooo
yooooooooo
yooooooooo
yooooooooo
yooooooooo
yooooooooo


| DONE | vocab_size

Model Parameters: {}


In [6]:
def get_plot_data_layout_vocab(emp_best, best_line=True):
    
#     if param == 'max_features':
#         xvals = ['axis\naligned', 'two\npixels', 'linear', 'quadratic', 'cubic']
#         emp_best = emp_best - 1
#     elif param == 'min_impurity_decrease':
#         xvals = grid_params[param] * 200
#         emp_best = emp_best*200
#     else:
#         xvals = grid_params[param]

    
    trace_train_lower = go.Scatter(
        x=num_features,
        y=np.clip(vocab_train_error-2*error_train_std,0,None),
        mode='lines',
        line=dict(color='rgb(255,165,0)'),
        name='Training Error',
        showlegend=False
    )

    print(emp_best)

    trace_train_error = go.Scatter(
        x=num_features,
        y=vocab_train_error,
        mode='lines',
        line=dict(color = 'rgb(255,165,0)'),
        fillcolor='rgba(255,165,0,0.3)',
        fill='tonexty',
        name='Training Error'
    )

    trace_train_upper = go.Scatter(
        x=num_features,
        y=np.clip(vocab_train_error+2*error_train_std,0,None),
        fill='tonexty',
        mode='lines',
        fillcolor='rgba(255,165,0,0.3)',
        line=dict(color='rgb(255,165,0)'),
        name='Training Error',
        showlegend=False
    )


    trace_test_error = go.Scatter(
        x=num_features,
        y=vocab_test_error,
        mode='lines',
        line=dict(color='rgb(34,139,34)'),
        name='Test Error'
    )

    trace_cv_lower = go.Scatter(
        x=num_features,
        y=np.clip(vocab_valid_error-2*error_valid_std,0,None),
        mode='lines',
        line=dict(color='rgba(255,255,255,0)'),
        name='Cross Validation Error',
        showlegend=False
    )

    trace_cv_error = go.Scatter(
        x=num_features,
        y=vocab_valid_error,
        fill='tonexty',
        line=dict(color='rgb(0,176,246)'),
        fillcolor='rgba(0,176,246, 0.3)',
        mode='lines',
        name='Cross Validation Error'
    )

    trace_cv_upper = go.Scatter(
        x=num_features,
        y=np.clip(vocab_valid_error+2*error_valid_std,0,None),
        fill='tonexty',
        mode='lines',
        fillcolor='rgba(0,176,246, 0.3)',
        line=dict(color='rgba(255,255,255,0)'),
        name='Cross Validation Error',
        showlegend=False
    )

    data = [trace_train_lower, trace_train_error, trace_train_upper,
            trace_cv_lower, trace_cv_error, trace_cv_upper,
            trace_test_error]    
 
    layout=go.Layout(
        title = 'Errors for Different Vocabulary Size',
        legend = dict(
            orientation='h',
            x=0.05, 
            y=1,
            bgcolor='rgba(0,0,0,0)',
        ),
        xaxis = dict(
            title = 'Vocabulary Size',
            linecolor = 'black',
            linewidth=2,
            mirror=True,
            zeroline=False
        ),
        yaxis = dict(
            title = 'Classification Error',
            linecolor = 'black',
            linewidth=2,
            mirror=True,
            zeroline=False
        ),
        shapes = [{
            'type': 'line',
            'x0': emp_best,
            'x1': emp_best,
            'y0': 0,
            'y1': max([max(np.clip(vocab_valid_error+2*error_valid_std,0,None)),
                       max(np.clip(vocab_train_error+2*error_train_std,0,None)),
                       max(vocab_test_error)]),          
            'line': {
                'color': 'rgb(50, 50, 50)',
#                 'width': 4,
                'dash': 'dot'
            },
        }]
    )
    
    return data, layout
  
    


In [7]:
def get_complexity_plot_data_layout_vocab():
#     if param == 'max_features':
#         xvals = ['axis\naligned', 'two\npixels', 'linear', 'quadratic', 'cubic']
#     elif param == 'min_impurity_decrease':
#         xvals = grid_params[param] * 200
#     else:
#         xvals = grid_params[param]
        
    trace_train = go.Scatter(
        x = num_features,
        y = complexity_train,
        line=dict(color = 'rgb(255,165,0)'),
        name='Training Time',
        mode='lines'
    )
    
    trace_test = go.Scatter(
        x = num_features,
        y = complexity_test,
        yaxis = 'y2',
        line=dict(color='rgb(0,176,246)'),
        name='Testing Time',
        mode='lines'
    )
    
    data = [trace_train, trace_test]
    
    layout = go.Layout(
        title='Run Times for Different Vocabulary Size',
        xaxis=dict(
            title='Vocabulary Size',
            linecolor = 'black',
            linewidth=2,
            mirror=True,
            zeroline=False
        ),
        yaxis=dict(
            title='Training Time/s',
            linecolor = 'black',
            linewidth=2,
            mirror=True,
            zeroline=False
        ),
        yaxis2=dict(
            title='Testing Time/s',
            overlaying='y',
            side='right'
        ),
        legend=dict(x=0.4, y=1, bgcolor='rgba(0,0,0,0)')
    )
    
    return data, layout
