### Libraries

In [1]:
import re
import os
import pickle
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from matplotlib import pyplot as plt
pd.options.plotting.backend = "matplotlib"

### Macros

#### File-Handling Macros

In [2]:
''' Input/Output file related macros '''

CONFIG_VERSION_1 = 'hivenas_a'
CONFIG_VERSION_2 = 'hivenas_aug_a'
RESULTS_PATH = './res/archived results/'
ANALYSIS_RESULTS_PATH = './analysis/images/'

if not os.path.exists(ANALYSIS_RESULTS_PATH):
    os.makedirs(ANALYSIS_RESULTS_PATH)
    
    
def get_file_path(config, file_extension):
    ''' Returns relative path for a given filename and configuration version '''
    
    f_path = os.path.join(RESULTS_PATH, config)
    f_path = os.path.join(f_path, config + f'.{file_extension}')
    
    return f_path


def get_all_paths(config, directory, endswith='.pickle'):
    ''' Returns a list of paths for all files in a given directory and configuration version '''
    
    paths = []
    root_dir = os.path.join(RESULTS_PATH, config)
    root_dir = os.path.join(root_dir, directory)
    
    for root, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith(endswith):
                paths.append((os.path.join(root, filename), filename))

    return paths

#### Data Analysis Macros

In [21]:
''' Data analysis and processing related macros '''
def reject_outliers(data, m=1.5):
    ''' Prunes data points outside m standard deviations '''
    return data[abs(data - np.mean(data)) < m * np.std(data)]

TRACE_1_COLOR = 'rgba(230,126,34,1.0)'
TRACE_1_DOTTED_COLOR = 'rgba(230,126,34,0.5)'
TRACE_2_COLOR = 'rgba(52,152,219,1.0)'
TRACE_2_DOTTED_COLOR = 'rgba(52,152,219,0.5)'
GRID_LINE_COLOR = 'rgba(189,195,199,0.75)'
GRID_ZERO_LINE_COLOR = 'rgba(189,195,199,1.0)'
PLOT_TEXT_COLOR = 'rgba(44,62,80,1.0)'

### Data Structuring and Manipulation

#### Log File

In [4]:
def get_log_df(config):
    ''' Loads and structures a log file (raw) into a DF '''
    
    # Get raw file str
    f_path = get_file_path(config, 'log')
    
    raw_file = ''
    
    with open(f_path) as f:
        for line in f:
            raw_file += line

    # Extract evaluation logs by tag
    evals = raw_file.split('EVALUATION LOG:')
    evals.pop(0)

    log_df = pd.DataFrame()

    # Format raw text and populate dataframe
    eval_idx = 0
    itr = 1
    for e in evals:
        arch = re.search('(?<=Candidate \().+?(?=\))', e).group(0)
        val_acc = re.findall('(?<=val_sparse_categorical_accuracy: )[0-9]+\.[0-9]+', e)
        val_loss = re.findall('(?<=val_loss: )[0-9]+\.[0-9]+', e)
        train_acc = re.findall('(?<= sparse_categorical_accuracy: )[0-9]+\.[0-9]+', e)
        train_loss = re.findall('(?<= loss: )[0-9]+\.[0-9]+', e)

        for i in range(len(val_acc)):
            series = pd.Series({
                'arch': arch,
                'epoch': i + 1,
                'val_acc': val_acc[i],
                'val_loss': val_loss[i],
                'train_acc': train_acc[i],
                'train_loss': train_loss[i],
                'eval_idx': int(eval_idx),
                'type': e.split(' ')[1],
                'bee_id': int(e.split(' ')[3].strip('()')),
                'itr': int(itr)
            })
            log_df = log_df.append(series, ignore_index=True)

        eval_idx += 1

        if e.find('itr: ') != -1:
            itr += 1

    # Standardize types
    keys_int = ['itr', 'bee_id', 'eval_idx', 'epoch']
    keys_float = ['val_acc', 'val_loss', 'train_acc', 'train_loss']

    for key in keys_int:
        log_df[key] = log_df[key].astype(int)

    for key in keys_float:
        log_df[key] = log_df[key].astype(float)

    return log_df

log_df_a = get_log_df(CONFIG_VERSION_1)
log_df_b = get_log_df(CONFIG_VERSION_2)

print(log_df_a, '\n\n')
print(log_df_b)

                                                  arch  epoch  val_acc  \
0    input|sep5x5_128|sep3x3_32|avg_pool3x3|sep5x5_...      1   0.5441   
1    input|sep5x5_128|sep3x3_32|avg_pool3x3|sep5x5_...      2   0.6463   
2    input|sep5x5_128|sep3x3_32|avg_pool3x3|sep5x5_...      3   0.6679   
3    input|sep5x5_128|sep3x3_32|avg_pool3x3|sep5x5_...      4   0.7018   
4    input|sep5x5_128|sep3x3_32|avg_pool3x3|sep5x5_...      5   0.7029   
..                                                 ...    ...      ...   
310  input|sep5x5_32|sep3x3_32|sep3x3_64|dropout|ou...      1   0.5197   
311  input|sep5x5_32|sep3x3_32|sep3x3_64|dropout|ou...      2   0.6027   
312  input|sep5x5_32|sep3x3_32|sep3x3_64|dropout|ou...      3   0.6623   
313  input|sep5x5_32|sep3x3_32|sep3x3_64|dropout|ou...      4   0.6771   
314  input|sep5x5_32|sep3x3_32|sep3x3_64|dropout|ou...      5   0.6753   

     val_loss  train_acc  train_loss  eval_idx         type  bee_id  itr  
0      1.2490     0.4450      1.5248

#### Main Results File

In [5]:
main_df_a = pd.read_csv(get_file_path(CONFIG_VERSION_1, 'csv'), header=0, index_col=0)
main_df_b = pd.read_csv(get_file_path(CONFIG_VERSION_2, 'csv'), header=0, index_col=0)

print(main_df_a, '\n\n', main_df_b)

       bee_type  bee_id bee_parent  itr  \
0   EmployeeBee     0.0          -  0.0   
1   EmployeeBee     1.0          -  0.0   
2   EmployeeBee     2.0          -  0.0   
3   OnlookerBee     0.0          0  0.0   
4   OnlookerBee     1.0          0  0.0   
..          ...     ...        ...  ...   
58  EmployeeBee     2.0          -  8.0   
59  OnlookerBee     0.0          1  8.0   
60  OnlookerBee     1.0          2  8.0   
61  OnlookerBee     2.0          2  8.0   
62  OnlookerBee     3.0          0  8.0   

                                            candidate  fitness  \
0   input|sep5x5_128|sep3x3_32|avg_pool3x3|sep5x5_...   0.7100   
1   input|dropout|sep5x5_128|sep5x5_128|sep3x3_128...   0.6957   
2   input|sep5x5_32|sep3x3_32|sep5x5_128|sep5x5_12...   0.6742   
3   input|sep3x3_64|sep3x3_32|avg_pool3x3|sep5x5_1...   0.7034   
4   input|sep5x5_128|sep3x3_32|avg_pool3x3|avg_poo...   0.7125   
..                                                ...      ...   
58  input|sep5x5_128|

#### Training History Files

In [6]:
def get_pickle_df(config, main_df):
    ''' Loads and structures training hisotry pickle files '''
    
    paths = get_all_paths(config, 'training_history')
    
    # Load pickles into DFs
    ret_df = pd.DataFrame()
    
    for (path, filename) in paths:
        pckl = None
        with open(path, 'rb') as f:
            pckl = pickle.load(f)
        temp_df = pd.DataFrame(pckl).reset_index().rename(columns={'index': 'epoch'})
        
        filename = filename.split('.')[0]
        
        temp_df['filename'] = filename
        temp_df['epoch'] = temp_df['epoch'].apply(lambda e: e+1)
        temp_df['arch'] = main_df.loc[main_df['weights_filename'] == filename + '.h5']['candidate'].values[0]
        
        ret_df = pd.concat([ret_df, temp_df])
        
    return ret_df.reset_index(drop=True)
        
# print(main_df_a.loc[main_df_a['weights_filename'] == 'b0f90cbfb47656ed8b26f980b761c416b3e88b41.h5']['candidate'])
hist_df_a = get_pickle_df(CONFIG_VERSION_1, main_df_a)
hist_df_b = get_pickle_df(CONFIG_VERSION_2, main_df_b)

print(hist_df_a, '\n\n', hist_df_b)

     epoch      loss  sparse_categorical_accuracy  val_loss  \
0        1  1.475623                     0.460225  1.460599   
1        2  1.059200                     0.617950  0.991109   
2        3  0.829793                     0.706475  0.877863   
3        4  0.630564                     0.777000  0.873035   
4        5  0.410721                     0.855125  0.940298   
..     ...       ...                          ...       ...   
310      1  2.302844                     0.098825  2.302775   
311      2  2.302657                     0.099825  2.302778   
312      3  2.302686                     0.098175  2.302868   
313      4  2.302672                     0.101575  2.302859   
314      5  2.302673                     0.100250  2.302806   

     val_sparse_categorical_accuracy  \
0                             0.4674   
1                             0.6435   
2                             0.6876   
3                             0.7061   
4                             0.7030   
.. 

### Data Analysis

#### Augmentation Improvement

In [32]:
# Data manipulation
hist_df_a['train_test_gap'] = hist_df_a['val_sparse_categorical_accuracy'] - hist_df_a['sparse_categorical_accuracy']
hist_df_b['train_test_gap'] = hist_df_b['val_sparse_categorical_accuracy'] - hist_df_b['sparse_categorical_accuracy']
#log_df_a['train_test_gap'],log_df_a['train_acc'],log_df_a['val_acc']

train_test_a = hist_df_a.groupby(['arch'])['train_test_gap'].mean().values
train_test_b = hist_df_b.groupby(['arch'])['train_test_gap'].mean().values

train_test_a = reject_outliers(train_test_a)
train_test_b = reject_outliers(train_test_b)

# Truncate data length to match (for aesthetic purposes)
train_test_a = train_test_a[:min(len(train_test_a), len(train_test_b))]
train_test_b = train_test_b[:min(len(train_test_a), len(train_test_b))]

# Figure customization
fig = go.Figure()
fig.update_layout(
    title='Validation / Training Percent Difference',
    xaxis_title='Neural Architecture Index', 
    yaxis_title='Generalization Improvement',
    plot_bgcolor='white',
    autosize=False,
    width=800,
    height=500
)

fig.update_yaxes(tickformat=',.0%',
                 range= [np.concatenate((train_test_a, train_test_b), axis=0).min() * 1.1, 
                         np.concatenate((train_test_a, train_test_b), axis=0).max() * 1.225],
                 color=PLOT_TEXT_COLOR,
                 zeroline=True,
                 zerolinewidth=1,
                 zerolinecolor=GRID_ZERO_LINE_COLOR,
                 gridcolor=GRID_LINE_COLOR)

fig.update_xaxes(color=PLOT_TEXT_COLOR,
                 zeroline=True,
                 zerolinewidth=1,
                 zerolinecolor=GRID_ZERO_LINE_COLOR,
                 gridcolor=GRID_LINE_COLOR)

# Traces
fig.add_trace(go.Scatter(x=list(range(len(train_test_a))), y=train_test_a,
                         mode='lines',
                         name='HiveNAS',
                         line=dict(color=TRACE_1_COLOR)))
fig.add_trace(go.Scatter(x=list(range(len(train_test_a))), y=[train_test_a.mean() for _ in range(len(train_test_a))],
                         mode='lines',
                         name='HiveNAS (mean)',
                         line=dict(color=TRACE_1_DOTTED_COLOR, dash='dot')))


fig.add_trace(go.Scatter(x=list(range(len(train_test_b))), y=train_test_b,
                         mode='lines',
                         name='HiveNAS+Aug',
                         line=dict(color=TRACE_2_COLOR)))
fig.add_trace(go.Scatter(x=list(range(len(train_test_b))), y=[train_test_b.mean() for _ in range(len(train_test_b))],
                         mode='lines',
                         name='HiveNAS+Aug (mean)',
                         line=dict(color=TRACE_2_DOTTED_COLOR, dash='dot')))

fig.write_image(ANALYSIS_RESULTS_PATH + 'augmentation_improvement.eps')

fig.show()

#### Augmentation Performance Impact

In [33]:
p_time_a = main_df_a['time'].values
p_time_b = main_df_b['time'].values

# Truncate data length to match (for aesthetic purposes)
p_time_a = p_time_a[:min(len(p_time_a), len(p_time_b))]
p_time_b = p_time_b[:min(len(p_time_a), len(p_time_b))]

# Instantiate processing times df
performance_times = {'HiveNAS': p_time_a,
                     'HiveNAS+Aug': p_time_b}

performance_df = pd.DataFrame(dict([(k,pd.Series(val)) for k,val in performance_times.items()]))

# Figure customization
fig = go.Figure()
fig.update_layout(
    title='Augmentation Performance Impact',
    xaxis_title='Neural Architecture Index', 
    yaxis_title='Processing Time (s)',
    plot_bgcolor='white',
    autosize=False,
    width=800,
    height=500
)

fig.update_yaxes(color=PLOT_TEXT_COLOR,
                 zeroline=True,
                 zerolinewidth=1,
                 zerolinecolor=GRID_ZERO_LINE_COLOR,
                 gridcolor=GRID_LINE_COLOR)

fig.update_xaxes(color=PLOT_TEXT_COLOR,
                 zeroline=True,
                 zerolinewidth=1,
                 zerolinecolor=GRID_ZERO_LINE_COLOR,
                 gridcolor=GRID_LINE_COLOR)

# Traces
fig.add_trace(go.Scatter(x=list(range(len(p_time_a))), y=p_time_a,
                         mode='lines',
                         name='HiveNAS',
                         line=dict(color=TRACE_1_COLOR)))
fig.add_trace(go.Scatter(x=list(range(len(p_time_a))), y=[p_time_a.mean() for _ in range(len(p_time_a))],
                         mode='lines',
                         name='HiveNAS (mean)',
                         line=dict(color=TRACE_1_DOTTED_COLOR, dash='dot')))


fig.add_trace(go.Scatter(x=list(range(len(p_time_b))), y=p_time_b,
                         mode='lines',
                         name='HiveNAS+Aug',
                         line=dict(color=TRACE_2_COLOR)))
fig.add_trace(go.Scatter(x=list(range(len(p_time_b))), y=[p_time_b.mean() for _ in range(len(p_time_b))],
                         mode='lines',
                         name='HiveNAS+Aug (mean)',
                         line=dict(color=TRACE_2_DOTTED_COLOR, dash='dot')))

fig.write_image(ANALYSIS_RESULTS_PATH + 'augmentation_impact.eps')

fig.show()


#### Convergence Profile

In [34]:
# Data Prep.
def group_avg_roc(tpl):
    return abs((tpl[-1] - tpl[0]) / len(tpl))

CONVERGENCE_METRIC = 'loss'    # 'loss', 'val_loss', 'sparse_categorical_accuracy', 'val_sparse_categorical_accuracy'
    
avg_roc_a = [group_avg_roc(group[1].values) for group in hist_df_a.groupby('arch')['val_loss']]
avg_roc_b = [group_avg_roc(group[1].values) for group in hist_df_b.groupby('arch')['val_loss']]

avg_roc_a = reject_outliers(np.array(avg_roc_a), m=2)
avg_roc_b = reject_outliers(np.array(avg_roc_b), m=2)

# Truncate data length to match (for aesthetic purposes)
avg_roc_a = avg_roc_a[:min(len(avg_roc_a), len(avg_roc_b))]
avg_roc_b = avg_roc_b[:min(len(avg_roc_a), len(avg_roc_b))]

# Figure customization
fig = go.Figure()
fig.update_layout(
    title='Convergence Profile',
    xaxis_title='Neural Architecture Index', 
    yaxis_title='Mean Loss Rate of Change',
    plot_bgcolor='white',
    autosize=False,
    width=800,
    height=500
)

fig.update_yaxes(color=PLOT_TEXT_COLOR,
                 zeroline=True,
                 zerolinewidth=1,
                 zerolinecolor=GRID_ZERO_LINE_COLOR,
                 gridcolor=GRID_LINE_COLOR)

fig.update_xaxes(color=PLOT_TEXT_COLOR,
                 zeroline=True,
                 zerolinewidth=1,
                 zerolinecolor=GRID_ZERO_LINE_COLOR,
                 gridcolor=GRID_LINE_COLOR)

# Traces
fig.add_trace(go.Scatter(x=list(range(len(avg_roc_a))), y=avg_roc_a,
                         mode='lines',
                         name='HiveNAS',
                         line=dict(color=TRACE_1_COLOR)))
fig.add_trace(go.Scatter(x=list(range(len(avg_roc_a))), y=[avg_roc_a.mean() for _ in range(len(avg_roc_a))],
                         mode='lines',
                         name='HiveNAS (mean)',
                         line=dict(color=TRACE_1_DOTTED_COLOR, dash='dot')))


fig.add_trace(go.Scatter(x=list(range(len(avg_roc_b))), y=avg_roc_b,
                         mode='lines',
                         name='HiveNAS+Aug',
                         line=dict(color=TRACE_2_COLOR)))
fig.add_trace(go.Scatter(x=list(range(len(avg_roc_b))), y=[avg_roc_b.mean() for _ in range(len(avg_roc_b))],
                         mode='lines',
                         name='HiveNAS+Aug (mean)',
                         line=dict(color=TRACE_2_DOTTED_COLOR, dash='dot')))

fig.write_image(ANALYSIS_RESULTS_PATH + 'convergence_profile.eps')

fig.show()

# print([i[0] for i in hist_df_a.groupby('arch')['val_loss']])
# print(hist_df_a.groupby('arch')['val_loss'].get_group('input|avg_pool3x3|sep3x3_128|avg_pool3x3|sep5x5_32|output'),'\n\n')
# print(hist_df_a.groupby('arch')['val_loss'].get_group('input|batch_norm|sep3x3_128|avg_pool3x3|sep5x5_32|output'))

In [37]:
print(f'HiveNAS Mean Loss Rate of Change (standard deviation):     {avg_roc_a.std()} --> Jittery convergence',
      f'\nHiveNAS+Aug Mean Loss Rate of Change (standard deviation): {avg_roc_b.std()}  --> More stable convergence')

HiveNAS Mean Loss Rate of Change (standard deviation):     0.029585244956550065 --> Jittery convergence 
HiveNAS+Aug Mean Loss Rate of Change (standard deviation): 0.01954693346978766  --> More stable convergence
