In [1]:
from tqdm import tqdm
import wandb
api = wandb.Api(timeout=180)
import os
import pandas as pd
import wandb
import yaml
from pathlib import Path
from copy import deepcopy
import torch
import matplotlib.pyplot as plt
import numpy as np
import argparse
import itertools
import  matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import itertools
import time
import matplotlib as mpl

In [2]:
USER='wilderlavington'
PROJECT='FunctionalStochasticOptimization'
SUMMARY_FILE='sharan_report_0906.csv'
K=1
# make plots dir
try:
    os.makedirs("plots/0906/")
except FileExistsError:
    print("File already exists")

File already exists


In [3]:
def download_wandb_summary(sweeps=['ijsmzrdx', 'l9x8gji5', 'hk7t69nt', 'ircrqll7']):
    """
    Download a summary of all runs on the wandb project
    """
    runs = api.runs(USER+'/'+PROJECT, per_page=10000000)
    summary_list = []
    config_list = []
    name_list = []
    id_list = []
    assert len([run for run in runs])
    for run in tqdm(runs):
        if (sweeps is not None):
            if (run.sweep is not None) and (run.sweep.id in sweeps):
                summary_list.append(run.summary._json_dict)
                run = api.run(USER+'/'+PROJECT+"/"+run.id)
                config_list.append({k: v for k, v in run.config.items()})
                name_list.append(run.name)
                id_list.append(run.id)
        elif sweeps is None:
            summary_list.append(run.summary._json_dict)
            run = api.run(USER+'/'+PROJECT+"/"+run.id)
            config_list.append({k: v for k, v in run.config.items()})
            name_list.append(run.name)
            id_list.append(run.id)
        else:
            pass
    summary_df = pd.DataFrame.from_records(summary_list)
    config_df = pd.DataFrame.from_records(config_list)
    name_df = pd.DataFrame({"name": name_list, "id": id_list})
    all_df = pd.concat([name_df, config_df, summary_df], axis=1)
    Path('logs/wandb_data/').mkdir(parents=True, exist_ok=True)
    all_df.to_csv('logs/wandb_data/'+SUMMARY_FILE)

def download_wandb_records():
    """
    Download data for all runs in summary file
    """
    # load it all in and clean it up
    runs_df = pd.read_csv('logs/wandb_data/'+SUMMARY_FILE, header=0, squeeze=True)
    runs_df = runs_df.loc[:,~runs_df.columns.duplicated()]
    columns_of_interest = ['avg_loss', 'optim_steps', 'grad_norm', 'time_elapsed', \
             'grad_evals', 'function_evals']
    # set which columns we will store for vizualization
    list_of_dataframes = []
    # iterate through all runs to create individual databases
    for ex in tqdm(range(len(runs_df)), leave=False):
        # get the associated runs
        run = api.run(USER+'/'+PROJECT+'/'+runs_df.loc[runs_df.iloc[ex,0],:]['id'])
        run_df = []
        # iterate through all rows in online database
        base_info = {}
        for key in runs_df.loc[runs_df.iloc[ex,0],:].keys():
            base_info.update({key:runs_df.loc[runs_df.iloc[ex,0],:][key]})
        for i, row in run.history().iterrows():
            row_info = deepcopy(base_info)
            row_info.update({key:row[key] for key in columns_of_interest if key in row.keys()})
            run_df.append(row_info)
        # convert format to dataframe and add to our list
        list_of_dataframes.append(pd.DataFrame(run_df))
    # combine and then store
    wandb_records = pd.concat(list_of_dataframes)
    wandb_records.to_csv('logs/wandb_data/__full__'+SUMMARY_FILE)
    # return single data frame for vizualization
    return wandb_records

In [None]:
download_wandb_summary()
wandb_records = download_wandb_records()
wandb_records

  3%|█▎                                   | 714/20922 [06:27<2:12:00,  2.55it/s]

In [None]:
def smooth(array, k):
    array = np.array(array)
    new_array = deepcopy(array)
    # print(array[max(0,i-k):i] )
    for i in range(len(array)):
        if str(array[i]) != 'nan':
            avg_list = [val for val in array[max(0,i-k):i+1] if str(val) != 'nan']
            new_array[i] = sum(avg_list) / len(avg_list)
    return new_array

In [None]:
def format_dataframe(records, id_subfields={}, avg_subfields=['seed'],
            max_subfields=['log_eta', 'eta_schedule', 'c'],
            x_col='optim_steps', y_col='avg_loss'):
    #
    pd.set_option('display.max_columns', None)
    max_subfields = [m for m in max_subfields if m not in id_subfields.keys()]

    for key in id_subfields: 
        records = records.loc[records[key] == id_subfields[key]] 
    records['function_evals+grad_evals'] = records['function_evals']+records['grad_evals']
    if not len(records):
        return None
    # remove nans
    records = records[records[y_col].notna()]
    important_cols = list(set(avg_subfields+max_subfields+\
        list(id_subfields.keys())+[x_col, y_col, 'optim_steps']))
    # remove redundant information
    records = records[important_cols]
    # average over avg_subfields
    records = records.drop(avg_subfields, axis=1)
    # group over averaging field
    gb = list(set(list(max_subfields+list(id_subfields.keys())+[x_col, 'optim_steps'])))
    # only look at final optim steps
    last_mean_records = records.loc[records['optim_steps'] == records['optim_steps'].max()]
    # get the best record
    best_record = last_mean_records[last_mean_records[y_col] == last_mean_records[y_col].min()]
    # find parameters of the best record
    merge_on = list(set(gb)-set(['optim_steps', x_col, y_col]))
    merge_on = [ x for x in merge_on if x in best_record.columns.values]
    best_records = pd.merge(best_record[merge_on], records, on=merge_on,how='left')
    final_records = best_records.groupby(merge_on+[x_col], as_index=False)[y_col].mean()
    final_records[y_col+'25'] = best_records.groupby(merge_on+[x_col], as_index=False)[y_col].quantile(0.25)[y_col]
    final_records[y_col+'75'] = best_records.groupby(merge_on+[x_col], as_index=False)[y_col].quantile(0.75)[y_col]
    final_records = final_records.sort_values(x_col, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
    # smooth outputs 
    final_records[y_col+'75'] = smooth(final_records[y_col+'75'],K)
    final_records[y_col+'25'] = smooth(final_records[y_col+'25'],K)
    final_records[y_col] = smooth(final_records[y_col],K) 
#     print(best_record['algo'].unique())
#     print(best_record)
    return final_records

In [None]:
def generate_plot(proc_df, x, y, ax, label, linestyle='solid', color=None, x_max=100000):
    low_order_idx = (torch.tensor(proc_df[x].values) < x_max).nonzero().reshape(-1)
    if label:
        ax.plot(torch.tensor(proc_df[x].values[low_order_idx]), 
                torch.tensor(proc_df[y].values[low_order_idx]), 
                label=label, linestyle=linestyle, color=color,
                linewidth=4)
    else:
        ax.plot(torch.tensor(proc_df[x].values[low_order_idx]), 
                torch.tensor(proc_df[y].values[low_order_idx]), 
                label='_nolegend_', linestyle=linestyle, color=color,
                linewidth=4)
    ax.fill_between(torch.tensor(proc_df[x].values)[low_order_idx],
            torch.tensor(proc_df[y+'75'].values)[low_order_idx],
            torch.tensor(proc_df[y+'25'].values)[low_order_idx],
            alpha = 0.5, label='_nolegend_', linestyle=linestyle, color=color)
    return ax

In [None]:
def generate_A1_figure(loss, dataset_name, wandb_records):
    
    # base info   
    schedules = ['constant', 'stochastic', 'exponential']
    batch_sizes = [5, 25, 125, 625]
    m = [1, 2, 5, 10, 20]
    x = 'grad_evals'
    y = 'grad_norm'
    
    # init plots 
    fig, axs = plt.subplots(len(schedules), len(batch_sizes), figsize=(21, 21), sharey=True)
    colors = mpl.cm.Set1.colors   # Qualitative colormap
    colormap = {'SGD':colors[0], 'SLS':colors[1]}
    colormap.update({'FuncOpt'+str(m_):colors[idx+2] for idx, m_ in enumerate(m)})
    algorithms = ['SGD', 'SLS'] + ['FuncOpt'+str(m_) for m_ in m]
    plt.title('Comparison of SGD/SLS/FuncOpt: '+loss+'-'+dataset_name)
    label_map = {x:'Time-Elapsed', y:'Gradient-Norm'}
    # now add in the lines to each of the plots 
    for row, schedule in enumerate(schedules):
        for col, batch_size in enumerate(batch_sizes):
            x_max = 0
            # SLS
            proc_df = format_dataframe(wandb_records,
                id_subfields={'batch_size': batch_size, 'episodes': EPISODES,
                'use_optimal_stepsize': 1, 'loss': loss, 'algo': 'LSOpt',
                'eta_schedule': schedule, 'dataset_name': dataset_name},
                x_col=x , y_col=y)
            if proc_df is not None:
                x_max = max(proc_df[x].values.max(), x_max)
                axs[row][col] = generate_plot(proc_df, x, y, axs[row][col], label='SLS', 
                                             linestyle='solid', color=colormap['SLS'])
            else:
                print('missing SLS  ', schedule, batch_size)
            
            # SGD
            proc_df = format_dataframe(wandb_records,
                id_subfields={'batch_size': batch_size, 'episodes': EPISODES,
                'use_optimal_stepsize': 1, 'loss': loss, 'algo': 'SGD',
                'eta_schedule': schedule, 'dataset_name': dataset_name},
                x_col=x , y_col=y)
            if proc_df is not None: 
                x_max = max(proc_df[x].values.max(), x_max)
                axs[row][col] = generate_plot(proc_df, x, y, axs[row][col], label='SGD', 
                                             linestyle='solid', color=colormap['SGD'])
            else:
                print('missing SGD  ', schedule, batch_size)
            
            # FMDopt theoretical 
            for m_ in m:
                # create parsed info 
                proc_df = format_dataframe(wandb_records, 
                    id_subfields={'batch_size': batch_size, 'episodes': EPISODES,
                        'use_optimal_stepsize': 1,  
                        'loss': loss, 'algo': 'SGD_FMDOpt', 'm': m_,
                        'eta_schedule': schedule, 'dataset_name': dataset_name}, 
                         avg_subfields=['seed'], max_subfields=['c'],
                    x_col=x, y_col=y)
                if proc_df is not None:
                    x_max = max(proc_df[x].values.max(), x_max)
                    axs[row][col] = generate_plot(proc_df, x, y, axs[row][col],  \
                                            label='FuncOpt'+str(m_), linestyle='solid', color=colormap['FuncOpt'+str(m_)])
                else:
                    print('missing FMDopt  ', m_, schedule, batch_size)
            
            axs[row][col].set_xlim(0, x_max)  
            axs[row][col].grid()    
            axs[row][col].set_title('schedule: '+schedule+', batch_size: '+str(batch_size), fontsize=18)
            axs[row][col].set_xlabel(label_map[x], fontsize=16)
            axs[row][col].set_ylabel(label_map[y], fontsize=16)
            axs[row][col].tick_params(axis='both', which='major', labelsize=14)
            axs[row][col].tick_params(axis='both', which='minor', labelsize=14)
    
    # remaining format stuff  
    handles = [mpatches.Patch(color=colormap[algo], label=algo) for algo in algorithms]
    leg = fig.legend(handles=handles,
           loc="lower center",   # Position of legend
           borderaxespad=1.65,    # Small spacing around legend box
           # title="Algorithms",  # Title for the legend
           fontsize=18,
           ncol=7, 
           bbox_to_anchor=(0.5, -0.05),
           )
    plt.yscale("log")
    plt.subplots_adjust(hspace=1.5)
    plt.rcParams['figure.dpi'] = 400 
    plt.suptitle('Comparison of SGD/SLS/FuncOpt: Loss: '+loss+', Dataset: '+dataset_name, fontsize=28)
    fig.tight_layout()
    fig.subplots_adjust(top=0.95)
    
    # show / save
    plt.savefig('plots/0902/'+loss+'_'+dataset_name+'.pdf', bbox_inches='tight')
    plt.show() 

In [None]:
data_sets = ['mushrooms', 'ijcnn', 'rcv1']
losses = ['MSELoss', 'BCEWithLogitsLoss']
wandb_records = pd.read_csv('logs/wandb_data/__full__'+SUMMARY_FILE, header=0, squeeze=True)
 
for data_set in data_sets:
    for loss in losses:
        print('generating SGD plot for ', data_set, loss)
        generate_A1_figure(loss, data_set, wandb_records) 

In [None]:
def generate_A2_figure(loss, dataset_name, wandb_records):
    
    # base info   
    schedules = ['constant']
    batch_sizes = [5, 25, 125, 625]
    m = [1, 2, 5, 10, 20]
    x = 'grad_evals'
    y = 'grad_norm'
    
    # init plots 
    fig, axs = plt.subplots(len(schedules), len(batch_sizes), figsize=(21, 7), sharey=True)
    colors = mpl.cm.Set1.colors   # Qualitative colormap
    colormap = {'Adagrad':colors[0]}
    colormap.update({'AdaFuncOpt'+str(m_):colors[idx+2] for idx, m_ in enumerate(m)})
    axs = [axs]
    label_map = {x:'Time-Elapsed', y:'Gradient-Norm'}
    algorithms = ['Adagrad'] + ['AdaFuncOpt'+str(m_) for m_ in m]
    
    # now add in the lines to each of the plots 
    for row, schedule in enumerate(schedules):
        for col, batch_size in enumerate(batch_sizes):
            x_max = 0
            
            # adagrad
            proc_df = format_dataframe(wandb_records,
                id_subfields={'batch_size': batch_size, 'episodes': EPISODES,
                'use_optimal_stepsize': 1, 'loss': loss, 'algo': 'Sadagrad',
                'eta_schedule': schedule, 'dataset_name': dataset_name},
                x_col=x , y_col=y)
            if proc_df is not None: 
                x_max = max(proc_df[x].values.max(), x_max)
                axs[row][col] = generate_plot(proc_df, x, y, axs[row][col], label='Sadagrad', 
                                             linestyle='solid', color=colormap['Adagrad'])
            else:
                print('missing sadagrad  ', schedule, batch_size)
    
            # AdaFMDopt theoretical 
            for m_ in m:
                # create parsed info 
                proc_df = format_dataframe(wandb_records, 
                    id_subfields={'batch_size': batch_size, 'episodes': EPISODES,
                        'use_optimal_stepsize': 0, 'log_eta': -3,
                        'loss': loss, 'algo': 'Ada_FMDOpt', 'm': m_,
                        'eta_schedule': schedule, 'dataset_name': dataset_name}, 
                         avg_subfields=['seed'], max_subfields=['c'],
                    x_col=x, y_col=y)
                
                # generate the associated plot 
                if proc_df is not None:
                    axs[row][col] = generate_plot(proc_df, x, y, axs[row][col],  \
                                            label='AdaFuncOpt', linestyle='solid', color=colormap['AdaFuncOpt'+str(m_)])
                else:
                    print('missing AdaFMDopt  ', schedule, batch_size)
            
            axs[row][col].set_xlim(0, x_max)  
            axs[row][col].grid()    
            axs[row][col].set_title('schedule: '+schedule+', batch_size: '+str(batch_size), fontsize=20)
            axs[row][col].set_xlabel(label_map[x], fontsize=16)
            axs[row][col].set_ylabel(label_map[y], fontsize=16)
            axs[row][col].tick_params(axis='both', which='major', labelsize=14)
            axs[row][col].tick_params(axis='both', which='minor', labelsize=14)
    
    # remaining format stuff 
    handles = [mpatches.Patch(color=colormap[algo], label=algo) for algo in algorithms]
    leg = fig.legend(handles=handles,
           loc="lower center",   # Position of legend
           borderaxespad=1.65,    # Small spacing around legend box
           # title="Algorithms",  # Title for the legend
           fontsize=18,
           ncol=6, 
           bbox_to_anchor=(0.5, -0.15),
           ) 
    plt.yscale("log")
    plt.subplots_adjust(hspace=1.25)
    plt.rcParams['figure.dpi'] = 400 
    plt.suptitle('Comparison of Adagrad/Ada-FuncOpt: Loss: '+loss+', Dataset: '+dataset_name, fontsize=28)
    fig.tight_layout()
    fig.subplots_adjust(top=0.85)
    
    # show / save
    plt.savefig('plots/0902/Adaptive_'+loss+'_'+dataset_name+'.pdf', bbox_inches='tight')
    plt.show() 

In [None]:
data_sets = ['mushrooms', 'ijcnn', 'rcv1']
losses = ['MSELoss', 'BCEWithLogitsLoss']
wandb_records = pd.read_csv('logs/wandb_data/__full__'+SUMMARY_FILE, header=0, squeeze=True)
 
for data_set in data_sets:
    for loss in losses: 
        print('generating Adagrad plot for ', data_set, loss)
        generate_A2_figure(loss, data_set, wandb_records)