# Data Pre-processing

The first part of this script is dedicated to data pre-processing. We will be reading a set of files and storing them in the appropriate Pandas.DataFrame structure so that we can more easily parse and analyse them.

In [1]:
import pandas as pd
import numpy as np

These are the __constants__ that will be used throughout the script, feel free to modify in the future in order to obtian similar functionality. The main configurations are:
- Base folders (pointing towards the folders with the files to be processed) 
- Algorithms (different categories of derivative-free optimization algorithms, where we are using the classifications of Wortmann
- Headers (names of the columns to process in each file)
- File configurations

In [2]:
# -----------------------------------------
# Base folders
# -----------------------------------------
base_dir = 'analysis/'
base_ph1 = base_dir + "phase1/"

# -----------------------------------------
# Algorithms
# -----------------------------------------
# Metaheuristics algorithms (global)
metaheuristics = ['CRS2', 'ESCH', 'ISRES']
# Direct-search algorithms (1 global, 4 local)
direct_search = ['DIRECT', 'DIRECTL', 'PRAXIS', 'SUBPLEX', 'NMS']
# Model-based algorithms (w/ sampling) (3 global, 2 local)
model_based = ['GPR','RBFCC', 'RBFCL', 'BOBYQA', 'COBYLA']

# Split by local algorithms
local_algorithms = ['BOBYQA', 'COBYLA', 'PRAXIS', 'SUBPLEX', 'NMS']
global_algorithms = ['CRS2', 'ESCH', 'ISRES', 'DIRECT', 'GPR', 'RBFCC', 'RBFCL', 'DIRECTL']
all_algorithms = global_algorithms + local_algorithms 

In [3]:
# -----------------------------------------
# Headers
# -----------------------------------------
vars_cols = ['min-c', 'max-c', 'd-c', 'max-d-stripes']
obj = 'sUDI'
best_obj = 'Best sUDI'
cols = ['Algorithm', 'Timestamp', 'Elapsed Time', 'Curr Run', 'Curr Eval', 'min-c', 'max-c', 'd-c', 'max-d-stripes', 'sUDI']

In [4]:
# -----------------------------------------
# File configurations
# -----------------------------------------
seps = [',', ';']
nevals = 60

### Data preprocessing functions
This section will provide the basis functions for reading a file and assign it the aforementioned cols as header!

In [5]:
def filename(base_dir, algorithm, run, filetype='csv'): 
    return base_dir + "evals_" + algorithm + "_r" + str(run) + "." + filetype

In [6]:
# Sanity check
print(filename(base_ph1, 'PRAXIS', 1))
print(filename(base_ph1, 'PRAXIS', 2))
print(filename(base_ph1, 'PRAXIS', 3))

analises/phase1/evals_PRAXIS_r1.csv
analises/phase1/evals_PRAXIS_r2.csv
analises/phase1/evals_PRAXIS_r3.csv


In [7]:
def read_csv(filepath, header=cols, seps=seps): 
    seps =  seps if isinstance(seps, list) else [seps]
    for sep in seps:
        try:
            return pd.read_csv(filepath, header=0, names=header, sep=sep, index_col=False).iloc[0:nevals]
        except:
            print("Failed to read file", filepath, 'with separator "', sep, '"')
    return None            

In [8]:
# Sanity check
print(read_csv(filename(base_ph1, 'PRAXIS', 1)).shape)
print(read_csv(filename(base_ph1, 'PRAXIS', 2)).shape)
read_csv(filename(base_ph1, 'PRAXIS', 3))

Failed to read file analises/phase1/evals_PRAXIS_r1.csv with separator " , "
(60, 10)
(60, 10)


Unnamed: 0,Algorithm,Timestamp,Elapsed Time,Curr Run,Curr Eval,min-c,max-c,d-c,max-d-stripes,sUDI
0,PRAXIS,2019-03-20 05:07:34.911286,266.904056,2,1,0.476509,0.960294,0.1364,0.900784,49.0
1,PRAXIS,2019-03-20 05:12:02.336288,534.329058,2,2,0.476681,0.960294,0.1364,0.900784,53.0
2,PRAXIS,2019-03-20 05:16:26.623143,798.615913,2,3,0.476853,0.960294,0.1364,0.900784,52.0
3,PRAXIS,2019-03-20 05:20:53.466932,1065.459702,2,4,0.476733,0.960294,0.1364,0.900784,50.0
4,PRAXIS,2019-03-20 05:25:21.312253,1333.305023,2,5,0.476681,0.960466,0.1364,0.900784,50.0
5,PRAXIS,2019-03-20 05:29:45.511150,1597.50392,2,6,0.476681,0.960122,0.1364,0.900784,53.0
6,PRAXIS,2019-03-20 05:34:13.146061,1865.138831,2,7,0.476681,0.960208,0.1364,0.900784,56.0
7,PRAXIS,2019-03-20 05:38:40.344262,2132.337032,2,8,0.476681,0.960208,0.136572,0.900784,56.0
8,PRAXIS,2019-03-20 05:43:08.397114,2400.389884,2,9,0.476681,0.960208,0.136744,0.900784,56.0
9,PRAXIS,2019-03-20 05:47:35.096957,2667.089727,2,10,0.476681,0.960208,0.118782,0.900784,56.0


In [9]:
def read_csvs(algorithms, parent_folder=base_ph1, nruns=3):
    data = []
    for algorithm in algorithms:
        for nrun in range(1, nruns+1):
            filepath = filename(parent_folder, algorithm, nrun)
            datum = read_csv(filepath)
            datum['Curr Run'] = pd.DataFrame(np.full((datum.shape[0], 1), nrun), columns=['Curr Run'])
            if datum is None:
                raise ValueError('Unexpected error with', filepath)
            print('Read', datum.shape[0], 'rows and', datum.shape[1], 'columns from', filepath)
            data.append(datum)
    print('Expected', len(algorithms) * nruns, 'algorithms. Read', len(data), 'algorithms.')
    return data

In [10]:
# Sanity check
results = read_csvs(all_algorithms)

Failed to read file analises/phase1/evals_CRS2_r1.csv with separator " , "
Read 60 rows and 10 columns from analises/phase1/evals_CRS2_r1.csv
Read 60 rows and 10 columns from analises/phase1/evals_CRS2_r2.csv
Read 60 rows and 10 columns from analises/phase1/evals_CRS2_r3.csv
Failed to read file analises/phase1/evals_ESCH_r1.csv with separator " , "
Read 60 rows and 10 columns from analises/phase1/evals_ESCH_r1.csv
Read 60 rows and 10 columns from analises/phase1/evals_ESCH_r2.csv
Read 60 rows and 10 columns from analises/phase1/evals_ESCH_r3.csv
Failed to read file analises/phase1/evals_ISRES_r1.csv with separator " , "
Read 60 rows and 10 columns from analises/phase1/evals_ISRES_r1.csv
Read 60 rows and 10 columns from analises/phase1/evals_ISRES_r2.csv
Read 60 rows and 10 columns from analises/phase1/evals_ISRES_r3.csv
Read 60 rows and 10 columns from analises/phase1/evals_DIRECT_r1.csv
Read 60 rows and 10 columns from analises/phase1/evals_DIRECT_r2.csv
Read 60 rows and 10 columns fr

### Feature extraction

This next section will be dedicated to the definition of functions for adding the max result per iteration


In [11]:
def add_best(df, name=best_obj):
    current_best = 0
    best = np.zeros((df.shape[0], 1))

    for index, row in df.iterrows():
        current_best = max(current_best, row[obj])
        best[index] = current_best
    
    df[best_obj] = pd.DataFrame(best, columns=[best_obj])
    return df

In [12]:
# Add best sUDI to every result
results = [add_best(result) for result in results]

In [13]:
algs_results = dict()
for alg_ix, alg in enumerate(all_algorithms):
    alg_nruns = alg_ix * 3
    algs_results[alg] = results[alg_nruns:alg_nruns+3]

# Data post-processing
In this section, we process the data to obtain the mean best result and its standard deviation for each algorithm.

In [14]:
output_folder = 'outputs/'
output_file_ph1 = output_folder + 'phase1_stats.csv'
output_file_ph2 = output_folder + 'phase2_stats.csv'

In [15]:
import statistics

In [16]:
# Aggregate per algorithm
def get_stats_per_alg(data, nruns):
    def get_best(datum, col):
        return datum.loc[datum[obj].idxmax()][col]
    
    best_res = [get_best(d, best_obj) for d in data]
    best_eval = [get_best(d, 'Curr Eval') for d in data]
    return [statistics.mean(best_res), statistics.stdev(best_res), statistics.mean(best_eval), statistics.stdev(best_eval)]
    

In [17]:
# Statistics
stats_cols = ['Mean Best sUDI', 'Stdev Best sUDI', 'Mean Eval', 'Stdev Eval']

In [18]:
stats = np.zeros((len(all_algorithms), len(stats_cols)))
for alg_i, alg in enumerate(all_algorithms):
    stats[alg_i] = get_stats_per_alg(algs_results[alg], 3)

In [19]:
stats_results = pd.DataFrame(data=stats, columns=stats_cols).round(2)
stats_results.head()

Unnamed: 0,Mean Best sUDI,Stdev Best sUDI,Mean Eval,Stdev Eval
0,85.0,11.79,33.0,21.73
1,78.67,4.93,54.0,5.29
2,87.67,6.81,35.0,1.0
3,98.0,0.0,60.0,0.0
4,99.67,0.58,31.0,14.11


In [20]:
stats_results['Algorithm'] = np.array(all_algorithms)

In [21]:
stats_results

Unnamed: 0,Mean Best sUDI,Stdev Best sUDI,Mean Eval,Stdev Eval,Algorithm
0,85.0,11.79,33.0,21.73,CRS2
1,78.67,4.93,54.0,5.29,ESCH
2,87.67,6.81,35.0,1.0,ISRES
3,98.0,0.0,60.0,0.0,DIRECT
4,99.67,0.58,31.0,14.11,GPR
5,99.67,0.58,30.0,9.27,RBFCC
6,99.67,0.58,31.0,11.0,RBFCL
7,79.0,0.0,57.0,0.0,DIRECTL
8,68.67,6.03,29.0,10.39,BOBYQA
9,71.33,13.2,48.0,15.62,COBYLA


In [22]:
stats_results.sort_values(by='Algorithm').to_csv(output_file_ph1, index=False)

## Get graphs values 
(Mean best sUDI / eval)


In [23]:
# Aggregate per algorithm
def get_per_algorithm(algorithm_name, data):
    alg_mean = np.zeros((nevals, 1))

    for index in range(nevals):
        alg_mean[index] = statistics.mean([d.loc[index][best_obj] for d in data])
            
    return pd.DataFrame(alg_mean, columns=[algorithm_name])

In [24]:
dfs = [] 
for algorithm in all_algorithms:
    dfs.append(get_per_algorithm(algorithm, algs_results[algorithm]))

In [25]:
data_graph = pd.concat(dfs, axis=1)

In [26]:
# Aggregate per class
def get_mean_values(data, name, nevals=nevals, obj=best_obj):
    mean = np.zeros((nevals, 1))
        
    for index in range(nevals):
        mean[index] = statistics.mean([d.loc[index][obj] for d in data])
            
    return pd.DataFrame(mean, columns=[name])

In [27]:
data_graph

Unnamed: 0,CRS2,ESCH,ISRES,DIRECT,GPR,RBFCC,RBFCL,DIRECTL,BOBYQA,COBYLA,PRAXIS,SUBPLEX,NMS
0,39.0,55.333333,41.666667,57.0,53.333333,55.333333,60.0,57.0,50.0,48.666667,59.333333,43.666667,49.0
1,55.0,60.333333,66.333333,59.0,71.666667,58.666667,66.0,59.0,50.0,48.666667,60.666667,49.0,49.333333
2,60.0,60.333333,66.333333,59.0,71.666667,67.666667,66.666667,59.0,50.333333,48.666667,60.666667,51.0,49.333333
3,64.666667,70.0,70.666667,59.0,74.666667,68.0,72.333333,59.0,51.0,48.666667,60.666667,55.666667,50.0
4,64.666667,70.0,70.666667,59.0,75.0,73.333333,72.333333,59.0,55.666667,59.0,62.333333,55.666667,59.666667
5,64.666667,72.0,70.666667,62.0,75.666667,74.333333,72.333333,62.0,56.333333,65.0,63.333333,55.666667,60.666667
6,66.333333,72.0,70.666667,64.0,75.666667,77.0,72.333333,64.0,58.333333,65.0,64.333333,55.666667,66.333333
7,67.333333,72.666667,74.0,64.0,75.666667,77.0,72.333333,64.0,58.333333,65.0,64.333333,57.0,71.666667
8,67.333333,72.666667,74.0,66.0,75.666667,77.0,73.0,66.0,58.333333,67.666667,64.333333,57.0,71.666667
9,67.333333,72.666667,74.0,68.0,75.666667,77.0,73.0,66.0,58.333333,68.0,64.333333,57.0,71.666667


In [28]:
def flat(a):
    b = list()
    for l in a:
        b.extend(l)
    return b

In [78]:
local_model = ['COBYLA', 'BOBYQA']
local_model_data = (flat(map(lambda x: algs_results[x], local_model)), 'L_Model-based')

global_model = ['GPR', 'RBFCC', 'RBFCL']
global_model_data = (flat(map(lambda x: algs_results[x], global_model)), 'G_Model-based')

global_metaheuristics = ['CRS2', 'ESCH', 'ISRES']
global_metaheuristics_data = (flat(map(lambda x: algs_results[x], global_metaheuristics)), 'Metaheuristics')

global_direct = ['DIRECT', 'DIRECTL']
global_direct_data = (flat(map(lambda x: algs_results[x], global_direct)), 'G_Direct-search')

local_direct = ['NMS', 'PRAXIS', 'SUBPLEX']
local_direct_data = (flat(map(lambda x: algs_results[x], local_direct)), 'L_Direct-search')

In [79]:
data_dfs_class = [] 
for data in [local_model_data, global_model_data, global_metaheuristics_data, global_direct_data, local_direct_data]:
    data_dfs_class.append(get_mean_values(data[0], data[1]))

In [80]:
data_graph_class = pd.concat(data_dfs_class, axis=1)

In [81]:
data_graph_class

Unnamed: 0,L_Model-based,G_Model-based,Metaheuristics,G_Direct-search,L_Direct-search
0,49.333333,56.222222,45.333333,57.0,50.666667
1,49.333333,65.444444,60.555556,59.0,53.0
2,49.5,68.666667,62.222222,59.0,53.666667
3,49.833333,71.666667,68.444444,59.0,55.444444
4,57.333333,73.555556,68.444444,59.0,59.222222
5,60.666667,74.111111,69.111111,62.0,59.888889
6,61.666667,75.0,69.666667,64.0,62.111111
7,61.666667,75.0,71.333333,64.0,64.333333
8,63.0,75.222222,71.333333,66.0,64.333333
9,63.166667,75.222222,71.333333,67.0,64.333333


# Data visualization

In [82]:
# Visualization Framework
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

# Print plotly's version
plotly.__version__
plotly.tools.set_credentials_file(username='PastelBelem8', api_key='X8k1mloXRlB24rZm0qyq')

In [86]:
layout = go.Layout(
    # title = "Solutions of the minimization of two goals of an arc-shaped space frame",
    font=dict(size=9),
    showlegend=True,
    # showlegend=False,
    legend=dict(orientation="h", 
                xanchor="center", 
                yanchor="bottom",
                x= 0.5,
                y=-0.30,
                font=dict(size=13)
               ),
    xaxis=dict(
        title="Function Evaluations",
        titlefont=dict(
            # family='Old Standard TT, serif',
            size=14,
            color='black'
        ),
        range=[1, 60],
        # autorange=True,
        showgrid=True,
        zeroline=False,
        showline=True,
        ticks='',
        showticklabels=True,tickfont=dict(
            # family='Old Standard TT, serif',
            size=14,
            color='black'
        ),
    ),
    yaxis=dict(
        title="Mean Best sUDI value [%]",
        titlefont=dict(
            # family='Old Standard TT, serif',
            size=16,
            color='black'
        ),
        range=[44, 100],
        # autorange=True,        
        showgrid=True,
        zeroline=False,
        showline=True,
        ticks='',
        showticklabels=True,
        tickfont=dict(
            # family='Old Standard TT, serif',
            size=14,
            color='black'
        ),
    )
    )

In [87]:
# Create random data with numpy
def simple_scatter(data, layout=layout, nrows=nevals):
    x = np.arange(1, nrows+1)
    traces = []
    for c in data.columns:
        trace = go.Scatter(
            x = x,
            y = data[c],
            name = c,
            mode = 'lines+markers',
            marker = dict(
                # Markers size
                size = 2,
            )
        )
        traces.append(trace)
        
    fig = go.Figure(data=traces, layout=layout)
    return py.iplot(fig, filename='basic-scatter')

In [88]:
simple_scatter(data_graph_class)

In [77]:
simple_scatter(data_graph)

# Phase 2

The second phase test consists in testing the adequability of the local algorithms according to different initial points.

In [None]:
base_ph2 = base_dir + "phase2/"
base_ph2_hs = base_ph2 + "hotstart/"
base_ph2_bs = base_ph2 + "badstart/"

local_evals = 15

In [None]:
local_results_hs = read_csvs(local_algorithms, parent_folder=base_ph2_hs)
local_results_hs = [add_best(result) for result in local_results_hs]

In [None]:
local_results_bs = read_csvs(local_algorithms, parent_folder=base_ph2_bs)
local_results_bs = [add_best(result) for result in local_results_bs]

In [None]:
def group_per_algs(data, algs):
    res = dict()

    for alg_ix, alg in enumerate(algs):
        alg_nruns = alg_ix * 3
        res[alg] = data[alg_nruns:alg_nruns+3]

    return res

In [None]:
local_algs_hs = group_per_algs(local_results_hs, local_algorithms)
local_algs_bs = group_per_algs(local_results_bs, local_algorithms)

In [None]:
data_dfs_hs = [] 
for alg in local_algorithms:
    data_dfs_hs.append(get_mean_values(local_algs_hs[alg], alg + "_HS", local_evals))
    
data_dfs_bs = [] 
for alg in local_algorithms:
    data_dfs_bs.append(get_mean_values(local_algs_bs[alg], alg + "_BS", local_evals))

In [None]:
local_data = list()
local_data += data_dfs_hs
local_data += data_dfs_bs

In [None]:
len(local_data)

In [None]:
data_ph2_graph = pd.concat(local_data, axis=1)

In [None]:
data_ph2_graph

In [None]:
local_layout = go.Layout(
    font=dict(size=9),
    showlegend=True,
    # showlegend=False,
    legend=dict(orientation="h", 
                xanchor="center", 
                yanchor="bottom",
                x= 0.5,
                y=-0.35),
    xaxis=dict(
        title="Function Evaluations",
        range=[1, 15],
        showgrid=True,
        zeroline=False,
        showline=True,
        ticks='',
        showticklabels=True
    ),
    yaxis=dict(
        title="Mean Best sUDI value [%]",
        range=[0, 100],
        # autorange=True,        
        showgrid=True,
        zeroline=False,
        showline=True,
        ticks='',
        showticklabels=True
    )
    )

In [None]:
colors = [
            # Purple
           'rgb(107, 52, 128)', 'rgb(184, 133, 204)',
            # Blue 
          'rgb(7, 80, 133)', 'rgb(133, 174, 204)',
            # Green
          'rgb(75, 133, 29)', 'rgb(165, 204, 133)',
            # Red
          'rgb(133, 23, 16)', 'rgb(205, 91, 69)',
            # Pink
          # 'rgb(225, 0, 215)', 'rgb(205, 101, 235)'
            # Oranges
    'rgb(255, 107, 0)', 'rgb(255, 179, 78)'
]

In [None]:
# Create random data with numpy
def grouped_scatter(data, algorithms, colors=colors, layout=local_layout, nrows=local_evals):
    x = np.arange(1, nrows+1)
    traces = []
    for i, alg in enumerate(algorithms):
        alg_hs = alg + "_HS"
        alg_bs = alg + "_BS"
        
        trace1 = go.Scatter(
            x = x,
            y = data[alg_hs],
            name = alg_hs,
            legendgroup =alg,
            mode = 'lines+markers',
            marker = dict(
                # Markers size
                size = 2,
                color = colors[2*i]
            )
        )
        
        trace2 = go.Scatter(
            x = x,
            y = data[alg_bs],
            name = alg_bs,
            legendgroup =alg,
            mode = 'lines+markers',
            marker = dict(
                # Markers size
                size = 2,
                color = colors[2*i+1]
            )
        )
        
        traces.append(trace1)
        traces.append(trace2)
        
    fig = go.Figure(data=traces, layout=layout)
    return py.iplot(fig, filename='basic-scatter')

In [None]:
grouped_scatter(data_ph2_graph, local_algorithms)