In [1]:
# Data processing
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
print("Plotly version:", plotly.__version__)

# Credentials to visualize data online in Plotly
plotly.tools.set_credentials_file(username='PastelBelem8', api_key='X8k1mloXRlB24rZm0qyq')

Plotly version: 3.7.1


In [2]:
# Read data from a CSV file and returns a Dataframe
def get_data_from_csv(filename, base_folder='final/outputs/', sep=',', header='infer'):
    filepath = base_folder + filename
    return pd.read_csv(filepath, sep, header=header)

def write_data_to_csv(data, filename, base_folder='final/outputs/'):
    data.to_csv(base_folder + filename)

In [3]:
sols_file = 'all_solutions_20190310.csv'
# Sanity check test 1
get_data_from_csv(sols_file, header=None).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,27.9,27.519,0.244,0.665985,21.417838,-0.23779,31.026032,-1.327788,0.261611,1.307729,72.461375
1,9.435,9.435,0.0,0.310227,10.939239,-0.592896,10.580658,-0.811962,3.633277,1.292872,28.926759
2,10.35,10.35,0.0,0.769788,7.761587,-1.183812,19.867838,-1.424898,9.505861,1.109352,49.047421
3,9.483,9.483,0.0,1.536313,30.916909,-0.696372,14.878207,-1.283161,14.654795,0.969101,55.451661
4,10.18,10.18,0.0,1.144987,6.288066,0.206822,25.155869,0.491872,9.079051,1.349896,44.244605


In [4]:
# Sanity check test 2
write_data_to_csv(get_data_from_csv(sols_file, header=None).head(), 'test2.csv', base_folder='./')

# Problem

Architectural design optimization problem that aims to optimize the structural capability of an arc-shaped space frame and its elegance/interestingness. To achieve that we define an objective function as being the maximum displacement of the space frame, which we ought to minimize, and an ad-hoc measure of its irregularity which we are trying to minimize to produce more elegant solutions. 

Here is a picture of some of the obtained solutions in the Pareto Front of previous runs. 

![Three non dominated solutions of the addressed optimization problem]("truss-kat-dark.png")

## Define the constants of the problem


- **data**: structure holding the data from the file to be analysed
- **time_cols**: the columns in the __data__ that enclose the time took for the evaluation of each data record
- **vars_cols**: the columns in the __data__ that enclose the values of the variables
- **objs_cols**: the columns in the __data__ that enclose the values of the objectives

In [12]:
data = get_data_from_csv(sols_file, header=None)
# Optimization Results Files Header
time_cols = np.arange(0, 3)
vars_cols = np.arange(3, 9)
objs_cols = np.arange(9, 11)

In [13]:
data_columns = data.columns
if not isinstance(data_columns, pd.Int64Index):
    time_cols = data_columns[time_cols]
    vars_cols = data_columns[vars_cols]
    objs_cols = data_columns[objs_cols]

### 1. Data Analysis Overview 

Firstly, we observe the quality of the information collected from the file in order to verify that the data was correctly processed. 

In [14]:
data.info() # We have a total of 5589 records

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24514 entries, 0 to 24513
Data columns (total 11 columns):
0     24514 non-null float64
1     24514 non-null float64
2     24514 non-null float64
3     24514 non-null float64
4     24514 non-null float64
5     24514 non-null float64
6     24514 non-null float64
7     24514 non-null float64
8     24514 non-null float64
9     24514 non-null float64
10    24514 non-null float64
dtypes: float64(11)
memory usage: 2.1 MB


Secondly, we observe the description of the overall data. Note the minimum and maximum ranges of the objective values. 

In [15]:
data.drop(labels=time_cols, axis=1, inplace=True)
data.describe()

Unnamed: 0,3,4,5,6,7,8,9,10
count,24514.0,24514.0,24514.0,24514.0,24514.0,24514.0,24514.0,24514.0
mean,0.053193,14.954274,0.025563,16.800471,0.404427,16.80077,1.150966,39.81099
std,1.169126,8.752282,1.093927,8.129443,1.055486,8.416493,0.155919,17.767717
min,-1.570796,0.0,-1.570796,0.0,-1.570796,0.0,0.543703,0.0
25%,-1.247604,6.97011,-1.062693,10.920053,-0.558991,9.871483,1.056924,26.964389
50%,0.116491,14.719265,0.0,17.077492,0.761007,17.397337,1.156206,40.238719
75%,1.277199,22.563129,1.152847,23.120909,1.395803,23.930227,1.253621,53.143111
max,1.570796,31.415927,1.570796,31.415927,1.570796,31.415927,2.001658,88.900953


In [16]:
data.drop_duplicates(inplace=True)

In [17]:
data.describe()

Unnamed: 0,3,4,5,6,7,8,9,10
count,18838.0,18838.0,18838.0,18838.0,18838.0,18838.0,18838.0,18838.0
mean,0.118915,15.470296,0.111515,16.868432,0.28754,16.494583,1.161435,38.856503
std,1.13621,9.014712,1.059554,8.483679,1.055366,8.547042,0.16406,18.212667
min,-1.570796,0.0,-1.570796,0.0,-1.570796,0.0,0.543703,0.0
25%,-1.080826,7.171384,-0.897941,10.659445,-0.685219,9.628061,1.076982,25.794844
50%,0.242658,15.506316,0.119617,17.077492,0.548548,16.9492,1.173865,39.407851
75%,1.280463,23.210043,1.166362,23.727462,1.335933,22.924835,1.268917,51.351283
max,1.570796,31.415927,1.570796,31.415927,1.570796,31.415927,2.001658,88.900953


In [18]:
for o in objs_cols:
    o_data = data[o].describe()
    o_min, o_max  = o_data.loc['min'], o_data.loc['max']
    
    print("Objective", o, ": \n\tMin val:", str(o_min), "\tMax val:", str(o_max), "\tRange val:", str(o_max-o_min))

Objective 9 : 
	Min val: 0.54370260494 	Max val: 2.00165774896 	Range val: 1.45795514402
Objective 10 : 
	Min val: 0.0 	Max val: 88.9009529384 	Range val: 88.9009529384


The scales of each objective are considerably different. Also, the second objective seems to have a huge skew towards higher values, let's plot the distribution of each objective separately.

In [19]:
# Because we want to create multiple violins plotting the distribution of each objective, we must create a trace for each objective
def create_violin_plots(data=data, cols=objs_cols):
    traces = []
    for c in cols:
        trace = {
                "type": 'violin',
                # The identification of the Objective (to place in X axis)
                "x": c,
                # The actual values of the Objective (to compte the distribution)
                "y": data[c],
                # The name to put in the legen of the plot
                "name": "Column_" + str(c),
                # Plot a box plot (making visible the min, 25%, median (50%), 75%, max)
                "box": {
                    "visible": True
                },
                # Mean
                "meanline": {
                    "visible": True
                }
            }
        traces.append(trace)


    # Create the Figure Object
    fig = {
        # The data to be presented in the plot
        "data": traces,
        "layout" : {
            "title": "",
            "yaxis": {
                "zeroline": False,
            }
        }
    }

    return py.iplot(fig, filename='violin-multiple', validate = False)


In [20]:
create_violin_plots()

Well, this does not provide a very good overview of the results. This is the consequence of the disparity in the scales. Let us scale both objectives. To achieve this, one could use the let us use the [MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html) from the Scikit Learn library, but Pandas enables the creation of a scaling function that will be applied to every element independently. Therefore, we only need to create the the function that given a max and min value will create the appropriate value. Fortunately, we have computed the max, min and range values before, so we can reuse that logic.

```python3
from sklearn.preprocessing import MinMaxScaler
# Example
scaler = MinMaxScaler()
scaler.fit(data[9].values.reshape(-1, 1))
```

In [21]:
def scale(x, mn, mx):
    return (x - mn) / (mx-mn)

def scale_data(data, cols=objs_cols):
    scaled_data = data.copy()
    
    for c in cols:
        c_data = data[c].describe()
        c_min, c_max  = c_data.loc['min'], c_data.loc['max']
        
        scaled_data[c] = scaled_data[c].apply(lambda v: scale(v, c_min, c_max))
    return scaled_data

In [22]:
scaled_data = scale_data(data, cols=np.arange(3, 11))

Let's try again, to plot their distribution!

In [23]:
create_violin_plots(scaled_data)

This gave no information, and if we plot the unscaled versions of these objectives the distribution will be the same...  What about the **distribution of the variables**?

- the **odd** variables represent the **angles of the attractors** and take values in the range of [-pi/2, pi/2]
- the **even** variables represent the **position of the attractors** in the space-frame and take values in the range of [0, 10pi]

In [24]:
# Plot the distribution of the angles values
create_violin_plots(cols=vars_cols[0::2])

We can observe that there's an overall decrease in the values explored for each angle. This can be due to the fact that only Evolutionary Algorithms and Particle Swarm algorithms are being used. But one should **generate 225** design solutions (through different **sampling methods**, to try and obtain a more **uniform** distribution, e.g., Latin Hypercube Sampling, K-factorial?) 

In [25]:
# Plot the distribution of the position values
create_violin_plots(cols=vars_cols[1::2])

Regarding the position of the attractors, we can also see that there's a generalized difference in the values. It would be interesting to actually plot the points next to the violins, to validate the existence of overlapping solutions (this is capping the outliers)

In [26]:
# Because we want to create multiple box plotting the distribution of each objective, we must create a trace for each objective
def create_box_plot(data=data, cols=objs_cols):
    colors = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, cols.shape[0]*2)]
    traces = []
    for i, c in enumerate(cols):
        trace = {
                "type": 'box',
                # The actual values of the Objective (to compte the distribution)
                "y": data[c],
                # The name to put in the legen of the plot
                "name": "Column_" + str(c),
                "boxpoints": 'all',
                "jitter": 0.3,
                "pointpos": -1.8, 
                "boxmean": 'sd',
                "color": colors[2*i],
                # "boxmean": True,
                "marker": {
                    "color": colors[2*i],
                    "outliercolor": colors[2*i+1],
                    "line": {
                        "outliercolor": colors[2*i+1],
                        "outlierwidth": 2} 
                },
            
                "whiskerwidth": 0.2,
        }
        traces.append(trace)


    # Create the Figure Object
    fig = {
        # The data to be presented in the plot
        "data": traces,
        "layout" : {
            "title": "",
            "yaxis": {
                "zeroline": False,
            }
        }
    }

    return py.iplot(fig, filename='box-multiple', validate = False)


In [27]:
create_box_plot()

The draw time for this plot will be slow for clients without much RAM.



Estimated Draw Time Slow



In [28]:
create_box_plot(scaled_data, cols=vars_cols)

The draw time for this plot will be slow for all clients.



Estimated Draw Time Too Long



In [29]:
create_box_plot(scaled_data, cols=np.arange(3, 11))

The draw time for this plot will be slow for all clients.



Estimated Draw Time Too Long



## 2. Pareto Front Analysis

Let us examine the objective space distribution. First, we want an overview of the spread of all the points... Just to get a picture

In [30]:
# Create a trace
def create_scatter(data, x=9, y=10):
    trace = go.Scatter(
        x = data[x],
        y = data[y],
        mode = 'markers'
    )
    fig = go.Figure(data=[trace])
    return py.iplot(fig, filename='line-mode', validate=False)

In [36]:
create_scatter(data)

In [31]:
create_scatter(scaled_data)

In [32]:
vars_cols

array([3, 4, 5, 6, 7, 8])

In [37]:
# Create a trace
def create_3d_scatter(data, x=9, y=10, zs=vars_cols):
    traces = []
    colors = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, zs.shape[0])]
    
    for i, z in enumerate(zs):
        trace = go.Scatter3d(
            x = data[x],
            y = data[y],
            z = data[z],
            name = "Var_" + str(i),
            mode = 'markers',
            marker=dict(
                color=colors[i],
                size=3,
                symbol='circle',
                line=dict(
                    color=colors[i],
                    width=1
                ),
                opacity=1
            )
        )
        traces.append(trace)
        
    fig = go.Figure(data=traces)
    return py.iplot(fig, filename='scatter3d-mode', validate=False)

In [34]:
create_3d_scatter(scaled_data)

The draw time for this plot will be slow for all clients.



Estimated Draw Time Too Long



In [35]:
from sklearn.neural_network import MLPRegressor

In [None]:
# Create the model
mlpr = MLPRegressor(hidden_layer_sizes=(50, 100, 50), early_stopping=True, 
                    shuffle=True, max_iter=200, verbose=True, validation_fraction=0.05)

In [None]:
def create_surface_plot(model, data, x=objs_cols, y=vars_cols):
    X_train = data[x]
    y_train = data[y]
    model.fit(X, y)
    
    # Create the data for the surface
    x_to_mesh = np.linspace(0, 1, 100)
    y_to_mesh = np.linspace(0, 1, 100)
    
    # FIXME - 
    Y, X = np.meshgrid(x, y)
    Z = model.predict()

In [None]:
x1_to_mesh = np.linspace(0, 1, 100)
x2_to_mesh = np.linspace(0, 1, 100)

X2, X1 = np.meshgrid(x1_to_mesh, x2_to_mesh)

In [None]:
X2.shapeb

In [None]:
X1.shape

In [None]:
# np.array([1, 1]).reshape((1, 2))
# Z1 = np.cos(X) * np.sin(Y)

In [None]:
scaled_data.describe()

In [None]:
o1 = 9 
o2 = 10

In [None]:
fixed = scaled_data[(scaled_data[o2] < 0.41) & (scaled_data[o2] > 0.39)]
fixed_o1 = fixed[o1]
fixed_o2 = fixed[o2]

In [None]:
trace = go.Scatter(
        x = fixed_o2,
        y = fixed_o1,
        mode = 'markers'
    )
fig = go.Figure(data=[trace])
py.iplot(fig, filename='line-mode', validate=False)