In [93]:
import sys; sys.path.insert(0, '..') #Add upper folder to path
from src.preprocess import Preprocess
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import yaml

In [94]:
with open("../src/config.yaml", 'r') as stream:
    config = yaml.safe_load(stream)

features_cols = config['features']

max_runtime = 300000
algorithms = config['algorithms']
runtime_cols = [algorithm + ' Runtime' for algorithm in algorithms]
runtime_cols.append('Y Runtime')
success_cols = [algorithm + ' Success' for algorithm in algorithms]
data_path = '../src/AllData-labelled.csv'
df = pd.read_csv(data_path)

# drop_maps = None
drop_maps = ['warehouse']

if drop_maps is not None:
    for maptype in drop_maps:
        df = df[~df.GridName.str.contains(maptype)]
    df = df.reset_index(drop=True)

In [95]:
success_cols

['icts Success', 'epea Success', 'cbsh Success', 'sat Success']

### How many problems solved by only one solver? ###

In [96]:
df['SolvesFinishedCount'] = df[success_cols].sum(axis=1)

In [97]:
print("{n} of the problems solved by only one solver!".format(n=len(df[df['SolvesFinishedCount']==1])/len(df)))

0.1976237265930293 of the problems solved by only one solver!


### What is the maximum number of agents solved for each map? ###

Note: A map is a combination of an instance and the grid.

In [98]:
max_agents_for_each_map_row_idxs =df.groupby(['GridName','InstanceId'])['NumOfAgents'].idxmax()

In [99]:
winners_df = df.iloc[max_agents_for_each_map_row_idxs][['NumOfAgents','Y','GridName','InstanceId']]
winners_df.sort_values(['NumOfAgents'], ascending=False).head(20)

Unnamed: 0,NumOfAgents,Y,GridName,InstanceId
32507,1280,sat Runtime,orz900d,4
37187,1274,sat Runtime,orz900d,20
40300,1000,sat Runtime,orz900d,10
251,200,cbsh Runtime,Berlin_1_256,2
43947,198,epea Runtime,empty-48-48,7
20820,195,cbsh Runtime,Paris_1_256,25
5500,195,cbsh Runtime,Berlin_1_256,16
13797,187,cbsh Runtime,Paris_1_256,20
436,186,cbsh Runtime,Berlin_1_256,3
18432,186,cbsh Runtime,Paris_1_256,22


### How many maps each solver solved alone? (i.e. gain in coverage) ###

In [100]:
df[df['SolvesFinishedCount']==1]['Y'].value_counts()

sat Runtime      5855
cbsh Runtime     1948
epea Runtime      585
icts Runtime      275
astar Runtime      58
macbs Runtime      18
cbs Runtime        10
Name: Y, dtype: int64

Let's try to find patterns regarding the large amount of maps sat solved alone

### For each solver, how distributed are his "single solver" maps? ###

In other words - we want to check if there is a solver that wins by a large margin on a specific map (and therefore gains coverage).

In [101]:
df[(df['SolvesFinishedCount']==1)&(df.Y=='sat Runtime')].GridName.value_counts()

orz900d                3370
random-64-64-10         442
empty-48-48             269
Berlin_1_256            260
ht_chantry              141
empty-32-32             140
maze-32-32-4            131
maze-32-32-2            124
den312d                 111
random-64-64-20          89
lt_gallowstemplar_n      87
random-32-32-10          85
room-32-32-4             77
maze-128-128-2           68
maze-128-128-10          63
room-64-64-8             50
ost003d                  49
ht_mansion_n             42
Paris_1_256              37
random-32-32-20          36
den520d                  36
room-64-64-16            33
Boston_0_256             30
lak303d                  29
w_woundedcoast           24
empty-16-16              16
brc202d                  12
empty-8-8                 4
Name: GridName, dtype: int64

### We can see that orz900d is highly dominated by sat. What will happen when we train the solver without it? ###
Although surprising, it didn't had impact on the results.


In [102]:
df[(df['SolvesFinishedCount']==1)&(df.Y=='sat Runtime')&(df.GridName=='orz900d')].InstanceId.value_counts()

4     1212
20    1208
10     950
Name: InstanceId, dtype: int64

### Which maps have the highest std in terms of "single solvers" across different configurations? ###

In [103]:
s = df[(df['SolvesFinishedCount']==1)].groupby(['GridName'])['Y'].value_counts()
s.std(level='GridName').sort_values(ascending=False)

GridName
orz900d                1483.782767
random-64-64-10         206.560726
Paris_1_256             188.536999
empty-48-48             131.981059
Berlin_1_256            127.101141
Boston_0_256             73.225679
ht_chantry               72.794231
empty-32-32              62.215754
empty-16-16              53.275229
maze-128-128-2           47.376154
room-32-32-4             42.003968
lt_gallowstemplar_n      41.476901
den312d                  38.183766
empty-8-8                38.183766
brc202d                  36.715120
random-32-32-20          36.528528
maze-128-128-10          35.510562
random-32-32-10          32.733775
random-64-64-20          32.500000
ht_mansion_n             32.176078
w_woundedcoast           30.605555
room-64-64-16            27.403771
den520d                  25.358759
room-64-64-8             23.907809
lak303d                  16.653328
ost003d                  11.532563
maze-32-32-2                   NaN
maze-32-32-4                   NaN
Name: Y, dt

### Interesting! We see that sat exceed in specific configurations of orz only, but when it does - it does so in an outstanding way and wins by a large margin over the others. 

1. What's so special about those specific configurations of orz?
2. How can we use that knowledge during the training of the AS model? Maybe it emposes a bias?

### What is the margin for each map between the largest number of agents solved and the 2nd best solver? ###

In [104]:
max_num_of_agents_for_two_solvers_per_map = df[df['SolvesFinishedCount']>=2].groupby(['GridName','InstanceId'])['NumOfAgents'].max()


In [105]:
max_num_of_agents_for_single_solver_per_map = df[df['SolvesFinishedCount']==1].groupby(['GridName','InstanceId'])['NumOfAgents'].max()

In [106]:
margin_from_single_solver = max_num_of_agents_for_single_solver_per_map - max_num_of_agents_for_two_solvers_per_map
margin_from_single_solver.sort_values(ascending=False).head(10)

GridName      InstanceId
orz900d       4             1212.0
              20            1208.0
              10             950.0
Paris_1_256   25              99.0
Berlin_1_256  16              84.0
Paris_1_256   3               68.0
              19              65.0
Berlin_1_256  3               64.0
Paris_1_256   22              62.0
Berlin_1_256  4               60.0
Name: NumOfAgents, dtype: float64

### Let's try to cluster the data given the features and try to see if those outliers (orz900d of instances 4,20,10) seems "strange" ###

In [122]:
from sklearn.cluster import KMeans
from sklearn.model_selection import GroupShuffleSplit

groups = df['InstanceId']  # len of scenarios
gkf = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

for index, (tr_ind, test_ind) in enumerate(gkf.split(df, df['Y'], groups)):
    X_train, X_test, y_train, y_test = df.iloc[tr_ind].copy(), df.iloc[test_ind].copy(), \
                                           df['Y'].iloc[tr_ind].copy(), df['Y'].iloc[test_ind].copy()

    kmeans = KMeans(n_clusters=len(success_cols)).fit(X_train[features_cols])

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True)
  ret = umr_sum(arr, axis, dtype, out, keepdims)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [123]:
X_train[features_cols]

Unnamed: 0,GridRows,GridColumns,NumOfAgents,NumOfObstacles,BranchingFactor,ObstacleDensity,AvgDistanceToGoal,MaxDistanceToGoal,MinDistanceToGoal,AvgStartDistances,AvgGoalDistances,PointsAtSPRatio,Sparsity
252,256.0,256.0,2.0,17996.0,2.500000e+01,0.274597,243.000000,470.0,16.0,108.66670,53.33333,0.007446,0.000042
253,256.0,256.0,3.0,17996.0,1.250000e+02,0.274597,195.666667,470.0,16.0,109.00000,96.33334,0.009003,0.000063
254,256.0,256.0,4.0,17996.0,6.250000e+02,0.274597,193.750000,470.0,16.0,106.10000,112.40000,0.011520,0.000084
255,256.0,256.0,5.0,17996.0,3.125000e+03,0.274597,224.600000,470.0,16.0,110.80000,124.93330,0.016769,0.000105
256,256.0,256.0,6.0,17996.0,1.562500e+04,0.274597,233.000000,470.0,16.0,112.61900,117.85710,0.018158,0.000126
...,...,...,...,...,...,...,...,...,...,...,...,...,...
44215,32.0,32.0,74.0,102.0,5.293956e+51,0.099609,23.986486,50.0,2.0,22.17153,22.21586,0.687500,0.080260
44216,32.0,32.0,75.0,102.0,2.646978e+52,0.099609,23.960000,50.0,2.0,22.20632,22.17474,0.687500,0.081345
44217,32.0,32.0,76.0,102.0,1.323489e+53,0.099609,24.184211,50.0,2.0,22.34347,22.24983,0.696289,0.082430
44218,32.0,32.0,77.0,102.0,6.617445e+53,0.099609,24.038961,50.0,2.0,22.49950,22.27173,0.701172,0.083514
