In [1]:
import sys; sys.path.insert(0, '..') #Add upper folder to path
from src.preprocess import Preprocess
from src.metrics import coverage_score, cumsum_score, runtime_adjusted_coverage_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import yaml
import numpy as np
import random

In [2]:
with open("../src/config.yaml", 'r') as stream:
    config = yaml.safe_load(stream)

features_cols = config['features']

algorithms = config['algorithms']
max_runtime = config['max_runtime']
max_sub_runtime = config['max_sub_runtime']
runtime_cols = [algorithm + ' Runtime' for algorithm in algorithms]
alg_runtime_cols = runtime_cols.copy()
runtime_cols.append('Y Runtime')
success_cols = [algorithm + ' Success' for algorithm in algorithms]
success_cols
data_path = '../src/AllData-labelled.csv'
df = pd.read_csv(data_path)

# drop_maps = None
drop_maps = ['warehouse']

if drop_maps is not None:
    for maptype in drop_maps:
        df = df[~df.GridName.str.contains(maptype)]
    df = df.reset_index(drop=True)

In [3]:
alg_runtime_cols

['icts Runtime', 'epea Runtime', 'cbsh-c Runtime']

In [4]:
def add_partial_problem_label(row, p,algorithms,alg_max_runtimes):
    n_agents = max(int(p*row['NumOfAgents']),2) #MAPF problem can have less then 2 agents
    
    res = df[(df.NumOfAgents==n_agents)&(df.GridName==row.GridName)&(df.InstanceId==row.InstanceId)]
    if len(res)==0:
        print("No such problem! n_agents: {n}, instance: {i}, grid: {g}".format(n= n_agents,i=row.InstanceId, g=row.GridName))
        res = df[(df.NumOfAgents==n_agents+1)&(df.GridName==row.GridName)&(df.InstanceId==row.InstanceId)]
        # In case the problem wasn't found - it's some kind of error in the dataset (very rare)
        # We deal with it by taking the p*N+1 which we know exists
    
    label = res['Y'].values[0]
    label_runtime = res['Y Runtime'].values[0]
    time = res[algorithms]
    for alg_max_runtime in alg_max_runtimes:
        subproblem_prefix = str(p)+'maxtime_'+str(alg_max_runtime)
        capped_time = time.where(time < alg_max_runtime, alg_max_runtime)

        row[subproblem_prefix+'calctime'] = capped_time.sum(1).values[0]
        if row[subproblem_prefix+'calctime'] == len(algorithms)*alg_max_runtime:
            #No algorithm solved the sub-problem at time
            label = random.choice(algorithms)
            label_runtime = res[label].values[0]

    row[str(percentage)+'Y'] = label
    row[str(percentage)+'Y Runtime'] = label_runtime
    return row

In [5]:
percentage_bins = [0.1,0.25,0.5,0.75,0.9,0.95]
subproblem_runtimes = [60000, 30000,10000,5000, 1000]
# percentage_bins = [0.9]
# subproblem_runtimes = [5000]

In [6]:
subproblem_columns = []
subproblem_columns.append('Y')
for percentage in percentage_bins:
    subproblem_columns.append(str(percentage)+'Y')
    subproblem_columns.append(str(percentage)+'NumOfAgents')
    
    for subproblem_runtime in subproblem_runtimes:
        subproblem_prefix = str(percentage)+'maxtime_'+str(subproblem_runtime)
        for c in ['Y Runtime','Y Success','calctime']:
            subproblem_columns.append(subproblem_prefix + c)
# df[subproblem_columns]

In [7]:
# df = pd.read_csv('AllData-labelled-partial_problems.csv')

In [8]:
for percentage in percentage_bins:
    df[str(percentage)+'NumOfAgents'] = (df.NumOfAgents*percentage).astype(int)
    print("percentage:",percentage)
    df = df.apply(lambda x: add_partial_problem_label(x, percentage,alg_runtime_cols,subproblem_runtimes),axis=1) #Adds 0.xY + 0.xY Runtime columns
    for subproblem_runtime in subproblem_runtimes:
        print("maxtime:",subproblem_runtime)
        
        subproblem_prefix = str(percentage)+'maxtime_'+str(subproblem_runtime)
        df[subproblem_prefix+'Y Runtime'] = df[str(percentage)+'Y Runtime'].copy()
        df[subproblem_prefix+'Y Success'] = df.apply(lambda x: x[subproblem_prefix+'Y Runtime']<subproblem_runtime, axis=1)
        df.loc[df[subproblem_prefix+'Y Runtime'] > subproblem_runtime, subproblem_prefix+'Y Runtime'] = subproblem_runtime
#     df = df.apply(lambda x: p_feature_time(x,percentage,alg_runtime_cols,subproblem_runtimes),axis=1)

percentage: 0.1
maxtime: 60000
maxtime: 30000
maxtime: 10000
maxtime: 5000
maxtime: 1000
percentage: 0.25
maxtime: 60000
maxtime: 30000
maxtime: 10000
maxtime: 5000
maxtime: 1000
percentage: 0.5
maxtime: 60000
maxtime: 30000
maxtime: 10000
maxtime: 5000
maxtime: 1000
percentage: 0.75
maxtime: 60000
maxtime: 30000
maxtime: 10000
maxtime: 5000
maxtime: 1000
percentage: 0.9
No such problem! n_agents: 34, instance: 1, grid: ht_chantry
No such problem! n_agents: 36, instance: 1, grid: lt_gallowstemplar_n
No such problem! n_agents: 16, instance: 2, grid: maze-128-128-2
No such problem! n_agents: 18, instance: 3, grid: maze-128-128-2
No such problem! n_agents: 20, instance: 3, grid: maze-32-32-4
No such problem! n_agents: 16, instance: 17, grid: maze-128-128-2
No such problem! n_agents: 53, instance: 18, grid: Boston_0_256
No such problem! n_agents: 15, instance: 18, grid: maze-128-128-2
No such problem! n_agents: 22, instance: 19, grid: maze-32-32-2
No such problem! n_agents: 16, instance: 1

### How many problems we "lose" due to the computation of the sub-problem?

In [17]:
lost_problems = pd.DataFrame(columns=['p','max_time','oracle_coverage','baseline_coverage','num_problems','subproblem_coverage'])
for percentage in percentage_bins:
    for subproblem_runtime in subproblem_runtimes:
        subproblem_prefix = str(percentage)+'maxtime_'+str(subproblem_runtime)
        lost = df[df[subproblem_prefix+'calctime']+df['Y Runtime']>=max_runtime]
        oracle_coverage =  1 - len(lost)/len(df)
        subprob_coverage = (df[subproblem_prefix+'Y Success']==1).sum()/len(df)
        baseline_coverage = runtime_adjusted_coverage_score(tmp_df, tmp_df[str(percentage)+'Y'], max_runtime_arr=(max_runtime - tmp_df[subproblem_prefix+'calctime']))
        lost_problems = lost_problems.append({'p':percentage,'max_time':subproblem_runtime,'oracle_coverage':oracle_coverage, 'baseline_coverage':baseline_coverage,'subproblem_coverage':subprob_coverage,'num_problems':len(lost),},ignore_index=True)

lost_problems.sort_values(by=['p','max_time'], ascending=False)

Unnamed: 0,p,max_time,oracle_coverage,baseline_coverage,num_problems,subproblem_coverage
25,0.95,60000.0,0.956025,0.392375,2013.0,0.71662
26,0.95,30000.0,0.98047,0.411046,894.0,0.694425
27,0.95,10000.0,0.991065,0.419437,409.0,0.656829
28,0.95,5000.0,0.993468,0.422157,299.0,0.637124
29,0.95,1000.0,0.994997,0.424093,229.0,0.584389
20,0.9,60000.0,0.959717,0.402702,1844.0,0.743599
21,0.9,30000.0,0.981409,0.418653,851.0,0.724069
22,0.9,10000.0,0.991174,0.426721,404.0,0.687434
23,0.9,5000.0,0.993512,0.42898,297.0,0.666943
24,0.9,1000.0,0.994997,0.430732,229.0,0.615388


In [18]:
lost_problems.sort_values(by=['p','max_time'], ascending=False).to_csv('lost_problems.csv')

In [10]:
# df.to_csv('AllData-labelled-partial_problems_withoutSAT.csv',index=False)

In [15]:
df['SolvesFinishedCount'] = df[success_cols].sum(axis=1)
tmp_df = df[df['SolvesFinishedCount']<len(success_cols)]
len(tmp_df)

21691

### Let's compute metrics for each of those baselines (I.e., predicting for the N-agents problem, the p*N agents label) ###

In [16]:
for percentage in percentage_bins:
    for subproblem_runtime in subproblem_runtimes:
        subproblem_prefix = str(percentage)+'maxtime_'+str(subproblem_runtime)
#         partially_solved_only_df = df[df[subproblem_prefix+'Y Success']==1].copy()
        print('Score {s} for percentage {p} and subproblem runtime {t}'.format(s=runtime_adjusted_coverage_score(tmp_df, tmp_df[str(percentage)+'Y'], max_runtime_arr=(max_runtime - tmp_df[subproblem_prefix+'calctime'])),p=percentage,t=subproblem_runtime))

Score 0.19376700013830622 for percentage 0.1 and subproblem runtime 60000
Score 0.19390530634825504 for percentage 0.1 and subproblem runtime 30000
Score 0.1948734498178968 for percentage 0.1 and subproblem runtime 10000
Score 0.19491955188787977 for percentage 0.1 and subproblem runtime 5000
Score 0.1949656539578627 for percentage 0.1 and subproblem runtime 1000
Score 0.3474713014614356 for percentage 0.25 and subproblem runtime 60000
Score 0.347932322161265 for percentage 0.25 and subproblem runtime 30000
Score 0.34940758840071917 for percentage 0.25 and subproblem runtime 10000
Score 0.34959199668065094 for percentage 0.25 and subproblem runtime 5000
Score 0.3498686091005486 for percentage 0.25 and subproblem runtime 1000
Score 0.4193905306348255 for percentage 0.5 and subproblem runtime 60000
Score 0.4235397169332903 for percentage 0.5 and subproblem runtime 30000
Score 0.4267207597621133 for percentage 0.5 and subproblem runtime 10000
Score 0.4278733115116869 for percentage 0.5 an

In [38]:
df[(df.NumOfAgents==10)&(df.GridName=='Berlin_1_256')&(df.InstanceId==1)]['Y'].values[0]

'macbs Runtime'