In [146]:
import sys; sys.path.insert(0, '..') #Add upper folder to path
from src.preprocess import Preprocess
from src.metrics import coverage_score, cumsum_score, runtime_adjusted_coverage_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import yaml
import numpy as np
import random

In [147]:
with open("../src/config.yaml", 'r') as stream:
    config = yaml.safe_load(stream)

features_cols = config['features']

algorithms = config['algorithms']
max_runtime = config['max_runtime']
max_sub_runtime = config['max_sub_runtime']
runtime_cols = [algorithm + ' Runtime' for algorithm in algorithms]
alg_runtime_cols = runtime_cols.copy()
runtime_cols.append('Y Runtime')
success_cols = [algorithm + ' Success' for algorithm in algorithms]
success_cols
data_path = '../src/AllData-labelled.csv'
df = pd.read_csv(data_path)

# drop_maps = None
drop_maps = ['warehouse']

if drop_maps is not None:
    for maptype in drop_maps:
        df = df[~df.GridName.str.contains(maptype)]
    df = df.reset_index(drop=True)

In [148]:
alg_runtime_cols

['icts Runtime', 'epea Runtime', 'cbsh-c Runtime']

In [149]:
def add_partial_problem_label(row, p,algorithms,alg_max_runtimes):
    n_agents = max(int(p*row['NumOfAgents']),2) #MAPF problem can have less then 2 agents
    
    res = df[(df.NumOfAgents==n_agents)&(df.GridName==row.GridName)&(df.InstanceId==row.InstanceId)]
    if len(res)==0:
        print("No such problem! n_agents: {n}, instance: {i}, grid: {g}".format(n= n_agents,i=row.InstanceId, g=row.GridName))
        res = df[(df.NumOfAgents==n_agents+1)&(df.GridName==row.GridName)&(df.InstanceId==row.InstanceId)]
        # In case the problem wasn't found - it's some kind of error in the dataset (very rare)
        # We deal with it by taking the p*N+1 which we know exists
    
    label = res['Y'].values[0]
    label_runtime = res['Y Runtime'].values[0]
    time = res[algorithms]
    for alg_max_runtime in alg_max_runtimes:
        subproblem_prefix = str(p)+'maxtime_'+str(alg_max_runtime)
        capped_time = time.where(time < alg_max_runtime, alg_max_runtime)

        row[subproblem_prefix+'calctime'] = capped_time.sum(1).values[0]
        if row[subproblem_prefix+'calctime'] == len(algorithms)*alg_max_runtime:
            #No algorithm solved the sub-problem at time
            label = random.choice(algorithms)
            label_runtime = res[label].values[0]

    row[str(percentage)+'Y'] = label
    row[str(percentage)+'Y Runtime'] = label_runtime
    return row

In [111]:
percentage_bins = [0.1,0.25,0.5,0.75,0.9,0.95]
subproblem_runtimes = [60000, 30000,10000,5000]
# percentage_bins = [0.9]
# subproblem_runtimes = [5000]

In [112]:
# def p_feature_time(row, p, algorithms, alg_max_runtimes):
#     time = df[(df.NumOfAgents==int(p*row['NumOfAgents']))&(df.GridName==row.GridName)&(df.InstanceId==row.InstanceId)][algorithms]    
#     for alg_max_runtime in alg_max_runtimes:
#         subproblem_prefix = str(p)+'maxtime_'+str(alg_max_runtime)
#         capped_time = time.where(time < alg_max_runtime, alg_max_runtime)
        
#         if len(capped_time)==0:
#             if int(p*row.NumOfAgents)>1:
#                 print("No such problem!", int(p*row.NumOfAgents))
#             row[subproblem_prefix+'calctime'] = row['Y Runtime']
#         else:
#             row[subproblem_prefix+'calctime'] = capped_time.sum(1).values[0]
#     return row

In [113]:
subproblem_columns = []
subproblem_columns.append('Y')
for percentage in percentage_bins:
    subproblem_columns.append(str(percentage)+'Y')
    subproblem_columns.append(str(percentage)+'NumOfAgents')
    
    for subproblem_runtime in [60000, 30000,10000,5000,1000]:
        subproblem_prefix = str(percentage)+'maxtime_'+str(subproblem_runtime)
        for c in ['Y Runtime','Y Success','calctime']:
            subproblem_columns.append(subproblem_prefix + c)
# df[subproblem_columns]

In [114]:
# df = pd.read_csv('AllData-labelled-partial_problems.csv')

In [116]:
for percentage in percentage_bins:
    df[str(percentage)+'NumOfAgents'] = (df.NumOfAgents*percentage).astype(int)
    print("percentage:",percentage)
    df = df.apply(lambda x: add_partial_problem_label(x, percentage,alg_runtime_cols,subproblem_runtimes),axis=1) #Adds 0.xY + 0.xY Runtime columns
    for subproblem_runtime in subproblem_runtimes:
        print("maxtime:",subproblem_runtime)
        
        subproblem_prefix = str(percentage)+'maxtime_'+str(subproblem_runtime)
        df[subproblem_prefix+'Y Runtime'] = df[str(percentage)+'Y Runtime'].copy()
        df[subproblem_prefix+'Y Success'] = df.apply(lambda x: x[subproblem_prefix+'Y Runtime']<subproblem_runtime, axis=1)
        df.loc[df[subproblem_prefix+'Y Runtime'] > subproblem_runtime, subproblem_prefix+'Y Runtime'] = subproblem_runtime
#     df = df.apply(lambda x: p_feature_time(x,percentage,alg_runtime_cols,subproblem_runtimes),axis=1)

percentage: 0.1
maxtime: 60000
maxtime: 30000
maxtime: 10000
maxtime: 5000
percentage: 0.25
maxtime: 60000
maxtime: 30000
maxtime: 10000
maxtime: 5000
percentage: 0.5
maxtime: 60000
maxtime: 30000
maxtime: 10000
maxtime: 5000
percentage: 0.75
maxtime: 60000
maxtime: 30000
maxtime: 10000
maxtime: 5000
percentage: 0.9
No such problem! n_agents: 34, instance: 1, grid: ht_chantry
No such problem! n_agents: 36, instance: 1, grid: lt_gallowstemplar_n
No such problem! n_agents: 16, instance: 2, grid: maze-128-128-2
No such problem! n_agents: 18, instance: 3, grid: maze-128-128-2
No such problem! n_agents: 20, instance: 3, grid: maze-32-32-4
No such problem! n_agents: 16, instance: 17, grid: maze-128-128-2
No such problem! n_agents: 53, instance: 18, grid: Boston_0_256
No such problem! n_agents: 15, instance: 18, grid: maze-128-128-2
No such problem! n_agents: 22, instance: 19, grid: maze-32-32-2
No such problem! n_agents: 16, instance: 19, grid: maze-32-32-4
No such problem! n_agents: 18, ins

### How many problems we "lose" due to the computation of the sub-problem?

In [119]:
lost_problems = pd.DataFrame(columns=['p','max_time','oracle_coverage','num_problems'])
for percentage in percentage_bins:
    for subproblem_runtime in [60000, 30000,10000,5000]:
        subproblem_prefix = str(percentage)+'maxtime_'+str(subproblem_runtime)
        lost = df[df[subproblem_prefix+'calctime']+df['Y Runtime']>=max_runtime]
        oracle_coverage =  1 - len(lost)/len(df)
        subprob_coverage = (df[subproblem_prefix+'Y Success']==1).sum()/len(df)
        lost_problems = lost_problems.append({'p':percentage,'max_time':subproblem_runtime,'oracle_coverage':oracle_coverage, 'subproblem_coverage':subprob_coverage,'num_problems':len(lost),},ignore_index=True)

lost_problems.sort_values(by=['p','max_time'], ascending=False).to_csv('lost_problems.csv')

In [145]:
df.to_csv('AllData-labelled-partial_problems.csv',index=False)

### Let's compute metrics for each of those baselines (I.e., predicting for the N-agents problem, the p*N agents label) ###

In [144]:
for percentage in percentage_bins:
    for subproblem_runtime in subproblem_runtimes:
        subproblem_prefix = str(percentage)+'maxtime_'+str(subproblem_runtime)
#         partially_solved_only_df = df[df[subproblem_prefix+'Y Success']==1].copy()
        print('Score {s} for percentage {p} and subproblem runtime {t}'.format(s=runtime_adjusted_coverage_score(df, df[str(percentage)+'Y'], max_runtime_arr=(max_runtime - df[subproblem_prefix+'calctime'])),p=percentage,t=subproblem_runtime))

Score 0.5841270534778049 for percentage 0.1 and subproblem runtime 60000
Score 0.5842799720377491 for percentage 0.1 and subproblem runtime 30000
Score 0.5847168822090179 for percentage 0.1 and subproblem runtime 10000
Score 0.5847387277175813 for percentage 0.1 and subproblem runtime 5000
Score 0.6931361412093674 for percentage 0.25 and subproblem runtime 60000
Score 0.6935948968891996 for percentage 0.25 and subproblem runtime 30000
Score 0.694315798671793 for percentage 0.25 and subproblem runtime 10000
Score 0.6946216357916812 for percentage 0.25 and subproblem runtime 5000
Score 0.7615999650471863 for percentage 0.5 and subproblem runtime 60000
Score 0.7661438308283817 for percentage 0.5 and subproblem runtime 30000
Score 0.7692895840615169 for percentage 0.5 and subproblem runtime 10000
Score 0.7701415588954911 for percentage 0.5 and subproblem runtime 5000
Score 0.7973392170569731 for percentage 0.75 and subproblem runtime 60000
Score 0.8075410695560993 for percentage 0.75 and s

In [38]:
df[(df.NumOfAgents==10)&(df.GridName=='Berlin_1_256')&(df.InstanceId==1)]['Y'].values[0]

'macbs Runtime'