In [1]:
import pandas as pd
import glob

In [2]:
data_dir = 'data/from-vpn'

In [3]:
features_cols = ['GridRows', 'GridColumns', 'NumOfAgents', 'NumOfObstacles', 'BranchingFactor', 'ObstacleDensity',
            'AvgDistanceToGoal', 'MaxDistanceToGoal', 'MinDistanceToGoal', 'AvgStartDistances', 'AvgGoalDistances',
            'PointsAtSPRatio']

In [4]:
experiments = []
for exp in glob.glob(data_dir+'/*.csv'):
    experiments.append(pd.read_csv(exp, skiprows=1))

In [5]:
alldata = pd.concat(experiments)

### ADD CBSH files (under CBSH dir) ###

In [6]:
cbsh_data_dir = 'data/from-vpn/CBSH/CBSH'
cbsh_experiments = []
for exp in glob.glob(cbsh_data_dir+'/*.csv'):
    cbsh_experiments.append(pd.read_csv(exp, skiprows=1)) #skiprows needed only for nathan experiments
    
cbsh_alldata = pd.concat(cbsh_experiments)


In [7]:
len(cbsh_alldata) + len(alldata), len(alldata), len(cbsh_alldata),

(68175, 35950, 32225)

### ADD SAT files (under SAT dir)

In [36]:
sat_data_dir = 'data/from-vpn/SAT'
sat_experiments = []
for exp in glob.glob(sat_data_dir+'/*.csv'):
    sat_experiments.append(pd.read_csv(exp)) #skiprows needed only for nathan experiments
    
sat_alldata = pd.concat(sat_experiments)
sat_alldata['BranchingFactor'] = pd.to_numeric(sat_alldata.BranchingFactor, errors='coerce') 
#IMPORTANT: Coerce will transform all strings which failed at casting to int as NaNs. Therefore, we need to replace NaNs with a large number.


In [35]:
len(sat_alldata) + len(alldata), len(alldata), len(sat_alldata),

(92672, 35950, 56722)

In [15]:
# alldata.reset_index(drop=True, inplace=True)
# cbsh_alldata.reset_index(drop=True, inplace=True)
merged = alldata.merge(cbsh_alldata, how='outer')


In [37]:
for column in sat_alldata.columns:
    if column in sat_alldata and column in alldata:
        print(column, sat_alldata[column].dtype, alldata[column].dtype)

GridName object object
GridRows int64 int64
GridColumns int64 int64
NumOfAgents int64 int64
NumOfObstacles int64 int64
InstanceId int64 int64
BranchingFactor float64 float64
ObstacleDensity float64 float64
AvgDistanceToGoal float64 float64
MaxDistanceToGoal int64 int64
MinDistanceToGoal int64 int64
AvgStartDistances float64 float64
AvgGoalDistances float64 float64
PointsAtSPRatio float64 float64


In [29]:
# merged.reset_index(drop=True, inplace=True)
# sat_alldata.reset_index(drop=True, inplace=True)
merged = alldata.merge(sat_alldata, how='outer')
len(merged)

ValueError: You are trying to merge on float64 and object columns. If you wish to proceed you should use pd.concat

In [26]:
success_cols = list(merged.filter(regex="Success$"))
success_cols

['A*+OD+ID Success',
 'Basic-CBS/(A*/SIC)+ID Success',
 'EPEA*+ID Success',
 'ICTS 3E +ID Success',
 'MA-CBS-Global-10/(EPEA*/SIC) choosing the first conflict in CBS nodes Success',
 'CBS/(A*/SIC) + BP + PC without smart tie breaking using Dynamic Lazy Open List with Heuristic MVC of Cardinal Conflict Graph Heuristic Success']

In [27]:
runtime_cols = list(merged.filter(like="Runtime"))
CBSH_header = 'CBS/(A*/SIC) + BP + PC without smart tie breaking using Dynamic Lazy Open List with Heuristic MVC of Cardinal Conflict Graph Heuristic Runtime'
CBSH_success_header = 'CBS/(A*/SIC) + BP + PC without smart tie breaking using Dynamic Lazy Open List with Heuristic MVC of Cardinal Conflict Graph Heuristic Success'
# merged[~merged[CBSH_header].isnull()][runtime_cols]

In [None]:
merged.fillna('irrelevant',inplace=True)

In [None]:
# merged[CBSH_header] = merged[CBSH_header].apply(lambda x: 300000 if x == 'irrelevant' else x)
for runtime_col in runtime_cols:
    print(runtime_col)
    merged[runtime_col] = merged[runtime_col].apply(lambda x: 300000 if x == 'irrelevant' else x)
for success_col in success_cols:
    print(success_col)
    merged[success_col] = merged[success_col].apply(lambda x: 0 if x == 'irrelevant' else x)


In [11]:
merged.to_csv(data_dir+'/AllData.csv', index=False)

In [12]:
alldata.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043],
           dtype='int64', length=7491)