In [1]:
# only permit vertex cover

canonical_order = ['BarabasiAlbert', 'ErdosRenyi', 'PowerlawCluster', 'WattsStrogatz',  'MUTAG', 'ENZYMES', 'PROTEINS',   'IMDB-BINARY', 'COLLAB']
def reorder(df, canonical_order=canonical_order, by='dataset', extras=['dataset','gen_n', 'gen_n_max'], secondary='gen_n', columns=None):
    df['dataset_name_order'] = df[by].map({name: i for i, name in enumerate(canonical_order)})
    if secondary is not None:
        df = df.sort_values(by=['dataset_name_order', secondary])
    else:
        df = df.sort_values(by=['dataset_name_order'])

    df.drop('dataset_name_order', axis =1, inplace=True)

    if columns:
        return df[columns ]
    else:
        return df

In [2]:
# construct gurobi 8.0 score baseline

import os
import json
import pandas as pd
import numpy as np
import sys
from pathlib import Path

root_folder = Path('/home/bcjexu/maxcut-80/bespoke-gnn4do/')
sys.path.insert(0, str(root_folder))

from utils.tabulate import load_baseline_outputs

baseline_folders = ['baseline_runs/230928_gurobi']

gurobi_8_dict = {}

for baseline_folder in baseline_folders:

    for model_folder in os.listdir(os.path.join(root_folder, baseline_folder)):
        with open(os.path.join(os.path.join(root_folder, baseline_folder, model_folder), 'params.txt'), 'r') as f:
            model_args = json.load(f)
        if model_args['problem_type'] == 'vertex_cover':
            continue
        
        #print(model_args['gurobi'], model_args['gurobi_timeout'], model_args['dataset'], model_args['gen_n'])
        #print(load_baseline_outputs(Path(os.path.join(root_folder, baseline_folder)), model_folder, 'gurobi', indices))

        if model_args["gurobi_timeout"] != 8:
            continue
        row = f'gurobi_{model_args["gurobi_timeout"]}'
        if isinstance(model_args['gen_n'], list):
            col = f"{model_args['dataset']}@@{model_args['gen_n'][0]}"
        else:
            col = f"{model_args['dataset']}"
        #print(row,col, Path(os.path.join(root_folder, baseline_folder)), model_folder)

        scores = []
        with open(Path(os.path.join(root_folder, baseline_folder, model_folder))  / 'results.jsonl', 'r') as f:

            for line in f:
                res = json.loads(line)
                # second condition is: only do this if the graph is in the validation set
                #print("A")
                #assert(indices == None or dataset not in indices or res['index'] in indices[dataset])
                #print("B")
                scores.append(res['score'])
        #print(scores)
        gen_n = tuple(model_args['gen_n']) if isinstance(model_args['gen_n'], list) else ""
        gurobi_8_dict[(model_args['dataset'], gen_n)] = scores
        

In [3]:
import os
os.listdir('/home/bcjexu/maxcut-80/bespoke-gnn4do/training_runs')

['legacy',
 'Testing',
 '230928_snapshot',
 'LiftMP_runs',
 '230924_hparam2',
 '230927_snapshot',
 '230928_runs',
 '230924_hparam',
 '230924_hparam_TU_multiarch',
 '230924_hparam_TU',
 '230926_finetune_ER_runs']

In [4]:
print(list(os.path.join('230927_snapshot', x) for x in os.listdir('/home/bcjexu/maxcut-80/bespoke-gnn4do/training_runs/230927_snapshot')))

['230927_snapshot/230925_TUsmall_GAT_VC', '230927_snapshot/230925_TUsmall_GIN_VC', '230927_snapshot/230925_generated_preset_cut', '230927_snapshot/230925_TUlarge_all_cut', '230927_snapshot/230925_TUsmall_GCNN_VC', '230927_snapshot/230925_generated_liftMP_VC', '230927_snapshot/230925_TUlarge_all_VC', '230927_snapshot/230925_generated_preset_VC', '230927_snapshot/230925_TUsmall_liftMP_VC', '230927_snapshot/230925_TUsmall_GatedGCNN_VC', '230927_snapshot/230925_generated_liftMP_cut']


In [5]:
run_folders = ['LiftMP_runs', '230927_snapshot/230925_TUsmall_GAT_VC', '230927_snapshot/230925_TUsmall_GIN_VC', 
               '230927_snapshot/230925_generated_preset_cut', '230927_snapshot/230925_TUlarge_all_cut', '230927_snapshot/230925_TUsmall_GCNN_VC',
                 '230927_snapshot/230925_generated_liftMP_VC', '230927_snapshot/230925_TUlarge_all_VC', '230927_snapshot/230925_generated_preset_VC', 
                 '230927_snapshot/230925_TUsmall_liftMP_VC', '230927_snapshot/230925_TUsmall_GatedGCNN_VC', '230927_snapshot/230925_generated_liftMP_cut']

run_folders = ['230928_snapshot/230925_TUsmall_GAT_VC', '230928_snapshot/230925_TUsmall_GIN_cut', 
               '230928_snapshot/230925_generated_preset_cut', '230928_snapshot/230925_TUsmall_GAT_cut', 
               '230928_snapshot/230925_TUsmall_liftMP_cut', '230928_snapshot/230925_TUsmall_GCNN_VC', 
               '230928_snapshot/230925_TUsmall_GCNN_cut', '230928_snapshot/230925_generated_liftMP_VC', 
               '230928_snapshot/230925_generated_preset_VC', '230928_snapshot/230925_TUsmall_liftMP_VC', '230928_snapshot/230925_TUsmall_GatedGCNN_VC', 
               '230928_snapshot/230925_TUsmall_VC_32', '230928_snapshot/230925_generated_liftMP_cut', '230928_snapshot/230925_TUsmall_GatedGCNN_cut']

run_folders = ['230928_runs/230925_TUsmall_GAT_VC', '230928_runs/230925_TUsmall_GIN_cut', 
               '230928_runs/230925_generated_preset_cut', '230928_runs/230925_TUsmall_GAT_cut', 
               '230928_runs/230925_TUsmall_liftMP_cut', '230928_runs/230925_TUsmall_GCNN_VC', 
               '230928_runs/230925_TUsmall_GCNN_cut', '230928_runs/230925_generated_liftMP_VC', '230928_runs/230925_generated_preset_VC', 
               '230928_runs/230925_TUsmall_liftMP_VC', '230928_runs/230925_TUsmall_GatedGCNN_VC', '230928_runs/230925_TUsmall_VC_32', 
               '230928_runs/230925_TUlarge_liftMP_cut', '230928_runs/230925_TUlarge_liftMP_VC', '230928_runs/230925_generated_liftMP_cut', '230928_runs/230925_TUsmall_GatedGCNN_cut']


In [6]:
import json
import pandas as pd
import numpy as np

folder_path = '/home/bcjexu/maxcut-80/bespoke-gnn4do/training_runs'
model_list = [os.path.join(folder_path, run_folder, x) for run_folder in run_folders for x in os.listdir(os.path.join(folder_path, run_folder))  ]


rows = []

errored = []
numtimes = 0
for model_folder in model_list:
    try:
        with open(os.path.join(model_folder, 'params.txt'), 'r') as f:
            model_args = json.load(f)
        if model_args['problem_type'] == 'vertex_cover':
            continue
        
        losses = np.load(os.path.join(model_folder, 'valid_scores.npy'))
        test_losses = np.load(os.path.join(model_folder, 'test_scores.npy'))

        modeldict = model_args #{x: model_args[x] for x in params}
        modeldict['max_valid_score'] = max(losses)
        modeldict['max_valid_epoch'] = np.argmax(losses)
        modeldict['scores'] = test_losses[np.argmax(losses)]
        modeldict['baseline'] = False


        scorefile = [x for x in os.listdir(model_folder) if x.startswith("retest_best")]
        #assert(len(scorefile) <=1)
        if len(scorefile) >= 1:
            times, scores = np.load(os.path.join(model_folder, scorefile[0]))
            #modeldict['stdev'] = np.std(scores)
            modeldict['full_scores'] = scores
            numtimes+=1
            gen_n = tuple(model_args['gen_n']) if isinstance(model_args['gen_n'], list) else ""
            gurobi_scores = gurobi_8_dict[(model_args['dataset'], gen_n)]
            
            norms = [x/y for x, y in zip(scores, gurobi_scores)]
            #print(model_args['dataset'])
            #print(list(zip(scores, gurobi_scores)))
            #print(np.average(scores)/np.average(gurobi_scores), norms)
            modeldict['scores'] = np.average(norms)
            modeldict['stdev'] = np.std(norms)
            #print(len(scores), len(gurobi_scores))

            
        else: 
            # load a dummy
            #times, scores = np.load('/home/bcjexu/maxcut-80/bespoke-gnn4do/training_runs/230924_hparam/paramhash:0a0656a369a5b8e4a4be27e0d04fb3b8c161e7b630caf99b8eaeedcddd6a2b18/time_and_score@@test_results_2023-09-28_01:23:33.np.npy')
            modeldict['stdev'] = np.nan
        

        rows.append(modeldict)
        
    except Exception as e:
        print(f'{e} is wrong w/ {model_folder}')
        errored.append(model_folder)

In [7]:
# load in baselines
baselines = pd.read_csv('vc_baseline_scores.csv')

gen_n_dict = dict(zip([50, 100, 400], [[50, 100], [100, 200],[400, 500]]))

# unwind them 
for i, baseline in baselines.iterrows():
    #print(baseline.index)
    for col in baselines.keys():
        if col == 'Unnamed: 0':
            continue
        ds = col
        gen_n = np.nan
        if len(col.split('@@')) > 1:
            ds, gen_n = col.split('@@')
            gen_n = gen_n_dict[int(gen_n)]

        row = {'dataset': ds, 'gen_n': gen_n, 'model_type': baseline['Unnamed: 0'], 'scores': baseline[col], 'baseline': True}
        #print(row)
        rows.append(row)


In [8]:
len(errored)

0

In [9]:
df = pd.DataFrame.from_records(rows)

from collections import Counter
Counter(df[df.gen_n == 400].model_type)

for x in df.gen_n:
    if not isinstance(x, list) and x !=100:
        print(x)

        

In [10]:
df['gen_n_list'] = df.gen_n
df['gen_n_max'] = df.gen_n_list.apply(lambda x: int(x[1]) if isinstance(x,list) else x)
df.gen_n = df.gen_n.apply(lambda x: int(x[0]) if isinstance(x,list) else x)


from collections import Counter

Counter(df.gen_n_max)

for x, y in zip(df.gen_n, df.gen_n_max):
    if x!=y : print("hi")

In [11]:
df.keys()
# 'problem_type', 'seed',  'prefix', 'RB_n', 'RB_k', 'log_dir',
relevant_keys = [ 'model_type', 'num_layers',
       'repeat_lift_layers', 'num_layers_project', 'rank', 'vc_penalty', 'gen_n', 'gen_n_max',
       'dataset', 'infinite',  'positional_encoding', 'pe_dimension',
       'max_valid_score', 'max_valid_epoch',
       'scores']

In [12]:
df.gen_n = df.gen_n.fillna("")
df.gen_n_max = df.gen_n_max.fillna("")

In [13]:
# test models

models_for_test = []

In [14]:
dataset_by_arc = pd.DataFrame()
dss = ['BarabasiAlbert', 'ErdosRenyi', 'PowerlawCluster', 'WattsStrogatz']
mts = ['SDP proj', 'gurobi_2.0', 'gurobi_4.0', 'gurobi_8.0', 'vertex count']



for (mt, ds, gen_n), group in df[(df.infinite == False) | (df.dataset == 'ErdosRenyi')].groupby(['model_type', 'dataset', 'gen_n']):
    if mt in mts:
        continue
    if all(group['max_valid_score'].isna()):
        continue
    if ds not in dss:
        gen_n = ""

    #print(mt, ds, gen_n)
    bestidx = group['max_valid_score'].idxmax()

    score_writeout = f'{df.loc[bestidx]["scores"]:0.2f} +/- {df.loc[bestidx]["stdev"]:0.2f}'

    if ds in dss:
        dataset_by_arc.at[f'{ds}, {gen_n}', mt] = score_writeout
        dataset_by_arc.at[f'{ds}, {gen_n}', 'gen_n'] = gen_n
        dataset_by_arc.at[f'{ds}, {gen_n}', 'dataset'] = ds
    else: 
        dataset_by_arc.at[f'{ds}', 'gen_n'] = gen_n
        dataset_by_arc.at[f'{ds}', mt] = score_writeout
        dataset_by_arc.at[f'{ds}', 'dataset'] = ds

    try:
        models_for_test.append((df.loc[group['scores'].idxmax()]['log_dir'], df.loc[group['scores'].idxmax()]['dataset'], df.loc[group['scores'].idxmax()]['gen_n'])) 
    except Exception as e:
        print(f'{e}')




#dataset_by_arc['dataset'] = dataset_by_arc.index
reorder(dataset_by_arc[[k for k in dataset_by_arc.keys() if k not in ['SDP lift', 'edge count']]], by='dataset', columns = ['dataset', 'gen_n', 'GAT', 
                                                                                                                            'GCNN', 'GIN', 'GatedGCNN', 'LiftMP']).to_csv('Table1_MC_std.csv', index=False) #.style.highlight_max(color = 'green', axis = 1)

In [15]:
dataset_by_arc = pd.DataFrame()
mts = ['GAT', 'GCNN', 'GIN', 'GatedGCNN']
dss = ['BarabasiAlbert', 'ErdosRenyi', 'PowerlawCluster', 'WattsStrogatz']

for (mt, ds, gen_n, gen_nmax), group in df[~df.model_type.isin(mts)].groupby(['model_type', 'dataset', 'gen_n', 'gen_n_max']):

    #if mt =='gurobi_4.0': print(ds) # and ds =='MUTAG': print("hi")

    if all(group['max_valid_score'].isna()):
        continue
    if ds not in dss:
        gen_n = ""
        gen_nmax = ""
    #print(mt, ds, gen_n)
    if ds in dss:
        k = f'{ds}_{gen_n}'
    else:
        k = ds
    dataset_by_arc.at[k, 'Type'] = ds
    dataset_by_arc.at[k, 'Nmin'] = gen_n
    dataset_by_arc.at[k, 'Nmax'] = gen_nmax

    bestidx = group['max_valid_score'].idxmax()
    score_writeout = f'{df.loc[bestidx]["scores"]:0.2f} +/- {df.loc[bestidx]["stdev"]:0.2f}'
    
    if mt not in  ['LiftMP'] + mts:
        dataset_by_arc.at[k, mt] = -1*group['scores'].max()
    else:
        dataset_by_arc.at[k, mt] = score_writeout
    try:
        models_for_test.append((df.loc[group['scores'].idxmax()]['log_dir'], df.loc[group['scores'].idxmax()]['dataset'], df.loc[group['scores'].idxmax()]['gen_n'])) 
    except Exception as e:
        print(f'{e}')

#dataset_by_arc.rename(columns={'Nikos': 'CustomLiftProject'}, inplace=True)
reorder(dataset_by_arc[[k for k in dataset_by_arc.keys() if k not in ['SDP lift', 'vertex count']]], by='Type', secondary='Nmin').round(2).to_csv('Table1_MC_std.csv') #.style.highlight_max(color = 'green', axis = 1)