# Browse experiment results ...

In [1]:
import torch
import pickle
import glob
import os.path
import pandas as pd
from train_engine import __training_cfg, __exp_res_meta
import numpy as np
# from core.utils_ipynb import read_exp_result_files
import os

def read_exp_result_files(path):
    files = glob.glob(os.path.join(path, "*.pickle"))
    res = []
    for f in files: 
        if os.path.basename(f) == 'errors.pickle':
            continue        
        
        r = pickle.load(open(f, 'rb'))
        
        #older cfgs have no 'set_node_degree_uninformative' ... 
        if 'set_node_degree_uninformative' not in r['exp_cfg']['model']:
            r['exp_cfg']['model']['set_node_degree_uninformative'] = False
            
        res.append(r)
    return res


def get_keychain_value_iter(d, key_chain=None):
    key_chain = [] if key_chain is None else list(key_chain).copy()       
    
    if not isinstance(d, dict):
        
        yield tuple(key_chain), d
    else:
        for k, v in d.items():
            yield from get_keychain_value_iter(v, key_chain + [k])
            
def get_keychain_value(d, key_chain):
    
    try:
        
        for k in key_chain:
            d = d[k]
            
    except Exception as ex:
        raise KeyError() from ex
        
    return d

Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality


In [2]:
kc = {k: k[-1] for k, v in list(get_keychain_value_iter(__exp_res_meta))}
kc

{('exp_cfg', 'dataset_name'): 'dataset_name',
 ('exp_cfg', 'training', 'lr'): 'lr',
 ('exp_cfg', 'training', 'lr_drop_fact'): 'lr_drop_fact',
 ('exp_cfg', 'training', 'num_epochs'): 'num_epochs',
 ('exp_cfg', 'training', 'epoch_step'): 'epoch_step',
 ('exp_cfg', 'training', 'batch_size'): 'batch_size',
 ('exp_cfg', 'training', 'weight_decay'): 'weight_decay',
 ('exp_cfg', 'training', 'validation_ratio'): 'validation_ratio',
 ('exp_cfg', 'model', 'model_type'): 'model_type',
 ('exp_cfg',
  'model',
  'use_super_level_set_filtration'): 'use_super_level_set_filtration',
 ('exp_cfg', 'model', 'use_node_degree'): 'use_node_degree',
 ('exp_cfg',
  'model',
  'set_node_degree_uninformative'): 'set_node_degree_uninformative',
 ('exp_cfg', 'model', 'pooling_strategy'): 'pooling_strategy',
 ('exp_cfg', 'model', 'use_node_label'): 'use_node_label',
 ('exp_cfg', 'model', 'gin_number'): 'gin_number',
 ('exp_cfg', 'model', 'gin_dimension'): 'gin_dimension',
 ('exp_cfg', 'model', 'gin_mlp_type'): 'gi

In [3]:
COL_NAMES = {
    ('exp_cfg', 'dataset_name'): 'dataset_name',
    #('exp_cfg', 'tag'): 'tag', 
#     ('exp_cfg', 'training', 'lr'): 'lr',
#     ('exp_cfg', 'training', 'lr_drop_fact'): 'lr_drop_fact',
#     ('exp_cfg', 'training', 'num_epochs'): 'num_epochs',
#     ('exp_cfg', 'training', 'epoch_step'): 'epoch_step',
    # ('exp_cfg', 'training', 'batch_size'): 'batch_size',
#     ('exp_cfg', 'training', 'weight_decay'): 'weight_decay',
#     ('exp_cfg', 'training', 'validation_ratio'): 'validation_ratio',
    ('exp_cfg', 'model', 'model_type'): 'model_type',
    # ('exp_cfg', 'model', 'use_super_level_set_filtration'): 'use_super_level_set_filtration',
    # ('exp_cfg', 'model', 'use_node_degree'): 'use_node_degree',
    # ('exp_cfg', 'model', 'use_node_label'): 'use_node_label',
    # ('exp_cfg', 'model', 'gin_number'): 'gin_number',
    # ('exp_cfg', 'model', 'cls_hidden_dimension'): 'cls_hidden_dimension',
    #('exp_cfg', 'model', 'gin_mlp_type'): 'gin_mlp_type',
    # ('exp_cfg', 'model', 'set_node_degree_uninformative'): 'set_node_degree_uninformative',
    # ('exp_cfg', 'model', 'num_struct_elements'): 'num_struct_elements',
    # ('exp_cfg', 'model', 'drop_out'): 'drop_out',
    # ('exp_cfg', 'model', 'pooling_strategy'): 'pooling_strategy',
    ('exp_cfg', 'model', 'concat_fp'): 'fp',
    # ('cv_test_acc',): 'cv_test_acc',
#     ('cv_val_acc',): 'cv_val_acc',
#     ('cv_indices_trn_tst_val',): 'cv_indices_trn_tst_val',
#     ('cv_epoch_loss',): 'cv_epoch_loss',
#     ('start_time',): 'start_time',
#     ('id',): 'id',
    ('finished_training',): 'finished_training'
}

In [4]:
def pd_frame(path):
    
    f = read_exp_result_files(path)
    
    data_frames = []
    for i, res in enumerate(f):
        row = {}
        # print(res.keys())
        cv_acc_last = [x[-1] for x in res['cv_test_acc'] if len(x) > 0]
        cv_roc_last = []
        if 'cv_test_roc' in res.keys():
            cv_roc_last = [x[-1] for x in res['cv_test_roc'] if len(x) > 0]
        
        row['acc_last_mean'] = np.mean(cv_acc_last)
        row['acc_last_std'] = np.std(cv_acc_last)
        row['roc_last_mean'] = np.mean(cv_roc_last) if len(cv_roc_last) > 0 else np.nan
        row['roc_last_std'] = np.std(cv_roc_last) if len(cv_roc_last) > 0 else np.nan
        row['start_time'] = res['start_time']
        
        cv_acc_validated = []
        for test, val in zip(res['cv_test_acc'], res['cv_val_acc']):
            if not len(test) == res['exp_cfg']['training']['num_epochs']:
                continue
            # n = len(test)//2
            # test = torch.tensor(test[n:])
            # val = torch.tensor(val[n:])
            test = torch.tensor(test)
            val = torch.tensor(val)
            
            
            _, i_max = val.max(0)
            cv_acc_validated.append(test[i_max].item())
        row['acc_val_mean'] = np.mean(cv_acc_validated)
        row['acc_val_std'] = np.std(cv_acc_validated)
        if 'cv_test_roc' in res.keys():
            cv_roc_validated = []
            for test, val in zip(res['cv_test_roc'], res['cv_val_roc']):
                if not len(test) == res['exp_cfg']['training']['num_epochs']:
                    continue
                # n = len(test)//2
                # test = torch.tensor(test[n:])
                # val = torch.tensor(val[n:])
                test = torch.tensor(test)
                val = torch.tensor(val)
                
                
                _, i_max = val.max(0)
                cv_roc_validated.append(test[i_max].item())
                
            row['roc_val_mean'] = np.mean(cv_roc_validated)
            row['roc_val_std'] = np.std(cv_roc_validated)
        else:
            row['roc_val_mean'] = np.nan
            row['roc_val_std'] = np.nan
        
        
        cv_folds_available = sum([1 for cv in res['cv_test_acc'] if len(cv) == res['exp_cfg']['training']['num_epochs']])
        row['cv_folds_available'] = cv_folds_available
        
        
        for k, v in COL_NAMES.items():
            try:
                row[v] = get_keychain_value(res, k)
            except KeyError:
                pass

        f = pd.DataFrame(row, index=[i])
        
        data_frames.append(f)
        
        
    return pd.concat(data_frames, sort=True)

In [6]:
path = './experiment_logs/'
RES = pd_frame(path)
RES = RES.sort_values(by=['start_time','dataset_name', 'model_type'], ascending=False)
# RES = RES[RES['dataset_name'] == 'EGFR']
# RES = RES[(RES['model_type'] == 'GIN_MPML') & RES['dataset_name']]
# RES = RES[(RES['model_type'] == 'SimpleNNBaseline')]
# RES = RES[RES['fp'] == 'morgan3']
# RES = RES[RES['dataset_name'].str.contains('CHEMBL4722')]
# RES = RES[RES['dataset_name'].str.contains('COX2')]
# RES[RES['dataset_name'].str.contains('REDDIT') & (RES['gin_number'] == 3)]
# RES = RES[(RES['cv_folds_available'] == 5) & (RES['model_type'] == 'GIN_MPML')]
RES

Unnamed: 0,acc_last_mean,acc_last_std,acc_val_mean,acc_val_std,cv_folds_available,dataset_name,finished_training,fp,model_type,roc_last_mean,roc_last_std,roc_val_mean,roc_val_std,start_time
69,78.160604,0.409089,78.160603,0.409087,5,COX2,True,,GIN_MPML,50.000000,0.000000,50.267449,0.534899,2024-05-08 18:50:47.131480
40,49.800000,4.523273,58.900000,1.907878,5,IMDB-BINARY,True,,GIN_MPML,49.800000,4.523273,58.900000,1.907878,2024-05-08 18:00:53.839918
11,60.978564,0.102823,60.978563,0.102824,5,DHFR,True,,GCN_MPML,50.000000,0.000000,50.000000,0.000000,2024-05-06 12:42:53.845368
21,57.407587,6.933607,67.740475,3.566763,5,PROTEINS,True,,GAT_MPML,54.501481,5.563718,65.134541,3.921838,2024-05-06 01:52:33.115779
2,67.921464,4.321550,68.011958,4.408858,5,PROTEINS,True,,GIN_MPML,64.041999,5.271562,62.573441,4.503903,2024-05-05 00:23:47.325658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30,59.568537,0.089062,59.568537,0.089060,5,PROTEINS,True,,GIN_MPML,50.000000,0.000000,50.000000,0.000000,2024-02-19 19:42:58.523739
49,86.842105,0.000000,84.210526,2.631577,2,MUTAG,False,,GIN_MPML,87.230769,0.923077,87.230769,0.923077,2024-02-19 17:53:22.626422
13,65.789474,0.000000,65.789474,0.000000,2,MUTAG,False,,GIN_MPML,50.000000,0.000000,50.000000,0.000000,2024-02-19 17:19:44.936623
36,74.452347,7.641406,76.031294,8.120202,5,MUTAG,True,,GIN_MPML,67.107692,15.766161,71.692308,17.760537,2024-02-19 16:48:48.109684


In [7]:
print(len(RES))
# RES.to_csv('results_fp_gril.csv')

14


The following cells contain some utility for messing around with results, i.e., deleting etc. 

In [8]:
# num_completed = len(RES[(RES['model_type'] == 'GIN') & (RES['dataset_name'].str.contains('CHEMBL')) & (RES['fp'].str.contains('morgan2'))])
# num_completed