In [1]:
import os
import sys
sys.path.append('../src')

import numpy as np
from texttable import Texttable
import latextable

from metrics import print_and_analysis_performance_mean_std, print_ablation_performance_mean_std, print_overall_performance_mean_std

In [2]:
dropout = 0.5
alpha = 1
seeds = [10, 20, 30, 40, 50]
selected_metrics = ['kendall tau', 'upset simple', 'upset ratio']
METRICS_NUM = len(selected_metrics)
baselines = ['SpringRank','syncRank','serialRank','btl', 'davidScore',
        'eigenvectorCentrality', 'PageRank', 'rankCentrality', 'SVD_RS', 'SVD_NRS']
GNN_selection_choices = ['lr', 'train with', 'pretrain with', 'upset margin coeff', \
                'imbalance coeff', 'unnormalized L', 'trainable alpha', \
                         'upset ratio coeff', 'Fiedler layer num', 'pretrain epochs']
GNN_selection_choices_curr = ['lr', 'train with', 'pretrain with', 'upset margin coeff', 'trainable alpha', \
                 'upset ratio coeff', 'Fiedler layer num', 'pretrain epochs']
GNN_CHOICES_NUM = len(GNN_selection_choices)
GNN_CHOICES_NUM_CURR = len(GNN_selection_choices_curr)
mvr = ['mvr']
all_GNNs = ['DIGRAC', 'ib']
desirable_list = [1, 2, 5]
train_with_list = ['anchor_dist', 'anchor_innerproduct', 'emb_dist', 'emb_innerproduct','emb_baseline']
imbalance_list = [0] # [0, 1]

selected_baseline_indices = [0,1,3,5,6,9]

GNN_variant_names = ['clustering'] + train_with_list
GNN_NUM = 12

def generate_method_str_and_compare_names_all(all_methods=baselines, normalizations=['plain'], thresholds=['sort']):
    method_str = ''
    for method_name in all_methods:
        method_str += method_name
    if 'DIGRAC' in all_methods or 'ib' in all_methods:
        method_str += 'normalizations_'
        for normalization in normalizations:
            method_str += normalization
        method_str += 'thresholds_'
        for threshold in thresholds:
            method_str += threshold  
    compare_names_all = []
    for method_name in all_methods:
        if method_name not in ['DIGRAC', 'ib']:
            compare_names_all.append(method_name)
        else:
            for normalization in normalizations:
                for threshold in thresholds:
                    for GNN_type in GNN_variant_names:
                        compare_names_all.append(method_name+'_'+normalization+'_'+threshold+'_'+GNN_type)
    return method_str, compare_names_all

methods_of_interest = ['SpringRank','syncRank','btl', 
        'eigenvectorCentrality', 'PageRank', 'SVD_NRS']
        
GNN_names = []
for method_name in ['DIGRAC', 'ib']:
    for GNN_type in GNN_variant_names:
        GNN_names.append(method_name+'_plain_sort_'+GNN_type)

non_proximal_ind = [1, 2, 7, 8] # removed 0 and 6 for "clustering" variant
proximal_ind = [3, 4, 5, 9, 10, 11]
NON_PROXIMAL_GNN_NUM = len(non_proximal_ind)
PROXIMAL_GNN_NUM = len(proximal_ind)
non_proximal_bool = np.zeros(GNN_NUM, dtype=bool)
proximal_bool = np.zeros(GNN_NUM, dtype=bool)
for i in non_proximal_ind:
    non_proximal_bool[i] = True
for i in proximal_ind:
    proximal_bool[i] = True

GNN_names_non_proximal = ['DIGRAC_plain_sort_anchor_dist', 'DIGRAC_plain_sort_anchor_innerproduct',\
                          'ib_plain_sort_anchor_dist', 'ib_plain_sort_anchor_innerproduct']
GNN_names_proximal = ['DIGRAC_plain_sort_emb_dist', 'DIGRAC_plain_sort_emb_innerproduct', 'DIGRAC_plain_sort_emb_baseline', \
                      'ib_plain_sort_emb_dist', 'ib_plain_sort_emb_innerproduct', 'ib_plain_sort_emb_baseline']


compare_names_all = methods_of_interest
METHODS_NUM = len(compare_names_all)

In [3]:
keys = ['DIGRAC_plain_sort_clustering', 'DIGRAC_plain_sort_anchor_dist', 'DIGRAC_plain_sort_anchor_innerproduct',\
                          'ib_plain_sort_clustering', 'ib_plain_sort_anchor_dist', 'ib_plain_sort_anchor_innerproduct', \
        'DIGRAC_plain_sort_emb_dist', 'DIGRAC_plain_sort_emb_innerproduct', 'DIGRAC_plain_sort_emb_baseline', \
                      'ib_plain_sort_emb_dist', 'ib_plain_sort_emb_innerproduct', 'ib_plain_sort_emb_baseline', \
       ]
values = ['DIGRAC clustering', 'DIGRAC dist', 'DIGRAC innerproduct',\
                          'ib clustering', 'ib dist', 'ib innerproduct', \
        'DIGRAC proximal dist', 'DIGRAC proximal innerproduct', 'DIGRAC proximal baseline', \
                      'ib proximal dist', 'ib proximal innerproduct', 'ib proximal baseline']

keys += train_with_list + ['dist', 'innerproduct', 'serial_similarity']
values += ['dist', 'innerproduct', 'proximal dist', 'proximal innerproduct','proximal baseline'] + \
['dist', 'innerproduct', 'SerialRank similarity']
name_mapping_dict = dict(zip(keys, values))
print(name_mapping_dict)

{'DIGRAC_plain_sort_clustering': 'DIGRAC clustering', 'DIGRAC_plain_sort_anchor_dist': 'DIGRAC dist', 'DIGRAC_plain_sort_anchor_innerproduct': 'DIGRAC innerproduct', 'ib_plain_sort_clustering': 'ib clustering', 'ib_plain_sort_anchor_dist': 'ib dist', 'ib_plain_sort_anchor_innerproduct': 'ib innerproduct', 'DIGRAC_plain_sort_emb_dist': 'DIGRAC proximal dist', 'DIGRAC_plain_sort_emb_innerproduct': 'DIGRAC proximal innerproduct', 'DIGRAC_plain_sort_emb_baseline': 'DIGRAC proximal baseline', 'ib_plain_sort_emb_dist': 'ib proximal dist', 'ib_plain_sort_emb_innerproduct': 'ib proximal innerproduct', 'ib_plain_sort_emb_baseline': 'ib proximal baseline', 'anchor_dist': 'dist', 'anchor_innerproduct': 'innerproduct', 'emb_dist': 'proximal dist', 'emb_innerproduct': 'proximal innerproduct', 'emb_baseline': 'proximal baseline', 'dist': 'dist', 'innerproduct': 'innerproduct', 'serial_similarity': 'SerialRank similarity'}


In [4]:
p_list_dict = {}
eta_list_dict = {}
ERO_style_list_dict = {}
K_list_dict = {}
size_ratio_list_dict = {}
season_list_dict = {}
season_list_dict['basketball'] = np.arange(1985, 2015)
season_list_dict['finer_basketball'] = np.arange(1985, 2015)
season_list_dict['football'] = np.arange(2009, 2015)
season_list_dict['finer_football'] = np.arange(2009, 2015)
for dataset in ['finance','animal', 'faculty_business', 'faculty_cs', 'faculty_history', 'HeadToHead', 'DSBM', 'ERO']:
    season_list_dict[dataset] = [2009]
for dataset in ['basketball', 'finer_basketball', 'football', 'finer_football', 'finance','animal', 'faculty_business', 'faculty_cs', 'faculty_history', 'HeadToHead']:
    p_list_dict[dataset] = [0.05]
    eta_list_dict[dataset] = [0]
    K_list_dict[dataset] = [5]
    size_ratio_list_dict[dataset] = [1.5]
    ERO_style_list_dict[dataset] = ['uniform']
p_list_dict['DSBM'] = [0.05]
eta_list_dict['DSBM'] = [0, 0.1]
K_list_dict['DSBM'] = [5, 10, 20]
size_ratio_list_dict['DSBM'] = [1, 1.5, 2]
ERO_style_list_dict['DSBM'] = ['uniform']

p_list_dict['ERO'] = [0.05, 1]
eta_list_dict['ERO'] = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
K_list_dict['ERO'] = [5]
size_ratio_list_dict['ERO'] = [1]
ERO_style_list_dict['ERO'] = ['uniform', 'gamma']
pretrain_with_list_dict = {}
for train_with in train_with_list:
    if train_with[:3] == 'emb':
        pretrain_with_list_dict[train_with] = ['dist', 'innerproduct', 'serial_similarity']
    else:
        pretrain_with_list_dict[train_with] = ['dist']

In [5]:
def generate_save_name(dataset='HeadToHead', all_methods=all_GNNs, K=5, train_with='anchor_dist', imbalance_coeff=1, upset_ratio_coeff=1.0, upset_margin_coeff=0, upset_margin=0.01,
                           trainable_alpha=False, lr=0.01, hidden=32, num_trials=10, train_ratio=1, test_ratio=1,  AllTrain=True, cluster_rank_baseline='SpringRank', sigma=1.0, 
                           Fiedler_layer_num=5, pretrain_epochs=50, pretrain_with='dist'):
    default_name_base = ''
    if 'DIGRAC' in all_methods or 'ib' in all_methods:
        default_name_base += 'K' + str(K) + 'dropout' + str(int(100*dropout))
        default_name_base += 'imb_coe' + str(int(100*imbalance_coeff)) + 'ratio_coe' + str(int(100*upset_ratio_coeff)) + 'margin_coe' + str(int(100*upset_margin_coeff)) 
        if upset_margin_coeff > 0:
            default_name_base += 'margin' + str(int(100*upset_margin)) 
        default_name_base += 'with' + str(train_with)  + 'Fiedler' + str(Fiedler_layer_num) + 'sigma' + str(int(100*sigma))
        default_name_base += 'alpha' + str(int(100*alpha)) + 'train_alpha' + str(trainable_alpha) + 'hid' + str(hidden) + 'lr' + str(int(1000*lr))
        default_name_base += 'use' + str(cluster_rank_baseline)
        if pretrain_epochs > 0 and train_with[:3] == 'emb':
            default_name_base +=  'pre' + str(pretrain_with) + str(int(pretrain_epochs))
    save_name_base = default_name_base

    default_name_base +=  'trials' + str(num_trials) + 'train_r' + str(int(100*train_ratio)) + 'test_r' + str(int(100*test_ratio)) + 'All' + str(AllTrain)
    if dataset[:4] == 'DSBM' or dataset[:3] == 'ERO':
        default_name_base += 'seeds' + '_'.join([str(value) for value in np.array(seeds).flatten()])
    return default_name_base

In [6]:
def dataset_and_K(dataset, season=2009, K=5, p=0.05, size_ratio=1.5, ERO_style='uniform', eta=0.1, N=350):
    F_style = 'path'
    sp_style = 'random'
    ambient = 0
    if dataset[-1]!='/':
        dataset += '/'

    if dataset[:4] == 'DSBM':
        hidden = 8
        default_name_base = F_style+ '_' + sp_style
        default_name_base += 'p' + str(int(100*p)) + 'K' + str(K) + 'N' + str(N) + 'size_r' + str(int(100*size_ratio))
        default_name_base += 'eta' + str(int(100*eta)) + 'ambient' + str(ambient)
        dataset = 'DSBM/' + default_name_base
        dataset_print = 'DSBM(p={}, K={}, size ratio={},$\eta$={})'.format(p, K, size_ratio, eta)
    elif dataset[:3] == 'ERO':
        hidden = 8
        K = 5
        F = 3
        default_name_base = 'p' + str(int(100*p)) + 'K' + str(K) + 'N' + str(N)
        default_name_base += 'eta' + str(int(100*eta)) + 'style' + str(ERO_style)
        dataset = 'ERO/' + default_name_base
        dataset_print = 'ERO(p={}, style={},$\eta$={})'.format(p, ERO_style, eta)
    elif dataset[:10].lower() == 'basketball':
        hidden = 8
        hidden_compare = 8
        F = 70
        K = 20
        dataset = 'Basketball_temporal/' + str(season)
        dataset_print = 'Basketball({})'.format(season)
    elif dataset[:16].lower() == 'finer_basketball':
        hidden = 8
        hidden_compare = 8
        F = 2
        K = 20
        dataset = 'Basketball_temporal/finer' + str(season)
        dataset_print = 'Basketball finer({})'.format(season)
    elif dataset[:6].lower() == 'animal':
        hidden = 4
        hidden_compare = 4
        F = 3
        K = 3
        dataset = 'Dryad_animal_society/'
        dataset_print = 'Animal'
    elif dataset[:7].lower() == 'finance':
        hidden = 32
        hidden_compare = 32
        F = 5 # threshold: > 0.7, others have threshold > 0.9
        K = 20
        dataset_print = 'Finance'
    elif dataset[:10].lower() == 'headtohead':
        hidden = 16
        hidden_compare = 16
        F = 39
        K = 48
        dataset = 'Halo2BetaData/HeadToHead'
        dataset_print = 'HeadToHead'
    elif dataset[:16].lower() == 'faculty_business':
        hidden = 8
        hidden_compare = 8
        F = 6
        K = 5
        dataset = 'FacultyHiringNetworks/Business/Business_FM_Full_'
        dataset_print = 'Faculty: Business'
    elif dataset[:10].lower() == 'faculty_cs':
        hidden = 8
        hidden_compare = 8
        F = 8
        K = 9
        dataset = 'FacultyHiringNetworks/ComputerScience/ComputerScience_FM_Full_'
        dataset_print = 'Faculty: CS'
    elif dataset[:15].lower() == 'faculty_history':
        hidden = 8
        hidden_compare = 8
        F = 22
        K = 12
        dataset = 'FacultyHiringNetworks/History/History_FM_Full_'
        dataset_print = 'Faculty: History'
    elif dataset[:8].lower() == 'football':
        hidden = 4
        hidden_compare = 4
        F = 19
        K = 9
        dataset = 'Football_data_England_Premier_League/England_' + str(season) + '_' + str(season+1)
        dataset_print = 'Football({})'.format(season)
    elif dataset[:14].lower() == 'finer_football':
        hidden = 4
        hidden_compare = 4
        F = 4
        K = 9
        dataset = 'Football_data_England_Premier_League/finerEngland_' + str(season) + '_' + str(season+1)
        dataset_print = 'Football finer({})'.format(season)
    return '{\it '+dataset_print+'}', dataset, K, hidden

In [7]:
lr_list = [0.01, 0.05, 0.005]
upset_margin_coeff_list = [0, 1]
imbalance_coeff_list = [0] # [0, 1]
unnormalized_L_list = [True] # [True, False]
trainable_alpha_list = [False, True]
# cluster_rank_baseline_list = ['SpringRank', 'btl']
upset_ratio_coeff_list = [0, 1]
Fiedler_layer_num_list = [3, 5, 7]
pretrain_epochs_list = [0, 50]
non_proximal_ind_correspondence_dict = {}
proximal_ind_correspondence_dict = {}
i = 0
for lr_ind, lr in enumerate(lr_list):
    for train_ind, train_with in enumerate(train_with_list):
        for pretrain_ind, pretrain_with in enumerate(pretrain_with_list_dict[train_with]):
            for margin_coeff_ind, upset_margin_coeff in enumerate(upset_margin_coeff_list):
                for imb_coeff_ind, imbalance_coeff in enumerate(imbalance_coeff_list):
                    for unnormalized_L_ind, unnormalized_L in enumerate(unnormalized_L_list):
                        for trainable_alpha_ind, trainable_alpha in enumerate(trainable_alpha_list):
                            # for cluster_base_ind, cluster_rank_baseline in enumerate(cluster_rank_baseline_list):
                            for ratio_coeff_ind, upset_ratio_coeff in enumerate(upset_ratio_coeff_list):
                                for Fiedler_layer_num_ind, Fiedler_layer_num in enumerate(Fiedler_layer_num_list):
                                    for pretrain_epochs_ind, pretrain_epochs in enumerate(pretrain_epochs_list):
                                        for method_ind in range(NON_PROXIMAL_GNN_NUM):
                                            non_proximal_ind_correspondence_dict[i] = [lr_ind, train_ind, pretrain_ind, \
                                                                          margin_coeff_ind, imb_coeff_ind, \
                                                                         unnormalized_L_ind, trainable_alpha_ind, \
                                                                         ratio_coeff_ind, \
                                                                        Fiedler_layer_num_ind, pretrain_epochs_ind, method_ind]
                                            i += 1
i = 0
for lr_ind, lr in enumerate(lr_list):
    for train_ind, train_with in enumerate(train_with_list):
        for pretrain_ind, pretrain_with in enumerate(pretrain_with_list_dict[train_with]):
            for margin_coeff_ind, upset_margin_coeff in enumerate(upset_margin_coeff_list):
                for imb_coeff_ind, imbalance_coeff in enumerate(imbalance_coeff_list):
                    for unnormalized_L_ind, unnormalized_L in enumerate(unnormalized_L_list):
                        for trainable_alpha_ind, trainable_alpha in enumerate(trainable_alpha_list):
                            #for cluster_base_ind, cluster_rank_baseline in enumerate(cluster_rank_baseline_list):
                            for ratio_coeff_ind, upset_ratio_coeff in enumerate(upset_ratio_coeff_list):
                                for Fiedler_layer_num_ind, Fiedler_layer_num in enumerate(Fiedler_layer_num_list):
                                    for pretrain_epochs_ind, pretrain_epochs in enumerate(pretrain_epochs_list):
                                        for method_ind in range(PROXIMAL_GNN_NUM):
                                            proximal_ind_correspondence_dict[i] = [lr_ind, train_ind, pretrain_ind, \
                                                                          margin_coeff_ind, imb_coeff_ind, \
                                                                         unnormalized_L_ind, trainable_alpha_ind, \
                                                                          ratio_coeff_ind, \
                                                                        Fiedler_layer_num_ind, pretrain_epochs_ind, method_ind]
                                            i += 1
non_proximal_cases_num = len(non_proximal_ind_correspondence_dict.keys())
proximal_cases_num = len(proximal_ind_correspondence_dict.keys())
print(non_proximal_cases_num, proximal_cases_num)

6336 9504


In [8]:
def GNN_load_results(dataset=dataset, all_methods=all_GNNs, K=5, train_with='anchor_dist', imbalance_coeff=0, 
                                   upset_ratio_coeff=1, upset_margin_coeff=0,  upset_margin=0.01,
                                    trainable_alpha=False, lr=0.01, hidden=32, num_trials=10, 
                                   train_ratio=0.8, test_ratio=0.1,  AllTrain=True, cluster_rank_baseline='SpringRank', 
                                   sigma=1, Fiedler_layer_num=5, pretrain_epochs=50, pretrain_with='dist', unnormalized_L=False):
    normalizations = ['plain']
    thresholds = ['sort']
    save_name = generate_save_name(dataset=dataset, all_methods=all_GNNs, K=K, train_with=train_with, imbalance_coeff=imbalance_coeff, 
                                   upset_ratio_coeff=upset_ratio_coeff, upset_margin_coeff=upset_margin_coeff, upset_margin=upset_margin,
                                    trainable_alpha=trainable_alpha, lr=lr, hidden=hidden, num_trials=num_trials, 
                                   train_ratio=train_ratio, test_ratio=test_ratio,  AllTrain=AllTrain, cluster_rank_baseline=cluster_rank_baseline, 
                                   sigma=sigma, Fiedler_layer_num=Fiedler_layer_num, pretrain_epochs=pretrain_epochs, pretrain_with=pretrain_with)
    method_str, _ = generate_method_str_and_compare_names_all(all_GNNs, normalizations, thresholds)
    assert unnormalized_L == True
    dir_name = '../result_arrays0107/'+dataset
    '''
    if lr == 0.01:
        dir_name = '../result_arrays0107/'+dataset
        assert unnormalized_L == True # , 'lr={}, trainable_alpha={}, unnormalized_L={}'.format(lr, trainable_alpha, unnormalized_L)
    else:
        dir_name = '../result_arrays/'+dataset
        assert unnormalized_L == False # , 'lr={}, trainable_alpha={}, unnormalized_L={}'.format(lr, trainable_alpha, unnormalized_L)
    '''
    kendalltau_res = None
    # try:
    if dataset[:3] == 'ERO' or dataset[:4] == 'DSBM':
        kendalltau_res = np.load(os.path.join(dir_name,'kendalltau',method_str,save_name) + '.npy')[:, :, 2, 0]
    final_upset = np.load(os.path.join(dir_name,'upset',method_str,save_name) + '.npy')
    # except FileNotFoundError:
        # print(os.path.join(dir_name,'kendalltau',method_str,save_name) + '.npy')
    return kendalltau_res, final_upset[:, :, 0], final_upset[:, :, 1]

In [9]:
def GNN_selection_with_fix_dim(dataset, K, train_ratio, test_ratio, AllTrain, hidden, num_trials, cluster_rank_baseline):
    # print(dataset, K, train_ratio, test_ratio, AllTrain, hidden, num_trials)
    upset_margin = 0.01
    pretrain_epochs = 50
    sigma = 1
    full_results_proximal = 1000*np.ones((METRICS_NUM, PROXIMAL_GNN_NUM * proximal_cases_num))
    full_results_proximal[0] = 0
    final_ind_proximal = 0
    has_result = False
    for lr_ind, lr in enumerate(lr_list):
        for train_ind, train_with in enumerate(train_with_list):
            for pretrain_ind, pretrain_with in enumerate(pretrain_with_list_dict[train_with]):
                for margin_coeff_ind, upset_margin_coeff in enumerate(upset_margin_coeff_list):
                    for imb_coeff_ind, imbalance_coeff in enumerate(imbalance_coeff_list):
                        for unnormalized_L_ind, unnormalized_L in enumerate(unnormalized_L_list):
                            for trainable_alpha_ind, trainable_alpha in enumerate(trainable_alpha_list):
                                #for cluster_base_ind, cluster_rank_baseline in enumerate(cluster_rank_baseline_list):
                                for ratio_coeff_ind, upset_ratio_coeff in enumerate(upset_ratio_coeff_list):
                                    for Fiedler_layer_num_ind, Fiedler_layer_num in enumerate(Fiedler_layer_num_list):
                                        for pretrain_epochs_ind, pretrain_epochs in enumerate(pretrain_epochs_list):
                                            try:

                                                kendalltau, upset_simple, upset_ratio = GNN_load_results(dataset=dataset, all_methods=all_GNNs, K=K, train_with=train_with, imbalance_coeff=imbalance_coeff, 
                                                       upset_ratio_coeff=upset_ratio_coeff, upset_margin_coeff=upset_margin_coeff, upset_margin=upset_margin,
                                                        trainable_alpha=trainable_alpha, lr=lr, hidden=hidden, num_trials=num_trials, 
                                                       train_ratio=train_ratio, test_ratio=test_ratio,  AllTrain=AllTrain, cluster_rank_baseline=cluster_rank_baseline, 
                                                       sigma=sigma, Fiedler_layer_num=Fiedler_layer_num, pretrain_epochs=pretrain_epochs, pretrain_with=pretrain_with, unnormalized_L=unnormalized_L)
                                                if kendalltau is not None:
                                                    mean_kendalltau = np.nanmean(kendalltau[proximal_bool], axis=1)
                                                    full_results_proximal[0, final_ind_proximal: final_ind_proximal + PROXIMAL_GNN_NUM] = mean_kendalltau

                                                mean_upset_simple = upset_simple[proximal_bool].mean(axis=1)
                                                mean_upset_ratio = upset_ratio[proximal_bool].mean(axis=1)      
                                                full_results_proximal[1, final_ind_proximal: final_ind_proximal + PROXIMAL_GNN_NUM] = mean_upset_simple
                                                full_results_proximal[2, final_ind_proximal: final_ind_proximal + PROXIMAL_GNN_NUM] = mean_upset_ratio
                                                has_result = True
                                            except FileNotFoundError:
                                                # print(dataset, lr, upset_ratio_coeff, upset_margin_coeff, hidden, num_trials, \
                                                      # trainable_alpha, train_ratio, test_ratio, AllTrain, cluster_rank_baseline, \
                                                     # unnormalized_L, train_with, pretrain_with)
                                                pass
                                            except AssertionError:
                                                #print(lr, upset_ratio_coeff, upset_margin_coeff, hidden, num_trials, \
                                                      # trainable_alpha, train_ratio, test_ratio, AllTrain, cluster_rank_baseline, \
                                                     # unnormalized_L, train_with, pretrain_with)
                                                pass
                                            final_ind_proximal += PROXIMAL_GNN_NUM
    if has_result:
        best_ind = np.zeros((METRICS_NUM, 1))
        best_vals = np.array([[0], [1000], [1000]], dtype=np.float64)
        if dataset[:4] == 'DSBM' or dataset[:3] == 'ERO':
            full_results_proximal[0] = np.nan_to_num(full_results_proximal[0], nan=0)
            best_ind[0, 0] = full_results_proximal[0].argmax()
            best_vals[0, 0] = np.nanmax(full_results_proximal[0])

        full_results_proximal[1] = np.nan_to_num(full_results_proximal[1], nan=1000)
        full_results_proximal[2] = np.nan_to_num(full_results_proximal[2], nan=1000)
        best_vals[1, 0] = np.nanmin(full_results_proximal[1])
        best_vals[2, 0] = np.nanmin(full_results_proximal[2])
        best_ind[1, 0] = full_results_proximal[1].argmin()
        best_ind[2, 0] = full_results_proximal[2].argmin()
        selected_indices = np.zeros((METRICS_NUM, 1, GNN_CHOICES_NUM+1))
        kendalltau_res = np.zeros((METRICS_NUM, 1, 10))
        kendalltau_res[:] = np.nan
        final_upset = np.zeros((METRICS_NUM, 1, 10, 2)) # the first "1" means proximal
        final_upset[:] = np.nan
        # print('Best values are {}.'.format(best_vals))
        for i in range(METRICS_NUM):
            if i == 0 and dataset[:4] != 'DSBM' and dataset[:3] != 'ERO':
                continue
            j = 0
            selected_indices[i, j] = proximal_ind_correspondence_dict[best_ind[i, j]]
                
            lr = lr_list[int(selected_indices[i, j, 0])]
            train_with = train_with_list[int(selected_indices[i, j, 1])]
            pretrain_with = pretrain_with_list_dict[train_with][int(selected_indices[i, j, 2])]
            upset_margin_coeff = upset_margin_coeff_list[int(selected_indices[i, j, 3])]
            imbalance_coeff = imbalance_coeff_list[int(selected_indices[i, j, 4])]
            unnormalized_L = unnormalized_L_list[int(selected_indices[i, j, 5])]
            trainable_alpha = trainable_alpha_list[int(selected_indices[i, j, 6])]
            # cluster_rank_baseline = cluster_rank_baseline_list[int(selected_indices[i, j, 7])]
            upset_ratio_coeff = upset_ratio_coeff_list[int(selected_indices[i, j, 7])]
            Fiedler_layer_num = Fiedler_layer_num_list[int(selected_indices[i, j, 8])]
            pretrain_epochs = pretrain_epochs_list[int(selected_indices[i, j, 9])]

            sel_ind = int(selected_indices[i, j, -1])
            GNN_selected = GNN_names_proximal[sel_ind]
            selected_vals = [lr, train_with, pretrain_with, upset_margin_coeff, \
                              imbalance_coeff, unnormalized_L, trainable_alpha, \
                             cluster_rank_baseline, upset_ratio_coeff, Fiedler_layer_num, pretrain_epochs]
            
            kendalltau, upset_simple, upset_ratio = GNN_load_results(dataset=dataset, all_methods=all_GNNs, K=K, train_with=train_with, imbalance_coeff=imbalance_coeff, 
                                       upset_ratio_coeff=upset_ratio_coeff, upset_margin_coeff=upset_margin_coeff, upset_margin=upset_margin,
                                        trainable_alpha=trainable_alpha, lr=lr, hidden=hidden, num_trials=num_trials, 
                                       train_ratio=train_ratio, test_ratio=test_ratio,  AllTrain=AllTrain, cluster_rank_baseline=cluster_rank_baseline, 
                                       sigma=sigma, Fiedler_layer_num=Fiedler_layer_num, pretrain_epochs=pretrain_epochs, pretrain_with=pretrain_with, unnormalized_L=unnormalized_L)
            if kendalltau is not None:
                kendalltau_res[i, j] = (kendalltau[proximal_bool])[sel_ind]
            upset_simple_res = (upset_simple[proximal_bool])[sel_ind]
            upset_ratio_res = (upset_ratio[proximal_bool])[sel_ind]
            final_upset[i, j] = np.array([upset_simple_res, upset_ratio_res]).swapaxes(0,1)
        return kendalltau_res, final_upset, selected_indices
    else:
        raise FileNotFoundError

In [10]:
def extract_results(dataset, season=2009, K=5, upset_ratio_coeff=1.0, upset_margin=0.01, p=0.1, 
                       AllTrain=True, size_ratio=1.5, eta=0.1, lr=0.05, hidden=32, normalizations=['plain'], thresholds=['sort'],
                        N=350, ERO_style='uniform', train_ratio = 0.8, test_ratio = 0.1, dropout=0.5, sigma=1.0, 
                           methods_of_interest=methods_of_interest, print_latex=True):
    F_style = 'path'
    sp_style = 'random'
    num_trials = 2
    seed = 31
    fill_val = 0.5
    ambient = 0
    alpha = 1
    seeds = [10, 20, 30, 40, 50]
    normalizations = ['plain']
    thresholds = ['sort']
    
    dataset_print, dataset, K, hidden = dataset_and_K(dataset, season, K, p, size_ratio, ERO_style, eta)

    if dataset[:4] != 'DSBM' and dataset[:3] != 'ERO':
        num_trials = 10
        AllTrain = True
        train_ratio = 1
        test_ratio = 1
        seeds = [10]
    
    kendalltau_res_full_list = []
    final_upset_full_list = []
    selected_indices_list = []
    for cluster_rank_baseline in methods_of_interest:
        kendalltau_res_full, final_upset_full, selected_indices = GNN_selection_with_fix_dim(dataset, K, train_ratio, test_ratio, AllTrain, hidden, num_trials, cluster_rank_baseline=cluster_rank_baseline)
        kendalltau_res_full_list.append(kendalltau_res_full)
        final_upset_full_list.append(final_upset_full)
        selected_indices_list.append(selected_indices)
    dir_name = '../result_arrays/'+dataset
    kendalltau_res_all = np.zeros((METRICS_NUM, METHODS_NUM, num_trials*len(seeds)))
    kendalltau_res_all[:] = np.nan
    final_upset_all = np.zeros((METRICS_NUM, METHODS_NUM, num_trials*len(seeds), 2))
    final_upset_all[:] = np.nan
    for i in range(METRICS_NUM):
        compare_names_all = methods_of_interest
        if i == 0 and dataset[:3] != 'ERO' and dataset[:4] != 'DSBM':
            continue
        # load baseline results
        save_name = generate_save_name(dataset=dataset, all_methods=baselines, K=K, num_trials=num_trials, 
                                       train_ratio=train_ratio, test_ratio=test_ratio,  AllTrain=AllTrain)
        method_str, compare_names_baselines = generate_method_str_and_compare_names_all(baselines, normalizations, thresholds)
        baseline_kendalltau_res = np.load(os.path.join(dir_name,'kendalltau',method_str,save_name) + '.npy')[:, :, 2, 0][selected_baseline_indices]
        baseline_final_upset = np.load(os.path.join(dir_name,'upset',method_str,save_name) + '.npy')[selected_baseline_indices]
        # subtract baseline results
        kendalltau_res = - kendalltau_res_full_list[0][i] + baseline_kendalltau_res[0]
        final_upset = final_upset_full_list[0][i] - baseline_final_upset[0]
        

        kendalltau_res_all[i] = kendalltau_res
        final_upset_all[i] = final_upset
        # print(i, final_upset.shape)
        for j in np.arange(1, len(methods_of_interest)):
            kendalltau_res = np.concatenate((kendalltau_res, - kendalltau_res_full_list[j][i] + baseline_kendalltau_res[j]), axis=0)
            final_upset = np.concatenate((final_upset, final_upset_full_list[j][i] - baseline_final_upset[j]), axis=0)
            # print(i, j, final_upset.shape)
        kendalltau_res_all[i] = kendalltau_res
        final_upset_all[i] = final_upset

        
    return dataset_print, kendalltau_res_all, final_upset_all, np.array(selected_indices_list).swapaxes(0,1)

In [11]:
def analysis_results(dataset_list=['HeadToHead', 'finance', 'animal', 'faculty_business', 'faculty_cs', 'faculty_history', 'football', 'finer_football', 'basketball', 'finer_basketball']):
    dataset_name_full = []
    kendalltau_res_all_full = []
    final_upset_all_full = []
    selected_indices_full = []
    for dataset in dataset_list:
        for p in p_list_dict[dataset]:
                for K in K_list_dict[dataset]:
                    for eta in eta_list_dict[dataset]:
                        for ERO_style in ERO_style_list_dict[dataset]:
                            for season in season_list_dict[dataset]:
                                for size_ratio in size_ratio_list_dict[dataset]:
                                    try:
                                        dataset_long, kendalltau_res_all, final_upset_all, selected_indices = extract_results(dataset=dataset, 
                                                        season=season, p=p, K=K, eta=eta, size_ratio=size_ratio, 
                                                        ERO_style=ERO_style)
                                        dataset_name_full.append(dataset_long)
                                        kendalltau_res_all_full.append(kendalltau_res_all)
                                        final_upset_all_full.append(final_upset_all)
                                        selected_indices_full.append(selected_indices)
                            
                                    except FileNotFoundError:
                                        print('No result yet for {}, season {}, p={}, K={}, size ratio = {}, eta={}, ERO style = {}.'.format(dataset,
                                            season, p, K, size_ratio, eta, ERO_style))
    

    full_results = np.concatenate((np.expand_dims(np.array(kendalltau_res_all_full), axis=-1), np.array(final_upset_all_full)), axis=-1)
    for i in range(METRICS_NUM):
        for j in range(1, METRICS_NUM):
            results_to_print = full_results[:,j,:,:,i].swapaxes(0,2)
            if not np.isnan(results_to_print).all():
                dataset_name_print = dataset_name_full
                compare_names_print = compare_names_all
                title_name = selected_metrics[i] + ' with best ' + selected_metrics[j]
                print_overall_performance_mean_std(title_name, results_to_print, 
                                compare_names_print, dataset_name_print, True)
    return dataset_name_full, final_upset_all_full

In [12]:
dataset_name_full, final_upset_all_full = analysis_results()

 upset simple       SpringRank         syncRank             btl         eigenvectorCen      PageRank         SVD_NRS    
with best upset                                                            trality                                      
simpleData/Meth                                                                                                         
      od                                                                                                                
{\it              -0.01$\pm$0.00    \red{-0.98$\pm$   -0.13$\pm$0.01    \blue{-0.48$\p   -0.37$\pm$0.00   -0.42$\pm$0.03
HeadToHead}                         0.00}                               m$0.00}                                         
{\it Finance}     -0.63$\pm$0.00    \red{-0.98$\pm$   \blue{-0.78$\pm   -0.74$\pm$0.00   -0.75$\pm$0.00   -0.64$\pm$0.00
                                    0.00}             $0.01}                                                            
{\it Animal}      \blue{-0.09$\p

In [13]:
test = np.array(final_upset_all_full)
tmp = test[:,1,:,:,0]
tmp.mean(axis=0).mean(axis=1)

array([-0.03178992, -0.94972278, -0.23900918, -0.17122803, -0.16370831,
       -0.06428139])