In [None]:
import os
import sys
sys.path.append('../src')

import numpy as np
import scipy.sparse as sp
from texttable import Texttable

from metrics import print_and_analysis_performance_mean_std, print_overall_performance_mean_std

In [None]:
dropout = 0.5
alpha = 1
seeds = [10, 20, 30, 40, 50]
upset_choices = ['upset simple', 'upset ratio', 'upset naive']
selected_metrics = ['kendall tau'] + upset_choices
NUM_UPSET_CHOICES = len(upset_choices)
METRICS_NUM = len(selected_metrics)
baselines = ['SpringRank','syncRank','serialRank','btl', 'davidScore',
        'eigenvectorCentrality', 'PageRank', 'rankCentrality', 'SVD_RS', 'SVD_NRS']
GNN_selection_choices = ['lr', 'train with', 'pretrain with', 'upset margin coeff', \
                'trainable alpha', 'baseline', \
                         'upset ratio coeff', 'Fiedler layer num', 'pretrain epochs']
GNN_selection_choices_curr = ['lr', 'train with', 'pretrain with', 'upset margin coeff',  \
                 'baseline', 'upset ratio coeff']
GNN_CHOICES_NUM = len(GNN_selection_choices)
GNN_CHOICES_NUM_CURR = len(GNN_selection_choices_curr)
mvr = ['mvr']
all_GNNs = ['DIGRAC', 'ib']
desirable_list = [1, 2, 5]
train_with_list = ['dist', 'innerproduct', 'proximal_dist', 'proximal_innerproduct','proximal_baseline']

GNN_variant_names = train_with_list
GNN_NUM = 10

def generate_method_str_and_compare_names_all(all_methods=baselines):
    method_str = ''
    for method_name in all_methods:
        method_str += method_name
    
    compare_names_all = []
    for method_name in all_methods:
        if method_name not in ['DIGRAC', 'ib']:
            compare_names_all.append(method_name)
        else:
            for GNN_type in GNN_variant_names:
                compare_names_all.append(method_name+'_'+GNN_type)
    return method_str, compare_names_all

GNN_NUM = 10
GNN_names = []
for method_name in ['DIGRAC', 'ib']:
    for GNN_type in GNN_variant_names:
        GNN_names.append(method_name+GNN_type)

non_proximal_ind = [0, 1, 5, 6]
proximal_ind = [2, 3, 4, 7, 8, 9]
NON_PROXIMAL_GNN_NUM = len(non_proximal_ind)
PROXIMAL_GNN_NUM = len(proximal_ind)
non_proximal_bool = np.zeros(GNN_NUM, dtype=bool)
proximal_bool = np.zeros(GNN_NUM, dtype=bool)
for i in non_proximal_ind:
    non_proximal_bool[i] = True
for i in proximal_ind:
    proximal_bool[i] = True

GNN_names_non_proximal = ['DIGRAC_dist', 'DIGRAC_innerproduct',\
                          'ib_dist', 'ib_innerproduct']
GNN_names_proximal = ['DIGRAC_proximal_dist', 'DIGRAC_proximal_innerproduct', 'DIGRAC_proximal_baseline', \
                      'ib_proximal_dist', 'ib_proximal_innerproduct', 'ib_proximal_baseline']

methods_of_interest = ['inductive', 'original']
compare_names_all = baselines + mvr + methods_of_interest
METHODS_NUM = len(compare_names_all)

In [None]:
keys = ['DIGRAC_dist', 'DIGRAC_innerproduct',\
                          'ib_dist', 'ib_innerproduct', \
        'DIGRAC_proximal_dist', 'DIGRAC_proximal_innerproduct', 'DIGRAC_proximal_baseline', \
                      'ib_proximal_dist', 'ib_proximal_innerproduct', 'ib_proximal_baseline', \
       ]
values = ['DIGRAC dist', 'DIGRAC innerproduct',\
                          'ib dist', 'ib innerproduct', \
        'DIGRAC proximal dist', 'DIGRAC proximal innerproduct', 'DIGRAC proximal baseline', \
                      'ib proximal dist', 'ib proximal innerproduct', 'ib proximal baseline']

keys += train_with_list + ['dist', 'innerproduct', 'serial_similarity'] + \
['avg_football', 'avg_finer_football', 'avg_basketball', 'avg_finer_basketball']
values += ['dist', 'innerproduct', 'proximal dist', 'proximal innerproduct','proximal baseline'] + \
['dist', 'innerproduct', 'SerialRank similarity'] + \
['{\it Football (avg)}', '{\it Football finer (avg)}', '{\it Basketball (avg)}', '{\it Basketball finer (avg)}']
name_mapping_dict = dict(zip(keys, values))
print(name_mapping_dict)

In [None]:
def generate_save_name(dataset='HeadToHead', all_methods=all_GNNs, K=5, train_with='dist', upset_ratio_coeff=1.0, upset_margin_coeff=0, upset_margin=0.01,
                           trainable_alpha=False, lr=0.01, hidden=32, num_trials=10, train_ratio=1, test_ratio=1,  AllTrain=True, rank_baseline='SpringRank', sigma=1.0, 
                           Fiedler_layer_num=5, pretrain_epochs=50, pretrain_with='dist'):
    default_name_base = ''
    if 'DIGRAC' in all_methods or 'ib' in all_methods:
        default_name_base += 'K' + str(K) + 'dropout' + str(int(100*dropout))
        default_name_base += 'ratio_coe' + str(int(100*upset_ratio_coeff)) + 'margin_coe' + str(int(100*upset_margin_coeff)) 
        if upset_margin_coeff > 0:
            default_name_base += 'margin' + str(int(100*upset_margin)) 
        default_name_base += 'with' + str(train_with)  + 'Fiedler' + str(Fiedler_layer_num) + 'sigma' + str(int(100*sigma))
        default_name_base += 'alpha' + str(int(100*alpha))
        if train_with[:8] == 'proximal':
            default_name_base += 'train_alpha' + str(trainable_alpha)
        default_name_base += 'hid' + str(hidden) + 'lr' + str(int(1000*lr))
        default_name_base += 'use' + str(rank_baseline)
        if pretrain_epochs > 0 and train_with[:8] == 'proximal':
            default_name_base +=  'pre' + str(pretrain_with) + str(int(pretrain_epochs))
    save_name_base = default_name_base

    default_name_base +=  'trials' + str(num_trials) + 'train_r' + str(int(100*train_ratio)) + 'test_r' + str(int(100*test_ratio)) + 'All' + str(AllTrain)
    if dataset[:3] == 'ERO':
        default_name_base += 'seeds' + '_'.join([str(value) for value in np.array(seeds).flatten()])
    return default_name_base

In [None]:
def dataset_and_K(dataset, season=2009, K=5, p=0.05, ERO_style='uniform', eta=0.1, N=350):
    F_style = 'path'
    sp_style = 'random'
    ambient = 0
    if dataset[-1]!='/':
        dataset += '/'

    if dataset[:3] == 'ERO':
        hidden = 8
        K = 5
        F = 3
        default_name_base = 'p' + str(int(100*p)) + 'K' + str(K) + 'N' + str(N)
        default_name_base += 'eta' + str(int(100*eta)) + 'style' + str(ERO_style)
        dataset = 'ERO/' + default_name_base
        dataset_print = 'ERO(p={}, style={},$\eta$={})'.format(p, ERO_style, eta)
    elif dataset[:10].lower() == 'basketball':
        hidden = 8
        hidden_compare = 8
        F = 70
        K = 20
        dataset = 'Basketball_temporal/' + str(season)
        dataset_print = 'Basketball({})'.format(season)
    elif dataset[:16].lower() == 'finer_basketball':
        hidden = 8
        hidden_compare = 8
        F = 2
        K = 20
        dataset = 'Basketball_temporal/finer' + str(season)
        dataset_print = 'Basketball finer({})'.format(season)
    elif dataset[:6].lower() == 'animal':
        hidden = 4
        hidden_compare = 4
        F = 3
        K = 3
        dataset = 'Dryad_animal_society/'
        dataset_print = 'Animal'
    elif dataset[:7].lower() == 'finance':
        hidden = 32
        hidden_compare = 32
        F = 5 # threshold: > 0.7, others have threshold > 0.9
        K = 20
        dataset_print = 'Finance'
    elif dataset[:10].lower() == 'headtohead':
        hidden = 16
        hidden_compare = 16
        F = 39
        K = 48
        dataset = 'Halo2BetaData/HeadToHead'
        dataset_print = 'HeadToHead'
    elif dataset[:16].lower() == 'faculty_business':
        hidden = 8
        hidden_compare = 8
        F = 6
        K = 5
        dataset = 'FacultyHiringNetworks/Business/Business_FM_Full_'
        dataset_print = 'Faculty: Business'
    elif dataset[:10].lower() == 'faculty_cs':
        hidden = 8
        hidden_compare = 8
        F = 8
        K = 9
        dataset = 'FacultyHiringNetworks/ComputerScience/ComputerScience_FM_Full_'
        dataset_print = 'Faculty: CS'
    elif dataset[:15].lower() == 'faculty_history':
        hidden = 8
        hidden_compare = 8
        F = 22
        K = 12
        dataset = 'FacultyHiringNetworks/History/History_FM_Full_'
        dataset_print = 'Faculty: History'
    elif dataset[:8].lower() == 'football':
        hidden = 4
        hidden_compare = 4
        F = 19
        K = 9
        dataset = 'Football_data_England_Premier_League/England_' + str(season) + '_' + str(season+1)
        dataset_print = 'Football({})'.format(season)
    elif dataset[:14].lower() == 'finer_football':
        hidden = 4
        hidden_compare = 4
        F = 4
        K = 9
        dataset = 'Football_data_England_Premier_League/finerEngland_' + str(season) + '_' + str(season+1)
        dataset_print = 'Football finer({})'.format(season)
    return dataset_print, dataset, K, hidden

In [None]:
upset_ratio_coeff=1.0
upset_margin_coeff=1.0
upset_margin=0.01
p=0.1
AllTrain=True
eta=0.1
lr=0.01
N=350
ERO_style='uniform'
dropout=0.5
rank_baseline='syncRank'
sigma=1.0
pretrain_epochs=50
dataset='finer_basketball'
train_with='proximal_baseline'
trainable_alpha=True
pretrain_with='serial_similarity'
method_of_interest='ib_proximal_baseline'
aggregation='ib'
F_style = 'path'
sp_style = 'random'
num_trials = 2
seed = 31
fill_val = 0.5
ambient = 0
Fiedler_layer_num = 5
alpha = 1
seeds = [10, 20, 30, 40, 50]
in_dataset = dataset

if dataset in ['basketball', 'finer_basketball']:
    season_range = np.arange(1985, 2015)
else:
    season_range = np.arange(2009,2015)

final_upset_all = np.zeros((len(season_range), 12, 10, NUM_UPSET_CHOICES))
final_upset_all[:] = np.nan
original_ind = GNN_names.index(method_of_interest)
new_ind = original_ind % 6
dataset_name_print = []
for i, season in enumerate(season_range):
    dataset_print, dataset, K, hidden = dataset_and_K(in_dataset, season)
    if dataset[:3] != 'ERO':
        num_trials = 10
        AllTrain = True
        train_ratio = 1
        test_ratio = 1
        seeds = [10]
    # original GNN results
    save_name = generate_save_name(dataset=dataset, all_methods=all_GNNs, K=K, train_with=train_with, 
                                   upset_ratio_coeff=upset_ratio_coeff, upset_margin_coeff=upset_margin_coeff, upset_margin=upset_margin,
                                    trainable_alpha=trainable_alpha, lr=lr, hidden=hidden, num_trials=num_trials, 
                                   train_ratio=train_ratio, test_ratio=test_ratio,  AllTrain=AllTrain, rank_baseline=rank_baseline, 
                                   sigma=sigma, pretrain_epochs=pretrain_epochs, pretrain_with=pretrain_with)
    method_str, compare_names_all = generate_method_str_and_compare_names_all(all_GNNs)
    dir_name = '../result_arrays/'+dataset
    final_upset = np.load(os.path.join(dir_name,'upset',method_str,save_name) + '.npy')[original_ind:original_ind+1]


    # inductive GNN results
    save_name = generate_save_name(dataset=dataset, all_methods=[aggregation], K=K, train_with=train_with, 
                                   upset_ratio_coeff=upset_ratio_coeff, upset_margin_coeff=upset_margin_coeff, upset_margin=upset_margin,
                                    trainable_alpha=trainable_alpha, lr=lr, hidden=hidden, num_trials=num_trials, 
                                   train_ratio=train_ratio, test_ratio=test_ratio,  AllTrain=AllTrain, rank_baseline=rank_baseline, 
                                   sigma=sigma, pretrain_epochs=pretrain_epochs, pretrain_with=pretrain_with)
    method_str, _ = generate_method_str_and_compare_names_all([aggregation])
    dir_name = '../result_arrays_inductive/'+dataset
    inductive_final_upset = np.load(os.path.join(dir_name,'upset',method_str,save_name) + '.npy')[new_ind:new_ind+1]
    final_upset = np.concatenate((inductive_final_upset, final_upset), axis=0)
    compare_names_all = ['inductive', 'original']


    # try to include mvr results
    dir_name = '../result_arrays/'+dataset
    save_name = generate_save_name(dataset=dataset, all_methods=mvr, K=K, train_with=train_with,
                                   upset_ratio_coeff=upset_ratio_coeff, upset_margin_coeff=upset_margin_coeff, upset_margin=upset_margin,
                                    trainable_alpha=trainable_alpha, lr=lr, hidden=hidden, num_trials=num_trials, 
                                   train_ratio=train_ratio, test_ratio=test_ratio,  AllTrain=AllTrain, rank_baseline=rank_baseline, 
                                   sigma=sigma, pretrain_epochs=pretrain_epochs, pretrain_with=pretrain_with)
    method_str, _ = generate_method_str_and_compare_names_all(mvr)
    if os.path.exists(os.path.join(dir_name,'kendalltau',method_str,save_name) + '.npy'):
        final_upset = np.concatenate((np.load(os.path.join(dir_name,'upset',method_str,save_name) + '.npy'), final_upset), axis=0)
        compare_names_all = ['mvr'] + compare_names_all


    # include baseline results
    save_name = generate_save_name(dataset=dataset, all_methods=baselines, K=K, train_with=train_with, 
                                   upset_ratio_coeff=upset_ratio_coeff, upset_margin_coeff=upset_margin_coeff, upset_margin=upset_margin,
                                    trainable_alpha=trainable_alpha, lr=lr, hidden=hidden, num_trials=num_trials, 
                                   train_ratio=train_ratio, test_ratio=test_ratio,  AllTrain=AllTrain, rank_baseline=rank_baseline, 
                                   sigma=sigma, pretrain_epochs=pretrain_epochs, pretrain_with=pretrain_with)
    method_str, compare_names_baselines = generate_method_str_and_compare_names_all(baselines)
    if os.path.exists(os.path.join(dir_name,'kendalltau',method_str,save_name) + '.npy'):
        final_upset = np.concatenate((np.load(os.path.join(dir_name,'upset',method_str,save_name) + '.npy'), final_upset), axis=0)
        compare_names_all = compare_names_baselines + compare_names_all
    final_upset_all[i] = final_upset
    dataset_name_print.append(dataset_print)
for i, metric in enumerate(['upset simple', 'upset ratio', 'upset naive']):
    title_name = metric
    results_to_print = final_upset_all[:,:,:,i].swapaxes(0,2)
    print_overall_performance_mean_std(title_name, results_to_print, 
                    compare_names_all, dataset_name_print, True)

In [None]:
upset_simple = final_upset_all[:,-1,:,0].mean(axis=0)
upset_naive = final_upset_all[:,-1,:,2].mean(axis=0)
upset_ratio = final_upset_all[:,-1,:,1].mean(axis=0)
print('{:.4f}\pm {:.4f}'.format(upset_simple.mean(), upset_simple.std())) #  new trained
print('{:.4f}\pm {:.4f}'.format(upset_naive.mean(), upset_naive.std())) #  new trained
print('{:.4f}\pm {:.4f}'.format(upset_ratio.mean(), upset_ratio.std())) #  new trained

In [None]:
upset_simple = final_upset_all[:,-2,:,0].mean(axis=0)
upset_naive = final_upset_all[:,-2,:,2].mean(axis=0)
upset_ratio = final_upset_all[:,-2,:,1].mean(axis=0)
print('{:.4f}\pm {:.4f}'.format(upset_simple.mean(), upset_simple.std())) # directly apply
print('{:.4f}\pm {:.4f}'.format(upset_naive.mean(), upset_naive.std()))
print('{:.4f}\pm {:.4f}'.format(upset_ratio.mean(), upset_ratio.std())) 