In [20]:
import os
import re
import json
import pandas as pd
from visualization import *
path = '../tuning'
img_path = '../tuning_img'

param_log = '../params_log.txt'
measure_cols = ['PPL', 'topic_recon', 'tc_drug_drug', 'tc_disease_disease', 'tc_drug_disease']
max_measures = [s+"_max" for s in measure_cols[1:]] 
max_epoches = [s+"_max_epoch" for s in measure_cols[1:]]
files = [os.path.join(path,f) for f in sorted(os.listdir(path))][2:]
file_names = [f for f in sorted(os.listdir(path))][2:]

In [2]:
# file-to-param mapping
f2param = get_f2p(param_log)
rows = []
for f,f_name in zip(files,file_names):
    best_row = get_row(f,f_name,f2param,measure_cols[1:])
    rows.append(best_row)
    ## uncomment this line to generate the plts for csv files
    # plt_csv(f,f_name,f2param,showfig=False)

In [3]:
# raw data 
df = pd.concat(rows,axis=1).T
df.sort_values(by="PPL",inplace=True)
df[measure_cols] = df[measure_cols].apply(pd.to_numeric).round(3)
df[["Epoch","best_epoch"]] = df[["Epoch","best_epoch"]].astype(int)
df[max_measures] = df[max_measures].round(3)
df[max_epoches] = df[max_epoches].astype(int)
df_measure = df[measure_cols]

In [4]:
s2p_map = get_s2p(list(df.index))
df_avg_baselines = pd.DataFrame()
# calculate a avg baseline and add them to to s2p_map
for seed, params in s2p_map.items():
    baselines = []
    for l in params:
        if "-baseline" in l:
            baselines.append(l)
    index_name = f"-avgbaseline {seed}"
    avg_baseline = df_measure.loc[baselines].mean(axis=0).rename(index_name)
    # add a series to dataframe: needs to convert series to dataframe, and then Transpose it
    df_avg_baselines = pd.concat([df_avg_baselines,avg_baseline.to_frame().T],axis=0)
    s2p_map[seed].append(index_name)
# merge avgbaselines with original raw data
df_measure = pd.concat([df_measure,df_avg_baselines])

In [5]:
# seeds = s2p_map.keys()
compare_params,share_params = same_params(s2p_map)
param_num = len(list(compare_params.values())[0]) # number of parameters
stderr_list = []
avg_measure_list = []
for i in range(param_num):
    display_list = [compare_params[seed][i] for seed in compare_params.keys()]
    # compute the standard error based on some params tuned: std of avgbaselines, avg of -lr 0.05...
    stderr_series = df_measure.loc[display_list].sem(axis=0) 
    stderr_series = stderr_series.rename(str(share_params[i])) # rename the series
    stderr_list.append(stderr_series)
    # compute the avg of all params: avg of avgbaselines, avg of -lr 0.05, ...
    avg_series = df_measure.loc[display_list].mean(axis=0) 
    avg_series = avg_series.rename(str(share_params[i]))
    avg_measure_list.append(avg_series)
df_stderr = pd.concat(stderr_list,axis=1).T
df_avg_measure = pd.concat(avg_measure_list,axis=1).T

In [6]:
def highlight(s): # highlight best PPL and TCs in green
    is_best = s == s.min() if 'PPL' in s.name else s == s.max()
    return ['background: green' if cell else '' for cell in is_best]
highlight_base = lambda x: ['background: darkblue' if x.name in ['-avgbaseline'] else '' for i in x] # high light the avgbaseline in blue

In [7]:
df_stderr = df_stderr.sort_values(by="PPL")

# df_stderr.style.apply(highlight).apply(highlight_base,axis=1).format(precision=3)

In [8]:
df_avg_measure = df_avg_measure.sort_values(by="PPL")

# df_avg_measure.style.apply(highlight).apply(highlight_base,axis=1).format(precision=3)

In [9]:
pm_sign = " \u00B1 "
cols2pm_map = {}
for col in measure_cols:
    cols2pm_map[col] = f"{col} {pm_sign} stderr"
stderr = pd.DataFrame()
avg = pd.DataFrame()
stderr[measure_cols] = df_stderr[measure_cols].round(3).astype(str)
avg[measure_cols] = df_avg_measure[measure_cols].round(3).astype(str)

df_result = pd.concat([avg.stack(),stderr.stack()],axis=1) \
    .apply(lambda x: pm_sign.join(x), axis=1) \
    .unstack()

df_result.rename(columns=cols2pm_map, inplace=True)

In [10]:
def highlight_pm(s,sign): # highlight best PPL and TCs in green
    is_best = compute_min(s,sign) if 'PPL' in s.name else compute_max(s,sign)
    return ['background: green' if cell else '' for cell in is_best]
highlight_base_pm = lambda x: ['background: darkblue' if x.name in ['-avgbaseline'] else '' for i in x] # high light the avgbaseline in blue


# highlight the rows: avgbaseline in blue, best PPL & TCs in green
# .apply(highlight_base_pm,axis=1) 
df_result.style.apply(lambda s: highlight_pm(s,pm_sign)).apply(highlight_base_pm,axis=1) 

Unnamed: 0,PPL ± stderr,tc_disease_disease ± stderr,tc_drug_disease ± stderr,tc_drug_drug ± stderr,topic_recon ± stderr
-avgbaseline,114.396 ± 0.176,0.02 ± 0.002,0.036 ± 0.008,0.126 ± 0.004,1.437 ± 0.102
-batch 256,116.026 ± 0.513,0.023 ± 0.004,0.031 ± 0.009,0.141 ± 0.013,1.222 ± 0.107
-batch 32,113.636 ± 0.243,0.02 ± 0.002,0.029 ± 0.008,0.124 ± 0.007,1.338 ± 0.084
-batch 64,113.818 ± 0.202,0.02 ± 0.002,0.032 ± 0.007,0.131 ± 0.013,1.621 ± 0.12
-embed_dim 128,119.307 ± 0.221,0.028 ± 0.002,0.059 ± 0.01,0.18 ± 0.01,1.594 ± 0.134
-embed_dim 64,127.744 ± 0.332,0.027 ± 0.004,0.048 ± 0.008,0.243 ± 0.009,1.431 ± 0.128
-gcn_dim 256,113.751 ± 0.286,0.023 ± 0.003,0.026 ± 0.007,0.124 ± 0.006,2.241 ± 0.202
-gcn_dim 64,116.038 ± 0.258,0.027 ± 0.002,0.031 ± 0.008,0.14 ± 0.006,0.873 ± 0.056
-gcn_drop 0.15,115.072 ± 0.256,0.022 ± 0.002,0.027 ± 0.009,0.131 ± 0.008,1.641 ± 0.105
-gcn_drop 0.18,114.934 ± 0.268,0.023 ± 0.002,0.025 ± 0.007,0.131 ± 0.007,1.541 ± 0.087


In [11]:
compare_params.keys() # the seeds we used in these experiments

dict_keys(['-seed 41504', '-seed 47486', '-seed 14557', '-seed 43690', '-seed 24602', '-seed 20284', '-seed 31388', '-seed 33273', '-seed 26668', '-seed 21897'])

In [12]:
all_indexes = []
with open("../params/params_pool.txt") as f:
    for l in f.readlines():
        tokens = l.strip("\n").split(" ")
        token1 = tokens[-1]
        if token1 == 'CCS':
            param = '-avgbaseline'
        else:
            token2 = tokens[-2]
            param = f"{token2} {token1}"
        all_indexes.append(param)
all_indexes = set(all_indexes)

In [13]:
# check how many experiments are still running
running_exps = {}
for f,f_name in zip(files,file_names):
    df = pd.read_csv(f)
    status_series = df['split']
    status = status_series.iloc[-1]
    if status == "valid":
        running_exps[f_name] = f2param[f_name]
if len(running_exps) > 0:
    print(running_exps)

In [21]:
# find the missing params from experiments (identified with seed)
# such params are NOT in running schedule, if a param does not show in here
# then it is either Done or Running
missing = {}
baseline = "run.py -data subbkg_dd_an -gpu 4 -name subbkg_dd_nll -nodedoc -diffa -within_type -anchor CCS"
for k,v in s2p_map.items():
    cur_v = []
    for param in v:
        pattern = re.search(r" -seed.*",param).group()
        cur_v.append(param.replace(pattern,""))
    newv = set(cur_v)
    diff_set = all_indexes.difference(newv)
    if diff_set != set():
        params = list(diff_set)
        for i in range(len(params)):
            params[i] = f"{baseline} {params[i]} {k}"
        missing[k] = params
with open("missing_exp.json", "w") as f:
    json.dump(missing,f)