In [1]:
import os
import re
import pandas as pd
from visualization import *
path = './tuning'
img_path = './tuning_img'

param_log = 'params_log.txt'
measure_cols = ['PPL', 'topic_recon', 'tc_drug_drug', 'tc_disease_disease', 'tc_drug_disease']
max_measures = [s+"_max" for s in measure_cols[1:]] 
max_epoches = [s+"_max_epoch" for s in measure_cols[1:]]
files = [os.path.join(path,f) for f in sorted(os.listdir(path))][2:]
file_names = [f for f in sorted(os.listdir(path))][2:]

In [2]:
# file-to-param mapping
f2param = get_f2p(param_log)
rows = []
for f,f_name in zip(files,file_names):
    best_row = get_row(f,f_name,f2param,measure_cols[1:])
    rows.append(best_row)
    ## uncomment this line to generate the plts for csv files
    # plt_csv(f,f_name,f2param,showfig=False)

In [3]:
# raw data 
df = pd.concat(rows,axis=1).T
df.sort_values(by="PPL",inplace=True)
df[measure_cols] = df[measure_cols].apply(pd.to_numeric).round(3)
df[["Epoch","best_epoch"]] = df[["Epoch","best_epoch"]].astype(int)
df[max_measures] = df[max_measures].round(3)
df[max_epoches] = df[max_epoches].astype(int)
df_measure = df[measure_cols]

In [4]:
s2p_map = get_s2p(list(df.index))
# excluding the '-seed 47486'
del s2p_map['-seed 47486']
df_avg_baselines = pd.DataFrame()
# calculate a avg baseline and add them to to s2p_map
for seed, params in s2p_map.items():
    baselines = []
    for l in params:
        if "-baseline" in l:
            baselines.append(l)
    index_name = f"-avgbaseline {seed}"
    avg_baseline = df_measure.loc[baselines].mean(axis=0).rename(index_name)
    # add a series to dataframe: needs to convert series to dataframe, and then Transpose it
    df_avg_baselines = pd.concat([df_avg_baselines,avg_baseline.to_frame().T],axis=0)
    s2p_map[seed].append(index_name)
# merge avgbaselines with original raw data
df_measure = pd.concat([df_measure,df_avg_baselines])

In [5]:
# seeds = s2p_map.keys()
compare_params,share_params = same_params(s2p_map)
param_num = len(list(compare_params.values())[0]) # number of parameters
stderr_list = []
avg_measure_list = []
for i in range(param_num):
    display_list = [compare_params[seed][i] for seed in compare_params.keys()]
    # compute the standard error based on some params tuned: std of avgbaselines, avg of -lr 0.05...
    stderr_series = df_measure.loc[display_list].sem(axis=0) 
    stderr_series = stderr_series.rename(str(share_params[i])) # rename the series
    stderr_list.append(stderr_series)
    # compute the avg of all params: avg of avgbaselines, avg of -lr 0.05, ...
    avg_series = df_measure.loc[display_list].mean(axis=0) 
    avg_series = avg_series.rename(str(share_params[i]))
    avg_measure_list.append(avg_series)
df_stderr = pd.concat(stderr_list,axis=1).T
df_avg_measure = pd.concat(avg_measure_list,axis=1).T

In [6]:
def highlight(s): # highlight best PPL and TCs in green
    is_best = s == s.min() if 'PPL' in s.name else s == s.max()
    return ['background: green' if cell else '' for cell in is_best]
highlight_base = lambda x: ['background: darkblue' if x.name in ['-avgbaseline'] else '' for i in x] # high light the avgbaseline in blue

In [7]:
df_stderr = df_stderr.sort_values(by="PPL")

# df_stderr.style.apply(highlight).apply(highlight_base,axis=1).format(precision=3)

In [8]:
df_avg_measure = df_avg_measure.sort_values(by="PPL")

# df_avg_measure.style.apply(highlight).apply(highlight_base,axis=1).format(precision=3)

In [9]:
pm_sign = " \u00B1 "
cols2pm_map = {}
for col in measure_cols:
    cols2pm_map[col] = f"{col} {pm_sign} stderr"
stderr = pd.DataFrame()
avg = pd.DataFrame()
stderr[measure_cols] = df_stderr[measure_cols].round(3).astype(str)
avg[measure_cols] = df_avg_measure[measure_cols].round(3).astype(str)

df_result = pd.concat([avg.stack(),stderr.stack()],axis=1) \
    .apply(lambda x: pm_sign.join(x), axis=1) \
    .unstack()

df_result.rename(columns=cols2pm_map, inplace=True)

In [10]:
def highlight_pm(s,sign): # highlight best PPL and TCs in green
    is_best = compute_min(s,sign) if 'PPL' in s.name else compute_max(s,sign)
    return ['background: green' if cell else '' for cell in is_best]
highlight_base_pm = lambda x: ['background: darkblue' if x.name in ['-avgbaseline'] else '' for i in x] # high light the avgbaseline in blue


# highlight the rows: avgbaseline in blue, best PPL & TCs in green
# .apply(highlight_base_pm,axis=1) 
df_result.style.apply(lambda s: highlight_pm(s,pm_sign)).apply(highlight_base_pm,axis=1) 

Unnamed: 0,PPL ± stderr,tc_disease_disease ± stderr,tc_drug_disease ± stderr,tc_drug_drug ± stderr,topic_recon ± stderr
-avgbaseline,114.304 ± 0.279,0.021 ± 0.002,0.036 ± 0.014,0.13 ± 0.005,1.524 ± 0.13
-batch 256,115.732 ± 0.321,0.027 ± 0.006,0.021 ± 0.011,0.111 ± 0.011,1.181 ± 0.191
-batch 32,113.634 ± 0.391,0.018 ± 0.002,0.017 ± 0.005,0.125 ± 0.012,1.406 ± 0.127
-batch 64,113.83 ± 0.235,0.024 ± 0.002,0.022 ± 0.008,0.134 ± 0.02,1.587 ± 0.22
-embed_dim 128,118.867 ± 0.235,0.024 ± 0.003,0.056 ± 0.014,0.186 ± 0.018,1.497 ± 0.093
-embed_dim 64,128.08 ± 0.485,0.025 ± 0.004,0.047 ± 0.011,0.239 ± 0.013,1.185 ± 0.176
-gcn_dim 256,113.604 ± 0.39,0.026 ± 0.005,0.012 ± 0.005,0.124 ± 0.009,1.993 ± 0.257
-gcn_dim 64,115.961 ± 0.394,0.029 ± 0.003,0.024 ± 0.007,0.144 ± 0.008,0.923 ± 0.058
-gcn_drop 0.15,115.538 ± 0.28,0.025 ± 0.004,0.015 ± 0.006,0.124 ± 0.013,1.624 ± 0.154
-gcn_drop 0.2,114.672 ± 0.415,0.021 ± 0.002,0.034 ± 0.011,0.125 ± 0.013,1.25 ± 0.24


In [13]:
len(df_result.index)

20

In [11]:
compare_params.keys() # the seeds we used in these experiments

dict_keys(['-seed 41504', '-seed 43690', '-seed 24602', '-seed 33273', '-seed 21897'])

In [15]:
cur_indexes = set(list(df_result.index))
cur_indexes

{'-avgbaseline',
 '-batch 256',
 '-batch 32',
 '-batch 64',
 '-embed_dim 128',
 '-embed_dim 64',
 '-gcn_dim 256',
 '-gcn_dim 64',
 '-gcn_drop 0.15',
 '-gcn_drop 0.2',
 '-gcn_drop 0.3',
 '-hid_drop 0.1',
 '-hid_drop 0.2',
 '-hid_drop 0.4',
 '-init_dim 256',
 '-init_dim 512',
 '-init_dim 64',
 '-lr 0.0001',
 '-lr 0.0005',
 '-lr 0.005'}

In [19]:
all_indexes = []
with open("./params/params_pool.txt") as f:
    for l in f.readlines():
        tokens = l.strip("\n").split(" ")
        token1 = tokens[-1]
        if token1 == 'CCS':
            param = '-avgbaseline'
        else:
            token2 = tokens[-2]
            param = f"{token2} {token1}"
        all_indexes.append(param)
all_indexes = set(all_indexes)

In [20]:
all_indexes.difference(cur_indexes)

{'-gcn_drop 0.18', '-gcn_drop 0.4'}

In [24]:
# find the missing params from experiments (identified with seed)
for k,v in s2p_map.items():
    cur_v = []
    for param in v:
        pattern = re.search(r" -seed.*",param).group()
        cur_v.append(param.replace(pattern,""))
    newv = set(cur_v)
    print(k, all_indexes.difference(newv))

-seed 41504 set()
-seed 43690 set()
-seed 24602 set()
-seed 33273 {'-gcn_drop 0.18'}
-seed 21897 {'-gcn_drop 0.4'}


In [9]:
import re
words = ["-word1","-word2","-word3","-word4"]
params_str = " ".join(words)


tokens = params_str.split("-")
tokens = [param.strip(" ") for param in tokens[1:]]
params = [f"-{token}" for token in tokens]
param_num = len(params)

result = []
for i in range(param_num-1):
    base = params[i]
    for j in range(i, param_num-1):
        for k in range(j+1, param_num):
            curl = f"{base} {params[k]}"
            result.append(curl)
        base = f"{base} {params[j+1]}"
        
result

['-word1 -word2',
 '-word1 -word3',
 '-word1 -word4',
 '-word1 -word2 -word3',
 '-word1 -word2 -word4',
 '-word1 -word2 -word3 -word4',
 '-word2 -word3',
 '-word2 -word4',
 '-word2 -word3 -word4',
 '-word3 -word4']

In [15]:
from subprocess import Popen, PIPE
import re
pipe = Popen(["screen","-ls"], stdout=PIPE)

text = pipe.communicate()[0].decode("utf-8")
running_screens = text.split("\n")[1:-2]
if running_screens != []:
    for l in running_screens:
        match = re.

SyntaxError: invalid syntax (1222724350.py, line 9)