In this notebook, we visualize some sweep results.
We create boxplots for each noise levels and both models to see:
- The performance of both models
- How performance relate to the number of iterations/layers

In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import wandb

import numpy as np
import seaborn as sns

import sys
BASE_PATH = globals()['_dh'][0].parent.absolute()
sys.path.insert(1, str(BASE_PATH))

from src.utils.wandb_analysis import get_sweep_info, get_clean_sweep_runs
wandb.login()

True

In [4]:
def get_boxplot(sweep_id, noise_level):
    df = get_clean_sweep_runs(sweep_id, model_name=None)
    plt.figure()
    bp = sns.boxplot(x='num_iter_layers',
            y="test_accuracy",
            data=df,
            width=0.5,
            flierprops={"marker": "x"},
            medianprops={"color": "red"},
            color="royalblue"
            )
    bp.set(title="noise level: "+str(noise_level), xlabel='Number of iterations/layers', ylabel='Test accuracy')
    sns.move_legend(bp, loc=(1.02, 1))

In [35]:
usual_sweeps = [
"yb9nj1j5", # noise 0, 100 data
"s29ovdjm", # noise 1, 100 data
"io2roeki", # noise 2, 100 data
"ni1y0i2n", # noise 3, 100 data
"c1v5292z", # noise 4, 100 data
"f3kaci4y", # noise 5, 100 data
"91g554g5", # noise 6, 100 data
"zgyfs4la"  # noise 7, 100 data
]

iterative_sweeps = [
"8b5samb8", # noise 0, 100 data
"4013a4eg", # noise 1, 100 data
"hewvhl9l", # noise 1, 200 data
"y4prhi94", # noise 2, 100 data
"yypt1ahv", # noise 2, 200 data
"el1u1dr4", # noise 3, 100 data
"n21z6vhb", # noise 3, 200 data
"4mlrggda", # noise 4, 100 data
"f5sg4zi7", # noise 4, 200 data
"x6nu6eip", # noise 4, 200 data
"qspwm421", # noise 5, 100 data
"2e6jew01", # noise 5, 200 data
"36x0z5q4", # noise 6, 100 data
"eyb0xbt1", # noise 6, 200 data
"823rcbn7", # noise 7, 100 data
"7etnd43p"  # noise 7, 200 data
]

variant_sweeps = [
"nhj93ccd", # noise 0, 100 data
"z35d1w5v", # noise 1, 100 data
"3qpjo1d3", # noise 2, 100 data
"uhno0u30", # noise 3, 100 data
"nm41at8u", # noise 4, 100 data
"4fq47kn0", # noise 5, 100 data
"9w9xq0qg", # noise 6, 100 data
"kj2gmoex"  # noise 7, 100 data
]

# the following list contains new sweeps that fix all hyperparameters except number of iterations/layers to 
iterative_sweeps_new = [
    "npqagdoe",
    "fbn61ca2",
    "9atukh0o",
    "3i2obz9l",
    "ic0hskuj",
    "g1jho213",
    "nnefje8j",
    "k8g0jcqg",
    "eypvgt5a",
    "rpdye1qv",
    "dpqr6d03",
    "kvx76htx",
    "sc74qok8",
    "98xetv9g",
    "hkdp23qh",
    "pr8zccth",
    "t5fm602l"
]

usual_sweeps_new = [
    "ajdn1ku3",
    "wpwawhmp",
    "2m3mssse",
    "3p8bp8cp",
    "fhyx3bbq",
    "mba8gfcz",
    "nr0x2ymu",
    "fue8e9t8",
    "ztmtmko3"
]

variant_sweeps_new = [
    "nlx35xhp",
    "tneqq6w2",
    "dxka4tsg",
    "9c28vxsu",
    "gvoklm8v",
    "ie77h8x0",
    "z5dtpgqd",
    "1yx7lryw",
    "p5z39rcd"
]
et_sweeps = [
    "0raelo43",
    "4rg0uxpp",
    "94xcgi8e",
    "erzrkzbk",
    "fee9p58u",
    "hui3yyj4",
    "kocjz30g",
    "l7vskl73"
]
la_sweeps = [
    "8riai7xz",
    "9on84a2k",
    "asuy8soi",
    "jwfs6hs6",
    "lps302bu",
    "n5x4650c",
    "rae47h1a",
    "tc64owjo"
]

et_sweeps_new = [
    "0jtwamgf",
    "df8kwrcn",
    "dvdreyh1",
    "hn1uti5e",
    "rw1njstw",
    "w9m9pd9w",
    "z9rcd42e",
    "zvbdj6cq"
]
la_sweeps_new = [
    "25djetlf",
    "ivdib8fh",
    "k8u5qveo",
    "ls4ffjhb",
    "ovgs93gz",
    "wg2tfjx0",
    "xjjclkjl",
    "zandqytf"
]

#fs below means fixed seed
usual_sweeps_fs = [
    "wpgqndil",
    "snixszrp",
    "sg10twya",
    "fixk72p9",
    "chw33qjj",
    "85ww4r7f",
    "6gsnhfm2",
    "xyla47oz"
]
iterative_sweeps_fs = [
    "r8a00hlx",
    "hdbezpa8",
    "eed7mz8v",
    "3zpkny2k",
    "09xsc4jg",
    "5yj0hoq0",
    "tb7en5xo",
    "deq4nstg"
]
variant_sweeps_fs = [
    "z76p5hhw",
    "wrykoehh",
    "rdakindz",
    "katu4o1y",
    "jl8sb2r8",
    "d5h89j4k",
    "9nwot4q3",
    "6w2oje4q"
]
et_sweeps_fs = [
    "iko74hch",
    "7xo3sn45",
    "42bru6yl",
    "dhcgqwh9",
    "x2uurnl1",
    "r0qjjsad",
    "46hm8exn",
    "tv6fdghw"
]
la_sweeps_fs = [
    "z5u0dnci",
    "ux3ih27j",
    "tapto3iy",
    "jxpq7oip",
    "4iepihxt",
    "xbla22nc",
    "7ndaao6o",
    "yh51xjxf"
]
et_norm_sweeps_fs = [ # For these sweeps, iteration is embedded by dividing total number of iterations
    "2wodzj6l",
    "bowne41k",
    "hfiwothe",
    "lr2ldaeo",
    "mn4dkjom",
    "qeplt441",
    "qn8hehck",
    "vnszomqr"
]
gcn_cs=[
    "c51ajunx", # noise 0
    "4cdluy2h", # noise 0.5
    "f4o5r28e" # noise 0.7
]
igcn_cs = [
    "nw6tl195", # noise 0
    "of4scxy5", # noise 0.5
    "uku89c8n" # noise 0.7
]




gcn_cs_fs = [
    "f9oinwkz", # noise 0
    "t7za0ph0", # noise 0.5
    "cuwdj19f" #noise 0.7
]

igcn_cs_fs = [
    "o170sjpu", # noise 0
    "d9p5cq09", # noise 0.5
    "nncry82a" # noise 0.7
]

gcn_pm_fs = [
    "nanuq30r", # noise 0
    "xkoexajw", # noise 0.5
    "3p3wmfvf" # noise 0.7
]

igcn_pm_fs = [
    "w2r5pqsl", # noise 0
    "3lfmpfyr", # 0
    "it6yivlp", # 0 
    "5my7tp9h", # 0
    "wiohqb26", # noise 0.5
    "lv444ihi", # 0.5
    "0v9to5o9", # 0.5
    "omustd51", # 0.5
    "n0avvfij", # noise 0.7
    "gekwibpk", # 0.7
    "enc22u9x", # 0.7
    "anx32dnu" # 0.7
]
    
igat_cr_fs = [
    "iphwe5ng", # noise 0 
    "ldtwbmgf", # noise 0.5
    "6ospmpw2" # noise 0.7
]

igcnv_cs_fs = [
    "31pg9a6s", # 0
    "lb2f13h4", # 0
    "an0m2893", #0.5
    "uxdeh47d", #0.5
    "vey0ipk3", #0.7
    "wuxu9xyn" #0.7
]

igcnv_pm_fs = [
    "836w6wfe", #0
    "b7f9h1ha", #0
    "7evp63z4", #0.5
    "me29hh89", #0.5
    "vq5tihts", #0.7
    "xitugzm8" #0.7
]



gcn_cs_fs_new = [
    "zmgyk5ib", #0
    "945jmsbj", #5
    "c3sf0gte" #7
]
gcn_pm_fs_new = [
    "qff3i62o", #0
    "3k4cumdc", #5
    "wdpc3vu0" #7
]
gcn_cr_fs_new=[
    "bja0do1o", #0
    "6tu3jk7a",#5
    "6kh2f2p4"#7
]

igcn_cs_fs_new = [
    "bqs2r4gb", #0
    "anjdfn1j", #5
    "sjvvlvko" #7
]

igcn_cr_fs_new = [
    "ixlq5rts", #0
    "zeay7g0u", #5
    "nu5i0axx" #7
]
igcnv_cs_fs_new=[
    "ktn26iet", #0
    "imxwvb6u",#5
    "ejmhxqb4"#7
]
igcnv_cr_fs_new=[
    "32u3g3de",#0
    "ycyroewa", #5
    "75l1f5zx"#7
]

In [None]:
get_sweep_info(igcnv_cs_fs).sort_values(by=["noise_percent"])

In [None]:
get_sweep_info(igcn_cs_fs).sort_values(by=["noise_percent"])

# Learned smoothing factors

In [None]:
la_df_list = []
for sweep_id in la_sweeps_fs:
    la_df_list.append(get_clean_sweep_runs(sweep_id=sweep_id, model_name='l_iGCN'))
la_df = pd.concat(la_df_list)

In [None]:
noise_level = 0.1
df = la_df[la_df["noise_percent"] == noise_level]
num_iterations = 4
dff = df[df["num_iter_layers"] == num_iterations]
dff

In [None]:
noise_level = 0.5
df = la_df[la_df["noise_percent"] == noise_level]
num_iterations = 7
dff = df[df["num_iter_layers"] == num_iterations]
drop_index = dff[dff["test_accuracy"].isnull()].index
dff.drop(drop_index, inplace=True)
running_sum = np.full(num_iterations, 0)
len_df = len(dff["learned smoothing factors"])
for item in dff["learned smoothing factors"]:
    running_sum = [a+b for (a,b) in zip(running_sum, item)]

mean = [a/len_df for a in running_sum]
print(mean)

# Exp

In [36]:
from tqdm import tqdm
df_list = []
for sweep_id in tqdm(gcn_cr_fs_new):
    df_list.append(get_clean_sweep_runs(sweep_id=sweep_id, model_name='GCN'))
for sweep_id in tqdm(igcn_cr_fs_new):
    df_list.append(get_clean_sweep_runs(sweep_id=sweep_id, model_name='iterativeGCN'))
for sweep_id in tqdm(igcnv_cr_fs_new):
    df_list.append(get_clean_sweep_runs(sweep_id=sweep_id, model_name='iterativeGCNvariant'))
# for sweep_id in tqdm(et_sweeps_fs):
#     df_list.append(get_clean_sweep_runs(sweep_id=sweep_id, model_name='et_iGCN'))
# for sweep_id in tqdm(la_sweeps_fs):
#     df_list.append(get_clean_sweep_runs(sweep_id=sweep_id, model_name='l_iGCN'))
# for sweep_id in tqdm(et_norm_sweeps_fs):
#     df_list.append(get_clean_sweep_runs(sweep_id=sweep_id, model_name='normed_et_iGCN'))
df = pd.concat(df_list)
drop_index = df[df["test_accuracy"].isnull()].index
df.drop(drop_index, inplace=True)
# df.to_csv("df_full.csv")

100%|██████████| 3/3 [00:30<00:00, 10.02s/it]
100%|██████████| 3/3 [00:29<00:00,  9.87s/it]
100%|██████████| 3/3 [00:50<00:00, 16.95s/it]


# Start from here

In [None]:
df = pd.read_csv("df_full.csv")

In [None]:
df

In [None]:
for noise_level in np.sort(pd.unique(df["noise_percent"])):
    dfn = df[df["noise_percent"] == noise_level]
    fig = plt.figure()
    sns.set(style="darkgrid")
    sns.set(rc={'figure.figsize':(8,6)})
    
    bp = sns.boxplot(x='num_iter_layers',
            y="test_accuracy",
            data=dfn,
            width=0.5,
            hue="model name",
            flierprops={"marker": "x",
                        "markersize": 2},
            medianprops={"color": "red"},
            palette="Paired"
            )
    bp.set(title="noise level: "+str(noise_level), 
           xlabel='Number of iterations/layers', 
           ylabel='Test accuracy')
    sns.move_legend(bp, loc=(1.02, 1))
    name = "noise_level_"+str(noise_level)+".png"
    
#     fig.savefig(name, bbox_inches='tight')
    

In [37]:
dict_list = []
for noise_level in np.sort(pd.unique(df["noise_percent"])):
    dfn = df[df["noise_percent"] == noise_level]
    for model in pd.unique(dfn["model name"]):
            dfm = dfn[dfn["model name"] == model]
            for iteration in np.sort(pd.unique(dfm["num_iter_layers"])):
                dfi = dfm[dfm["num_iter_layers"] == iteration]
                mean = np.mean(dfi["test_accuracy"])
                median = np.median(dfi["test_accuracy"])
                std = np.std(dfi["test_accuracy"])
                num_runs = dfi["test_accuracy"].count()
                
                eff_iter = 1
                if model == "GCN":
                     eff_iter = iteration 

                num_param = 1433 * 32 + 32 + 32 * 32 * eff_iter + 32 + 32 * 7 + 7
                dic = {
                        "noise percent": noise_level,
                        "model name": model,
                        "number of iterations/layers": iteration,
                        "mean": mean,
                        "median": median,
                        "std": std,
                        "parameters": num_param,
                        "number of runs": num_runs
                }
                dict_list.append(dic)

stats_df = pd.DataFrame(dict_list)     

In [39]:
pd.set_option('display.max_rows', None)
noise_level = 0.5
stats_df[stats_df["noise percent"] == noise_level].round(3)

Unnamed: 0,noise percent,model name,number of iterations/layers,mean,median,std,parameters,number of runs
10,0.5,GCN,3,0.332,0.33,0.122,49223,98
11,0.5,iterativeGCN,7,0.415,0.434,0.089,47175,98
12,0.5,iterativeGCNvariant,2,0.193,0.139,0.087,47175,26
13,0.5,iterativeGCNvariant,3,0.162,0.156,0.045,47175,24
14,0.5,iterativeGCNvariant,4,0.157,0.137,0.069,47175,21
15,0.5,iterativeGCNvariant,5,0.157,0.136,0.067,47175,26
16,0.5,iterativeGCNvariant,6,0.149,0.134,0.047,47175,22
17,0.5,iterativeGCNvariant,7,0.152,0.147,0.043,47175,31
18,0.5,iterativeGCNvariant,8,0.192,0.154,0.074,47175,24
19,0.5,iterativeGCNvariant,9,0.145,0.144,0.03,47175,24


In [None]:
get_boxplot("an0m2893", 0.5)