In [None]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
from scipy.stats import variation
from time import time
from neighbor_variance import neighbor_variance, get_file_list
from collections import defaultdict

from scale_to_latex import get_columnwidth, get_textwidth, get_figsize

In [None]:
# Folder where graphs are stored
d = "../train/data/"
graphdirs = [
    f"{d}Cora/",
    f"{d}Facebook/",
    f"{d}Protein/",
    f"{d}BlogCatalog/",
    f"{d}Wikipedia/"
]

# Folder where embeddings are stored
r = "../train/results/"
resultdirs = [
    r + "Cora/",
    r + "Facebook/",
    r + "Protein/",
    r + "BlogCatalog/",
    r + "Wikipedia/"
]

algorithms = [
    "graphsage",
#    "hope",
    "line",
    "node2vec",
    "sdne"
]

datasets = [
    "cora",
    "facebook",
    "protein",
    "blog",
    "wikipedia"
]

replace_dict = {"line":"LINE", "hope":"HOPE", "sdne":"SDNE", "graphsage":"GraphSAGE", "node2vec":"node2vec",
                "facebook":"Facebook", "protein":"Protein", "blog":"BlogCatalog", "wikipedia": "Wikipedia", "cora": "Cora"}
color_dict = {"LINE":"#E30066", "HOPE":"#612158", "SDNE":"#F6A800", "GraphSAGE":"#00549F", "node2vec":"#57AB27"}
seed = 87235

In [None]:
"""
    Run the neighbor_variance experiment for all specified datasets and algorithms. 
    For every dataset the nodes are only sampled once and saved as a file.
"""
start_time = time()
df = pd.DataFrame()
agg_list = []
k = 1600
min_nodes = 10000 # larger than sample size
for i in range(len(datasets)):
    tmp_df = pd.DataFrame()
    nodes = None
    for algo in algorithms:
        embedding_list = get_file_list([algo, datasets[i]], resultdirs[i])
        aggregated_results, raw_results, nodes = neighbor_variance(graphdirs[i], k, resultdirs[i],
                                                                   embedding_list, datasets[i], nodes_dict=nodes, seed=seed)
        # preaggregate into variances
        variance_dict = defaultdict(list)
        for node in raw_results["node"].unique():
            for j in raw_results["neighbor_type"].unique():
                variance_dict["node"].append(node)
                variance_dict["neighbor_type"].append(j)
                variance_dict["algorithm"].append(algo)
                variance_dict["dataset"].append(datasets[i])
                sel = raw_results.loc[(raw_results.node == node) & (raw_results.neighbor_type == j), "similarity"]
                variance_dict["variance"].append(np.var(sel))
                variance_dict["mean"].append(np.mean(sel))
                variance_dict["mean_abs_dev_deg"].append(sel.transform(lambda x: np.degrees(np.arccos(x))).mad())
        tmp_df = tmp_df.append(pd.DataFrame(variance_dict))
        agg_list.append([algo, i, aggregated_results])

    
    with open(f"./nb_var_nodes/{datasets[i]}_nodes", "wb") as node_file:
        pickle.dump(nodes, node_file)
    # only keep those nodes that have 1-nb, 2-nb, distant(-nb)
    # the length will be a multiple of 90
    notnan_nodes = [node for node in nodes.keys() if None not in nodes[node]]
    tmp_df = tmp_df.loc[tmp_df.node.isin(notnan_nodes)]
    
    # combine dataframes
    df = df.append(tmp_df)
    min_nodes = min(min_nodes, len(tmp_df.node.unique()))
print(f"Computing the data frame took {time()-start_time} seconds.")

In [None]:
from IPython.display import display
""" 
    Make sample size consistent
"""
save_df = df.copy()
udf = pd.DataFrame()
for i, dataset in enumerate(datasets):
    tmp = df[df.dataset == dataset]
    sampled_nodes = tmp.node.unique()
    np.random.shuffle(sampled_nodes)
    kept_nodes = sampled_nodes[:min_nodes]
    udf = udf.append(tmp[tmp.node.isin(kept_nodes)].copy())
    print(dataset, len(udf[udf.dataset == dataset].node.unique()))
print(min_nodes)
df = udf

In [None]:
df = df.replace(to_replace=replace_dict)
df.rename(columns={"algorithm": "Algorithm"}, inplace=True)
#df["Algorithm"] = pd.Categorical(df["Algorithm"], ["HOPE", "LINE", "node2vec", "SDNE", "GraphSAGE"])
df["Algorithm"] = pd.Categorical(df["Algorithm"], ["LINE", "node2vec", "SDNE", "GraphSAGE"])
df["neighbor_type"] = df["neighbor_type"].replace(to_replace={0:"1-Hop Neighbor", 1:"2-Hop Neighbor", 2:"Distant Node"})
df.head()

In [None]:
#cora and blog
angle = 40
columnwidth = get_columnwidth()
textwidth = get_textwidth()
light_gray = ".8"
dark_gray =".15"
sns.set(context="notebook", style="ticks", font_scale=1, #font="Bitstream Vera Sans",
        rc={"axes.edgecolor": light_gray, "xtick.color": dark_gray,
            "ytick.color": dark_gray, "xtick.bottom": True,
            "font.size":8,"axes.titlesize":6,"axes.labelsize":6, "xtick.labelsize":6, "legend.fontsize":6, 
            "ytick.labelsize":6, "axes.linewidth":1, 
            "xtick.minor.width":0.5, "xtick.major.width":0.5,
            "ytick.minor.width":0.5, "ytick.major.width":0.5, "lines.linewidth": 0.7,
            "xtick.major.size": 3,
            "ytick.major.size": 3,
            "xtick.minor.size": 2,
            "ytick.minor.size": 2,
           })
width, height, aspect = get_figsize(textwidth, wf=1/5)


#g = sns.catplot(data=df.loc[(df.dataset == "Cora") | (df.dataset == "Protein")].sort_values(by=["dataset", "Algorithm"]),
#                x="Algorithm", y="mean_abs_dev_deg", errwidth=0, palette="tab10",
#                hue="neighbor_type", kind="bar", col="dataset", estimator=np.mean, legend=False, height=width)
g = sns.catplot(data=df.loc[(df.dataset == "BlogCatalog") | (df.dataset == "Cora") | (df.dataset == "Facebook") | (df.dataset == "Protein") | (df.dataset == "Wikipedia")].sort_values(by=["dataset", "Algorithm"]),
                x="Algorithm", y="mean_abs_dev_deg", errwidth=0, palette="tab10",
                hue="neighbor_type", kind="bar", col="dataset", estimator=np.mean, legend=False, height=width, aspect=1)
g.set_ylabels("Average Mean Absolute Deviation\nof Angle in Degrees")
g.set_titles("{col_name}")
g.axes[0,3].legend(loc='upper center', bbox_to_anchor=(-0.8, -0.6), fancybox=False, shadow=False, ncol=5)
g.set_xlabels("")
#g.set_xticklabels(["HOPE", "LINE", "node2vec", "SDNE", "GraphSAGE"])
g.set_xticklabels(["LINE", "node2vec", "SDNE", "GraphSAGE"])
for i in range(g.axes.shape[1]):
    g.axes[0,i].set_xticklabels(g.axes[0,i].get_xticklabels(), rotation=angle, horizontalalignment='right')
    
for ax in g.axes[0, :]:
    ax.set_yticks(np.arange(0, 9, 2))
g.savefig("plots/nb_var_mad_all.pdf", bbox_inches="tight")
