In [None]:
import os
from pathlib import Path
import importlib

BASE_MODULO_PATH = Path("path/of/the/folder/where/hephaestus/is/placed")

SAVE_PATH = BASE_MODULO_PATH / "excluded" / "hephaestus_storage_unit" / "plots" / "stats_synth_dataset"
SAVE_PATH.mkdir(exist_ok=True, parents=True)

DETERMINISTIC_DATA_PATH = Path(
    "path/to/where/v2.tar.xz/was/extracted"
)
NON_DETERMINISTIC_DATA_PATH = Path(
    "path/to/where/v1_missing_parts.tar.xz/was/extracted"
)
DETERMINISTIC_LABELS_PATH = DETERMINISTIC_DATA_PATH / "v2" / "labels"
NON_DETERMINISTIC_LABELS_PATH = NON_DETERMINISTIC_DATA_PATH / "v1_missing_parts" / "labels"

os.chdir(BASE_MODULO_PATH)

In [None]:
import networkx as nx

import pandas as pd

import torch
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt

import hephaestus.utils.general_utils as hutils
import hephaestus.utils.load_general_config as hconf

pd.options.mode.chained_assignment = None  # default='warn'

sns.set_context("paper", font_scale=2)
sns.set_style("whitegrid")

In [None]:
rng = np.random.default_rng(42)


def reload_hephaestus_modules():
    importlib.reload(hutils)


def roundup(x):
    return np.ceil(x / 10.0) * 10

In [None]:
p = [
    "#000000",
    "#E69F00",
    "#56B4E9",
    "#009E73",
    "#FB6467FF",
    "#808282",
    "#F0E442",
    "#440154FF",
    "#0072B2",
    "#D55E00",
    "#CC79A7",
    "#C2CD23",
    "#918BC3",
    "#FFFFFF",
]

In [None]:
sns.color_palette(p)

## Nodes and Edges

In [None]:
df_stats = pd.read_csv(DETERMINISTIC_DATA_PATH / "v2" / "stats_deterministic.csv")

gen_to_remove = set()
for gen in df_stats["GraphNames"].unique():
    if hutils.get_dataset_name(gen) not in list(hconf.DETERMINISTIC_DATA.values()):
        gen_to_remove.add(hutils.get_dataset_name(gen))

for gen in gen_to_remove:
    df_stats = df_stats[~df_stats["GraphNames"].str.contains("HYPERCUBE")]
    df_stats.reset_index(inplace=True, drop=True)
    df_stats["GenType"] = "Deterministic"

In [None]:
df_list = []
for f in os.listdir(NON_DETERMINISTIC_DATA_PATH / "v1_missing_parts"):
    if ".csv" in f:
        stat = pd.read_csv(NON_DETERMINISTIC_DATA_PATH / "v1_missing_parts" / f)
        if f == "NDeterministicGen_stats_1.csv":
            stat = stat[
                ~stat["GraphNames"].apply(lambda x: "REGULAR" in x)
            ].reset_index(drop=True)

        if f == "NDeterministicGen_stats_3.csv":
            stat = stat[
                ~stat["GraphNames"].apply(lambda x: "hephaestus" in x)
            ].reset_index(drop=True)

        stat["GenType"] = "Non Deterministic"
        df_list.append(stat)

df_stats_nd = pd.concat(df_list)

In [None]:
all_df_stats = pd.concat([df_stats_nd, df_stats])

In [None]:
pretty_names = all_df_stats["GraphNames"].apply(hutils.get_pretty_graph_name)
dataset_names = all_df_stats["GraphNames"].apply(hutils.get_dataset_name)

all_df_stats["GraphNames"] = pretty_names

dataset_names = sorted(dataset_names.unique())
pretty_names = sorted(pretty_names.unique())

gen_dataset_dict = dict(
    [(i, dn) for i, dn in zip(range(len(dataset_names)), dataset_names)]
)
pretty_gen_graph_dict = dict(
    [(i, gn) for i, gn in zip(range(len(pretty_names)), pretty_names)]
)

mean_nodes = all_df_stats.groupby("GraphNames")["NumberNodes"].mean().apply(roundup)
mean_edges = all_df_stats.groupby("GraphNames")["NumberOfEdges"].mean().apply(roundup)

all_df_stats["MeanNodes"] = all_df_stats["GraphNames"].map(mean_nodes)
all_df_stats["MeanEdges"] = all_df_stats["GraphNames"].map(mean_edges)


max_nodes = all_df_stats.groupby("GraphNames")["NumberNodes"].max().apply(roundup)
max_edges = all_df_stats.groupby("GraphNames")["NumberOfEdges"].max().apply(roundup)

all_df_stats["MaxNodes"] = all_df_stats["GraphNames"].map(max_nodes)
all_df_stats["MaxEdges"] = all_df_stats["GraphNames"].map(max_edges)


min_nodes = all_df_stats.groupby("GraphNames")["NumberNodes"].min().apply(roundup)
min_edges = all_df_stats.groupby("GraphNames")["NumberOfEdges"].min().apply(roundup)

all_df_stats["MinNodes"] = all_df_stats["GraphNames"].map(min_nodes)
all_df_stats["MinEdges"] = all_df_stats["GraphNames"].map(min_edges)

In [None]:
all_df_stats["GraphNames"] = all_df_stats["GraphNames"].map(
    lambda x: x if x != "Fast Gnp Random Graph" else "Erdos-Renyi"
)
all_df_stats["GraphNames"] = all_df_stats["GraphNames"].map(
    lambda x: x if x != "Grid Graph" else "Square Lattice"
)
all_df_stats["GraphNames"] = all_df_stats["GraphNames"].map(
    lambda x: x if x != "Random Limited 3Dgeo Dd Graph" else "Geometric-3D DD Graph"
)

In [None]:
print("Minimum Stats")
print("Nodes (Used in stats for the real dataset)")
print(all_df_stats["MinNodes"].min())
print(all_df_stats["MinNodes"].mean(), " <------ Value used")
print(all_df_stats["MinNodes"].max())

print("\nEdges (Used in stats for the real dataset)")
print(all_df_stats["MinEdges"].min())
print(all_df_stats["MinEdges"].mean(), " <------ Value used")
print(all_df_stats["MinEdges"].max())

##################################################################
##################################################################

print("\n----------------------------------------------------------------")
print("Mean Stats")
print("Nodes (Used in stats for the real dataset)")
print(all_df_stats["MeanNodes"].min())
print(all_df_stats["MeanNodes"].mean(), " <------ Value used")
print(all_df_stats["MeanNodes"].max())

print("\nEdges (Used in stats for the real dataset)")
print(all_df_stats["MeanEdges"].min())
print(all_df_stats["MeanEdges"].mean(), " <------ Value used")
print(all_df_stats["MeanEdges"].max())

##################################################################
##################################################################

print("\n----------------------------------------------------------------")
print("Max Stats")
print("Nodes (Used in stats for the real dataset)")
print(all_df_stats["MaxNodes"].min())
print(all_df_stats["MaxNodes"].mean(), " <------ Value used")
print(all_df_stats["MaxNodes"].max())

print("\nEdges (Used in stats for the real dataset)")
print(all_df_stats["MaxEdges"].min())
print(all_df_stats["MaxEdges"].mean(), " <------ Value used")
print(all_df_stats["MaxEdges"].max())

In [None]:
all_df_stats_melted = pd.melt(
    all_df_stats,
    id_vars=["GraphNames", "MeanNodes", "MeanEdges"],
    value_vars=["NumberNodes", "NumberOfEdges"],
    var_name="RawCountType",
    value_name="Value",
)

In [None]:
sns.set_context("poster", font_scale=1)
g = sns.catplot(
    data=all_df_stats_melted,
    x="RawCountType",
    y="Value",
    col="GraphNames",
    kind="violin",
    inner_kws=dict(box_width=15, whis_width=2),
    hue="RawCountType",
    col_wrap=4,
    aspect=28 / 20,
    sharey=False,
    palette=p[1:3],
)

g.set_xlabels("")
g.set_titles(col_template="{col_name}", fontweight="bold")
g.despine(left=True)
plt.tight_layout()
plt.savefig(SAVE_PATH/'node_edges.pdf', dpi=1200)
plt.close()

In [None]:
print(f"{all_df_stats['NumberNodes'].sum():,} nodes present.")

print(f"{all_df_stats['NumberOfEdges'].sum():,} edges present.")

---

## See Data - All together

In [None]:
cols = []
new_col_names = {}
for i in range(hconf.NUM_SUBGRAPHS):
    base = "Subgraph" + str(i)
    original = base
    if i == 0:
        base += " (3-path)"
    elif i == 1:
        base += " (3-clique)"
    elif i == 2:
        base += " (4-path)"
    elif i == 3:
        base += " (4-cycle)"
    elif i == 4:
        base += " (4-star)"
    elif i == 5:
        base += " (tri-pan)"
    elif i == 6:
        base += " (bi-pan)"
    elif i == 7:
        base += " (4-clique)"
    cols.append(base)
    new_col_names[original] = base

subgraph_names = range(len(cols))


In [None]:
dgen_data = hconf.DETERMINISTIC_DATA
ndgen_data = hconf.NDETERMINISTIC_DATA

dfs = {}
for d in os.listdir(DETERMINISTIC_LABELS_PATH):
    dataset_name = hutils.get_dataset_name(d, has_file_identifier=True)
    if "csv" not in d or dataset_name not in list(dgen_data.values()):
        continue
    print(d)
    data = pd.read_csv(os.path.join(DETERMINISTIC_LABELS_PATH, d))
    data.dropna(
        subset=[
            "Subgraph0",
            "Subgraph1",
            "Subgraph2",
            "Subgraph3",
            "Subgraph4",
            "Subgraph5",
            "Subgraph6",
            "Subgraph7",
        ],
        inplace=True
    )
    data = data.rename(columns=new_col_names)
    dfs[dataset_name] = data

for d in os.listdir(NON_DETERMINISTIC_LABELS_PATH):
    dataset_name = hutils.get_dataset_name(d, has_file_identifier=True)
    if "csv" not in d or dataset_name not in list(ndgen_data.values()):
        continue
    print(d)
    data = pd.read_csv(os.path.join(NON_DETERMINISTIC_LABELS_PATH, d))
    data.dropna(
        subset=[
            "Subgraph0",
            "Subgraph1",
            "Subgraph2",
            "Subgraph3",
            "Subgraph4",
            "Subgraph5",
            "Subgraph6",
            "Subgraph7",
        ],
        inplace=True
    )
    data = data.rename(columns=new_col_names)
    dfs[dataset_name] = data

In [None]:
concatenated_df = []
concatenated_df_not_melted = []
for gen in dfs.keys():
    pd_data = pd.melt(dfs[gen], id_vars=["GraphName", "OldTarget"], value_vars=cols,
                      var_name="Type", value_name="Norm_Z_Score")

    pd_data["DatasetName"] = [gen] * pd_data.shape[0]
    try:
        dfs[gen].insert(1, "DatasetName", [gen] * dfs[gen].shape[0])
    except ValueError:
        pass

    concatenated_df.append(pd_data)
    concatenated_df_not_melted.append(dfs[gen])

concatenated_df = pd.concat(concatenated_df, ignore_index=True)
concatenated_df_not_melted = pd.concat(
    concatenated_df_not_melted, ignore_index=True)

## Grid Figure - All Datasets with Custom Band for Z-score and Strip Plots

In [None]:
sampled_conc_df = []
for dataset_name in concatenated_df_not_melted["DatasetName"].unique():
    bal = concatenated_df_not_melted[
        concatenated_df_not_melted["DatasetName"] == dataset_name
    ]
    bal = bal.iloc[:, :] # rng.integers(0, bal.shape[0], 100)
    bal_melted = pd.melt(
        bal,
        id_vars=["GraphName", "DatasetName"],
        value_vars=cols,
        var_name="Type",
        value_name="Norm_Z_Score",
    )
    sampled_conc_df.append(bal_melted)
sampled_conc_df = pd.concat(sampled_conc_df)

In [None]:
sampled_conc_df["DatasetName"] = sampled_conc_df["GraphName"].apply(hutils.get_pretty_graph_name)
sampled_conc_df["DatasetName"] = sampled_conc_df["DatasetName"].map(
    lambda x: x if x != "Fast Gnp Random Graph" else "Erdos-Renyi"
)
sampled_conc_df["DatasetName"] = sampled_conc_df["DatasetName"].map(
    lambda x: x if x != "Grid Graph" else "Square Lattice"
)
sampled_conc_df["DatasetName"] = sampled_conc_df["DatasetName"].map(
    lambda x: x if x != "Random Limited 3Dgeo Dd Graph" else "Geometric-3D DD Graph"
)

In [None]:
sns.set_context("paper", font_scale=1)

g = sns.FacetGrid(
    sampled_conc_df.iloc[: (8 * 12 * 3200)],
    col="Type",
    col_wrap=3,
    sharey=True,
    sharex=True,
    aspect=19 / 11,
)

g.set_titles(col_template="{col_name}", fontweight="bold")
g.map_dataframe(
    sns.stripplot,
    x="Norm_Z_Score",
    y="DatasetName",
    # hue="DatasetName",
    color=p[1],
    alpha=0.3,
    jitter=True,
)
g.set_ylabels("")
g.set_xlabels("Norm Z-score")

g.despine(left=True)
plt.tight_layout()
plt.savefig(SAVE_PATH/'d_z-score_strip.png', dpi=600)
plt.close()

In [None]:
sns.set_context("paper", font_scale=1)

g = sns.FacetGrid(
    sampled_conc_df.iloc[(8 * 12 * 3200) + 1 :],
    col="Type",
    hue="DatasetName",
    col_wrap=3,
    sharey=True,
    sharex=True,
    aspect=19 / 11,
)

g.set_titles(col_template="{col_name}", fontweight="bold")
g.map_dataframe(
    sns.stripplot,
    x="Norm_Z_Score",
    y="DatasetName",
    color=p[1],
    alpha=0.3,
    jitter=True,
)

g.set_ylabels("")
g.set_xlabels("Norm Z-score")

g.despine(left=True)
plt.tight_layout()
plt.savefig(SAVE_PATH/'nd_z-score_strip.png', dpi=600)
plt.close()

In [None]:
sns.set_context("paper", font_scale=2)

g = sns.FacetGrid(
    sampled_conc_df,
    col="DatasetName",
    col_wrap=4,
    sharey=True,
    sharex=True,
    aspect=19 / 11,
)
g.set_titles(col_template="{col_name}")
g.set(xticks=subgraph_names)
g.set_xticklabels(subgraph_names)
g.tick_params(axis="x", which="major", pad=0)
g.map_dataframe(
    sns.lineplot,
    x="Type",
    y="Norm_Z_Score",
    # hue="GraphName",
    alpha=1,
    estimator="mean",
    color=p[2],
    markeredgecolor=p[0],
    errorbar="pi",
    marker="o",
)
g.set_xlabels("Graph Type")
g.set_ylabels("Norm Z-Score")
g.fig.suptitle("Band of [2.5, 97.5] Percentiles")
g.fig.subplots_adjust(top=0.92)

sns.despine(left=True)
plt.tight_layout()
plt.savefig(SAVE_PATH/"separated_percentiles_band.pdf", dpi=1200)
plt.close()

---