In [2]:
import os
import os.path as osp
from pathlib import Path

BASE_MODULO_PATH = Path(os.getcwd())

DATA_PATH = Path("/path/to/a/folder/containing/another/folder/named/graphs/with/all/real/world/graphs/")
REAL_DATA_PATH = DATA_PATH / "graphs"

SAVE_PATH = BASE_MODULO_PATH / "excluded" / "hephaestus_storage_unit" / "plots" / "stats_real_dataset"
SAVE_PATH.mkdir(exist_ok=True, parents=True)

os.chdir(BASE_MODULO_PATH)

In [3]:
import networkx as nx

import pandas as pd

import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt


pd.options.mode.chained_assignment = None  # default='warn'

sns.set_context("paper", font_scale=2)
sns.set_style("whitegrid")

In [4]:
p = [
    "#000000",
    "#E69F00",
    "#56B4E9",
    "#009E73",
    "#FB6467FF",
    "#808282",
    "#F0E442",
    "#440154FF",
    "#0072B2",
    "#D55E00",
    "#CC79A7",
    "#C2CD23",
    "#918BC3",
    "#FFFFFF",
]

In [5]:
edge_stats = []
node_stats = []
cluster_coef_stats = []

data_size = []
category = []
network_names = []
for i in os.listdir(REAL_DATA_PATH):
    g = nx.from_pandas_edgelist(
        pd.read_csv(
            osp.join(REAL_DATA_PATH, i),
            sep=" ",
            header=None,
            usecols=[0, 1],
            names=["source", "target"],
            comment="%",
        )
    )
    edge_stats.append(g.number_of_edges())
    node_stats.append(g.number_of_nodes())
    # cluster_coef_stats.append(nx.average_clustering(g))

    if "mlreal" in i.split("@")[0]:
        data_size.append("Medium-Lage")
        category.append(i.split("@")[0].split("mlreal")[1])
    else:
        data_size.append("Small")
        category.append(i.split("@")[0].split("sreal")[1])

    network_names.append(i.split("@")[1].split(".csv")[0])

In [6]:
df = pd.DataFrame(
    {
        "nodes": node_stats,
        "edges": edge_stats,
        # "cluster_coef": cluster_coef_stats,
        "node_edge_ration": list(map(lambda x: x[0]/x[1], zip(edge_stats, node_stats))),
        "category": category,
        "network_names": network_names,
        "Scale Category": data_size,
    }
)

## Breakdown of nodes and edges for real dataset

In [11]:
sns.set_context("paper", font_scale=4)

plt.subplots(figsize=(29, 21))
g = sns.boxplot(
    x="nodes",
    y="category",
    hue="Scale Category",
    linewidth=2,
    width=0.75,
    whiskerprops={"color": "black", "alpha": 1.0, "linewidth": 3},
    flierprops={
        "marker": "o",
        "markerfacecolor": "black",
        "markeredgecolor": "black",
        "linewidth": 1,
        "alpha": 0.35,
    },
    fliersize=10,
    data=df,
    palette=p[1:3],
    orient="y",
    saturation=0.95,
    # width=1,
)
g.set(xscale="log")
g.set_xlabel("Node Count")
g.set_ylabel("")

plt.legend(title="Scale Category")
plt.axvline(x=641, c="red", linestyle="-.", zorder=0, alpha=0.7, lw=3)
plt.axvline(x=1577, c="red", linestyle="-.", zorder=0, alpha=0.7, lw=3)
plt.axvline(x=2993, c="red", linestyle="-.", zorder=0, alpha=0.7, lw=3)
plt.tight_layout()
plt.savefig(SAVE_PATH / "nodes_real_set.pdf", dpi=1200, bbox_inches="tight")
plt.close()

In [10]:
sns.set_context("paper", font_scale=4)

plt.figure(figsize=(29, 21))
g = sns.boxplot(
    x="edges",
    y="category",
    hue="Scale Category",
    linewidth=2,
    width=0.75,
    whiskerprops={"color": "black", "alpha": 1.0, "linewidth": 3},
    flierprops={
        "marker": "o",
        "markerfacecolor": "black",
        "markeredgecolor": "black",
        "linewidth": 1,
        "alpha": 0.35,
    },
    fliersize=10,
    data=df,
    palette=p[1:3],
    orient="y",
    saturation=0.95,
    # width=0.8,
)
g.set(xscale="log")
g.set_xlabel("Edge Count")
g.set_ylabel("")

plt.legend(title='Scale Category')
plt.axvline(x=1526, c="red", linestyle="-.", zorder=0, alpha=0.7, lw=3)
plt.axvline(x=4066, c="red", linestyle="-.", zorder=0, alpha=0.7, lw=3)
plt.axvline(x=10820, c="red", linestyle="-.", zorder=0, alpha=0.7, lw=3)
plt.tight_layout()
plt.savefig(SAVE_PATH/'edges_real_set.pdf', dpi=1200, bbox_inches = 'tight')
plt.close()

---

## Generate unformated LaTeX table of real networks

In [None]:
df_l = df.copy(deep=True)

df_l.rename(
    columns={
        "nodes": "Num Nodes",
        "edges": "Num Edges",
        "cluster_coef": "Cluster Coefficient",
        "node_edge_ration": "Edge/Node",
        "category": "Type",
        "network_names": "Network Name",
    },
    inplace=True,
)

df_l["Edge/Node"] = df_l["Edge/Node"].apply(lambda x: np.round(x, 3))

print(
    df_l.drop(["Edge/Node"], axis=1)
    .sort_values(by=["Scale Category", "Type", "Num Nodes", "Num Edges"])
    .to_latex(index=False, float_format="{:.3f}".format)
)