In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import matplotlib.colors as mcolors

def setup_mpl():
    mpl.rc('font', size = 10)
    mpl.rcParams['legend.fontsize'] = 'small'
    mpl.rcParams['xtick.labelsize'] = 'small'
    mpl.rcParams['ytick.labelsize'] = 'small'
    #
    mpl.rcParams['font.family'] = 'Helvetica'
    mpl.rcParams['mathtext.default'] = 'regular'
    #
    mpl.rcParams['lines.linewidth'] = 1
    mpl.rcParams['lines.markersize'] = 6  
    mpl.rcParams['axes.linewidth'] = 0.75
    mpl.rcParams['axes.labelpad'] = 2
    #
    mpl.rcParams['xtick.major.pad'] = '2.3'
    mpl.rcParams['ytick.major.pad'] = '2.3'
    #
    #
    mpl.rcParams['xtick.major.width'] = 0.75
    mpl.rcParams['ytick.major.width'] = 0.75
    mpl.rcParams['xtick.minor.width'] = 0.75
    mpl.rcParams['ytick.minor.width'] = 0.75
    #
    mpl.rcParams['xtick.major.size'] = 3
    mpl.rcParams['ytick.major.size'] = 3
    #
    mpl.rcParams['xtick.minor.size'] = 1.5
    mpl.rcParams['ytick.minor.size'] = 1.5
    #
    alpha = 0.6
    to_rgba = mpl.colors.ColorConverter().to_rgba
setup_mpl()

In [None]:
import networkx as nx
import numpy as np
import pandas as pd
import community.community_louvain as louvain
from collections import defaultdict

In [None]:
def edge_percolation(graph,w):
    retained_edges = [(u,v) for u,v,d in graph.edges(data = True) if d["weight"] < w]
    G = nx.Graph()
    G.add_edges_from(retained_edges)
    components = list(nx.connected_components(G))
    sorted_components = sorted(components, key=len, reverse=True)        
    largest_cc_size = len(sorted_components[0]) if len(sorted_components) > 0 else 0
    second_largest_cc_size = len(sorted_components[1]) if len(sorted_components) > 1 else 0
    return largest_cc_size, second_largest_cc_size

In [None]:
hdbscan_distance = pd.read_csv("../data/hdbscan_distances.csv")

In [None]:
hdbscan_distance.index = hdbscan_distance["store_name"]
del hdbscan_distance["store_name"]
distance_long = hdbscan_distance.reset_index().melt(
    id_vars='store_name',
    value_name='distance',
    var_name='node2'
).rename(columns={'store_name': 'node1'})
distance_long = distance_long[distance_long['node1'] != distance_long['node2']]
distance_long['pair'] = distance_long.apply(lambda row: tuple(sorted([row['node1'], row['node2']])), axis=1)
distance_long = distance_long.drop_duplicates(subset="pair",keep = "first")

In [None]:
Gnew = nx.read_gexf("../data/projection_stores.gexf")

In [None]:
for _,row in distance_long.iterrows():
    if Gnew.has_edge(row["node1"],row["node2"]):
        Gnew[row["node1"]][row["node2"]]["weight"] = row["distance"]

In [None]:
bins = np.linspace(500, 10**6, 1_000 + 1)
tmp_edges = []
for bin in bins:
    largest, second_largest = edge_percolation(Gnew,bin)
    tmp_edges.append(
        {
            "bin":bin,
            "largest":largest,
            "second_largest":second_largest
        }
    )
df_edges = pd.DataFrame(tmp_edges)

In [None]:
fig = plt.figure(figsize=(3.5,1),dpi = 300)
ax = fig.add_subplot(1,1,1)
ax.plot(df_edges["bin"],df_edges["largest"]/df_edges["largest"].max(),ls = "solid",marker = "s",markersize = 0,lw = 2,alpha = 0.7,color = "firebrick",label = "Largest Cluster")
ax.plot(df_edges["bin"],df_edges["second_largest"]/df_edges["second_largest"].max(),ls = "dotted",color = "teal",lw = 1.5,label = "Second Largest Cluster")
ax.set_xscale("log")
ax.set_ylabel("Normalized Size")
ax.set_xlabel("Distance [m]")
ax.axvline(26_500,color = "silver")
ax.legend(
          bbox_to_anchor=[0.5, 1.1], 
          loc='center', 
          ncol=2,
          frameon=False)
ax.set_xlim([1_000,100_000])
ax.set_yticks([0.0,0.25,0.5,0.75,1.0])
fig.savefig("../figure/fig_2_percolation.pdf",bbox_inches = "tight",dpi = 300)

In [None]:
category_colors = {
    "dressings": "#9e0142", 
    "patee": "#d53e4f", 
    "meat": "#f46d43", 
    "bread": "#fdae61",
    "ham": "#fee08b", 
    "milk": "#ffffbf", 
    "desserts snacks": "#e6f598", 
    "rye breads": "#abdda4",
    "light breads": "#66c2a5",    
    "pork": "#2ca25f",         
    "chicken": "#006d2c",        
    "ready to eat meals": "#1C8FA6", 
    "cheese": "#2166ac",     
    "yoghurt": "#084594",       
    "sausage": "#5e3c99",  
    "beverages": "#762a83"     
    }


In [None]:
Gnew = nx.read_gexf("../data/projection_products.gexf")

In [None]:
def get_largest_connected_components(graph, n=1):
    components = [graph.subgraph(c).copy() for c in nx.connected_components(graph)]
    components_sorted = sorted(components, key=lambda x: x.number_of_nodes(), reverse=True)
    return components_sorted[:n]
LCC = get_largest_connected_components(Gnew)[0]

In [None]:
partition = louvain.best_partition(LCC)
communities = defaultdict(set)
for node, comm in partition.items():
    communities[comm].add(node)
community_list = list(communities.values())
modularity_score = nx.algorithms.community.modularity(LCC, community_list)
bc_prods = nx.betweenness_centrality(LCC)
df_prod_new = pd.DataFrame()
df_prod_new["product"] = bc_prods.keys()
df_prod_new["bc"] = df_prod_new["product"].map(bc_prods)
dict_prod_in_lcc = df_prod_new.groupby(by = ["category"]).count()["product"].to_dict()

In [None]:
tmp_bc = df_prod_new[df_prod_new["bc"] > df_prod_new["bc"].quantile(0.9)].groupby(by = ["category"]).count().reset_index()
tmp_bc["total"] = tmp_bc["category"].map(dict_prod_in_lcc)
tmp_bc["probability"] = tmp_bc["product"]/tmp_bc["total"]

In [None]:
df_sorted = tmp_bc.sort_values('probability', ascending=False).reset_index(drop=True)
category_spacing = 1.0
offset = 0.0
x_positions = np.arange(len(df_sorted)) * category_spacing + offset
fig = plt.figure(figsize=(3.5, 1), dpi=300)
ax = fig.add_subplot(1, 1, 1)

for i, (cat, prob) in enumerate(zip(df_sorted['category'], df_sorted['probability'])):
    x = x_positions[i]
    color = category_colors.get(cat, 'grey')
    ax.vlines(x, 0, prob, color=color, linewidth=2, alpha=0.7)
    size = 25 if prob >= 0.05 else 12.5
    ax.scatter(x, prob, color=color, s=size)

ax.set_xticks(x_positions)
ax.set_xticklabels(df_sorted['category'].str.title(), rotation=45, ha='right')
ax.set_ylabel('Probability')
ax.set_xlabel('')
ax.set_ylim(0,0.6)
fig.savefig("../figure/fig_2_bc.pdf",bbox_inches="tight", pad_inches=0.02, dpi=300)