In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from IPython.display import display
import polars as pl
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from typing import List, Tuple
from pathlib import Path
import os
import torch
from torch import Tensor
from torch_geometric.utils import to_networkx
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data_dir = "./data"
os.makedirs(data_dir, exist_ok=True)

In [39]:
#l2g related imports
from l2gv2.patch.clustering import metis_clustering
from l2gv2.graphs.tgraph import TGraph
from l2gv2.patch.patches import create_patch_data

# <font color="grey"> Local2Global new version</font>

###  <a id='chapter1'> <font color="grey">1. Data loading and visualization </font></a>

The package comes equipped with a few data sets that can be imported using the get_dataset function.

In [40]:
from l2gv2.datasets import get_dataset

In [41]:
# There are several datasets available in the l2gv2.datasets module.
# Cora is one of them.
as733 = get_dataset("as-733")
print(as733[10].edge_index)

Loading edge and node data from memory
Loading edge and node data from memory


HBox(children=(HTML(value=''), IntProgress(value=0, max=11965533), HTML(value='')))

HBox(children=(HTML(value=''), IntProgress(value=0, max=3066397), HTML(value='')))

tensor([[    0,     0,     0,  ..., 10728, 10731, 64600],
        [    2,     5,    31,  ...,  3560,  5631,  2547]])


In [42]:
graph = as733.to("raphtory")
print(graph)

Graph(number_of_nodes=7716, number_of_edges=45645, number_of_temporal_edges=11965533, earliest_time=878947200000, latest_time=946771200000)


In [None]:
as733[10]

In [43]:
coral = get_dataset("Cora")
coral

Loading edge and node data from memory


HBox(children=(HTML(value=''), IntProgress(value=0, max=10556), HTML(value='')))

HBox(children=(HTML(value=''), IntProgress(value=0, max=2708), HTML(value='')))

CoraDataset()

In [44]:
coral.to("raphtory")

Graph(number_of_nodes=2708, number_of_edges=10556, number_of_temporal_edges=10556, earliest_time=0, latest_time=0)

The data can be accessed in three formats: as a pair of [polars dataframes](https://docs.pola.rs/api/python/stable/reference/dataframe/index.html) (one for edges and one for nodes), as a [torch-geometric Data](https://pytorch-geometric.readthedocs.io/en/2.5.2/modules/data.html) object, and as a [Raphtory Graph](https://www.raphtory.com/) object. Internally, the data is stored as a pair of parquet files.

###  <a id='chapter2'> <font color="grey">2. The local2global approach </font></a>

The initial step in creating the patch graph consists of clustering the graph. The clusters 

The Cora dataset is a well-known dataset in the field of graph research. This consists of 2708 scientific publications classified into one of seven classes. The citation network consists of 5429 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words.

In [None]:
np.full((2,), -1)

In [None]:
G = to_networkx(data, to_undirected=True)
degrees = [val for (node, val) in G.degree()]
display(pd.DataFrame(pd.Series(degrees).describe()).transpose().round(2))
print(len(degrees))
print(sum(degrees))
plt.figure(figsize=(10, 6))
plt.hist(degrees, bins=50)
plt.xlabel("node degree")
plt.show()

In [None]:
G = to_networkx(data, to_undirected=True)
pos = nx.spring_layout(G, seed=42)
cent = nx.degree_centrality(G)
node_size = list(map(lambda x: x * 500, cent.values()))
cent_array = np.array(list(cent.values()))
threshold = sorted(cent_array, reverse=True)[10]
print("threshold", threshold)
cent_bin = np.where(cent_array >= threshold, 1, 0.1)
plt.figure(figsize=(12, 12))
nodes = nx.draw_networkx_nodes(G, pos, node_size=node_size,
                               cmap=plt.cm.plasma,
                               node_color=cent_bin,
                               nodelist=list(cent.keys()),
                               alpha=cent_bin)
edges = nx.draw_networkx_edges(G, pos, width=0.25, alpha=0.3)
plt.show()

In [None]:
label_dict = {
    0: "Theory",
    1: "Reinforcement_Learning",
    2: "Genetic_Algorithms",
    3: "Neural_Networks",
    4: "Probabilistic_Methods",
    5: "Case_Based",
    6: "Rule_Learning"}
data.y[:10]

In [None]:
aspath = Path("../data/snap-as/as_edges.parquet")

In [None]:
df = pl.read_parquet(aspath)
df.head()

In [None]:
# Parameters: 
# 10 patches
# Average degree k=4
# Overlap between 256 and 1024
# Embedding dimension up to 128

In [None]:
def hierarchical_cluster_and_embed(graph: nx.Graph, m: int, k: int) -> List[Tuple[nx.Graph, List[float]]]:
    if graph.number_of_nodes() <= m:
        return [(graph, embed(graph))]
    
    clusters = cluster(graph, k)
    results = []
    
    for subgraph in clusters:
        results.extend(hierarchical_cluster_and_embed(subgraph, m, k))
    
    return results

In [None]:
edge_index = data.edge_index.numpy()
print(edge_index.shape)