In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
import numpy as np
import numpy.random as rnd
import numpy.linalg as la
import polars as pl
import pandas as pd
import datetime as dt
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import networkx as nx
import raphtory as rp
import community
import torch
import torch_geometric as tg
from torch_geometric.data import Data
from torch_geometric.utils.convert import from_networkx
from torch_geometric.transforms import LargestConnectedComponents
from torch_geometric.utils import to_networkx
from torch_geometric.nn import Node2Vec

In [14]:
from datasets import DataLoader

In [15]:
import local2global as l2g
import local2global_embedding
from local2global import Patch
from local2global_embedding import patches, clustering
from local2global_embedding.network import graph, TGraph
import local2global_embedding.patches as patches
import induced_subgraph, anomaly_detection
import models

## Local2Global v2 

### <font color="grey">  Table of Contents</font>

1. #### <a href='#chapter1'>Data</a>
2. #### <a href='#chapter2'>Embedding</a>
3. #### <a href='#chapter3'>Alignment</a>
4. #### <a href='#chapter4'>Anomaly detection</a>

###  <a id='chapter1'> <font color="grey">1. Data </font></a>

The data considered consists of temporal graphs with optional node and edge features. We should be able to efficiently load this data, as well as extract relevant graph-theoretic properties and statistics. A graph generally consists of the following:

* A list of nodes
* A list of edges
* A list of node features
* A list of edge features

In addition, temporarl graphs consists of a sequence of graphs 


There are different ways of representing graphs:

* List of edges (either as numpy array or part of polars / pandas dataframe)
* [PyTorch Geometric](https://pytorch-geometric.readthedocs.io/en/latest/) [Data](https://pytorch-geometric.readthedocs.io/en/latest/get_started/introduction.html#data-handling-of-graphs) object
* [Raphtory](https://www.raphtory.com/) Graph object
* [NetworkX](https://networkx.org/) Graph object

Below we give a summary of some of the important dataset we consider. Each of these datasets is stored internally as a parquet file containing a polars dataframe, with required columns 'source' and 'dest', as well as an optional column 'timestamp' for temporal graphs and additional columns representing edge features. If the graph also has node features, then an additional file with a 'node' column, an optional 'timestamp' column and columns for node features is provided. A dataloader is provided that is initialized for one of the datasets and provides interfaces for loading the data in a variety of formats, as well as providing a basic summary of the data.

In [16]:
# Supported arguments: 'AS', 'elliptic', 'NFTS'
dl = DataLoader(source='nAS')

In [17]:
df = dl.get_nodes()
df.head()

timestamp,nodes,nodetype,country,asname
datetime[μs],str,str,str,str
2024-09-14 00:00:00,"""AS7029""","""asn""","""US""","""WINDSTREAM"""
2024-09-14 00:00:00,"""AS32984""","""asn""","""US""","""RUELALA-INC"""
2024-09-14 00:00:00,"""AS136106""","""asn""","""ID""","""FIBERSTAR-AS-I"""
2024-09-14 00:00:00,"""AS58495""","""asn""","""ID""","""HSPNET-AS-I"""
2024-09-14 00:00:00,"""AS3491""","""asn""","""US""","""BTN-ASN"""


The Raphtory Graph data type is suited to temporal graphs and is the one we want to eventually use.

In [18]:
dl.node_features

['nodetype', 'country', 'asname']

In [19]:
rg = dl.get_raphtory()
print(rg)

AttributeError: 'DataLoader' object has no attribute 'get_raphtory'

The 'edge_list' format is sometimes used. The data is represented as a dictionary with dates as keys, and each entry simply consists of a list of edges.

In [None]:
el = dl.get_edge_list()
dt = list(el.keys())[0]
print(dt)
el[dt][:10]

With all the formats except for Raphtory, there is the option of loading the whole graph and ignoring the timestamps by supplying the 'temp=False' argument.

In [None]:
el = dl.get_edge_list(temp=False)
el[:10]

The 'edge_index' format is based on torch.Tensor objects and is used to initialize graphs in pytorch-geometric.

In [None]:
ei = dl.get_edge_index()
dt = list(ei.keys())[10]
print(dt)
ei[dt]

The networkx format is the format used by the common networkx package. This is a convenient format to explore graphs but it can be slow.

In [None]:
gx = dl.get_networkx()
dt = list(gx.keys())[20]
print(gx[dt])

Finally, there is the Data format used by pytorch-geometric, which will be used for computing the graph embeddings.

In [None]:
tg = dl.get_tgeometric()

In [None]:
dt = list(tg.keys())[30]
print(tg[dt])

####  <a id='chapter11'> <font color="grey">1.1 Autonomous Systems </font></a>

An autonomous system (AS) is a large network or collection of networks that is managed by a single entity or organization, such as an Internet Service Provider (ISP), a university, or a corporation. AS use the [Border Gateway Protocol (BGP)](https://en.wikipedia.org/wiki/Border_Gateway_Protocol) to exchange routing information among each other. This allows them to determine the most efficient paths for data to travel across the internet.

The [SNAP autonomous systems AS-733](https://snap.stanford.edu/data/as-733.html) dataset contains 733 daily snapshots that span an interval of 785 days from November 8 1997 to January 2 2000. In each of these datasets, nodes represent autonomous systems and edges indicate whether communication has taken place. The resulting graph is undirected.

In [None]:
dl = DataLoader(source='AS')

In [None]:
gx = dl.get_networkx()

In [None]:
As = list(gx.values())[:10]

####  <a id='chapter12'> <font color="grey">1.2 Elliptic Bitcoin transactions </font></a>

The [Elliptic dataset](https://www.kaggle.com/datasets/ellipticco/elliptic-data-set) maps Bitcoin transactions to real entities belonging to licit categories (exchanges, wallet providers, miners, licit services, etc.) versus illicit ones (scams, malware, terrorist organizations, ransomware, Ponzi schemes, etc.). The task on the dataset is to classify the illicit and licit nodes in the graph. The graph consists of $203,769$ nodes representing transactions and $234,355$ directed edges representing payments flows.
A case study is the paper [Anti-Money Laundering in Bitcoin: Experimenting with Graph
Convolutional Networks for Financial Forensics](https://arxiv.org/pdf/1908.02591) by Weber et.al.

In [None]:
dl = DataLoader(source='elliptic')

In [None]:
edge_df = dl.get_edges()
node_df = dl.get_nodes()

In [None]:
rg = dl.get_raphtory()

In [None]:
print(rg)

Some more work needs to be done here.

####  <a id='chapter13'> <font color="grey">1.3 Ethereum NFTs </font></a>

The [Ethereum NFT dataset](https://www.kaggle.com/datasets/simiotic/ethereum-nfts) represents the activity of the Ethereum non-fungible token (NFT) market between April 1, 2021 and September 25, 2021. These data were collected using Moonstream.to as part of Moonstream's open data efforts. The dataset is based on on-chain NFT Transfer events as its core. 

The data for this dataset still needs to be processed and the dataloader adapted.

###  <a id='chapter2'> <font color="grey">2. Embedding</font></a>

Graph neural networks are used to embedd the node features into a vector space, taking into account the graph structure. There are various ways of implementing these. The main method chosen here is the [Variational Graph Autoencoder](https://arxiv.org/abs/1611.07308) (VGAE). 

In [None]:
nodes_in_each_p=[set(list(p.nodes)) for p in As]
nodes_in_intersection=set.intersection(*nodes_in_each_p)
nodes_tot=[]
for p in nodes_in_each_p:
    nodes_tot+=p
nodes_tot=set(nodes_tot)

In [None]:
AS_patches=[from_networkx(G) for G in tqdm(As)]
for i, p in enumerate(AS_patches):
    p.nodes=torch.Tensor(list(As[i].nodes)).int()
    p.num_nodes=p.nodes.size(0)
vgae_AS_p_emb=models.VGAE_patch_embeddings(AS_patches, dim=2, hidden_dim=32, num_epochs=50, decoder=None, device='cpu', lr=0.01)
n2v_AS_p_emb=models.Node2Vec_patch_embeddings(AS_patches, emb_dim=2 , w_length=20, c_size=10,w_per_node=10, n_negative_samples=1, p=1, q=1, num_epochs=50)

vgae_AS_prob = l2g.utils.WeightedAlignmentProblem(vgae_AS_p_emb[0])  #embedding of the full graph using embeddings of each patch
n2v_AS_prob=l2g.utils.WeightedAlignmentProblem(n2v_AS_p_emb)

vgae_AS_emb=vgae_AS_prob.get_aligned_embedding()
n2v_AS_emb=n2v_AS_prob.get_aligned_embedding()

vgae_AS_outliers=set(anomaly_detection.get_outliers(vgae_AS_prob.patches, AS_patches,vgae_AS_emb[list(nodes_tot)], k=3))
n2v_AS_outliers=set(anomaly_detection.get_outliers(n2v_AS_prob.patches, AS_patches,n2v_AS_emb[list(nodes_tot)], k=3))

###  <a id='chapter3'> <font color="grey">3. Patch generation and alignment</font></a>

Describe idea.

###  <a id='chapter4'> <font color="grey">4. Application: anomaly detection </font></a>

Description.

In [None]:
# This is code from a previous version, will have to be adjusted
ell_outliers=[]
for i, cc in enumerate(ell):
    TG=TGraph(edge_index=cc.edge_index, edge_attr=cc.edge_attr,  num_nodes=cc.num_nodes, ensure_sorted=True, undir=False)
    pt, pgraph= patches.create_patch_data(TG, partition_tensor= clustering.louvain_clustering(TG),
                                               min_overlap=10, target_overlap=100, verbose=True)
    patch_data = [induced_subgraph.induced_subgraph(cc, p) for p in pt]
    for p in patch_data:
        p.y=cc.y[p.nodes]
        p.x=cc.x[p.nodes]

    ell_p_emb=models.VGAE_patch_embeddings(patch_data, dim=2, hidden_dim=32, num_epochs=50, decoder=None, device='cpu', lr=0.01)

    ell_prob = l2g.utils.WeightedAlignmentProblem(ell_p_emb[0])  #embedding of the full graph using embeddings of each patch
    ell_emb=ell_prob.get_aligned_embedding()
    ell_outliers.append(set(anomaly_detection.get_outliers(ell_prob.patches, patch_data,ell_emb, k=3)))
    
numb_ell_outliers=np.sum([len(s) for s in ell_outliers])   

In [None]:
Nfts=Nfts[:10]
nft_outliers=[]
for i, cc in enumerate(Nfts):
    TG=TGraph(edge_index=cc.edge_index, edge_attr=cc.edge_attr,  num_nodes=cc.num_nodes, ensure_sorted=True, undir=False)
    pt, pgraph= patches.create_patch_data(TG, partition_tensor= clustering.louvain_clustering(TG),
                                               min_overlap=10, target_overlap=100, verbose=True)
    patch_data = [induced_subgraph.induced_subgraph(cc, p) for p in pt]

    nft_p_emb=models.VGAE_patch_embeddings(patch_data, dim=2, hidden_dim=32, num_epochs=50, decoder=None, device='cpu', lr=0.01)
    nft_prob = l2g.utils.WeightedAlignmentProblem(nft_p_emb[0])  #embedding of the full graph using embeddings of each patch
    nft_emb=nft_prob.get_aligned_embedding()
    nft_outliers.append(set(anomaly_detection.get_outliers(nft_prob.patches, patch_data,nft_emb, k=3)))
    
numb_nft_outliers=np.sum([len(s) for s in nft_outliers])   

###  <a id='chapter5'> <font color="grey">5. New algorithm </font></a>

The idea is to learn the proper alignment. The below is just playing around at the moment.

In [None]:
def generate_data(n_clusters, scale=1.0, std=0.5, max_size=200, min_size=10):
    """Generate test data with normally-distributed clusters centered on sphere.

    :param int n_clusters: Number of clusters

    :param float scale: Radius of sphere for cluster centers [default: 1.0]

    :param float std: Standard deviation for cluster points [default: 0.5]

    :param max_size: maximum cluster size [default: 200]

    :param min_size: minimum cluster size [default: 10]
    """

    
    list_shifts = [np.array([np.cos(2 * np.pi * t / n_clusters), np.sin(2 * np.pi * t / n_clusters)]) * scale for t in range(n_clusters)]
   
    list_var = [std] * n_clusters
    rg = np.random.default_rng()
    list_sizes = [rg.integers(min_size, max_size) for _ in range(n_clusters)]

    # Make union cluster
    list_of_clusters = [rg.normal(scale=1, size=(s, 2)) * v + shift for shift, v, s in zip(list_shifts, list_var, list_sizes)]
    points = np.vstack(list_of_clusters)

    
    return points

In [None]:
points = generate_data(2, scale=1)

In [None]:
plt.scatter(points[:,0], points[:,1])
plt.show()

In [None]:
G = rnd.randn(2,2)
Q, _ = la.qr(G)
Q

In [None]:
x = np.linspace(-1,1,100).reshape(-1,1)
y = 2*x+1
points = np.concatenate((x,y), axis=1)

In [None]:
plt.scatter(points[:,0], points[:,1])
plt.show()

In [None]:
rpoints = points @ Q

In [None]:
plt.scatter(rpoints[:,0], rpoints[:,1])
plt.show()