In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import numpy.random as rnd
import numpy.linalg as la
import polars as pl
import pandas as pd
import datetime as dt
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import networkx as nx
import raphtory as rp
import community
import torch
import torch_geometric as tg
from torch_geometric.data import Data
from torch_geometric.utils.convert import from_networkx
from torch_geometric.transforms import LargestConnectedComponents
from torch_geometric.utils import to_networkx
from torch_geometric.nn import Node2Vec

In [3]:
from datasets import DataLoader

# <font color="grey"> $\quad$ New Autonoous Systems dataset </font>

$\newcommand{\vct}[1]{\mathbf{#1}}$
$\newcommand{\mtx}[1]{\mathbf{#1}}$
$\newcommand{\e}{\varepsilon}$
$\newcommand{\norm}[1]{\|#1\|}$
$\newcommand{\minimize}{\mathrm{minimize}\quad}$
$\newcommand{\maximize}{\mathrm{maximize}\quad}$
$\newcommand{\subjto}{\quad\text{subject to}\quad}$
$\newcommand{\R}{\mathbb{R}}$
$\newcommand{\C}{\mathbb{C}}$
$\newcommand{\N}{\mathbb{N}}$
$\newcommand{\Z}{\mathbb{Z}}$
$\newcommand{Prob}{\mathbb{P}}$
$\newcommand{Expect}{\mathbb{E}}$
$\newcommand{Cov}{\mathrm{Cov}}$
$\newcommand{Var}{\mathrm{Var}}$
$\newcommand{\trans}{T}$
$\newcommand{\ip}[2]{\langle {#1}, {#2} \rangle}$
$\newcommand{\zerovct}{\vct{0}}$
$\newcommand{\diff}[1]{\mathrm{d}{#1}}$
$\newcommand{\conv}{\operatorname{conv}}$
$\newcommand{\inter}{{\operatorname{int}}}$

### <font color="grey">  Table of Contents</font>

1. #### <a href='#chapter1'>Data</a>
2. #### <a href='#chapter2'>Embedding</a>
3. #### <a href='#chapter3'>Visualisation</a>

###  <a id='chapter1'> <font color="grey">1. Data </font></a>

The data can be accessed via the dataloader. It is saved in the datasets/data/nas directory in two parquet files. 

In [4]:
dl = DataLoader(source='nAS')

In [5]:
# Get the edges
edge_df = dl.get_edges()
edge_df.head()

timestamp,source,dest,weight
datetime[μs],i64,i64,i64
2024-09-29 00:00:00,8151,1840,1
2024-09-29 00:00:00,8151,10420,1
2024-09-29 00:00:00,8151,136907,1
2024-09-29 00:00:00,8151,7173,1
2024-09-29 00:00:00,8151,28391,1


The weight columns has no relevance, it is always 1. 

In [6]:
# Get the nodes
node_df = dl.get_nodes()
node_df.head()

timestamp,nodes,country_code
datetime[μs],i64,i64
2024-09-29 00:00:00,8151,150
2024-09-29 00:00:00,1840,150
2024-09-29 00:00:00,25220,52
2024-09-29 00:00:00,198570,52
2024-09-29 00:00:00,4657,191


The nodes have one feature: the country. This can be used to label and identify different subgraphs. In order to identify the features with a country, there is the country_code file.

In [7]:
country_codes = pl.read_parquet('./datasets/data/nas/country_codes.parquet')
country_codes

index,country
i64,str
0,"""AD"""
1,"""AE"""
2,"""AF"""
3,"""AG"""
4,"""AI"""
…,…
236,"""YT"""
237,"""ZA"""
238,"""ZM"""
239,"""ZW"""


In [8]:
# The timestamp column consists of datetime objects
date = node_df['timestamp'][0]
date

datetime.datetime(2024, 9, 29, 0, 0)

In [9]:
dates = dl.get_dates()
dates

[datetime.datetime(2024, 9, 29, 0, 0),
 datetime.datetime(2024, 9, 30, 0, 0),
 datetime.datetime(2024, 10, 1, 0, 0),
 datetime.datetime(2024, 10, 2, 0, 0),
 datetime.datetime(2024, 10, 3, 0, 0),
 datetime.datetime(2024, 10, 4, 0, 0),
 datetime.datetime(2024, 10, 5, 0, 0),
 datetime.datetime(2024, 10, 6, 0, 0),
 datetime.datetime(2024, 10, 7, 0, 0),
 datetime.datetime(2024, 10, 8, 0, 0),
 datetime.datetime(2024, 10, 9, 0, 0),
 datetime.datetime(2024, 10, 10, 0, 0),
 datetime.datetime(2024, 10, 11, 0, 0),
 datetime.datetime(2024, 10, 12, 0, 0),
 datetime.datetime(2024, 10, 13, 0, 0)]

In [10]:
# The edgelist is a list of edges for each date
edge_list = dl.get_edge_list()
edge_list[dates[1]][:10]

100%|███████████████████████████████████████████| 15/15 [00:02<00:00,  5.72it/s]


[(51185, 8913),
 (51185, 262605),
 (51185, 266989),
 (51185, 28624),
 (51185, 43350),
 (51185, 203478),
 (51185, 268700),
 (51185, 272539),
 (51185, 202146),
 (51185, 273573)]

In [11]:
# Data is available in Raphtory and NetworkX formats. I haven't implemented node features yet, so we just get the graph
nxg = dl.get_networkx()

100%|███████████████████████████████████████████| 15/15 [00:08<00:00,  1.73it/s]


In [14]:
# Finally, we can also get a graph in the torch-geometric format, which is useful for graph neural networks
tg = dl.get_tgeometric()

100%|███████████████████████████████████████████| 15/15 [00:04<00:00,  3.72it/s]


There may be a bug in the 

In [None]:
tg = tg[dates[0]] # Take graph from first day

A description of how pytorch-geometric deals with its graph data structure can be found [here](https://pytorch-geometric.readthedocs.io/en/latest/get_started/introduction.html).

In [None]:
# Show the edges
tg.edge_index

In [None]:
# Show nodes
tg.nodes

In [None]:
# Show node features (see the country dictionary above for the meaning of these codes)
tg.x

###  <a id='chapter2'> <font color="grey">2. Embedding </font></a>

In [None]:
# Use VGAE here

###  <a id='chapter3'> <font color="grey">3. Visualisation </font></a>

In [None]:
# Use UMAP to visualise the graph embeddings for different days