In [1]:
import pathlib
import pickle
import re

import geopandas as gpd
import pandas as pd
import networkx as nx
import shapely

import matplotlib.pyplot as plt

# Generate graph
This notebook generates a graph based on the EURIS network. 

## Data model of EURIS compared to a graph
In EURIS each section has 2 unique nodes per section. If you connect two edges there will be two nodes on the connection point.  If you connect three edges in one location, three nodes are used on that location. 
In a graph (e.g. data model in networkx) each node is unique. An edge consists of a pair of nodes. 
To transform the data model of EURIS to a graph we combine nodes that co-occur in one location. For each node we remember the set of euris nodes that corresponds to this node. 

## Steps
Generating the graph consists of several steps:

- Reading the data
    - Defining paths
    - Reading sections and nodes
- Node / edge administration
    - Merging nodes and sections
    - Find first and second node per section
    - Build the edge topology
    - Combine the edge topology with the sections
- Check, build, resolve
    - Check for spatial errors
    - Build the network based on edges
    - Resolve doubled nodes
- Borders
    - Connect border nodes
- Add extra info
    - Compute subgraphs
- Export
    - Store subgraph identifier in nodes and edges

## Read the data
We have already exported all the files from EURIS using the notebook `latest-downloads.ipynb`. These networks are concatenated to a similar structure using the notebook `concat-network.ipynb`. So you have to run those two notebooks first.
All files have similar names. We will store the exports version number in the name. 

In [2]:
data_dir = pathlib.Path('~/data/euris').expanduser()
version = 'v0.1.0'

In [3]:
node_path = data_dir / f'nodes-{version}.geojson'
section_path = data_dir / f'sections-{version}.geojson'
export_node_path = data_dir / f'export-nodes-{version}.geojson'
export_edge_path = data_dir / f'export-edges-{version}.geojson'
export_pickle_path = data_dir / f'export-graph-{version}.pickle'

In [4]:
section_gdf = gpd.read_file(section_path)
node_gdf = gpd.read_file(node_path)

# Node / edge administration

We're starting by merging nodes and sections. We need several fields:
- `section_gdf.code` -> section id
- `node_gdf.sectionref` -> section id
- `node_gdf.node_id` -> node id (preprocessed by concat networks)

Each node_gdf row is a connection between a node and one of it's connecting sections. 
Each section is an edge in the network. Unfortunately it doesn't specify source and target node. So we have the deduct that from the node network. 

## Infer node / edge topology
We will create a list of all node section combinations and then find first and second (last) node per section. 

In [5]:
node_section = section_gdf[['code']].merge(
    node_gdf[['sectionref', 'node_id']], 
    left_on='code', 
    right_on='sectionref'
)[['sectionref', 'node_id']]
node_section

Unnamed: 0,sectionref,node_id
0,HU0000100001,HU_J0001
1,HU0000100001,HU_J0002
2,HU0000100002,HU_J0002
3,HU0000100002,HU_J0003
4,HU0000100003,HU_J0003
...,...,...
13473,RO0000123700,RO_19750
13474,RO00001D7500,RO_D5940
13475,RO00001D7500,RO_D7500
13476,RO00005C1160,RO_C1000


In [6]:
left_df = node_section.groupby('sectionref').first()
right_df = node_section.groupby('sectionref').last()

# Combine left / right nodes
Now we have a first and last node per edge. We can combine these into a graph based network administration. 

In [7]:
edge_df = pd.merge(left_df, right_df, left_index=True, right_index=True, suffixes=['_from', '_to'])
edge_df = edge_df.rename(columns={"node_id_from": "source", "node_id_to": "target"})
edge_df

Unnamed: 0_level_0,source,target
sectionref,Unnamed: 1_level_1,Unnamed: 2_level_1
AT0000100001,AT_J0035,AT_J0072
AT0000100002,AT_J0002,AT_J0074
AT0000100003,AT_J0023,AT_J0053
AT0000100004,AT_J0053,AT_J0054
AT0000100005,AT_J0054,AT_J0027
...,...,...
SK0000100014,SK_J0006,SK_J0007
SK0000100015,SK_J0007,SK_J0008
SK0000100016,SK_J0022,SK_J0018
SK0000200001,SK_J0017,SK_J0003


## Merge administration with edges
Now we can combinate our edge administration with our edges. Then we have our old properties back, and extra columns: `sectionref`, `source`, `target`. 

In [8]:
section_gdf = section_gdf.merge(edge_df.reset_index(), left_on='code', right_on='sectionref')
section_gdf.head()

Unnamed: 0,name,name_cb,cntrycode,cntrycode_cb,fw_code,fw_code_cb,seq_nr,seq_nr_cb,code_cb,ww_name,...,active,ww_charges,remark,istentec,code,path,geometry,sectionref,source,target
0,Duna,,HU,,1,,1,,,Duna,...,1,0.0,,,HU0000100001,FairwaySection_HU_20241118.geojson,"LINESTRING (18.8142 45.9087, 18.8131 45.9093, ...",HU0000100001,HU_J0001,HU_J0002
1,Duna,,HU,,1,,2,,,Duna,...,1,0.0,,,HU0000100002,FairwaySection_HU_20241118.geojson,"LINESTRING (18.9228 46.1781, 18.9233 46.1788, ...",HU0000100002,HU_J0002,HU_J0003
2,Duna,,HU,,1,,3,,,Duna,...,1,0.0,,,HU0000100003,FairwaySection_HU_20241118.geojson,"LINESTRING (18.9699 47.0256, 18.9701 47.0247, ...",HU0000100003,HU_J0003,HU_J0004
3,Duna,,HU,,1,,4,,,Duna,...,1,0.0,,,HU0000100004,FairwaySection_HU_20241118.geojson,"LINESTRING (19.0509 47.437, 19.0506 47.4361, 1...",HU0000100004,HU_J0004,HU_J0005
4,Duna,,HU,,1,,5,,,Duna,...,1,0.0,,,HU0000100005,FairwaySection_HU_20241118.geojson,"LINESTRING (19.0518 47.4395, 19.0515 47.4387, ...",HU0000100005,HU_J0005,HU_J0006


## Check administration
Check that all the nodes that we combined are in the same location. 

In [9]:
# check if all nodes are at the same location
# almost, just one junction in belgium is not
(
    node_gdf
    .groupby(['objectcode', 'countrycode'])
    .agg({
        "geometry": lambda x: shapely.MultiPoint(x).convex_hull.area
    })
    .sort_values('geometry', ascending=False)
).head()



Unnamed: 0_level_0,Unnamed: 1_level_0,geometry
objectcode,countrycode,Unnamed: 2_level_1
J8219,BE,5.0000000000000005e-17
00000,RO,0.0
J3127,DE,0.0
J3132,DE,0.0
J3131,NL,0.0


# Create the graph
Now based on the edges we can create our initial graph. 

In [10]:
graph = nx.from_pandas_edgelist(section_gdf, edge_attr=True)

## Update node information
We update the information in the nodes. Some information (.e.g. names will be overwritten). Multiple nodes at the same location have multiple names. Here we keep track of the last node. We store the original node information in `euris_nodes`. You can use that to find all the node instances at this node location. 

In [11]:
for _, row in node_gdf.iterrows():
    n = row['node_id']
    node = graph.nodes[n]
    # add a list of euris nodes associated with this node
    euris_nodes = node.get('euris_nodes', [])
    euris_nodes.append(row.to_dict())
    node['euris_nodes'] = euris_nodes
    # and last property
    node.update(row.to_dict())

# Connect borders
The borders are not connected by default. There are several ways to connect borders. 
One way is to use the concept of `borderpoints` in the nodes table.  See all the variants in the [developer](https://data.eurisportal.eu/doc/DataModel%20EuRIS%20-%20Explanation_v1.1.pdf) docs (section 3). 

## Approach described in 3.3.1 / 3.3.2
Lookup all border_nodes that are connected using borderpoint and locode.

In [12]:
border_node_gdf = node_gdf[~node_gdf['borderpoint'].isna()]

# This is an example where the first record specifies the locode of the 2nd row.
# The 2nd row does not properly specifiy the borderpoint (assumes HUXXX00001J000214331 is present, following the second method, which it does not seem to be)
border_node_gdf[border_node_gdf.countrycode.isin(['HR', 'HU'])][
    ['node_id', 'objectcode', 'sectionref', 'hectom', 'locode', 'function', 'borderpoint']
]



Unnamed: 0,node_id,objectcode,sectionref,hectom,locode,function,borderpoint
138,HU_J0001,J0001,HU0000100001,14332,HUXXX00001J000114332,junction,HRXXX00001J000214331
11814,HR_J0002,J0002,HR0000114331,14331,HRXXX00001J000214331,Junction,HU
11827,HR_J0008,J0008,HR0000113635,12955,HRXXX00001J000812955,Junction,RS


In [13]:
# We can resolve border connections based on the border point / locode connection
border_locode_connections = pd.merge(
    border_node_gdf[['node_id', 'borderpoint']], 
    border_node_gdf[['node_id', 'locode']], 
    left_on='borderpoint', 
    right_on='locode'
)
border_locode_connections = border_locode_connections.rename(columns={'node_id_x': 'source', 'node_id_y': 'target'})
border_locode_connections

Unnamed: 0,source,borderpoint,target,locode
0,SK_J0001,ATXXX00001J001018802,AT_J0010,ATXXX00001J001018802
1,SK_J0002,ATXXX00001J001118727,AT_J0011,ATXXX00001J001118727
2,AT_J0010,SKXXX00001J000118802,SK_J0001,SKXXX00001J000118802
3,AT_J0011,SKXXX00001J000218727,SK_J0002,SKXXX00001J000218727
4,AT_J0023,DEXXX00401J006022018,DE_J0060,DEXXX00401J006022018
...,...,...,...,...
58,BE_F5199,NLSVW0150CJ245100111,NL_J2451,NLSVW0150CJ245100111
59,BE_F6262,NLBUD00121J428800466,NL_J4288,NLBUD00121J428800466
60,BE_F6151,NLSVG00130J198900165,NL_J1989,NLSVG00130J198900165
61,BE_W3531,NLMST00150J234400000,NL_J2344,NLMST00150J234400000


In [14]:
# This is an example of such a border connection
border_node_gdf[['node_id', 'locode', 'borderpoint']].query('node_id == "SK_J0022"')

Unnamed: 0,node_id,locode,borderpoint
197,SK_J0022,SKXXX00001J002217938,HUXXX00001J001517938
198,SK_J0022,SKXXX00001J002217938,HUXXX00001J001517938


In [15]:
# An example that does not seem to be connected properly (reported)
border_node_gdf[['node_id', 'locode', 'borderpoint']].query('locode == "HUXXX00001J001517938"')
# HUXXX00003J001500000
# HUXXX00001J001517938

Unnamed: 0,node_id,locode,borderpoint


In [16]:

def geometry_for_border(row):
    source_geometry = graph.nodes[row['source']]['geometry']
    target_geometry = graph.nodes[row['target']]['geometry']
    source_target_geometry = shapely.LineString([source_geometry, target_geometry])
    return source_target_geometry
    
border_locode_connections['geometry'] = border_locode_connections.apply(
    geometry_for_border, 
    axis=1
)

# This is an example of a connected border. 
border_locode_connections[border_locode_connections['source'].str.contains('SK_J00')]


Unnamed: 0,source,borderpoint,target,locode,geometry
0,SK_J0001,ATXXX00001J001018802,AT_J0010,ATXXX00001J001018802,"LINESTRING (16.97612946 48.17219739, 16.976129..."
1,SK_J0002,ATXXX00001J001118727,AT_J0011,ATXXX00001J001118727,"LINESTRING (17.0572242 48.14373057, 17.0572242..."


In [17]:
# We will update our edges based on the list of connections
# Note that these have no properties. 
border_graph = nx.from_pandas_edgelist(border_locode_connections, edge_attr=True)
graph.add_edges_from(
    (e[0], e[1], attrs)
    for e, attrs
    in border_graph.edges.items()
)


# Store if an edge is a border edge
for e, edge in graph.edges.items():
    edge['is_border'] = False
    if e in border_graph.edges:
        edge['is_border'] = True
        
        

In [18]:
graph.edges[('SK_J0001', 'AT_J0010')]

{'borderpoint': 'SKXXX00001J000118802',
 'locode': 'SKXXX00001J000118802',
 'geometry': <LINESTRING (16.976 48.172, 16.976 48.172)>,
 'is_border': True}

## Approach in 3.3.1
This approach does not seem to be applied. No borderpoints match this construction. 

In [19]:
border_objectcode_connections = pd.merge(
    border_node_gdf[['node_id', 'borderpoint', 'objectcode', 'countrycode']],
    border_node_gdf[['node_id', 'borderpoint', 'objectcode', 'countrycode']],
    left_on=['countrycode', 'objectcode'],
    right_on=['borderpoint', 'objectcode']
)
border_objectcode_connections

Unnamed: 0,node_id_x,borderpoint_x,objectcode,countrycode_x,node_id_y,borderpoint_y,countrycode_y


## Compute subgraphs
To check the connectivity of the network we compute subgraphs. 
This is the full list of subgraphs. Store the subgraph identifier for all nodes and edges. 

In [20]:
for i, component in enumerate(nx.connected_components(graph)):
    subgraph = graph.subgraph(component)
    for edge in subgraph.edges.values():
        edge['subgraph'] = i
    for node in subgraph.nodes.values():
        node['subgraph'] = i        

## Export
Export the network to different formats. Geojson is a valid format as it support nested lists (used for the euris nodes per nodes). 

In [21]:
edge_df = pd.DataFrame(data=graph.edges.values(), index=graph.edges.keys()).reset_index(names=['source', 'target'])
edge_gdf = gpd.GeoDataFrame(edge_df, crs='EPSG:4326')
node_df = pd.DataFrame(data=graph.nodes.values(), index=graph.nodes.keys()).reset_index(names=['n'])
node_gdf = gpd.GeoDataFrame(node_df, crs='EPSG:4326')

edge_gdf.to_file(export_edge_path)
node_gdf.to_file(export_node_path)

In [22]:

with export_pickle_path.open('wb') as f:
    pickle.dump(graph, f)