In [110]:
import pandas as pd
import geopandas as gpd
import osmnx as ox
import sys
sys.path.append('../.') 
from src.graph.create_osmnx_graph import OSMnxGraph
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
import networkx as nx
%reload_ext autoreload
from src.baseline_models.GCN.supervised_node_classification import SupervisedNodeClassificationGNN
from src.baseline_models.GCN.gcn import GCNModel
import torch
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
from src.baseline_models.GCN.GraphData import GraphData
import warnings
warnings.filterwarnings('ignore')
from shapely.geometry import Point

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Wczytanie danych

In [111]:
df_accidents = pd.read_csv("../data/wypadki-pl/accidents.csv")
df_accidents.drop(df_accidents[(df_accidents['mie_nazwa'] != 'Warszawa')].index, inplace=True)
df_accidents.drop(columns='uczestnicy', inplace=True)
geometry = [Point(xy) for xy in zip(df_accidents['wsp_gps_x'], df_accidents['wsp_gps_y'])]
gdf_accidents = gpd.GeoDataFrame(df_accidents, geometry=geometry)
gdf_accidents.drop(columns=['wsp_gps_x', 'wsp_gps_y'], inplace=True)
G =ox.graph.graph_from_place("Warsaw, Poland", network_type="drive")
gdf_nodes, gdf_edges = ox.graph_to_gdfs(G)
gdf_nodes

Unnamed: 0_level_0,y,x,street_count,highway,ref,geometry
osmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
26063726,52.152787,21.017454,5,,,POINT (21.01745 52.15279)
26063848,52.157570,20.991392,4,,,POINT (20.99139 52.15757)
26063857,52.166411,20.992301,3,,,POINT (20.99230 52.16641)
26063923,52.170803,20.992247,3,,,POINT (20.99225 52.17080)
26083887,52.176026,20.995141,3,,,POINT (20.99514 52.17603)
...,...,...,...,...,...,...
11795306888,52.284337,21.051939,1,,,POINT (21.05194 52.28434)
11795306892,52.284337,21.052441,3,,,POINT (21.05244 52.28434)
11795306899,52.285418,21.051583,1,,,POINT (21.05158 52.28542)
11795306921,52.284486,21.052052,1,,,POINT (21.05205 52.28449)


In [112]:
gdf_edges

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,osmid,oneway,lanes,name,highway,reversed,length,maxspeed,geometry,access,junction,bridge,tunnel,ref,width
u,v,key,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
26063726,31717945,0,4895484,True,4,Poleczki,secondary,False,25.021,,"LINESTRING (21.01745 52.15279, 21.01780 52.15271)",,,,,,
26063726,29273144,0,"[1014952979, 1014952978, 1007148203, 148749604]",True,"[3, 4]",Puławska,primary,False,336.457,60,"LINESTRING (21.01745 52.15279, 21.01745 52.152...",,,,,,
26063848,1391593855,0,"[13982288, 125098266, 896320940, 1203516007]",False,2,"[Nowy Służewiec, Wyczółki]",residential,"[False, True]",996.078,,"LINESTRING (20.99139 52.15757, 20.99129 52.157...",,,,,,
26063848,258495263,0,23853023,False,2,Zatorze,tertiary,False,353.996,,"LINESTRING (20.99139 52.15757, 20.99127 52.157...",,,,,,
26063848,26083906,0,715903941,False,,Wyczółki,residential,False,66.590,,"LINESTRING (20.99139 52.15757, 20.99147 52.157...",,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11795306899,11795306883,0,1270353666,False,,,living_street,True,89.591,20,"LINESTRING (21.05158 52.28542, 21.05159 52.285...",,,,,,
11795306921,11795306923,0,1270353669,False,,,living_street,False,14.146,20,"LINESTRING (21.05205 52.28449, 21.05205 52.284...",,,,,,
11795306923,11795306892,0,1270353667,False,,,living_street,False,23.305,20,"LINESTRING (21.05212 52.28437, 21.05212 52.284...",,,,,,
11795306923,11795306888,0,1270353667,False,,,living_street,True,14.195,20,"LINESTRING (21.05212 52.28437, 21.05211 52.284...",,,,,,


## Informacje o przetwarzaniu cech

### Krawędzie
Wszystkie zgromadzone cechy potraktowano analogicznie jak w pracy magisterskiej z road networks za wyjątkiem atrybutów: psv, service, busway, bicycle, cycleway i surface, które nie pojawiły się w zestawieniu (po prostu nie było ich po pobraniu przez osmnx).

Preprocessing obejmował:
* lanes - dodanie domyślnej wartości 2
* maxspeed - dodanie domyślnej wartości 50
* width - dodanie domyślnej wartości 2.0
* pozostałe - dodanie wartości "unspecified"


### Wierzchołki

Preprocessing obejmował rozbicie CountVectorizerem wartości z kolumny "highway", "street_count" pozostawiono jako wartość numeryczną bez zmian



W obu przypadkach (krawędzie i wierzchołki) usunięto kolumnę "ref".


# Agregacja do wierzchołków

## 1. Tworzenie grafu i statystyki

In [113]:
graph_embedder = OSMnxGraph(gdf_accidents, gdf_nodes, gdf_edges)

graph_data = graph_embedder.create_graph(aggregation_type='node')
graph_data

Data(x=[19365, 8], edge_index=[2, 44177], y=[19365], street_count=[19365], highway=[19365], ref=[19365], accidents_count=[19365], osmid=[44177], oneway=[44177], lanes=[44177], name=[44177], edge_highway=[44177], reversed=[44177], length=[44177], maxspeed=[44177], geometry=[44177], access=[44177], junction=[44177], bridge=[44177], tunnel=[44177], edge_ref=[44177], width=[44177], crs=epsg:4326)

In [114]:
graph_embedder.show_statistics()

{'Nodes': 19365,
 'Edges': 44177,
 'Nodes dim': 8,
 'Nodes class': 2,
 'Directed': True,
 'Graph density [%]': 0.012}

In [115]:
graph_data.num_features

8

In [116]:
len(graph_data.y.unique())

2

Poniżej znajduje się wywołanie metody do pozyskiwania cech wierzchołków.

In [117]:
features = graph_embedder.get_node_attrs()
features

Unnamed: 0_level_0,street_count,crossing,mini_roundabout,motorway_junction,speed_camera,traffic_signals,turning_circle,turning_loop
osmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
26063726,5,0,0,0,0,0,0,0
26063848,4,0,0,0,0,0,0,0
26063857,3,0,0,0,0,0,0,0
26063923,3,0,0,0,0,0,0,0
26083887,3,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
11795306888,1,0,0,0,0,0,0,0
11795306892,3,0,0,0,0,0,0,0
11795306899,1,0,0,0,0,0,0,0
11795306921,1,0,0,0,0,0,0,0


## 2. Uczenie i testowanie modelu - Supervised node classification

In [118]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
num_features = graph_data.num_features
hidden_dim = 256
out_dim = 128
num_classes = len(graph_data.y.unique())

gnn = GCNModel(in_dim=num_features, hidden_dim=hidden_dim, out_dim=out_dim)

model = SupervisedNodeClassificationGNN(gnn=gnn, emb_dim=out_dim, num_classes=num_classes, lr=0.0001)

In [119]:

nodes_labels = {'node': features.index.to_numpy(), 'label': graph_data.y.cpu().numpy()}
df_to_split = pd.DataFrame(nodes_labels)

df_train, df_test = train_test_split(df_to_split, test_size=0.2, random_state=42, stratify=df_to_split['label'])
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42, stratify=df_test['label'])

train_nodes = df_train['node']
val_nodes = df_val['node']
test_nodes = df_test['node']

train_mask = []   
val_mask = []
test_mask = []

for i in range(len(graph_data.y)):

  if i in train_nodes:
    train_mask.append(True)
    val_mask.append(False)
    test_mask.append(False)
  elif i in val_nodes:
    train_mask.append(False)
    val_mask.append(True)
    test_mask.append(False)
  elif i in test_nodes:
    train_mask.append(False)
    val_mask.append(False)
    test_mask.append(True)

graph_data.train_mask = torch.tensor(train_mask).cpu()
graph_data.val_mask = torch.tensor(val_mask).cpu()
graph_data.test_mask = torch.tensor(test_mask).cpu()

In [None]:

trainer = pl.Trainer(max_epochs=50)
datamodule = GraphData([graph_data])
trainer.fit(model=model, datamodule=datamodule)

In [121]:
trainer.test(model=model, datamodule=datamodule, verbose=False)

Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 15.46it/s]


[{'step': 50.0,
  'test/auc_weighted': 0.4092303216457367,
  'test/f1_weighted': 0.7625361680984497,
  'test/precision_weighted': 0.7003365159034729,
  'test/recall_weighted': 0.8368611335754395,
  'test/accuracy_weighted': 0.8368611335754395}]

In [122]:
test_auc = trainer.test(model=model, datamodule=datamodule, verbose=False)[0]["test/auc_weighted"]

z, y, y_pred = trainer.predict(model=model, datamodule=datamodule)[0]

print(f'AUC test = {test_auc * 100.:.2f}[%]')

Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 18.53it/s]
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 23.22it/s]
AUC test = 40.92[%]


In [123]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [124]:
print(y[test_mask].numpy())

[0 0 0 ... 0 1 0]


# Agregacja do krawędzi

## 1. Tworzenie grafu i statystyki

In [125]:
graph_embedder = OSMnxGraph(gdf_accidents, gdf_nodes, gdf_edges)

graph_data = graph_embedder.create_graph(aggregation_type='edge')
graph_data

Data(x=[44177, 36], edge_index=[2, 44177], y=[44177], street_count=[19365], highway=[19365], ref=[19365], accidents_count=[19365], osmid=[44177], oneway=[44177], lanes=[44177], name=[44177], edge_highway=[44177], reversed=[44177], length=[44177], maxspeed=[44177], geometry=[44177], access=[44177], junction=[44177], bridge=[44177], tunnel=[44177], edge_ref=[44177], width=[44177], edge_accidents_count=[44177], crs=epsg:4326)

In [126]:
graph_embedder.show_statistics()

{'Nodes': 44177,
 'Edges': 44177,
 'Nodes dim': 36,
 'Nodes class': 2,
 'Directed': True,
 'Graph density [%]': 0.002}

In [127]:
features = graph_embedder.get_edge_attrs()

## 2. Uczenie i testowanie modelu - Supervised node classification

In [None]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'
num_features = graph_data.num_features
hidden_dim = 256
out_dim = 128
num_classes = len(graph_data.y.unique())

gnn = GCNModel(in_dim=num_features, hidden_dim=hidden_dim, out_dim=out_dim)

model = SupervisedNodeClassificationGNN(gnn=gnn, emb_dim=out_dim, num_classes=num_classes, lr=0.0001)

nodes_labels = {'node': features.index.to_numpy(), 'label': graph_data.y.cpu().numpy()}
df_to_split = pd.DataFrame(nodes_labels)

df_train, df_test = train_test_split(df_to_split, test_size=0.2, random_state=42, stratify=df_to_split['label'])
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42, stratify=df_test['label'])

train_nodes = df_train['node']
val_nodes = df_val['node']
test_nodes = df_test['node']

train_mask = []   
val_mask = []
test_mask = []

for i in range(len(graph_data.y)):

  if i in train_nodes:
    train_mask.append(True)
    val_mask.append(False)
    test_mask.append(False)
  elif i in val_nodes:
    train_mask.append(False)
    val_mask.append(True)
    test_mask.append(False)
  elif i in test_nodes:
    train_mask.append(False)
    val_mask.append(False)
    test_mask.append(True)

graph_data.train_mask = torch.tensor(train_mask).cpu()
graph_data.val_mask = torch.tensor(val_mask).cpu()
graph_data.test_mask = torch.tensor(test_mask).cpu()

trainer = pl.Trainer(max_epochs=50)
datamodule = GraphData([graph_data])
trainer.fit(model=model, datamodule=datamodule)

In [129]:
trainer.test(model=model, datamodule=datamodule, verbose=False)

Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 12.81it/s]


[{'step': 50.0,
  'test/auc_weighted': 0.3399044871330261,
  'test/f1_weighted': 0.8851123452186584,
  'test/precision_weighted': 0.8507536053657532,
  'test/recall_weighted': 0.9223629832267761,
  'test/accuracy_weighted': 0.9223629832267761}]

In [130]:
test_auc = trainer.test(model=model, datamodule=datamodule, verbose=False)[0]["test/auc_weighted"]

z, y, y_pred = trainer.predict(model=model, datamodule=datamodule)[0]

print(f'AUC test = {test_auc * 100.:.2f}[%]')

Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 14.70it/s]
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 17.53it/s]
AUC test = 33.99[%]


In [131]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [132]:
y_pred.max()

0