In [1]:
import pandas as pd
import geopandas as gpd
import osmnx as ox
import sys
sys.path.append('../.') 
from src.graph.create_osmnx_graph import OSMnxGraph

%load_ext autoreload
%autoreload 2

In [2]:
df_accidents = pd.read_csv("../data/wypadki-pl/accidents.csv")
df_accidents.drop(df_accidents[(df_accidents['mie_nazwa'] != 'Warszawa')].index, inplace=True)
df_accidents.drop(columns='uczestnicy', inplace=True)
from shapely.geometry import Point
geometry = [Point(xy) for xy in zip(df_accidents['wsp_gps_x'], df_accidents['wsp_gps_y'])]
gdf_acc = gpd.GeoDataFrame(df_accidents, geometry=geometry)
gdf_acc.drop(columns=['wsp_gps_x', 'wsp_gps_y'], inplace=True)
G =ox.graph.graph_from_place("Warsaw, Poland", network_type="drive")
gdf_nodes, gdf_edges = ox.graph_to_gdfs(G)
gdf_nodes

Unnamed: 0_level_0,y,x,street_count,highway,ref,geometry
osmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
26063726,52.152787,21.017454,5,,,POINT (21.01745 52.15279)
26063848,52.157570,20.991392,4,,,POINT (20.99139 52.15757)
26063857,52.166411,20.992301,3,,,POINT (20.99230 52.16641)
26063923,52.170803,20.992247,3,,,POINT (20.99225 52.17080)
26083887,52.176026,20.995141,3,,,POINT (20.99514 52.17603)
...,...,...,...,...,...,...
11707394732,52.294628,21.022652,1,,,POINT (21.02265 52.29463)
11711719556,52.297432,21.043383,3,,,POINT (21.04338 52.29743)
11721637528,52.297728,21.046037,3,,,POINT (21.04604 52.29773)
11730365878,52.204687,20.880493,1,,,POINT (20.88049 52.20469)


## Node aggregation version

In [32]:
graph_embedder = OSMnxGraph(gdf_acc, gdf_nodes, gdf_edges)

graph_data = graph_embedder.create_graph(aggregation_type='node')
graph_data

10649521808
osmid
33124279       [20.9924766, 52.2318132]
3106862744     [20.9923747, 52.2319822]
10649521797    [20.9928828, 52.2314206]
10649521808    [20.9926346, 52.2318443]
10649521814     [20.9925175, 52.232021]
Name: geometry, dtype: object
abcd
osmid
33124279       0.000169
3106862744     0.000291
10649521797    0.000504
10649521808    0.000017
10649521814    0.000201
dtype: float64
<class 'pandas.core.series.Series'>
osmid
10649521808    0.000017
33124279       0.000169
10649521814    0.000201
3106862744     0.000291
10649521797    0.000504
Name: 0, dtype: float64
<class 'pandas.core.frame.DataFrame'>


AttributeError: 'Series' object has no attribute 'osmid'

In [None]:
graph_embedder.show_statistics()

{'Nodes': 19352,
 'Edges': 44148,
 'Nodes dim': 8,
 'Nodes class': 2,
 'Directed': True,
 'Graph density [%]': 0.012}

In [None]:
graph_data.num_features

8

In [7]:
len(graph_data.y.unique())

2

In [8]:
from src.baseline_models.GCN.supervised_node_classification import SupervisedNodeClassificationGNN
from src.baseline_models.GCN.gcn import GCNModel
import torch

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
num_features = graph_data.num_features
hidden_dim = 256
out_dim = 128
num_classes = len(graph_data.y.unique())

gnn = GCNModel(in_dim=num_features, hidden_dim=hidden_dim, out_dim=out_dim)

model = SupervisedNodeClassificationGNN(gnn=gnn, emb_dim=out_dim, num_classes=num_classes, lr=0.0001)

In [10]:
graph_data.x

tensor([[5., 0., 0.,  ..., 0., 0., 0.],
        [4., 0., 0.,  ..., 0., 0., 0.],
        [3., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [3., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.]])

In [11]:
graph_data

Data(x=[19352, 8], edge_index=[2, 44148], y=[19352], street_count=[19352], highway=[19352], ref=[19352], accidents_count=[19352], osmid=[44148], oneway=[44148], lanes=[44148], name=[44148], edge_highway=[44148], reversed=[44148], length=[44148], maxspeed=[44148], geometry=[44148], access=[44148], junction=[44148], bridge=[44148], tunnel=[44148], edge_ref=[44148], width=[44148], crs=epsg:4326)

In [12]:
graph_data.node_attrs

<bound method BaseData.node_attrs of Data(x=[19352, 8], edge_index=[2, 44148], y=[19352], street_count=[19352], highway=[19352], ref=[19352], accidents_count=[19352], osmid=[44148], oneway=[44148], lanes=[44148], name=[44148], edge_highway=[44148], reversed=[44148], length=[44148], maxspeed=[44148], geometry=[44148], access=[44148], junction=[44148], bridge=[44148], tunnel=[44148], edge_ref=[44148], width=[44148], crs=epsg:4326)>

In [13]:
graph_data.edge_attr

In [14]:
features = graph_embedder.get_node_attrs()
features

Unnamed: 0_level_0,street_count,crossing,mini_roundabout,motorway_junction,speed_camera,traffic_signals,turning_circle,turning_loop
osmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
26063726,5,0,0,0,0,0,0,0
26063848,4,0,0,0,0,0,0,0
26063857,3,0,0,0,0,0,0,0
26063923,3,0,0,0,0,0,0,0
26083887,3,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
11707394732,1,0,0,0,0,0,0,0
11711719556,3,0,0,0,0,0,0,0
11721637528,3,0,0,0,0,0,0,0
11730365878,1,0,0,0,0,0,0,0


In [15]:
type(features.index)

pandas.core.indexes.base.Index

In [16]:
len(set(features.index.to_list()))

19352

In [17]:
gdf_nodes.index

Index([   26063726,    26063848,    26063857,    26063923,    26083887,
          26083906,    26083915,    26083926,    26083927,    26083933,
       ...
       11701903469, 11701903470, 11702685791, 11702685797, 11702685811,
       11707394732, 11711719556, 11721637528, 11730365878, 11730365883],
      dtype='int64', name='osmid', length=19352)

In [18]:
len(graph_data.y)

19352

In [19]:
features.index

Index([   26063726,    26063848,    26063857,    26063923,    26083887,
          26083906,    26083915,    26083926,    26083927,    26083933,
       ...
       11701903469, 11701903470, 11702685791, 11702685797, 11702685811,
       11707394732, 11711719556, 11721637528, 11730365878, 11730365883],
      dtype='int64', name='osmid', length=19352)

In [20]:
features.index.unique()

Index([   26063726,    26063848,    26063857,    26063923,    26083887,
          26083906,    26083915,    26083926,    26083927,    26083933,
       ...
       11701903469, 11701903470, 11702685791, 11702685797, 11702685811,
       11707394732, 11711719556, 11721637528, 11730365878, 11730365883],
      dtype='int64', name='osmid', length=19352)

In [21]:
from sklearn.model_selection import train_test_split
nodes_labels = {'node': features.index.to_numpy(), 'label': graph_data.y.cpu().numpy()}
df_to_split = pd.DataFrame(nodes_labels)

df_train, df_test = train_test_split(df_to_split, test_size=0.2, random_state=42, stratify=df_to_split['label'])
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42, stratify=df_test['label'])

train_nodes = df_train['node']
val_nodes = df_val['node']
test_nodes = df_test['node']

train_mask = []   
val_mask = []
test_mask = []

for i in range(len(graph_data.y)):

  if i in train_nodes:
    train_mask.append(True)
    val_mask.append(False)
    test_mask.append(False)
  elif i in val_nodes:
    train_mask.append(False)
    val_mask.append(True)
    test_mask.append(False)
  elif i in test_nodes:
    train_mask.append(False)
    val_mask.append(False)
    test_mask.append(True)

graph_data.train_mask = torch.tensor(train_mask).cpu()
graph_data.val_mask = torch.tensor(val_mask).cpu()
graph_data.test_mask = torch.tensor(test_mask).cpu()

In [23]:
import pytorch_lightning as pl
from src.baseline_models.GCN.GraphData import GraphData
trainer = pl.Trainer(max_epochs=50)
datamodule = GraphData([graph_data])
trainer.fit(model=model, datamodule=datamodule)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                 | Type       | Params
----------------------------------------------------
0 | _gnn                 | GCNModel   | 35.2 K
1 | _classification_head | Sequential | 16.8 K
2 | _loss_fn             | NLLLoss    | 0     
----------------------------------------------------
52.0 K    Trainable params
0         Non-trainable params
52.0 K    Total params
0.208     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/grymar/studia/gradient/env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/home/grymar/studia/gradient/env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/home/grymar/studia/gradient/env/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

loss: 0.6900546550750732


Validation: |          | 0/? [00:00<?, ?it/s]

loss: 0.6874790787696838


Validation: |          | 0/? [00:00<?, ?it/s]

loss: 0.6843132376670837


Validation: |          | 0/? [00:00<?, ?it/s]

loss: 0.6815527677536011


Validation: |          | 0/? [00:00<?, ?it/s]

loss: 0.6794317364692688


Validation: |          | 0/? [00:00<?, ?it/s]

loss: 0.6768854856491089


Validation: |          | 0/? [00:00<?, ?it/s]

loss: 0.6739983558654785


Validation: |          | 0/? [00:00<?, ?it/s]

loss: 0.6717624068260193


Validation: |          | 0/? [00:00<?, ?it/s]

loss: 0.6688829660415649


Validation: |          | 0/? [00:00<?, ?it/s]

loss: 0.6666914224624634


Validation: |          | 0/? [00:00<?, ?it/s]

loss: 0.6644141674041748


Validation: |          | 0/? [00:00<?, ?it/s]

loss: 0.6620421409606934


Validation: |          | 0/? [00:00<?, ?it/s]

loss: 0.6593301296234131


Validation: |          | 0/? [00:00<?, ?it/s]

loss: 0.6568514704704285


Validation: |          | 0/? [00:00<?, ?it/s]

loss: 0.6536284685134888


Validation: |          | 0/? [00:00<?, ?it/s]

loss: 0.6526902914047241


Validation: |          | 0/? [00:00<?, ?it/s]

loss: 0.6496188044548035


Validation: |          | 0/? [00:00<?, ?it/s]

loss: 0.6479377150535583


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6444386839866638


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6424532532691956


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6404063701629639


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6376224756240845


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6360171437263489


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6330955028533936


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6305482387542725


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6279103755950928


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6252679228782654


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6231011748313904


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6208010315895081


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6181723475456238


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6161233186721802


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6132829785346985


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6113113164901733


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6081506609916687


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6057327389717102


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6042128801345825


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.6013122200965881


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.5991846323013306


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.5966188311576843


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.5936952829360962


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.5920385718345642


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.589257001876831


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.5865276455879211


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.5847756862640381


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.5823169350624084


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.5811008214950562


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.577311635017395


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.5748461484909058


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.5721794366836548


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


loss: 0.5705140233039856


  _warn_prf(average, modifier, msg_start, len(result))


Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
`Trainer.fit` stopped: `max_epochs=50` reached.


In [24]:
trainer.test(model=model, datamodule=datamodule, verbose=False)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/grymar/studia/gradient/env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[{'step': 50.0,
  'test/auc_weighted': 0.4011906087398529,
  'test/f1_weighted': 0.7624176144599915,
  'test/precision_weighted': 0.7001954913139343,
  'test/recall_weighted': 0.836776852607727,
  'test/accuracy_weighted': 0.836776852607727}]

In [25]:
test_auc = trainer.test(model=model, datamodule=datamodule, verbose=False)[0]["test/auc_weighted"]

z, y, y_pred = trainer.predict(model=model, datamodule=datamodule)[0]

print(f'AUC test = {test_auc * 100.:.2f}[%]')

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/grymar/studia/gradient/env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/grymar/studia/gradient/env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

AUC test = 40.12[%]


In [26]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [27]:
y_pred_test = y_pred[test_mask]

In [28]:
y_pred_test.max()

0

In [29]:
print(y[test_mask].numpy())

[1 0 0 ... 0 0 1]


In [30]:
len(y)

19352