In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from generator import RoadNetwork
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pandas as pd
import networkx as nx
import numpy as np
from tqdm import tqdm

from models import GAEModel, GCNEncoder, GATEncoder

In [2]:
network = RoadNetwork()
network.load("../../osm_data/porto")
# df = pd.read_csv("../datasets/trajectories/Porto/road_segment_map_final.csv", sep=";", usecols=["id", "cpath"])
traj_features = pd.read_csv("../../datasets/trajectories/Porto/speed_features_unnormalized.csv")
traj_features.set_index(["u", "v", "key"], inplace=True)
traj_features["util"] = (traj_features["util"] - traj_features["util"].min()) / (traj_features["util"].max() - traj_features["util"].min())  # min max normalization
traj_features["avg_speed"] = (traj_features["avg_speed"] - traj_features["avg_speed"].min()) / (traj_features["avg_speed"].max() - traj_features["avg_speed"].min())  # min max normalization
traj_features.fillna(0, inplace=True)

In [3]:
torch.cuda.set_device(1)
torch.cuda.is_available()

print(traj_features.info())

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 11331 entries, (25503936, 4722746638, 0) to (9709007543, 415754684, 0)
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         11331 non-null  int64  
 1   util       11331 non-null  float64
 2   avg_speed  11331 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 662.5 KB
None


In [4]:
data = network.generate_road_segment_pyg_dataset(traj_data=traj_features)

In [5]:
data.x

tensor([[0.2306, 0.0033, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.3033, 0.0033, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.3040, 0.0123, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0115, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0425, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0425, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       dtype=torch.float64)

In [6]:
# for training without features
data.x = None

In [7]:
from torch_geometric.data import Data
import torch_geometric.transforms as T

# create pyg dataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transform = T.Compose([
    T.OneHotDegree(128), # training without features
    T.ToDevice(device),
])
data = transform(data)
print(data.x.shape)
model = GAEModel(data, device=device, encoder=GATEncoder, emb_dim=128)
model.train(epochs=50000)
# model.save_model(path="../model_states/gaegcn/")
# model.save_emb(path="../model_states/gaegcn/")




torch.Size([11331, 129])
Epoch: 500, avg_loss: 1.078448539018631
Epoch: 1000, avg_loss: 1.0693705228567123
Epoch: 1500, avg_loss: 1.0628761025269826
Epoch: 2000, avg_loss: 1.058221438884735
Epoch: 2500, avg_loss: 1.0548021503925324
Epoch: 3000, avg_loss: 1.0522288338343302
Epoch: 3500, avg_loss: 1.0511149499416352
Epoch: 4000, avg_loss: 1.0494538285136223
Epoch: 4500, avg_loss: 1.0479780311849383
Epoch: 5000, avg_loss: 1.0467378011226653
Epoch: 5500, avg_loss: 1.045554447889328
Epoch: 6000, avg_loss: 1.0461784299016
Epoch: 6500, avg_loss: 1.045638972557508
Epoch: 7000, avg_loss: 1.0449535868849074
Epoch: 7500, avg_loss: 1.0442159131368
Epoch: 8000, avg_loss: 1.0435227753967047
Epoch: 8500, avg_loss: 1.042878923921024
Epoch: 9000, avg_loss: 1.0422102374368243
Epoch: 9500, avg_loss: 1.0415169450483823
Epoch: 10000, avg_loss: 1.0408845550656318
Epoch: 10500, avg_loss: 1.0403793281487057
Epoch: 11000, avg_loss: 1.0397985401803798
Epoch: 11500, avg_loss: 1.039260646218839
Epoch: 12000, avg_

In [10]:
z = model.model.encode(data.x, data.edge_index)
z.shape

torch.Size([11331, 128])

In [8]:
model.save_model(path="../model_states/gaegat/no_features")

In [6]:
z = model.load_emb("../../model_states/gaegcn/embedding.out")

In [13]:
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

X = z.detach().cpu().numpy() # embedding for each node
# train simple classifier on 80% of data with cross validation
y = np.array([network.gdf_edges.loc[n]["highway_enc"] for n in network.line_graph.nodes])

# mask = ((y==11) | (y==10) | (y==9) | (y==4) | (y==1) | (y==2) | (y==12) | (y==7)) # remove uncommon tags
# X = X[~mask, :]
# y = y[~mask]
# print(np.unique(y, return_counts=True))

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size= 0.2, random_state = 1)

print('X_train dimension= ', X_train.shape)
print('X_test dimension= ', X_test.shape)
print('y_train dimension= ', y_train.shape)
print('y_test dimension= ', y_test.shape)

X_train dimension=  (9064, 128)
X_test dimension=  (2267, 128)
y_train dimension=  (9064,)
y_test dimension=  (2267,)


In [14]:
lm = linear_model.LogisticRegression(multi_class="multinomial", max_iter=1000)
lm.fit(X_train, y_train)
print(metrics.classification_report(y_test, lm.predict(X_test)))

              precision    recall  f1-score   support

           0       0.38      0.02      0.03       309
           1       0.18      0.22      0.20        18
           2       0.43      0.10      0.16        31
           3       0.67      0.05      0.08       133
           4       1.00      0.08      0.14        13
           5       0.54      0.95      0.69      1175
           6       0.34      0.17      0.23       301
           7       0.00      0.00      0.00        20
           8       0.14      0.00      0.01       221
           9       0.00      0.00      0.00         5
          10       0.00      0.00      0.00         4
          11       0.00      0.00      0.00         3
          12       0.00      0.00      0.00        34

    accuracy                           0.52      2267
   macro avg       0.28      0.12      0.12      2267
weighted avg       0.44      0.52      0.40      2267



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T

device = torch.device('cpu')
transform = T.Compose([
    T.NormalizeFeatures(),
    T.ToDevice(device),
    T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True,
                      split_labels=True, add_negative_train_samples=False),
])
dataset = Planetoid(".", "Cora", transform=transform)
t,v, te = dataset[0]
t

Data(x=[2708, 1433], edge_index=[2, 8976], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], pos_edge_label=[4488], pos_edge_label_index=[2, 4488])

In [21]:
from torch_geometric.utils import train_test_split_edges
device = torch.device('cpu')
transform = T.Compose([
    T.NormalizeFeatures(),
    T.ToDevice(device),
    T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True,
                      split_labels=True, add_negative_train_samples=False)
])
print(data)
transform(data)
test = train_test_split_edges(data)

print(test)

Data(x=[11331, 8], edge_index=[2, 26617])




Data(x=[11331, 8], val_pos_edge_index=[2, 699], test_pos_edge_index=[2, 1399], train_pos_edge_index=[2, 23788], train_neg_adj_mask=[11331, 11331], val_neg_edge_index=[2, 699], test_neg_edge_index=[2, 1399])
