In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from generator import RoadNetwork, Trajectory
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pandas as pd
import networkx as nx
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch_geometric.transforms as T

from models import CLMModel
from models.utils import generate_trajid_to_nodeid

In [2]:
city = "sf"

In [3]:
network = RoadNetwork()
network.load(f"../../osm_data/{city}")
trajectory = pd.read_pickle(
    f"../../datasets/trajectories/{city}/traj_train_test_split/train_69.pkl"
)
trajectory["seg_seq"] = trajectory["seg_seq"].map(np.array)
data = network.generate_road_segment_pyg_dataset(include_coords=True, dataset=city)

In [21]:
# calculate transition matrix 
traj_map = generate_trajid_to_nodeid(network)
trans_mat = np.zeros((data.x.shape[0], data.x.shape[0]))
for seq in tqdm(trajectory.seg_seq):
    for i, id1 in enumerate(seq):
        for id2 in seq[i:]:
            node_id1, node_id2 = traj_map[id1], traj_map[id2]
            trans_mat[node_id1, node_id2] += 1

trans_mat = trans_mat / (trans_mat.max(axis=1, keepdims=True, initial=0.) + 1e-9)
row, col = np.diag_indices_from(trans_mat)
trans_mat[row, col] = 0

100%|██████████| 1080963/1080963 [16:02<00:00, 1123.30it/s]


In [22]:
np.savetxt("clm_trans_mat.gz", X=trans_mat)

In [4]:
trans_mat = np.loadtxt(f"./clm_trans_mat_{city}.gz")

In [5]:
trans_mat_b = (trans_mat > 0.6)
aug_edges = [(i // trans_mat.shape[0] , i % trans_mat.shape[0]) for i, n in enumerate(trans_mat_b.flatten()) if n]
aug_edge_index = torch.tensor(np.array(aug_edges).transpose()).cuda()

In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
trajectory.rename({"seg_seq": "path"}, inplace=True, axis=1)
model = CLMModel(data, device, network, trans_adj=aug_edge_index, traj_data=trajectory, batch_size=32, emb_dim=32)

In [10]:
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params+=params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
    
count_parameters(model.model)

+--------------------------------------------------------------------+------------+
|                              Modules                               | Parameters |
+--------------------------------------------------------------------+------------+
|                       node_embedding.weight                        |   865248   |
|                  graph_encoder1.layers.0.att_src                   |    128     |
|                  graph_encoder1.layers.0.att_dst                   |    128     |
|                    graph_encoder1.layers.0.bias                    |    128     |
|               graph_encoder1.layers.0.lin_src.weight               |    4096    |
|                  graph_encoder1.layers.1.att_src                   |    128     |
|                  graph_encoder1.layers.1.att_dst                   |    128     |
|                    graph_encoder1.layers.1.bias                    |    128     |
|               graph_encoder1.layers.1.lin_src.weight               |   163

1007328

In [12]:
torch.cuda.empty_cache()
del model

In [7]:
model.train(epochs=5)

100%|██████████| 250000/250000 [00:57<00:00, 4380.92it/s]


RuntimeError: CUDA out of memory. Tried to allocate 2.72 GiB (GPU 0; 10.92 GiB total capacity; 8.96 GiB already allocated; 777.44 MiB free; 9.05 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [10]:
torch.save(model.model.state_dict(), os.path.join("./clm.pt"))

In [11]:
z = model.load_emb().detach().cpu().numpy()

In [12]:
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

# n2v = models[-1]
idxs = np.arange(len(network.line_graph.nodes))
train_idx, test_idx = model_selection.train_test_split(idxs, test_size=0.2, random_state=69)
y = np.array([network.gdf_edges.loc[n]["highway_enc"] for n in network.line_graph.nodes])

# for m, e in models:
    # m.train(epochs=e)
    # zn = m.load_emb()
    # zcn = np.concatenate((zn, z2), axis=1)
    # zct = np.concatenate((zn, z3), axis=1)
    # zcnn = np.concatenate((zn, z4), axis=1)
    # zctn = np.concatenate((zn, z5), axis=1)
    # X = z # embedding for each node
eva = [z] # gtc.load_emb(), gae_emb, rand_emb
for X in eva:
    X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]

    lm = linear_model.LogisticRegression(multi_class="multinomial", max_iter=1000)
    # lm.fit(X_train, y_train)
    scorer = make_scorer(metrics.f1_score, average="macro")
    print(np.mean(cross_val_score(estimator=lm, X=X, y=y, scoring=scorer, cv=5)))
    #print(metrics.classification_report(y_test, lm.predict(X_test)))

0.3283721609008289
