In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pickle
import networkx as nx

# 从保存的 GNN 图文件加载
gnn_graph_path = r"/content/drive/MyDrive/BAGFormer/final_gnn_traffic_graph.pkl"
with open(gnn_graph_path, "rb") as f:
    G_gnn_parallel = pickle.load(f)

# 从 GNN 图中恢复 traffic_node_mapping
# 这里假设在构建 GNN 图时，你是这样添加节点的：
#   for site, node in traffic_node_mapping.items():
#       G_gnn_parallel.add_node(site, pos=node)
# 因此下面通过遍历节点来恢复这个映射
traffic_node_mapping = {node: data.get("pos") for node, data in G_gnn_parallel.nodes(data=True)}

print("恢复的 traffic_node_mapping 节点数量:", len(traffic_node_mapping))


恢复的 traffic_node_mapping 节点数量: 13649


In [4]:
import pickle
import networkx as nx
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import pandas as pd

# ================ Step 0: 获得 Transformer 使用的 sensor_categories ================
# 假设 aggregated_df 是 Transformer 数据预处理后的 DataFrame，并已经通过如下代码生成 sensor_id：
# aggregated_df['sensor_id'] = aggregated_df['Site_ID'].astype('category')
# aggregated_df['sensor_id'] = aggregated_df['sensor_id'].cat.set_categories(
#     sorted(aggregated_df['Site_ID'].unique()), ordered=True).cat.codes
# 则统一的 sensor 顺序就是：
aggregated_df = pd.read_csv("/content/drive/MyDrive/BAGFormer/aggregated_df.csv")

sensor_categories = sorted(aggregated_df['Site_ID'].unique())
print("Transformer sensor_categories:", sensor_categories)

# ================ Step 1: 加载已有 GNN 图，并恢复 traffic_node_mapping ================

gnn_graph_path = "/content/drive/MyDrive/BAGFormer/final_gnn_traffic_graph.pkl"
with open(gnn_graph_path, "rb") as f:
    G_gnn_parallel = pickle.load(f)

# 假设在构建图时，每个节点的 key 就是 Site_ID，且图节点的 'pos' 属性存储了 (x, y) 坐标
traffic_node_mapping = {node: data.get("pos") for node, data in G_gnn_parallel.nodes(data=True)}
print("Recovered traffic_node_mapping count:", len(traffic_node_mapping))

# ================ Step 2: 确定用于预训练的传感器列表 ================
# 只保留 Transformer 侧存在且在 GNN 图中有对应位置数据的传感器
sensor_list = [sensor for sensor in sensor_categories if sensor in traffic_node_mapping]
num_sensors = len(sensor_list)
print("Number of sensors used for GNN pretrain:", num_sensors)

# 构造映射：sensor -> index
sensor_to_idx = {sensor: i for i, sensor in enumerate(sensor_list)}

# ================ Step 3: 构造初始特征矩阵 X （利用节点 pos 信息） ================
# 提取 sensor_list 中每个 sensor 的 (x, y) 坐标
pos_list = []
for sensor in sensor_list:
    pos = traffic_node_mapping.get(sensor)
    if pos is None:
        # 如果缺失，可用 (0, 0) 作为默认
        pos = (0.0, 0.0)
    pos_list.append(pos)

pos_array = np.array(pos_list, dtype=np.float32)  # shape: [num_sensors, 2]

# 标准化处理（均值归一化）
mean_pos = pos_array.mean(axis=0, keepdims=True)
std_pos = pos_array.std(axis=0, keepdims=True)
pos_array_normalized = (pos_array - mean_pos) / std_pos

# 转换为 PyTorch Tensor，作为初始特征 X
X = torch.tensor(pos_array_normalized, dtype=torch.float32)  # shape: [num_sensors, 2]
print(f"Initial features X shape: {X.shape}")
print("First 5 nodes X:\n", X[:5])

# ================ Step 4: 构造邻接矩阵 A（只包含 sensor_list 内的节点） ================
A = np.zeros((num_sensors, num_sensors))
for u, v, data in G_gnn_parallel.edges(data=True):
    if (u in sensor_to_idx) and (v in sensor_to_idx):
        i = sensor_to_idx[u]
        j = sensor_to_idx[v]
        weight = data.get('weight', 1.0)
        A[i, j] = weight
        A[j, i] = weight  # 保持无向图对称

def normalize_adjacency(A):
    I = np.eye(A.shape[0])
    A_hat = A + I
    D_hat = np.diag(np.sum(A_hat, axis=1))
    D_hat_inv = np.linalg.inv(D_hat)
    return np.matmul(D_hat_inv, A_hat)

A_norm = normalize_adjacency(A)
A_norm = torch.tensor(A_norm, dtype=torch.float32)

# ================ Step 5: 向量化边信息，用于损失计算 ================
edges = []
for u, v, data in G_gnn_parallel.edges(data=True):
    if (u in sensor_to_idx) and (v in sensor_to_idx):
        i = sensor_to_idx[u]
        j = sensor_to_idx[v]
        weight = data.get('weight', 1.0)
        edges.append((i, j, weight))

edge_indices_i = torch.tensor([e[0] for e in edges], dtype=torch.long)
edge_indices_j = torch.tensor([e[1] for e in edges], dtype=torch.long)
edge_weights = torch.tensor([e[2] for e in edges], dtype=torch.float32)

def compute_edge_loss(embeddings, edge_indices_i, edge_indices_j, edge_weights):
    emb_i = embeddings[edge_indices_i]  # [num_edges, out_dim]
    emb_j = embeddings[edge_indices_j]  # [num_edges, out_dim]
    diffs = emb_i - emb_j
    squared_dists = torch.norm(diffs, p=2, dim=1) ** 2
    loss_edge = (edge_weights * squared_dists).sum() / edge_weights.sum()
    return loss_edge

# ================ Step 6: 定义简单两层 GCN 模型，并预训练 ================
class SimpleGCN(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super(SimpleGCN, self).__init__()
        self.gcn1 = nn.Linear(in_dim, hidden_dim)
        self.gcn2 = nn.Linear(hidden_dim, out_dim)

    def forward(self, A_norm, X):
        H = torch.relu(self.gcn1(torch.matmul(A_norm, X)))
        out = self.gcn2(torch.matmul(A_norm, H))
        return out

in_dim = X.shape[1]    # 此处为 2 （(x,y)维度）
hidden_dim = 64
out_dim = 64           # 嵌入维度，一般与 Transformer 中 d_model 保持一致

model_gcn = SimpleGCN(in_dim, hidden_dim, out_dim)
optimizer = optim.Adam(model_gcn.parameters(), lr=1e-3)
lambda_reg = 0.1  # 正则项权重
eps = 1e-5        # 防止对数无穷大

num_epochs = 200
for epoch in tqdm(range(num_epochs), desc="Pretrain GCN"):
    model_gcn.train()
    optimizer.zero_grad()

    embeddings = model_gcn(A_norm, X)  # shape: [num_sensors, out_dim]
    loss_edge = compute_edge_loss(embeddings, edge_indices_i, edge_indices_j, edge_weights)
    var_embeddings = torch.var(embeddings)
    loss_reg = -torch.log(var_embeddings + eps)

    loss = loss_edge + lambda_reg * loss_reg
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 20 == 0:
        print(f"Epoch {epoch+1}: loss_edge = {loss_edge.item():.6f}, loss_reg = {loss_reg.item():.6f}, total_loss = {loss.item():.6f}")

# ================ Step 7: 保存预训练得到的 GNN 嵌入 ================
gnn_embeds = embeddings.detach()  # shape: [num_sensors, out_dim]
save_path = "/content/drive/MyDrive/BAGFormer/gnn_embeds.pt"
torch.save(gnn_embeds, save_path)
print("Pretrained GNN embeddings saved to", save_path)


Transformer sensor_categories: ['0000001_0105', '0000001_0107', '0000001_0109', '0000001_0112', '0000001_0114', '0000001_0132', '0000001_0141', '0000001_0143', '0000001_0145', '0000001_0156', '0000001_0161', '0000001_0169', '0000001_0172', '0000001_0194', '0000001_0221', '0000001_0225', '0000001_0234', '0000001_0236', '0000001_0238', '0000001_8031', '0000001_8045', '0000001_8049', '0000001_8050', '0000001_8056', '0000001_8061', '0000001_8063', '0000001_8064', '0000001_8066', '0000003_0101', '0000003_0109', '0000003_0112', '0000003_0114', '0000003_0118', '0000003_0121', '0000003_0123', '0000003_0145', '0000003_0147', '0000003_0154', '0000003_0158', '0000003_0194', '0000003_0196', '0000003_0198', '0000003_0201', '0000003_8011', '0000003_8021', '0000003_8035', '0000003_8037', '0000005_0101', '0000005_0105', '0000005_0109', '0000005_0112', '0000005_0114', '0000005_0116', '0000005_0120', '0000005_0123', '0000005_0127', '0000005_0141', '0000005_0143', '0000005_0147', '0000005_0149', '0000005

Pretrain GCN:  10%|█         | 20/200 [00:07<01:03,  2.85it/s]

Epoch 20: loss_edge = 0.009591, loss_reg = 0.896663, total_loss = 0.099257


Pretrain GCN:  20%|██        | 40/200 [00:13<00:50,  3.14it/s]

Epoch 40: loss_edge = 0.018710, loss_reg = 0.000037, total_loss = 0.018714


Pretrain GCN:  30%|███       | 60/200 [00:20<00:45,  3.06it/s]

Epoch 60: loss_edge = 0.029114, loss_reg = -0.600661, total_loss = -0.030952


Pretrain GCN:  40%|████      | 80/200 [00:27<00:38,  3.15it/s]

Epoch 80: loss_edge = 0.038042, loss_reg = -1.027723, total_loss = -0.064731


Pretrain GCN:  50%|█████     | 100/200 [00:34<00:32,  3.11it/s]

Epoch 100: loss_edge = 0.042822, loss_reg = -1.339423, total_loss = -0.091121


Pretrain GCN:  60%|██████    | 120/200 [00:40<00:25,  3.16it/s]

Epoch 120: loss_edge = 0.043209, loss_reg = -1.581677, total_loss = -0.114958


Pretrain GCN:  70%|███████   | 140/200 [00:47<00:19,  3.15it/s]

Epoch 140: loss_edge = 0.040870, loss_reg = -1.788953, total_loss = -0.138025


Pretrain GCN:  80%|████████  | 160/200 [00:54<00:14,  2.71it/s]

Epoch 160: loss_edge = 0.037404, loss_reg = -1.980365, total_loss = -0.160632


Pretrain GCN:  90%|█████████ | 180/200 [01:01<00:06,  3.11it/s]

Epoch 180: loss_edge = 0.033670, loss_reg = -2.162798, total_loss = -0.182610


Pretrain GCN: 100%|██████████| 200/200 [01:08<00:00,  2.91it/s]

Epoch 200: loss_edge = 0.029994, loss_reg = -2.337970, total_loss = -0.203803





Pretrained GNN embeddings saved to /content/drive/MyDrive/BAGFormer/gnn_embeds.pt


In [5]:
# 假设你的 .pt 文件名为 "gnn_embeds.pt"
data = torch.load("/content/drive/MyDrive/BAGFormer/gnn_embeds.pt")

# 如果保存的是单个 tensor，可以直接打印
print(data)
print("Shape:", data.shape)

# 如果保存的是模型的 state_dict（一个字典），可以查看字典的键
if isinstance(data, dict):
    print("Saved keys:", data.keys())

tensor([[ 3.3980,  3.3246,  1.6859,  ..., -3.5180,  3.6075, -3.2389],
        [ 3.3978,  3.3243,  1.6857,  ..., -3.5177,  3.6073, -3.2385],
        [ 3.3971,  3.3234,  1.6854,  ..., -3.5168,  3.6064, -3.2375],
        ...,
        [ 3.2018,  2.9507,  1.8719,  ..., -3.3481,  3.2695, -3.0396],
        [ 3.2005,  2.9575,  1.8641,  ..., -3.3469,  3.2751, -3.0404],
        [ 3.2034,  2.9558,  1.8695,  ..., -3.3500,  3.2730, -3.0424]])
Shape: torch.Size([13645, 64])
