In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import networkx as nx
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
# 加载数据
#file_path_clean = '/home/sdong/data/chicago_bicycle/data_pr_cleaned.csv'
file_path_clean = '/home/sdong/data/chicago_bicycle/data_pr_raw.csv'
data_df = pd.read_csv(file_path_clean)

# 随机选择20分之一个数据集（即5%）
data_df = data_df.sample(frac=0.05, random_state=42)

print(data_df.head())
# 使用 fillna() 方法替换所有的 NaN 值为 0
data_df.fillna(0, inplace=True)
# 检查非数值列
non_numeric_cols = data_df.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_cols)

# 将非数值列转换为数值类型（使用标签编码）
label_encoders = {}
for col in non_numeric_cols:
    le = LabelEncoder()
    data_df[col] = le.fit_transform(data_df[col].astype(str))
    label_encoders[col] = le



# 确保所有特征列都是数值类型
print("Data types after encoding:\n", data_df.info())

           trip_id    usertype gender            starttime  \
7144300    9641299    Customer    NaN  2016-05-21 14:33:00   
3719381    5509312    Customer    NaN  2015-06-06 15:57:00   
5410102    7607966  Subscriber   Male  2015-09-22 20:52:00   
12289637  15804345  Subscriber   Male  2017-08-14 16:43:35   
4834729    7181708  Subscriber   Male  2015-08-29 15:35:00   

                     stoptime  tripduration  temperature        events  \
7144300   2016-05-21 14:54:00          1242         73.9  partlycloudy   
3719381   2015-06-06 16:24:00          1613         70.0  mostlycloudy   
5410102   2015-09-22 21:07:00           899         66.0  partlycloudy   
12289637  2017-08-14 16:47:39           244         81.0  mostlycloudy   
4834729   2015-08-29 15:43:00           500         70.0        cloudy   

          from_station_id          from_station_name  latitude_start  \
7144300                35    Streeter Dr & Grand Ave       41.892278   
3719381               143  Sedgwick St

In [8]:
data_df.columns

Index(['trip_id', 'usertype', 'gender', 'starttime', 'stoptime',
       'tripduration', 'temperature', 'events', 'from_station_id',
       'from_station_name', 'latitude_start', 'longitude_start',
       'dpcapacity_start', 'to_station_id', 'to_station_name', 'latitude_end',
       'longitude_end', 'dpcapacity_end'],
      dtype='object')

In [9]:
# 标准化数值特征
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_df)

# 将标准化后的数据转换为torch张量
x = torch.tensor(data_scaled, dtype=torch.float)


In [4]:


# 示例表格数据特征
columns = ['trip_id', 'usertype', 'gender',
    'starttime', 'stoptime', 'tripduration', 'temperature', 'events',
    'from_station_id', 'from_station_name', 'latitude_start', 'longitude_start',
    'dpcapacity_start', 'to_station_id', 'to_station_name', 'latitude_end',
    'longitude_end', 'dpcapacity_end'
    
]

# 定义特征之间的关系
relations = [
    ('trip_id', 'usertype'),
    ('trip_id', 'gender'),
    ('starttime', 'stoptime'),
    ('starttime', 'latitude_start'),
    ('starttime', 'longitude_start'),
    ('stoptime', 'latitude_end'),
    ('stoptime', 'longitude_end'),
    ('latitude_start', 'longitude_start'),
    ('latitude_end', 'longitude_end'),
    ('from_station_id', 'from_station_name'),
    ('to_station_id', 'to_station_name'),
    ('from_station_id', 'latitude_start'),
    ('from_station_id', 'longitude_start'),
    ('to_station_id', 'latitude_end'),
    ('to_station_id', 'longitude_end'),
    ('tripduration', 'temperature'),
    ('tripduration', 'events'),
    ('dpcapacity_start', 'from_station_id'),
    ('dpcapacity_end', 'to_station_id')
    # 可以添加更多关系
]


# 创建空的无向图
G = nx.Graph()

# 添加节点（每个特征作为一个节点）
for col in columns:
    G.add_node(col)

# 添加边（根据特征之间的关系）
for relation in relations:
    G.add_edge(relation[0], relation[1])

# 将NetworkX图转换为PyTorch Geometric图
data = from_networkx(G)

# 添加节点特征
data.x = x

# 映射特征列到索引
feature_to_index = {col: i for i, col in enumerate(columns)}

# 映射关系到索引
edges = [(feature_to_index[src], feature_to_index[dst]) for src, dst in relations]

# 添加边
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
data.edge_index = edge_index


# 确定设备（GPU 优先）
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 将数据移动到设备
data = data.to(device)

print(data)


Data(edge_index=[2, 19], num_nodes=18, x=[474762, 18])


In [5]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(nn.Module):
    def __init__(self, num_features, hidden_channels, output_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.fc = nn.Linear(hidden_channels, output_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.fc(x)
        return x

# 初始化模型
num_features = data.num_features
hidden_channels = 128
output_channels = num_features  # 确保输出维度与输入维度一致
model = GCN(num_features, hidden_channels, output_channels).to(device)

# 打印模型结构
print(model)

GCN(
  (conv1): GCNConv(18, 128)
  (conv2): GCNConv(128, 128)
  (fc): Linear(in_features=128, out_features=18, bias=True)
)


In [6]:
from torch_geometric.loader import DataLoader

# 创建数据加载器
loader = DataLoader([data], batch_size=32, shuffle=True)

# 定义优化器和损失函数
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()

# 训练模型
def train():
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch)
        loss = criterion(out, batch.x)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# 训练循环
for epoch in range(1, 201):
    loss = train()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')

Epoch 10, Loss: 0.0389
Epoch 20, Loss: 0.0253
Epoch 30, Loss: 0.0152
Epoch 40, Loss: 0.0113
Epoch 50, Loss: 0.0079
Epoch 60, Loss: 0.0057
Epoch 70, Loss: 0.0044
Epoch 80, Loss: 0.0033
Epoch 90, Loss: 0.0025
Epoch 100, Loss: 0.0019
Epoch 110, Loss: 0.0014
Epoch 120, Loss: 0.0012
Epoch 130, Loss: 0.0009
Epoch 140, Loss: 0.0006
Epoch 150, Loss: 0.0005
Epoch 160, Loss: 0.0005
Epoch 170, Loss: 0.0007
Epoch 180, Loss: 0.0005
Epoch 190, Loss: 0.0004
Epoch 200, Loss: 0.0004


In [10]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx

def aggregate_instance_embeddings(data_scaled, G, model):
    model.eval()
    instance_embeddings_list = []

    for i in range(len(data_scaled)):
        # 创建子图
        subgraph = G.copy()

        # 创建节点特征张量
        subgraph_data = data_scaled[i]

        # 确保 x 的形状是 [num_nodes, num_features]，即 [1, 23]
        node_features = torch.tensor(subgraph_data, dtype=torch.float).view(1, -1).repeat(len(columns), 1).to(device)

        # 将 NetworkX 子图转换为 PyTorch Geometric 图
        subgraph_data = from_networkx(subgraph)

        # 更新子图的节点特征
        subgraph_data.x = node_features

        # 映射特征列到索引
        feature_to_index = {col: idx for idx, col in enumerate(columns)}

        # 映射关系到索引
        edges = [(feature_to_index[src], feature_to_index[dst]) for src, dst in relations]

        # 添加边
        edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
        subgraph_data.edge_index = edge_index.to(device)

        # 获取特征嵌入
        with torch.no_grad():
            node_embeddings = model(subgraph_data)

        # 聚合节点嵌入到实例嵌入
        instance_embedding = node_embeddings.mean(dim=0, keepdim=True)
        instance_embeddings_list.append(instance_embedding)

    instance_embeddings = torch.cat(instance_embeddings_list, dim=0)
    return instance_embeddings

# 获取每个实例的特征嵌入
instance_embeddings = aggregate_instance_embeddings(data_scaled, G, model)
print(instance_embeddings)


tensor([[0.7450, 0.8540, 0.1850,  ..., 1.0302, 0.1460, 0.7051],
        [0.6072, 0.8751, 0.0835,  ..., 0.9954, 0.1512, 0.8842],
        [0.5808, 1.0771, 1.1767,  ..., 1.0833, 0.2451, 0.3280],
        ...,
        [0.8854, 1.0740, 1.2199,  ..., 1.1704, 0.2111, 0.3184],
        [1.1307, 0.7896, 0.1448,  ..., 1.1171, 0.1011, 0.6942],
        [0.9167, 1.0796, 1.1692,  ..., 1.1881, 0.2788, 0.5067]],
       device='cuda:0')


In [11]:
instance_embeddings.shape

torch.Size([688736, 18])

In [12]:
# 将 PyTorch 张量转换为 Pandas DataFrame
instance_embeddings_df = pd.DataFrame(instance_embeddings.cpu().numpy())

instance_embeddings_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.745011,0.853956,0.185043,0.722925,0.77867,0.096756,1.078438,0.32018,0.269563,1.137229,1.009653,0.327138,1.008116,0.052543,0.725219,1.030213,0.145972,0.7051
1,0.607209,0.875115,0.083511,0.562288,0.589511,0.074362,1.147049,0.289266,0.474551,1.062381,0.956597,0.275831,0.415869,0.29321,0.547654,0.995384,0.151217,0.884202
2,0.580834,1.077067,1.176736,0.482022,0.510033,0.038819,1.127422,0.452338,0.341479,0.901456,1.037002,0.288792,0.385471,0.462641,0.468411,1.083296,0.245137,0.328026
3,1.140987,1.065498,1.226609,1.08325,1.098114,-0.005879,1.186787,0.496539,0.391389,0.124939,1.186936,0.165381,0.162638,0.248685,0.972035,1.187482,0.192398,0.317021
4,0.551244,1.099043,1.122709,0.447905,0.453405,0.075066,1.167782,0.082245,0.284745,0.532237,1.131273,0.295978,0.242423,0.1611,0.236787,1.112394,0.301646,0.60362


In [13]:
# 标准化数值特征
scaler = MinMaxScaler()
instance_embeddings_df_scaled = scaler.fit_transform(instance_embeddings_df)
# 将标准化后的数据转换回 DataFrame
instance_embeddings_df_scaled = pd.DataFrame(instance_embeddings_df_scaled, columns=instance_embeddings_df.columns)

instance_embeddings_df_scaled.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.519946,0.261383,0.190347,0.528354,0.562063,0.208569,0.74609,0.376459,0.211964,0.834525,0.638875,0.421135,0.833642,0.125224,0.605659,0.703473,0.274618,0.634095
1,0.408307,0.304744,0.11683,0.405334,0.416141,0.191051,0.798349,0.352322,0.360277,0.78398,0.574563,0.342162,0.403105,0.302614,0.473988,0.662697,0.281299,0.765689
2,0.386939,0.718612,0.908407,0.343864,0.35483,0.163247,0.7834,0.479643,0.263996,0.675307,0.672027,0.362111,0.381007,0.427497,0.415227,0.76562,0.400945,0.357042
3,0.840744,0.694902,0.944518,0.804302,0.808489,0.128281,0.828617,0.514153,0.300108,0.150922,0.853771,0.172155,0.219017,0.269796,0.788681,0.887594,0.333761,0.348956
4,0.362968,0.763647,0.869287,0.317736,0.311145,0.191602,0.814141,0.190687,0.222948,0.425971,0.786298,0.373172,0.277017,0.205239,0.24347,0.799685,0.472933,0.559533


In [14]:
# 保存为 CSV 文件
csv_file_path = '/home/sdong/data/chicago_bicycle/data_pr_raw_embeddings.csv'
instance_embeddings_df_scaled.to_csv(csv_file_path, index=False)

print(f'Instance embeddings saved to {csv_file_path}')

Instance embeddings saved to /home/sdong/data/chicago_bicycle/data_pr_raw_embeddings.csv
