In [65]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
import networkx as nx
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
# 加载数据
file_path_clean = '/home/sdong/data/taxi/yellow_tripdata_sample.csv'
#file_path_clean = '/home/sdong/data/taxi/yellow_tripdata_missing.csv'
data_df = pd.read_csv(file_path_clean)

# # 保留指定的列
# columns_to_keep = [
#     "VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count",
#     "trip_distance"
# ]
# columns_to_keep = [
#     "VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count",
#     "trip_distance", "pickup_longitude", "pickup_latitude", "RateCodeID",
#     "store_and_fwd_flag", "dropoff_longitude"
# ]
#data_df = data_df[columns_to_keep]

print(len(data_df))

10000000


In [66]:
# 随机选择20分之一个数据集（即5%）
data_df = data_df.sample(frac=0.1, random_state=42)

# 使用 fillna() 方法替换所有的 NaN 值为 0
data_df.fillna(0, inplace=True)
# 检查非数值列
non_numeric_cols = data_df.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_cols)

# 将非数值列转换为数值类型（使用标签编码）
label_encoders = {}
for col in non_numeric_cols:
    le = LabelEncoder()
    data_df[col] = le.fit_transform(data_df[col].astype(str))
    label_encoders[col] = le


# 确保所有特征列都是数值类型
print("Data types after encoding:\n", data_df.info())

Non-numeric columns: Index(['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'store_and_fwd_flag'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 919213 to 2187994
Data columns (total 18 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   VendorID               1000000 non-null  int64  
 1   tpep_pickup_datetime   1000000 non-null  int64  
 2   tpep_dropoff_datetime  1000000 non-null  int64  
 3   passenger_count        1000000 non-null  int64  
 4   trip_distance          1000000 non-null  float64
 5   pickup_longitude       1000000 non-null  float64
 6   pickup_latitude        1000000 non-null  float64
 7   RateCodeID             1000000 non-null  int64  
 8   store_and_fwd_flag     1000000 non-null  int64  
 9   dropoff_longitude      1000000 non-null  float64
 10  dropoff_latitude       1000000 non-null  float64
 11  payment_type           1000000 non-null  int64  
 12  fare_amoun

In [67]:
data_df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'pickup_longitude',
       'pickup_latitude', 'RateCodeID', 'store_and_fwd_flag',
       'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount',
       'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'total_amount'],
      dtype='object')

In [68]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 919213 to 2187994
Data columns (total 18 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   VendorID               1000000 non-null  int64  
 1   tpep_pickup_datetime   1000000 non-null  int64  
 2   tpep_dropoff_datetime  1000000 non-null  int64  
 3   passenger_count        1000000 non-null  int64  
 4   trip_distance          1000000 non-null  float64
 5   pickup_longitude       1000000 non-null  float64
 6   pickup_latitude        1000000 non-null  float64
 7   RateCodeID             1000000 non-null  int64  
 8   store_and_fwd_flag     1000000 non-null  int64  
 9   dropoff_longitude      1000000 non-null  float64
 10  dropoff_latitude       1000000 non-null  float64
 11  payment_type           1000000 non-null  int64  
 12  fare_amount            1000000 non-null  float64
 13  extra                  1000000 non-null  float64
 14  mta_tax           

In [69]:
# 标准化数值特征
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_df)
# 将 NumPy 数组转换回 DataFrame
data_scaled_df = pd.DataFrame(data_scaled, columns=data_df.columns)


# 将标准化后的数据转换为torch张量
x = torch.tensor(data_scaled, dtype=torch.float)


In [70]:


# 示例表格数据特征
columns = data_df.columns

# # 定义特征之间的关系
relations = [
    ('VendorID', 'tpep_pickup_datetime'),
    ('VendorID', 'tpep_dropoff_datetime'),
    ('tpep_pickup_datetime', 'tpep_dropoff_datetime'),
    ('tpep_pickup_datetime', 'pickup_longitude'),
    ('tpep_pickup_datetime', 'pickup_latitude'),
    ('tpep_dropoff_datetime', 'dropoff_longitude'),
    ('tpep_dropoff_datetime', 'dropoff_latitude'),
    ('pickup_longitude', 'pickup_latitude'),
    ('dropoff_longitude', 'dropoff_latitude'),
    ('passenger_count', 'trip_distance'),
    ('trip_distance', 'fare_amount'),
    ('fare_amount', 'extra'),
    ('fare_amount', 'mta_tax'),
    ('fare_amount', 'tip_amount'),
    ('fare_amount', 'tolls_amount'),
    ('fare_amount', 'total_amount'),
    ('extra', 'total_amount'),
    ('mta_tax', 'total_amount'),
    ('tip_amount', 'total_amount'),
    ('tolls_amount', 'total_amount'),
    ('RateCodeID', 'store_and_fwd_flag'),
    ('RateCodeID', 'payment_type'),
    ('payment_type', 'total_amount'),
    ('store_and_fwd_flag', 'tpep_pickup_datetime'),
    ('store_and_fwd_flag', 'tpep_dropoff_datetime')
]
# 定义特征之间的关系
# relations = [
#     ('tpep_pickup_datetime', 'tpep_dropoff_datetime'),
#     ('tpep_pickup_datetime', 'pickup_longitude'),
#     ('tpep_pickup_datetime', 'pickup_latitude'),
#     ('tpep_dropoff_datetime', 'dropoff_longitude'),
#     ('pickup_longitude', 'pickup_latitude'),
#     ('passenger_count', 'trip_distance'),
#     ('trip_distance', 'RateCodeID'),
#     ('VendorID', 'store_and_fwd_flag'),
#     ('VendorID', 'RateCodeID'),
#     ('RateCodeID', 'store_and_fwd_flag')
#     # 可以添加更多关系
# ]
# relations = [
#     ("tpep_pickup_datetime", "tpep_dropoff_datetime"),
#     ("tpep_pickup_datetime", "passenger_count"),
#     ("tpep_pickup_datetime", "trip_distance"),
#     ("tpep_dropoff_datetime", "passenger_count"),
#     ("tpep_dropoff_datetime", "trip_distance"),
#     # ("passenger_count", "trip_distance"),
#     # ("VendorID", "tpep_pickup_datetime"),
#     # ("VendorID", "tpep_dropoff_datetime"),
#     # ("VendorID", "passenger_count"),
#     # ("VendorID", "trip_distance")
# ]
# 创建空的无向图
G = nx.Graph()

# 添加节点（每个特征作为一个节点）
for col in columns:
    G.add_node(col)

# 添加边（根据特征之间的关系）
for relation in relations:
    G.add_edge(relation[0], relation[1])

# 将NetworkX图转换为PyTorch Geometric图
data = from_networkx(G)

# 添加节点特征
data.x = x

# 映射特征列到索引
feature_to_index = {col: i for i, col in enumerate(columns)}

# 映射关系到索引
edges = [(feature_to_index[src], feature_to_index[dst]) for src, dst in relations]

# 添加边
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
data.edge_index = edge_index


# 确定设备（GPU 优先）
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 将数据移动到设备
data = data.to(device)

print(data)


Data(edge_index=[2, 25], num_nodes=18, x=[1000000, 18])


In [71]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(nn.Module):
    def __init__(self, num_features, hidden_channels, output_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.fc = nn.Linear(hidden_channels, output_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.fc(x)
        return x

# 初始化模型
num_features = data.num_features
hidden_channels = 128
output_channels = num_features  # 确保输出维度与输入维度一致
model = GCN(num_features, hidden_channels, output_channels).to(device)

# 打印模型结构
print(model)

GCN(
  (conv1): GCNConv(18, 128)
  (conv2): GCNConv(128, 128)
  (fc): Linear(in_features=128, out_features=18, bias=True)
)


In [72]:
from torch_geometric.loader import DataLoader

# 创建数据加载器
loader = DataLoader([data], batch_size=1280, shuffle=True)

# 定义优化器和损失函数
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()

# 训练模型
def train():
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch)
        loss = criterion(out, batch.x)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# 训练循环
for epoch in range(1, 201):
    loss = train()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')

Epoch 10, Loss: 0.0171
Epoch 20, Loss: 0.0051
Epoch 30, Loss: 0.0029
Epoch 40, Loss: 0.0013
Epoch 50, Loss: 0.0006
Epoch 60, Loss: 0.0003
Epoch 70, Loss: 0.0002
Epoch 80, Loss: 0.0002
Epoch 90, Loss: 0.0001
Epoch 100, Loss: 0.0001
Epoch 110, Loss: 0.0001
Epoch 120, Loss: 0.0001
Epoch 130, Loss: 0.0001
Epoch 140, Loss: 0.0001
Epoch 150, Loss: 0.0001
Epoch 160, Loss: 0.0000
Epoch 170, Loss: 0.0000
Epoch 180, Loss: 0.0000
Epoch 190, Loss: 0.0000
Epoch 200, Loss: 0.0000


In [73]:
import torch
from torch_geometric.data import Data, Batch
from torch_geometric.utils import from_networkx
import time
import numpy as np
import json

def aggregate_instance_embeddings(data_scaled, G, model, batch_size=4096):
    model.eval()
    instance_embeddings_list = []

    for i in range(0, len(data_scaled), batch_size):
        batch_subgraphs = []
        batch_node_features = []
        for j in range(i, min(i + batch_size, len(data_scaled))):
            # 创建子图
            subgraph = G.copy()

            # 创建节点特征张量
            subgraph_data = data_scaled[j]

            # 确保 x 的形状是 [num_nodes, num_features]，即 [1, 23]
            node_features = torch.tensor(subgraph_data, dtype=torch.float).view(1, -1).repeat(len(columns), 1).to(device)

            # 将 NetworkX 子图转换为 PyTorch Geometric 图
            subgraph_data = from_networkx(subgraph)

            # 更新子图的节点特征
            subgraph_data.x = node_features

            # 映射特征列到索引
            feature_to_index = {col: idx for idx, col in enumerate(columns)}

            # 映射关系到索引
            edges = [(feature_to_index[src], feature_to_index[dst]) for src, dst in relations]

            # 添加边
            edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
            subgraph_data.edge_index = edge_index.to(device)

            batch_subgraphs.append(subgraph_data)
            batch_node_features.append(node_features)

        # 创建批处理图
        batch_graph = Batch.from_data_list(batch_subgraphs)

        # 获取批处理中所有节点的特征嵌入
        with torch.no_grad():
            node_embeddings = model(batch_graph.to(device))

        # 逐个图聚合节点嵌入到实例嵌入
        batch_size_actual = len(batch_subgraphs)
        start_idx = 0
        for j in range(batch_size_actual):
            num_nodes = len(columns)
            instance_embedding = node_embeddings[start_idx:start_idx + num_nodes].mean(dim=0, keepdim=True)
            instance_embeddings_list.append(instance_embedding)
            start_idx += num_nodes

    instance_embeddings = torch.cat(instance_embeddings_list, dim=0)
    return instance_embeddings

# 设置不同的数据大小
data_sizes = [1000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 用于记录每个数据大小的运行时间
time_records = {}

for size in data_sizes:
    data_scaled = np.random.rand(size, len(columns))  # 随机生成数据

    start_time = time.time()
    instance_embeddings = aggregate_instance_embeddings(data_scaled, G, model)
    end_time = time.time()

    elapsed_time = end_time - start_time
    time_records[size] = elapsed_time
    print(f"Data size: {size}, Time taken: {elapsed_time:.2f} seconds")

# 将时间记录保存到JSON文件中
with open('time_records.json', 'w') as f:
    json.dump(time_records, f, indent=4)

print("Time records:", time_records)
print("Time records have been saved to time_records.json")


Data size: 1000, Time taken: 0.86 seconds
Data size: 5000, Time taken: 4.27 seconds
Data size: 10000, Time taken: 8.56 seconds
Data size: 20000, Time taken: 17.37 seconds
Data size: 50000, Time taken: 43.36 seconds
Data size: 100000, Time taken: 86.69 seconds
Data size: 200000, Time taken: 173.18 seconds
Data size: 500000, Time taken: 432.81 seconds
Data size: 1000000, Time taken: 870.02 seconds
Time records: {1000: 0.8578667640686035, 5000: 4.27446985244751, 10000: 8.555078744888306, 20000: 17.36573314666748, 50000: 43.35615849494934, 100000: 86.6929943561554, 200000: 173.18003010749817, 500000: 432.8054893016815, 1000000: 870.01686668396}
Time records have been saved to time_records.json


In [None]:
import torch
from torch_geometric.data import Data, Batch
from torch_geometric.utils import from_networkx

def aggregate_instance_embeddings(data_scaled, G, model, batch_size=4096):
    model.eval()
    instance_embeddings_list = []

    for i in range(0, len(data_scaled), batch_size):
        batch_subgraphs = []
        batch_node_features = []
        for j in range(i, min(i + batch_size, len(data_scaled))):
            # 创建子图
            subgraph = G.copy()

            # 创建节点特征张量
            subgraph_data = data_scaled[j]

            # 确保 x 的形状是 [num_nodes, num_features]，即 [1, 23]
            node_features = torch.tensor(subgraph_data, dtype=torch.float).view(1, -1).repeat(len(columns), 1).to(device)

            # 将 NetworkX 子图转换为 PyTorch Geometric 图
            subgraph_data = from_networkx(subgraph)

            # 更新子图的节点特征
            subgraph_data.x = node_features

            # 映射特征列到索引
            feature_to_index = {col: idx for idx, col in enumerate(columns)}

            # 映射关系到索引
            edges = [(feature_to_index[src], feature_to_index[dst]) for src, dst in relations]

            # 添加边
            edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
            subgraph_data.edge_index = edge_index.to(device)

            batch_subgraphs.append(subgraph_data)
            batch_node_features.append(node_features)

        # 创建批处理图
        batch_graph = Batch.from_data_list(batch_subgraphs)

        # 获取批处理中所有节点的特征嵌入
        with torch.no_grad():
            node_embeddings = model(batch_graph.to(device))

        # 逐个图聚合节点嵌入到实例嵌入
        batch_size_actual = len(batch_subgraphs)
        start_idx = 0
        for j in range(batch_size_actual):
            num_nodes = len(columns)
            instance_embedding = node_embeddings[start_idx:start_idx + num_nodes].mean(dim=0, keepdim=True)
            instance_embeddings_list.append(instance_embedding)
            start_idx += num_nodes

    instance_embeddings = torch.cat(instance_embeddings_list, dim=0)
    return instance_embeddings

# 获取每个实例的特征嵌入
instance_embeddings = aggregate_instance_embeddings(data_scaled, G, model)
print(instance_embeddings)


KeyboardInterrupt: 

In [None]:
# import torch
# from torch_geometric.data import Data
# from torch_geometric.utils import from_networkx

# def aggregate_instance_embeddings(data_scaled, G, model):
#     model.eval()
#     instance_embeddings_list = []

#     for i in range(len(data_scaled)):
#         # 创建子图
#         subgraph = G.copy()

#         # 创建节点特征张量
#         subgraph_data = data_scaled[i]

#         # 确保 x 的形状是 [num_nodes, num_features]，即 [1, 23]
#         node_features = torch.tensor(subgraph_data, dtype=torch.float).view(1, -1).repeat(len(columns), 1).to(device)

#         # 将 NetworkX 子图转换为 PyTorch Geometric 图
#         subgraph_data = from_networkx(subgraph)

#         # 更新子图的节点特征
#         subgraph_data.x = node_features

#         # 映射特征列到索引
#         feature_to_index = {col: idx for idx, col in enumerate(columns)}

#         # 映射关系到索引
#         edges = [(feature_to_index[src], feature_to_index[dst]) for src, dst in relations]

#         # 添加边
#         edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
#         subgraph_data.edge_index = edge_index.to(device)

#         # 获取特征嵌入
#         with torch.no_grad():
#             node_embeddings = model(subgraph_data)

#         # 聚合节点嵌入到实例嵌入
#         instance_embedding = node_embeddings.mean(dim=0, keepdim=True)
#         instance_embeddings_list.append(instance_embedding)

#     instance_embeddings = torch.cat(instance_embeddings_list, dim=0)
#     return instance_embeddings

# # 获取每个实例的特征嵌入
# instance_embeddings = aggregate_instance_embeddings(data_scaled, G, model)
# print(instance_embeddings)


tensor([[ 2.0560e-01, -3.1195e-02,  3.4954e-01,  ...,  2.7257e-02,
          9.4607e-01,  1.1783e+00],
        [ 9.0782e-01,  1.2033e+00,  1.1596e-01,  ..., -3.6276e-02,
          9.2317e-01,  5.7591e-01],
        [ 1.0101e+00, -1.2002e-01,  5.1213e-02,  ..., -1.7638e-02,
          1.3709e+00,  1.0080e+00],
        ...,
        [ 2.8917e-01,  4.7244e-02,  9.6095e-02,  ..., -5.0912e-02,
          8.7401e-01,  3.8242e-01],
        [ 7.8607e-01,  1.2082e+00,  2.1546e-01,  ..., -8.9726e-04,
          9.0406e-01,  4.1652e-02],
        [ 8.3822e-01,  2.7921e-02,  4.0561e-03,  ..., -1.3699e-02,
          1.3272e+00,  2.8801e-01]], device='cuda:0')


In [None]:
instance_embeddings.shape

torch.Size([100000, 18])

In [None]:
# 将 PyTorch 张量转换为 Pandas DataFrame
instance_embeddings_df = pd.DataFrame(instance_embeddings.cpu().numpy())

instance_embeddings_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,-0.011392,0.576693,0.561337,0.194952,0.003531,1.186569,-0.053824,0.019801,0.010904,1.183097,-0.033423,-0.011998,0.398114,0.012938,1.138101,0.035433,0.008031,0.381434
1,-0.016649,1.082896,1.07318,0.195673,-0.000514,0.037018,0.795961,0.012805,0.004696,0.091569,1.147589,-0.01576,0.368101,0.005186,1.142203,0.017424,0.004471,0.361656
2,-0.01829,0.673057,0.666099,0.194751,0.010105,0.036345,0.80049,0.013701,0.009253,0.08724,1.147548,-0.015181,0.384489,0.007494,1.13808,0.026607,0.008376,0.378201
3,1.159951,0.576652,0.56961,0.194156,-0.000214,0.040761,0.794604,0.012217,0.007623,0.092195,1.144462,-0.006746,0.368006,0.011595,1.14007,0.015737,0.00184,0.356928
4,-0.017218,0.09369,0.08311,0.615355,-0.004182,0.023136,0.806771,0.015518,0.010586,0.083672,1.145515,0.384091,0.377013,0.010267,1.15109,-0.005276,0.005237,0.360295


In [None]:
# 标准化数值特征
scaler = MinMaxScaler()
instance_embeddings_df_scaled = scaler.fit_transform(instance_embeddings_df)
# 将标准化后的数据转换回 DataFrame
instance_embeddings_df_scaled = pd.DataFrame(instance_embeddings_df_scaled, columns=instance_embeddings_df.columns)

instance_embeddings_df_scaled.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.079737,0.505302,0.463191,0.164329,0.307375,0.932816,0.039189,0.048556,0.059319,0.954976,0.191044,0.079452,0.268367,0.344981,0.819549,0.306108,0.097158,0.254161
1,0.076005,0.895363,0.861375,0.164925,0.287336,0.057906,0.81123,0.042802,0.054559,0.079511,0.927222,0.076495,0.226035,0.30274,0.824743,0.270138,0.087177,0.230025
2,0.074839,0.579556,0.54469,0.164162,0.339954,0.057394,0.815345,0.043538,0.058053,0.076038,0.927197,0.076951,0.249149,0.315314,0.819523,0.288479,0.098127,0.250215
3,0.911355,0.50527,0.469627,0.163669,0.288823,0.060755,0.809997,0.042318,0.056804,0.080013,0.925273,0.08358,0.225901,0.337662,0.822043,0.266767,0.079801,0.224255
4,0.075601,0.133118,0.091159,0.512327,0.269159,0.047341,0.821051,0.045033,0.059075,0.073176,0.925929,0.390758,0.238606,0.330423,0.835997,0.224797,0.089325,0.228365


In [None]:
# 保存为 CSV 文件
csv_file_path = '/home/sdong/data/hotel_booking/yellow_tripdata_sample_1000000_embeddings.csv'
instance_embeddings_df_scaled.to_csv(csv_file_path, index=False)

print(f'Instance embeddings saved to {csv_file_path}')

Instance embeddings saved to /home/sdong/data/hotel_booking/hotel_booking_string_embeddings.csv
