In [85]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
import networkx as nx
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
# 加载数据
file_path_clean = '/home/sdong/data/hotel_booking/hotel_booking_string.csv'
#file_path_clean = '/home/sdong/data/chicago_bicycle/data_pr_raw.csv'
data_df = pd.read_csv(file_path_clean)

# 随机选择20分之一个数据集（即5%）
data_df = data_df.sample(frac=1, random_state=42)

print(data_df.hotel.unique())
# 使用 fillna() 方法替换所有的 NaN 值为 0
data_df.fillna(0, inplace=True)
# 检查非数值列
non_numeric_cols = data_df.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_cols)

# 将非数值列转换为数值类型（使用标签编码）
label_encoders = {}
for col in non_numeric_cols:
    le = LabelEncoder()
    data_df[col] = le.fit_transform(data_df[col].astype(str))
    label_encoders[col] = le


# 确保所有特征列都是数值类型
print("Data types after encoding:\n", data_df.info())

['City Hote' 'Resot' 'Resro Hotesl' 'Resrot Hotesl' 'Ciy Hotle'
 'City Hotles' 'Resrot Hotsl' 'City Hotle' 'Resrot Hotes' 'City Hotl'
 'City' 'Cty Hotle' 'Rsrot Hotesl' 'Resort Hotel' 'City Hotel']
Non-numeric columns: Index(['hotel', 'arrival_date_month', 'meal', 'market_segment',
       'distribution_channel', 'reserved_room_type', 'assigned_room_type',
       'deposit_type', 'customer_type', 'reservation_status'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 119390 entries, 30946 to 15795
Data columns (total 20 columns):
 #   Column                     Non-Null Count   Dtype
---  ------                     --------------   -----
 0   hotel                      119390 non-null  int64
 1   is_canceled                119390 non-null  int64
 2   lead_time                  119390 non-null  int64
 3   arrival_date_year          119390 non-null  int64
 4   arrival_date_month         119390 non-null  int64
 5   arrival_date_week_number   119390 non-null  int64
 6   arr

In [86]:
print(data_df.hotel.unique())

[ 1  9 10 12  6  5 13  4 11  3  0  7 14  8  2]


In [87]:
data_df.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'babies', 'meal', 'market_segment',
       'distribution_channel', 'is_repeated_guest', 'reserved_room_type',
       'assigned_room_type', 'deposit_type', 'customer_type',
       'reservation_status'],
      dtype='object')

In [88]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119390 entries, 30946 to 15795
Data columns (total 20 columns):
 #   Column                     Non-Null Count   Dtype
---  ------                     --------------   -----
 0   hotel                      119390 non-null  int64
 1   is_canceled                119390 non-null  int64
 2   lead_time                  119390 non-null  int64
 3   arrival_date_year          119390 non-null  int64
 4   arrival_date_month         119390 non-null  int64
 5   arrival_date_week_number   119390 non-null  int64
 6   arrival_date_day_of_month  119390 non-null  int64
 7   stays_in_weekend_nights    119390 non-null  int64
 8   stays_in_week_nights       119390 non-null  int64
 9   adults                     119390 non-null  int64
 10  babies                     119390 non-null  int64
 11  meal                       119390 non-null  int64
 12  market_segment             119390 non-null  int64
 13  distribution_channel       119390 non-null  int64
 14  is_rep

In [89]:
# 标准化数值特征
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_df)
# 将 NumPy 数组转换回 DataFrame
data_scaled_df = pd.DataFrame(data_scaled, columns=data_df.columns)

print(data_scaled_df.hotel.unique())
# 将标准化后的数据转换为torch张量
x = torch.tensor(data_scaled, dtype=torch.float)


[0.07142857 0.64285714 0.71428571 0.85714286 0.42857143 0.35714286
 0.92857143 0.28571429 0.78571429 0.21428571 0.         0.5
 1.         0.57142857 0.14285714]


In [72]:


# 示例表格数据特征
columns = data_df.columns

# 定义特征之间的关系
relations = [
    ('hotel', 'is_canceled'),
    ('hotel', 'reservation_status'),
    ('is_canceled', 'reservation_status'),
    ('lead_time', 'arrival_date_year'),
    ('lead_time', 'arrival_date_month'),
    ('lead_time', 'arrival_date_week_number'),
    ('lead_time', 'arrival_date_day_of_month'),
    ('arrival_date_year', 'arrival_date_month'),
    ('arrival_date_month', 'arrival_date_week_number'),
    ('arrival_date_week_number', 'arrival_date_day_of_month'),
    ('stays_in_weekend_nights', 'stays_in_week_nights'),
    ('stays_in_weekend_nights', 'adults'),
    ('stays_in_week_nights', 'adults'),
    ('adults', 'babies'),
    ('adults', 'customer_type'),
    ('meal', 'market_segment'),
    ('meal', 'distribution_channel'),
    ('market_segment', 'distribution_channel'),
    ('is_repeated_guest', 'customer_type'),
    ('reserved_room_type', 'assigned_room_type'),
    ('deposit_type', 'customer_type'),
    ('reservation_status', 'customer_type'),
    ('hotel', 'meal'),
    ('hotel', 'market_segment'),
    ('hotel', 'distribution_channel'),
    ('hotel', 'adults'),
    ('hotel', 'babies')
]

# 创建空的无向图
G = nx.Graph()

# 添加节点（每个特征作为一个节点）
for col in columns:
    G.add_node(col)

# 添加边（根据特征之间的关系）
for relation in relations:
    G.add_edge(relation[0], relation[1])

# 将NetworkX图转换为PyTorch Geometric图
data = from_networkx(G)

# 添加节点特征
data.x = x

# 映射特征列到索引
feature_to_index = {col: i for i, col in enumerate(columns)}

# 映射关系到索引
edges = [(feature_to_index[src], feature_to_index[dst]) for src, dst in relations]

# 添加边
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
data.edge_index = edge_index


# 确定设备（GPU 优先）
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 将数据移动到设备
data = data.to(device)

print(data)


Data(edge_index=[2, 27], num_nodes=20, x=[119390, 20])


In [73]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(nn.Module):
    def __init__(self, num_features, hidden_channels, output_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.fc = nn.Linear(hidden_channels, output_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.fc(x)
        return x

# 初始化模型
num_features = data.num_features
hidden_channels = 128
output_channels = num_features  # 确保输出维度与输入维度一致
model = GCN(num_features, hidden_channels, output_channels).to(device)

# 打印模型结构
print(model)

GCN(
  (conv1): GCNConv(20, 128)
  (conv2): GCNConv(128, 128)
  (fc): Linear(in_features=128, out_features=20, bias=True)
)


In [74]:
from torch_geometric.loader import DataLoader

# 创建数据加载器
loader = DataLoader([data], batch_size=1280, shuffle=True)

# 定义优化器和损失函数
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()

# 训练模型
def train():
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch)
        loss = criterion(out, batch.x)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# 训练循环
for epoch in range(1, 201):
    loss = train()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')

Epoch 10, Loss: 0.0351
Epoch 20, Loss: 0.0146
Epoch 30, Loss: 0.0066
Epoch 40, Loss: 0.0036
Epoch 50, Loss: 0.0022
Epoch 60, Loss: 0.0014
Epoch 70, Loss: 0.0010
Epoch 80, Loss: 0.0011
Epoch 90, Loss: 0.0009
Epoch 100, Loss: 0.0005
Epoch 110, Loss: 0.0004
Epoch 120, Loss: 0.0013
Epoch 130, Loss: 0.0005
Epoch 140, Loss: 0.0003
Epoch 150, Loss: 0.0002
Epoch 160, Loss: 0.0007
Epoch 170, Loss: 0.0003
Epoch 180, Loss: 0.0002
Epoch 190, Loss: 0.0002
Epoch 200, Loss: 0.0010


In [90]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx

def aggregate_instance_embeddings(data_scaled, G, model):
    model.eval()
    instance_embeddings_list = []

    for i in range(len(data_scaled)):
        # 创建子图
        subgraph = G.copy()

        # 创建节点特征张量
        subgraph_data = data_scaled[i]

        # 确保 x 的形状是 [num_nodes, num_features]，即 [1, 23]
        node_features = torch.tensor(subgraph_data, dtype=torch.float).view(1, -1).repeat(len(columns), 1).to(device)

        # 将 NetworkX 子图转换为 PyTorch Geometric 图
        subgraph_data = from_networkx(subgraph)

        # 更新子图的节点特征
        subgraph_data.x = node_features

        # 映射特征列到索引
        feature_to_index = {col: idx for idx, col in enumerate(columns)}

        # 映射关系到索引
        edges = [(feature_to_index[src], feature_to_index[dst]) for src, dst in relations]

        # 添加边
        edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
        subgraph_data.edge_index = edge_index.to(device)

        # 获取特征嵌入
        with torch.no_grad():
            node_embeddings = model(subgraph_data)

        # 聚合节点嵌入到实例嵌入
        instance_embedding = node_embeddings.mean(dim=0, keepdim=True)
        instance_embeddings_list.append(instance_embedding)

    instance_embeddings = torch.cat(instance_embeddings_list, dim=0)
    return instance_embeddings

# 获取每个实例的特征嵌入
instance_embeddings = aggregate_instance_embeddings(data_scaled, G, model)
print(instance_embeddings)


tensor([[ 2.0560e-01, -3.1195e-02,  3.4954e-01,  ...,  2.7257e-02,
          9.4607e-01,  1.1783e+00],
        [ 9.0782e-01,  1.2033e+00,  1.1596e-01,  ..., -3.6276e-02,
          9.2317e-01,  5.7591e-01],
        [ 1.0101e+00, -1.2002e-01,  5.1213e-02,  ..., -1.7638e-02,
          1.3709e+00,  1.0080e+00],
        ...,
        [ 2.8917e-01,  4.7244e-02,  9.6095e-02,  ..., -5.0912e-02,
          8.7401e-01,  3.8242e-01],
        [ 7.8607e-01,  1.2082e+00,  2.1546e-01,  ..., -8.9726e-04,
          9.0406e-01,  4.1652e-02],
        [ 8.3822e-01,  2.7921e-02,  4.0561e-03,  ..., -1.3699e-02,
          1.3272e+00,  2.8801e-01]], device='cuda:0')


In [91]:
instance_embeddings.shape

torch.Size([119390, 20])

In [92]:
# 将 PyTorch 张量转换为 Pandas DataFrame
instance_embeddings_df = pd.DataFrame(instance_embeddings.cpu().numpy())

instance_embeddings_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.205599,-0.031195,0.349542,0.690189,0.307384,1.109467,0.09855,0.159396,0.119838,0.028896,0.005385,-0.055247,1.072833,0.368558,0.011259,0.71015,0.608001,0.027257,0.946067,1.17828
1,0.907816,1.203296,0.115962,0.02828,0.672609,0.654262,0.656545,0.020837,0.046033,0.015123,0.001723,0.022376,1.079807,1.041257,0.000404,0.009746,0.04947,-0.036276,0.923171,0.575909
2,1.010099,-0.120016,0.051213,0.723714,0.301551,1.231973,1.118393,0.009706,0.025346,0.022898,0.008069,-0.00113,1.08873,1.004884,-0.002919,0.036701,1.112588,-0.017638,1.370885,1.007951
3,1.162914,-0.068172,-0.02047,0.694937,0.902967,0.218904,0.354854,0.009096,0.007294,0.018326,0.003664,-0.012732,1.02985,1.028643,0.002393,0.0204,0.038056,0.016756,1.338806,0.973037
4,1.013447,-0.095947,0.109316,1.378456,0.046339,0.302095,0.638419,0.123014,0.067559,0.023728,0.00779,0.995799,0.814329,1.018636,-0.002409,0.02799,0.039518,-0.018762,0.912981,1.068226


In [93]:
# 标准化数值特征
scaler = MinMaxScaler()
instance_embeddings_df_scaled = scaler.fit_transform(instance_embeddings_df)
# 将标准化后的数据转换回 DataFrame
instance_embeddings_df_scaled = pd.DataFrame(instance_embeddings_df_scaled, columns=instance_embeddings_df.columns)

instance_embeddings_df_scaled.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.156263,0.130199,0.366345,0.501146,0.26447,0.822896,0.115967,0.174693,0.211111,0.609878,0.116451,0.02707,0.762402,0.305664,0.053078,0.575171,0.443522,0.12821,0.654794,0.84799
1,0.617825,0.925858,0.18786,0.067925,0.496814,0.523606,0.490265,0.075142,0.121218,0.469555,0.108425,0.077691,0.76716,0.773346,0.046156,0.11285,0.066405,0.084553,0.639738,0.413229
2,0.685055,0.072952,0.138383,0.523089,0.260759,0.903442,0.800068,0.067145,0.096021,0.548773,0.122333,0.062362,0.773247,0.748059,0.044037,0.130643,0.784216,0.09736,0.934146,0.725055
3,0.785499,0.106366,0.083609,0.504254,0.643359,0.237364,0.287893,0.066707,0.074035,0.502186,0.112678,0.054796,0.733081,0.764576,0.047425,0.119883,0.058698,0.120994,0.913052,0.699856
4,0.687255,0.088465,0.182781,0.951619,0.098403,0.292061,0.478106,0.148554,0.147436,0.557224,0.121723,0.712481,0.58606,0.757619,0.044363,0.124893,0.059686,0.096588,0.633038,0.768558


In [94]:
# 保存为 CSV 文件
csv_file_path = '/home/sdong/data/hotel_booking/hotel_booking_string_embeddings.csv'
instance_embeddings_df_scaled.to_csv(csv_file_path, index=False)

print(f'Instance embeddings saved to {csv_file_path}')

Instance embeddings saved to /home/sdong/data/hotel_booking/hotel_booking_string_embeddings.csv
