In [12]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import networkx as nx
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
# 加载数据
file_path_clean = '/home/sdong/data/airbnb/Airbnb_Open_Data_Alignement.csv'
data_df = pd.read_csv(file_path_clean)
# 使用 fillna() 方法替换所有的 NaN 值为 0
data_df.fillna(0, inplace=True)
# 检查非数值列
non_numeric_cols = data_df.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_cols)

# 将非数值列转换为数值类型（使用标签编码）
label_encoders = {}
for col in non_numeric_cols:
    le = LabelEncoder()
    data_df[col] = le.fit_transform(data_df[col].astype(str))
    label_encoders[col] = le

# 确保所有特征列都是数值类型
print("Data types after encoding:\n", data_df.dtypes)


Non-numeric columns: Index(['name', 'host_identity_verified', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'cancellation_policy', 'room_type', 'last_review',
       'house_rules'],
      dtype='object')
Data types after encoding:
 id                                  int64
name                                int64
host_id                             int64
host_identity_verified              int64
host_name                           int64
neighbourhood_group                 int64
neighbourhood                       int64
lat                               float64
long                              float64
instant_bookable                     bool
cancellation_policy                 int64
room_type                           int64
construction_year                 float64
price                             float64
service_fee                       float64
minimum_nights                    float64
number_of_reviews                 float64
last_review                         int6

In [13]:
# 标准化数值特征
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_df)

# 将标准化后的数据转换为torch张量
x = torch.tensor(data_scaled, dtype=torch.float)


In [14]:
data_scaled

array([[0.00000000e+00, 2.57453371e-01, 8.09927687e-01, ...,
        1.80722892e-02, 8.02820721e-02, 2.25202429e-01],
       [1.50444869e-05, 7.81987892e-01, 5.29316929e-01, ...,
        6.02409639e-03, 6.45511256e-02, 6.16902834e-01],
       [2.03845701e-05, 9.03281604e-01, 7.97911752e-01, ...,
        3.01204819e-03, 9.81828044e-02, 2.95546559e-01],
       ...,
       [9.03429953e-02, 2.80331587e-01, 6.98773960e-01, ...,
        3.01204819e-03, 9.54705723e-02, 1.04251012e-01],
       [9.03527884e-02, 1.63623309e-01, 1.11892169e-01, ...,
        3.01204819e-03, 1.07404394e-01, 1.04251012e-01],
       [9.03625993e-02, 5.86315497e-02, 6.89855618e-01, ...,
        3.01204819e-03, 2.14266341e-02, 1.04251012e-01]])

In [4]:


# 定义特征列和关系
columns = [
    'id', 'name', 'host_id', 'host_identity_verified', 'host_name',
    'neighbourhood_group', 'neighbourhood', 'lat', 'long',
    'instant_bookable', 'cancellation_policy', 'room_type',
    'construction_year', 'price', 'service_fee', 'minimum_nights',
    'number_of_reviews', 'last_review', 'reviews_per_month',
    'review_rate_number', 'calculated_host_listings_count', 'availability_365',
    'house_rules'
]

relations = [
    ('id', 'host_id'),
    ('host_id', 'host_identity_verified'),
    ('host_id', 'host_name'),
    ('neighbourhood_group', 'neighbourhood'),
    ('lat', 'long'),
    ('instant_bookable', 'cancellation_policy'),
    ('room_type', 'price'),
    ('price', 'service_fee'),
    ('minimum_nights', 'number_of_reviews'),
    ('number_of_reviews', 'reviews_per_month'),
    ('reviews_per_month', 'review_rate_number'),
    ('review_rate_number', 'calculated_host_listings_count'),
    ('calculated_host_listings_count', 'availability_365'),
    ('availability_365', 'house_rules')
]

# 创建空的无向图
G = nx.Graph()

# 添加节点（每个特征作为一个节点）
for col in columns:
    G.add_node(col)

# 添加边（根据特征之间的关系）
for relation in relations:
    G.add_edge(relation[0], relation[1])

# 将NetworkX图转换为PyTorch Geometric图
data = from_networkx(G)

# 添加节点特征
data.x = x

# 映射特征列到索引
feature_to_index = {col: i for i, col in enumerate(columns)}

# 映射关系到索引
edges = [(feature_to_index[src], feature_to_index[dst]) for src, dst in relations]

# 添加边
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
data.edge_index = edge_index

print(data)


Data(edge_index=[2, 14], num_nodes=23, x=[69305, 23])


In [5]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(nn.Module):
    def __init__(self, num_features, hidden_channels, output_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.fc = nn.Linear(hidden_channels, output_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.fc(x)
        return x

# 初始化模型
num_features = data.num_features
hidden_channels = 128
output_channels = num_features  # 确保输出维度与输入维度一致
model = GCN(num_features, hidden_channels, output_channels)

# 打印模型结构
print(model)

GCN(
  (conv1): GCNConv(23, 128)
  (conv2): GCNConv(128, 128)
  (fc): Linear(in_features=128, out_features=23, bias=True)
)


In [6]:
from torch_geometric.loader import DataLoader

# 创建数据加载器
loader = DataLoader([data], batch_size=1, shuffle=True)

# 定义优化器和损失函数
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()

# 训练模型
def train():
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        out = model(batch)
        loss = criterion(out, batch.x)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# 训练循环
for epoch in range(1, 201):
    loss = train()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')

Epoch 10, Loss: 0.0786
Epoch 20, Loss: 0.0417
Epoch 30, Loss: 0.0207
Epoch 40, Loss: 0.0097
Epoch 50, Loss: 0.0062
Epoch 60, Loss: 0.0037
Epoch 70, Loss: 0.0023
Epoch 80, Loss: 0.0016
Epoch 90, Loss: 0.0019
Epoch 100, Loss: 0.0011
Epoch 110, Loss: 0.0021
Epoch 120, Loss: 0.0013
Epoch 130, Loss: 0.0008
Epoch 140, Loss: 0.0006
Epoch 150, Loss: 0.0005
Epoch 160, Loss: 0.0008
Epoch 170, Loss: 0.0006
Epoch 180, Loss: 0.0003
Epoch 190, Loss: 0.0003
Epoch 200, Loss: 0.0007


In [15]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx

def aggregate_instance_embeddings(data_scaled, G, model):
    model.eval()
    instance_embeddings_list = []

    for i in range(len(data_scaled)):
        # 创建子图
        subgraph = G.copy()

        # 创建节点特征张量
        subgraph_data = data_scaled[i]

        # 确保 x 的形状是 [num_nodes, num_features]，即 [1, 23]
        node_features = torch.tensor(subgraph_data, dtype=torch.float).view(1, -1).repeat(len(columns), 1)

        # 将 NetworkX 子图转换为 PyTorch Geometric 图
        subgraph_data = from_networkx(subgraph)

        # 更新子图的节点特征
        subgraph_data.x = node_features

        # 映射特征列到索引
        feature_to_index = {col: idx for idx, col in enumerate(columns)}

        # 映射关系到索引
        edges = [(feature_to_index[src], feature_to_index[dst]) for src, dst in relations]

        # 添加边
        edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
        subgraph_data.edge_index = edge_index

        # 获取特征嵌入
        with torch.no_grad():
            node_embeddings = model(subgraph_data)

        # 聚合节点嵌入到实例嵌入
        instance_embedding = node_embeddings.mean(dim=0, keepdim=True)
        instance_embeddings_list.append(instance_embedding)

    instance_embeddings = torch.cat(instance_embeddings_list, dim=0)
    return instance_embeddings

# 获取每个实例的特征嵌入
instance_embeddings = aggregate_instance_embeddings(data_scaled, G, model)
print(instance_embeddings)


tensor([[ 8.1696e-04,  2.2841e-01,  9.0309e-01,  ...,  3.3610e-02,
          7.9161e-02,  2.6281e-01],
        [ 1.8874e-03,  8.2570e-01,  6.1290e-01,  ...,  2.4196e-02,
          5.4390e-02,  6.9737e-01],
        [-4.9829e-03,  8.9552e-01,  8.7455e-01,  ...,  2.0388e-02,
          1.4050e-01,  3.9682e-01],
        ...,
        [ 8.8830e-02,  2.7736e-01,  7.8720e-01,  ...,  9.4586e-03,
          1.0766e-01,  1.4798e-01],
        [ 8.6810e-02,  1.1454e-01,  1.6845e-01,  ...,  1.2470e-02,
          1.2308e-01,  1.7409e-01],
        [ 1.3753e-01,  1.6423e-01,  7.2028e-01,  ...,  2.2947e-02,
          4.9947e-02,  9.8170e-02]])


In [16]:
instance_embeddings.shape

torch.Size([102599, 23])

In [17]:
# 将 PyTorch 张量转换为 Pandas DataFrame
instance_embeddings_df = pd.DataFrame(instance_embeddings.numpy())

instance_embeddings_df.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,0.000817,0.228405,0.903086,0.545298,0.62954,0.359283,0.637048,1.215665,0.092474,-0.047036,...,0.856802,0.887323,0.205695,0.006652,0.113142,0.000424,0.922144,0.03361,0.079161,0.262806
1,0.001887,0.825697,0.612899,1.114049,0.463659,0.501175,0.694174,1.191927,0.096524,-0.01413,...,0.097614,0.111331,0.221673,0.092999,0.630479,0.022897,0.91867,0.024196,0.05439,0.697367
2,-0.004983,0.89552,0.874546,0.073093,0.344979,0.52825,0.542141,1.165342,0.157214,1.058975,...,0.535398,0.56365,0.2715,-0.016262,0.007722,0.009327,1.134149,0.020388,0.140503,0.396822
3,-0.000201,-0.030835,0.949625,0.607958,0.356957,0.373027,0.260671,1.162637,0.155808,1.061224,...,0.337681,0.3551,0.29501,0.185314,0.893623,0.062765,0.86827,-0.012259,0.090245,0.181288
4,-0.01605,0.394878,1.04488,1.108192,0.613007,0.493131,0.364837,1.190292,0.096336,-0.014566,...,0.158645,0.184044,0.21374,0.025236,0.198182,0.003554,0.717442,0.024944,0.069941,0.790252
5,0.00075,0.517158,0.540633,1.106703,0.687191,0.503082,0.749124,1.204429,0.083236,1.091774,...,0.493595,0.507369,0.212985,0.118647,0.732939,0.028568,0.701882,0.021983,0.094527,0.639099
6,0.02375,0.152125,0.737456,0.147463,0.141006,0.340436,0.195361,1.042977,0.189862,0.039007,...,0.130989,0.138987,0.251169,0.000879,0.238323,-0.002531,1.042127,-0.036534,0.12429,0.793922
7,0.008559,0.174777,0.995539,0.56639,0.285671,0.350565,0.127371,1.205168,0.117458,-0.009812,...,0.923621,0.955438,0.253155,0.039116,0.199725,0.007609,1.104583,0.016661,0.077932,0.322912
8,-0.069995,0.507579,0.933288,1.128596,0.33237,0.522024,0.538663,1.212869,0.07922,1.09461,...,0.898793,0.936768,0.242,0.260169,0.80586,0.08066,0.701076,-0.00106,0.05021,0.107146
9,0.017373,0.26974,0.8557,0.580562,0.183023,0.521468,1.073456,1.184218,0.115331,-0.02704,...,0.240505,0.25079,0.239179,0.135944,0.835459,0.036736,1.114009,0.00802,0.117485,0.436383


In [18]:
# 标准化数值特征
scaler = MinMaxScaler()
instance_embeddings_df_scaled = scaler.fit_transform(instance_embeddings_df)
# 将标准化后的数据转换回 DataFrame
instance_embeddings_df_scaled = pd.DataFrame(instance_embeddings_df_scaled, columns=instance_embeddings_df.columns)

instance_embeddings_df_scaled.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,0.087731,0.277903,0.7597,0.485925,0.560251,0.331498,0.511577,0.895001,0.186125,0.021239,...,0.786301,0.784521,0.175918,0.261275,0.120776,0.149842,0.70722,0.073963,0.118308,0.24264
1,0.088619,0.765511,0.526727,0.937501,0.418205,0.47032,0.560032,0.876064,0.18982,0.048189,...,0.119303,0.115024,0.190432,0.405577,0.570463,0.233505,0.704495,0.065883,0.0964,0.609514
2,0.08292,0.822512,0.736786,0.111006,0.316577,0.496809,0.431075,0.854856,0.245189,0.927073,...,0.503926,0.505268,0.235693,0.222982,0.029142,0.182987,0.873533,0.062614,0.17256,0.355782
3,0.086887,0.066268,0.797063,0.535676,0.326835,0.344945,0.192328,0.852697,0.243907,0.928914,...,0.330218,0.325339,0.257048,0.559852,0.799197,0.381927,0.664958,0.034593,0.128111,0.173818
4,0.073741,0.413806,0.873537,0.93285,0.546094,0.46245,0.280683,0.87476,0.189649,0.047833,...,0.172923,0.177758,0.183226,0.292333,0.194696,0.161495,0.546637,0.066525,0.110153,0.687932
5,0.087676,0.51363,0.46871,0.931668,0.609619,0.472186,0.606641,0.886038,0.177697,0.953935,...,0.467199,0.456711,0.18254,0.44844,0.659525,0.254617,0.53443,0.063983,0.131897,0.560322
6,0.106753,0.215631,0.626726,0.170054,0.141912,0.313059,0.136931,0.757237,0.274975,0.091709,...,0.148626,0.138885,0.217225,0.251627,0.229588,0.138845,0.801344,0.013757,0.15822,0.69103
7,0.094153,0.234123,0.833924,0.502672,0.265791,0.322968,0.079261,0.886627,0.208919,0.051726,...,0.845006,0.843287,0.219029,0.315528,0.196037,0.176594,0.850339,0.059415,0.117221,0.293384
8,0.028994,0.50581,0.783947,0.94905,0.30578,0.490718,0.428125,0.892771,0.174033,0.956258,...,0.823193,0.82718,0.208896,0.684947,0.722911,0.448547,0.533798,0.044205,0.092703,0.111225
9,0.101464,0.311648,0.721656,0.513924,0.177892,0.490174,0.881745,0.869914,0.206978,0.037616,...,0.244843,0.235344,0.206333,0.477346,0.748639,0.285025,0.857734,0.051998,0.152203,0.389181


In [19]:
# 保存为 CSV 文件
csv_file_path = '/home/sdong/data/airbnb/Airbnb_Open_Data_Alignement_embeddings.csv'
instance_embeddings_df_scaled.to_csv(csv_file_path, index=False)

print(f'Instance embeddings saved to {csv_file_path}')

Instance embeddings saved to /home/sdong/data/airbnb/Airbnb_Open_Data_Alignement_embeddings.csv
