In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import networkx as nx
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
# 加载数据
file_path_clean = '/home/sdong/data/airbnb/Airbnb_Open_Data_Alignement.csv'
data_df = pd.read_csv(file_path_clean)
# 使用 fillna() 方法替换所有的 NaN 值为 0
data_df.fillna(0, inplace=True)
# 检查非数值列
non_numeric_cols = data_df.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_cols)

# 将非数值列转换为数值类型（使用标签编码）
label_encoders = {}
for col in non_numeric_cols:
    le = LabelEncoder()
    data_df[col] = le.fit_transform(data_df[col].astype(str))
    label_encoders[col] = le

# 确保所有特征列都是数值类型
print("Data types after encoding:\n", data_df.dtypes)


Non-numeric columns: Index(['name', 'host_identity_verified', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'cancellation_policy', 'room_type', 'last_review',
       'house_rules'],
      dtype='object')
Data types after encoding:
 id                                  int64
name                                int64
host_id                             int64
host_identity_verified              int64
host_name                           int64
neighbourhood_group                 int64
neighbourhood                       int64
lat                               float64
long                              float64
instant_bookable                     bool
cancellation_policy                 int64
room_type                           int64
construction_year                 float64
price                             float64
service_fee                       float64
minimum_nights                    float64
number_of_reviews                 float64
last_review                         int6

In [21]:
# 标准化数值特征
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_df)

# 将标准化后的数据转换为torch张量
x = torch.tensor(data_scaled, dtype=torch.float)


In [22]:
data_scaled

array([[0.00000000e+00, 2.57453371e-01, 8.09927687e-01, ...,
        1.80722892e-02, 8.02820721e-02, 2.25202429e-01],
       [1.50444869e-05, 7.81987892e-01, 5.29316929e-01, ...,
        6.02409639e-03, 6.45511256e-02, 6.16902834e-01],
       [2.03845701e-05, 9.03281604e-01, 7.97911752e-01, ...,
        3.01204819e-03, 9.81828044e-02, 2.95546559e-01],
       ...,
       [9.03429953e-02, 2.80331587e-01, 6.98773960e-01, ...,
        3.01204819e-03, 9.54705723e-02, 1.04251012e-01],
       [9.03527884e-02, 1.63623309e-01, 1.11892169e-01, ...,
        3.01204819e-03, 1.07404394e-01, 1.04251012e-01],
       [9.03625993e-02, 5.86315497e-02, 6.89855618e-01, ...,
        3.01204819e-03, 2.14266341e-02, 1.04251012e-01]])

In [23]:


# 定义特征列和关系
columns = [
    'id', 'name', 'host_id', 'host_identity_verified', 'host_name',
    'neighbourhood_group', 'neighbourhood', 'lat', 'long',
    'instant_bookable', 'cancellation_policy', 'room_type',
    'construction_year', 'price', 'service_fee', 'minimum_nights',
    'number_of_reviews', 'last_review', 'reviews_per_month',
    'review_rate_number', 'calculated_host_listings_count', 'availability_365',
    'house_rules'
]

relations = [
    ('id', 'host_id'),
    ('host_id', 'host_identity_verified'),
    ('host_id', 'host_name'),
    ('neighbourhood_group', 'neighbourhood'),
    ('lat', 'long'),
    ('instant_bookable', 'cancellation_policy'),
    ('room_type', 'price'),
    ('price', 'service_fee'),
    ('minimum_nights', 'number_of_reviews'),
    ('number_of_reviews', 'reviews_per_month'),
    ('reviews_per_month', 'review_rate_number'),
    ('review_rate_number', 'calculated_host_listings_count'),
    ('calculated_host_listings_count', 'availability_365'),
    ('availability_365', 'house_rules')
]

# 创建空的无向图
G = nx.Graph()

# 添加节点（每个特征作为一个节点）
for col in columns:
    G.add_node(col)

# 添加边（根据特征之间的关系）
for relation in relations:
    G.add_edge(relation[0], relation[1])

# 将NetworkX图转换为PyTorch Geometric图
data = from_networkx(G)

# 添加节点特征
data.x = x

# 映射特征列到索引
feature_to_index = {col: i for i, col in enumerate(columns)}

# 映射关系到索引
edges = [(feature_to_index[src], feature_to_index[dst]) for src, dst in relations]

# 添加边
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
data.edge_index = edge_index

print(data)


Data(edge_index=[2, 14], num_nodes=23, x=[102599, 23])


In [24]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(nn.Module):
    def __init__(self, num_features, hidden_channels, output_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.fc = nn.Linear(hidden_channels, output_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.fc(x)
        return x

# 初始化模型
num_features = data.num_features
hidden_channels = 128
output_channels = num_features  # 确保输出维度与输入维度一致
model = GCN(num_features, hidden_channels, output_channels)

# 打印模型结构
print(model)

GCN(
  (conv1): GCNConv(23, 128)
  (conv2): GCNConv(128, 128)
  (fc): Linear(in_features=128, out_features=23, bias=True)
)


In [25]:
from torch_geometric.loader import DataLoader

# 创建数据加载器
loader = DataLoader([data], batch_size=1, shuffle=True)

# 定义优化器和损失函数
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()

# 训练模型
def train():
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        out = model(batch)
        loss = criterion(out, batch.x)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# 训练循环
for epoch in range(1, 201):
    loss = train()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')

Epoch 10, Loss: 0.0605
Epoch 20, Loss: 0.0421
Epoch 30, Loss: 0.0274
Epoch 40, Loss: 0.0158
Epoch 50, Loss: 0.0077
Epoch 60, Loss: 0.0045
Epoch 70, Loss: 0.0025
Epoch 80, Loss: 0.0012
Epoch 90, Loss: 0.0010
Epoch 100, Loss: 0.0008
Epoch 110, Loss: 0.0007
Epoch 120, Loss: 0.0011
Epoch 130, Loss: 0.0010
Epoch 140, Loss: 0.0006
Epoch 150, Loss: 0.0004
Epoch 160, Loss: 0.0003
Epoch 170, Loss: 0.0013
Epoch 180, Loss: 0.0008
Epoch 190, Loss: 0.0003
Epoch 200, Loss: 0.0003


In [26]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx

def aggregate_instance_embeddings(data_scaled, G, model):
    model.eval()
    instance_embeddings_list = []

    for i in range(len(data_scaled)):
        # 创建子图
        subgraph = G.copy()

        # 创建节点特征张量
        subgraph_data = data_scaled[i]

        # 确保 x 的形状是 [num_nodes, num_features]，即 [1, 23]
        node_features = torch.tensor(subgraph_data, dtype=torch.float).view(1, -1).repeat(len(columns), 1)

        # 将 NetworkX 子图转换为 PyTorch Geometric 图
        subgraph_data = from_networkx(subgraph)

        # 更新子图的节点特征
        subgraph_data.x = node_features

        # 映射特征列到索引
        feature_to_index = {col: idx for idx, col in enumerate(columns)}

        # 映射关系到索引
        edges = [(feature_to_index[src], feature_to_index[dst]) for src, dst in relations]

        # 添加边
        edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
        subgraph_data.edge_index = edge_index

        # 获取特征嵌入
        with torch.no_grad():
            node_embeddings = model(subgraph_data)

        # 聚合节点嵌入到实例嵌入
        instance_embedding = node_embeddings.mean(dim=0, keepdim=True)
        instance_embeddings_list.append(instance_embedding)

    instance_embeddings = torch.cat(instance_embeddings_list, dim=0)
    return instance_embeddings

# 获取每个实例的特征嵌入
instance_embeddings = aggregate_instance_embeddings(data_scaled, G, model)
print(instance_embeddings)


tensor([[ 0.0047,  0.2794,  0.8602,  ...,  0.0282,  0.0540,  0.2287],
        [-0.0089,  0.8593,  0.5658,  ...,  0.0107,  0.0514,  0.6618],
        [ 0.0034,  0.9858,  0.8612,  ...,  0.0099,  0.0543,  0.3110],
        ...,
        [ 0.0931,  0.3086,  0.7573,  ...,  0.0103,  0.0510,  0.0948],
        [ 0.0894,  0.1736,  0.1056,  ...,  0.0087,  0.0575,  0.0962],
        [ 0.0902,  0.0934,  0.7412,  ...,  0.0252,  0.0491,  0.0952]])


In [27]:
instance_embeddings.shape

torch.Size([102599, 23])

In [28]:
# 将 PyTorch 张量转换为 Pandas DataFrame
instance_embeddings_df = pd.DataFrame(instance_embeddings.numpy())

instance_embeddings_df.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,0.004669,0.279394,0.860157,0.531713,0.634393,0.328385,0.552324,1.071158,0.000248,0.003147,...,0.875447,0.873805,0.195456,0.006534,0.130678,-0.001579,0.859134,0.028155,0.053952,0.228661
1,-0.008931,0.859276,0.565838,1.087309,0.471292,0.476134,0.626673,1.071081,0.001595,-0.009376,...,0.120113,0.119893,0.192478,0.040418,0.664915,0.013361,0.870009,0.010716,0.051408,0.661755
2,0.003408,0.985769,0.861161,-0.003685,0.296069,0.47644,0.449198,1.078434,0.002118,1.108334,...,0.560351,0.556848,0.193733,0.009207,-0.015056,0.001212,1.087792,0.009851,0.054316,0.310983
3,-0.001735,0.038876,0.886874,0.576978,0.395586,0.348735,0.22067,1.065774,0.000116,1.07523,...,0.340244,0.358067,0.188332,0.120925,0.879054,0.025731,0.836938,0.002478,0.06208,0.089152
4,-0.000857,0.425645,1.002984,1.070894,0.629308,0.463955,0.28639,1.076959,0.003236,-0.000813,...,0.19787,0.1975,0.196546,0.026695,0.208847,0.004441,0.642283,0.011976,0.053935,0.744302
5,-0.009214,0.55235,0.49004,1.089233,0.701196,0.479724,0.676744,1.073332,0.000361,1.091208,...,0.515995,0.520987,0.192154,0.05322,0.771537,0.017268,0.649119,0.004795,0.056195,0.610582
6,0.050353,0.251456,0.609053,0.079637,0.105221,0.344269,0.138505,1.089385,-0.005729,0.094046,...,0.113871,0.085257,0.197436,-0.00099,0.246527,0.009004,0.977611,0.000563,0.029648,0.798535
7,0.081377,0.17578,0.895941,0.509204,0.30064,0.313311,0.143351,1.079645,-0.00801,0.0985,...,0.928344,0.923399,0.202871,-0.010216,0.227911,-0.007631,1.032765,0.01897,0.059298,0.27408
8,-0.00716,0.547584,0.870228,1.100039,0.317936,0.495681,0.461378,1.089088,-0.00547,1.090269,...,0.913964,0.941227,0.184518,0.164265,0.786815,0.038932,0.640678,-0.034247,0.067603,0.056526
9,0.01036,0.346764,0.84434,0.545552,0.179131,0.491422,0.988388,1.086072,-0.003094,-0.011486,...,0.261932,0.267082,0.189675,0.057806,0.854495,0.026046,1.085899,-0.000846,0.0585,0.394819


In [29]:
# 标准化数值特征
scaler = MinMaxScaler()
instance_embeddings_df_scaled = scaler.fit_transform(instance_embeddings_df)
# 将标准化后的数据转换回 DataFrame
instance_embeddings_df_scaled = pd.DataFrame(instance_embeddings_df_scaled, columns=instance_embeddings_df.columns)

instance_embeddings_df_scaled.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,0.040279,0.33281,0.7847,0.465053,0.566194,0.299502,0.481667,0.810724,0.278606,0.069877,...,0.746853,0.76473,0.8534,0.180652,0.151258,0.217388,0.719463,0.108854,0.474487,0.256196
1,0.028789,0.796646,0.532262,0.929377,0.437826,0.444993,0.538531,0.810654,0.289236,0.060066,...,0.11479,0.114642,0.835256,0.250801,0.608739,0.319657,0.728311,0.094298,0.453887,0.630009
2,0.039213,0.897824,0.785561,0.017609,0.299918,0.445295,0.402793,0.817303,0.293359,0.935692,...,0.483181,0.491422,0.842903,0.186187,0.026463,0.236495,0.905514,0.093576,0.477438,0.32725
3,0.034868,0.140425,0.807614,0.502882,0.378243,0.319542,0.228008,0.805855,0.277563,0.909758,...,0.298995,0.320016,0.809995,0.417467,0.79211,0.404336,0.701402,0.087422,0.540295,0.135783
4,0.03561,0.449793,0.907202,0.915658,0.562192,0.433001,0.278272,0.815969,0.302182,0.066775,...,0.179857,0.181561,0.860041,0.222391,0.218197,0.258599,0.543019,0.09535,0.47435,0.701256
5,0.02855,0.551142,0.46725,0.930984,0.618771,0.448528,0.576827,0.81269,0.279494,0.922276,...,0.446064,0.4605,0.833283,0.277302,0.700041,0.346401,0.548581,0.089356,0.492645,0.58584
6,0.078874,0.310463,0.569327,0.087243,0.149713,0.315144,0.165165,0.827207,0.231448,0.141088,...,0.109567,0.084776,0.865465,0.165078,0.250463,0.289832,0.815863,0.085824,0.277715,0.748066
7,0.105084,0.249932,0.815391,0.446241,0.303516,0.284659,0.168872,0.818398,0.213456,0.144578,...,0.791117,0.807494,0.898577,0.145978,0.234522,0.175958,0.86074,0.101187,0.51777,0.295399
8,0.030285,0.54733,0.793337,0.940015,0.317128,0.464242,0.412109,0.826938,0.233492,0.92154,...,0.779084,0.822866,0.786758,0.507189,0.713125,0.494704,0.541712,0.056771,0.585009,0.107624
9,0.045086,0.386698,0.771133,0.476618,0.207883,0.460048,0.815182,0.824211,0.252244,0.058414,...,0.233465,0.241561,0.818178,0.286797,0.77108,0.406495,0.903973,0.084648,0.511307,0.399611


In [30]:
# 保存为 CSV 文件
csv_file_path = '/home/sdong/data/airbnb/Airbnb_Open_Data_Alignement_embeddings.csv'
instance_embeddings_df_scaled.to_csv(csv_file_path, index=False)

print(f'Instance embeddings saved to {csv_file_path}')

Instance embeddings saved to /home/sdong/data/airbnb/Airbnb_Open_Data_Alignement_embeddings.csv
