In [12]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import networkx as nx
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
# 加载数据
file_path_clean = '/home/sdong/data/credit_card/application_record_string.csv'
#file_path_clean = '/home/sdong/data/chicago_bicycle/data_pr_raw.csv'
data_df = pd.read_csv(file_path_clean)

# 随机选择20分之一个数据集（即5%）
data_df = data_df.sample(frac=0.2, random_state=42)

print(data_df.head())
# 使用 fillna() 方法替换所有的 NaN 值为 0
data_df.fillna(0, inplace=True)
# 检查非数值列
non_numeric_cols = data_df.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_cols)

# 将非数值列转换为数值类型（使用标签编码）
label_encoders = {}
for col in non_numeric_cols:
    le = LabelEncoder()
    data_df[col] = le.fit_transform(data_df[col].astype(str))
    label_encoders[col] = le



# 确保所有特征列都是数值类型
print("Data types after encoding:\n", data_df.info())

             ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  CNT_CHILDREN  \
284481  6127905           F            N               Y             0   
203727  6508890           F            N               Y             1   
233631  5996425           M            N               Y             1   
7505    5028725           M            N               Y             0   
192047  5878879           F            Y               Y             0   

        AMT_INCOME_TOTAL  NAME_INCOME_TYPE NAME_EDUCATION_TYPE  \
284481           90000.0  Low sesffcondary   secondary special   
203727           81000.0       High edtion   secondary special   
233631          189000.0    High education           Secondary   
7505            360000.0        Seconfdary           Secondary   
192047          112500.0     Low secondary   secondary special   

       NAME_FAMILY_STATUS      NAME_HOUSING_TYPE  DAYS_BIRTH  DAYS_EMPLOYED  \
284481          Separated  secondasfsry spfecial      -23010         365243   


In [13]:
data_df.columns

Index(['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE',
       'FLAG_EMAIL', 'CNT_FAM_MEMBERS'],
      dtype='object')

In [14]:
# 标准化数值特征
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_df)

# 将标准化后的数据转换为torch张量
x = torch.tensor(data_scaled, dtype=torch.float)


In [15]:


# 示例表格数据特征
columns = data_df.columns

# 定义特征之间的关系
relations = [
    ('ID', 'CODE_GENDER'),
    ('ID', 'DAYS_BIRTH'),
    ('ID', 'FLAG_MOBIL'),
    ('CODE_GENDER', 'FLAG_OWN_CAR'),
    ('CODE_GENDER', 'FLAG_OWN_REALTY'),
    ('CODE_GENDER', 'CNT_CHILDREN'),
    ('FLAG_OWN_CAR', 'FLAG_OWN_REALTY'),
    ('CNT_CHILDREN', 'NAME_FAMILY_STATUS'),
    ('CNT_CHILDREN', 'CNT_FAM_MEMBERS'),
    ('NAME_FAMILY_STATUS', 'CNT_FAM_MEMBERS'),
    ('AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE'),
    ('AMT_INCOME_TOTAL', 'DAYS_EMPLOYED'),
    ('NAME_INCOME_TYPE', 'DAYS_EMPLOYED'),
    ('NAME_INCOME_TYPE', 'NAME_HOUSING_TYPE'),
    ('NAME_HOUSING_TYPE', 'FLAG_OWN_REALTY'),
    ('NAME_HOUSING_TYPE', 'AMT_INCOME_TOTAL'),
    ('NAME_EDUCATION_TYPE', 'AMT_INCOME_TOTAL'),
    ('NAME_EDUCATION_TYPE', 'NAME_INCOME_TYPE'),
    ('NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE'),
    ('DAYS_BIRTH', 'DAYS_EMPLOYED'),
    ('FLAG_MOBIL', 'FLAG_WORK_PHONE'),
    ('FLAG_MOBIL', 'FLAG_PHONE'),
    ('FLAG_MOBIL', 'FLAG_EMAIL'),
    ('FLAG_WORK_PHONE', 'FLAG_PHONE'),
    ('FLAG_WORK_PHONE', 'FLAG_EMAIL'),
    ('FLAG_PHONE', 'FLAG_EMAIL')
]


# 创建空的无向图
G = nx.Graph()

# 添加节点（每个特征作为一个节点）
for col in columns:
    G.add_node(col)

# 添加边（根据特征之间的关系）
for relation in relations:
    G.add_edge(relation[0], relation[1])

# 将NetworkX图转换为PyTorch Geometric图
data = from_networkx(G)

# 添加节点特征
data.x = x

# 映射特征列到索引
feature_to_index = {col: i for i, col in enumerate(columns)}

# 映射关系到索引
edges = [(feature_to_index[src], feature_to_index[dst]) for src, dst in relations]

# 添加边
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
data.edge_index = edge_index


# 确定设备（GPU 优先）
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 将数据移动到设备
data = data.to(device)

print(data)


Data(edge_index=[2, 26], num_nodes=17, x=[87711, 17])


In [10]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(nn.Module):
    def __init__(self, num_features, hidden_channels, output_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.fc = nn.Linear(hidden_channels, output_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.fc(x)
        return x

# 初始化模型
num_features = data.num_features
hidden_channels = 128
output_channels = num_features  # 确保输出维度与输入维度一致
model = GCN(num_features, hidden_channels, output_channels).to(device)

# 打印模型结构
print(model)

GCN(
  (conv1): GCNConv(17, 128)
  (conv2): GCNConv(128, 128)
  (fc): Linear(in_features=128, out_features=17, bias=True)
)


In [11]:
from torch_geometric.loader import DataLoader

# 创建数据加载器
loader = DataLoader([data], batch_size=1280, shuffle=True)

# 定义优化器和损失函数
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()

# 训练模型
def train():
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch)
        loss = criterion(out, batch.x)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# 训练循环
for epoch in range(1, 201):
    loss = train()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')

Epoch 10, Loss: 0.0438
Epoch 20, Loss: 0.0140
Epoch 30, Loss: 0.0065
Epoch 40, Loss: 0.0035
Epoch 50, Loss: 0.0019
Epoch 60, Loss: 0.0012
Epoch 70, Loss: 0.0008
Epoch 80, Loss: 0.0006
Epoch 90, Loss: 0.0009
Epoch 100, Loss: 0.0005
Epoch 110, Loss: 0.0004
Epoch 120, Loss: 0.0004
Epoch 130, Loss: 0.0004
Epoch 140, Loss: 0.0003
Epoch 150, Loss: 0.0003
Epoch 160, Loss: 0.0003
Epoch 170, Loss: 0.0024
Epoch 180, Loss: 0.0004
Epoch 190, Loss: 0.0003
Epoch 200, Loss: 0.0002


In [16]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx

def aggregate_instance_embeddings(data_scaled, G, model):
    model.eval()
    instance_embeddings_list = []

    for i in range(len(data_scaled)):
        # 创建子图
        subgraph = G.copy()

        # 创建节点特征张量
        subgraph_data = data_scaled[i]

        # 确保 x 的形状是 [num_nodes, num_features]，即 [1, 23]
        node_features = torch.tensor(subgraph_data, dtype=torch.float).view(1, -1).repeat(len(columns), 1).to(device)

        # 将 NetworkX 子图转换为 PyTorch Geometric 图
        subgraph_data = from_networkx(subgraph)

        # 更新子图的节点特征
        subgraph_data.x = node_features

        # 映射特征列到索引
        feature_to_index = {col: idx for idx, col in enumerate(columns)}

        # 映射关系到索引
        edges = [(feature_to_index[src], feature_to_index[dst]) for src, dst in relations]

        # 添加边
        edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
        subgraph_data.edge_index = edge_index.to(device)

        # 获取特征嵌入
        with torch.no_grad():
            node_embeddings = model(subgraph_data)

        # 聚合节点嵌入到实例嵌入
        instance_embedding = node_embeddings.mean(dim=0, keepdim=True)
        instance_embeddings_list.append(instance_embedding)

    instance_embeddings = torch.cat(instance_embeddings_list, dim=0)
    return instance_embeddings

# 获取每个实例的特征嵌入
instance_embeddings = aggregate_instance_embeddings(data_scaled, G, model)
print(instance_embeddings)


tensor([[ 4.6121e-01,  3.0720e-02,  5.7163e-02,  ..., -7.7568e-02,
         -1.3338e-02,  1.5783e-02],
        [ 6.1731e-01, -5.2372e-03,  1.6966e-02,  ..., -2.2888e-02,
          6.1607e-02,  1.5378e-01],
        [ 4.3634e-01,  1.2728e+00, -1.0398e-02,  ...,  2.5052e-02,
          2.0377e-02,  1.7452e-01],
        ...,
        [ 4.9952e-01,  1.2684e+00,  1.2374e+00,  ..., -5.0498e-03,
         -8.9565e-03,  2.7761e-01],
        [ 3.5327e-01,  1.2594e+00,  1.2185e+00,  ...,  1.2464e+00,
          1.1206e-02, -3.1460e-02],
        [ 3.8206e-01,  9.0300e-04, -1.1459e-02,  ...,  1.2050e+00,
          2.0539e-02,  2.4681e-01]], device='cuda:0')


In [17]:
instance_embeddings.shape

torch.Size([87711, 17])

In [18]:
# 将 PyTorch 张量转换为 Pandas DataFrame
instance_embeddings_df = pd.DataFrame(instance_embeddings.cpu().numpy())

instance_embeddings_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0.461209,0.03072,0.057163,1.29955,0.016376,0.032741,0.440937,1.216574,0.69013,1.174628,0.059692,1.255372,0.003491,-0.032661,-0.077568,-0.013338,0.015783
1,0.617313,-0.005237,0.016966,1.25204,0.123216,0.027827,0.06547,1.233238,0.325361,1.069075,0.56154,0.06138,-0.013161,1.271114,-0.022888,0.061607,0.153778
2,0.436341,1.272836,-0.010398,1.217566,0.127716,0.046084,0.144033,1.051804,0.297363,1.276768,0.834105,0.036984,-0.011628,-0.003888,0.025052,0.020377,0.17452
3,0.012216,1.306369,-0.016039,1.226419,0.005649,0.050405,1.14005,1.064812,0.308886,1.260318,0.900862,0.015015,-0.00445,0.003828,-0.018275,0.026666,0.067881
4,0.409959,-0.033269,1.294283,1.280013,0.016159,0.024662,0.333958,1.233323,0.312768,0.96558,0.258461,1.256214,0.00902,-0.023095,-0.079765,-0.012743,0.055093


In [19]:
# 标准化数值特征
scaler = MinMaxScaler()
instance_embeddings_df_scaled = scaler.fit_transform(instance_embeddings_df)
# 将标准化后的数据转换回 DataFrame
instance_embeddings_df_scaled = pd.DataFrame(instance_embeddings_df_scaled, columns=instance_embeddings_df.columns)

instance_embeddings_df_scaled.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0.364921,0.083284,0.147666,0.901936,0.05867,0.130946,0.377387,0.909268,0.532172,0.865565,0.144687,0.896476,0.485819,0.069077,0.035219,0.081347,0.085553
1,0.475041,0.059149,0.121396,0.871848,0.150307,0.106804,0.113298,0.920403,0.276319,0.790601,0.502285,0.080758,0.330056,0.921658,0.072587,0.130436,0.198479
2,0.347379,0.916978,0.103514,0.850015,0.154167,0.196509,0.168557,0.799178,0.256681,0.938104,0.696504,0.06409,0.344398,0.087893,0.105348,0.10343,0.215453
3,0.048191,0.939485,0.099827,0.855622,0.04947,0.217738,0.869116,0.807869,0.264763,0.926422,0.744073,0.049082,0.411541,0.092939,0.075739,0.10755,0.128187
4,0.328768,0.040335,0.956151,0.889563,0.058484,0.091249,0.302142,0.920459,0.267487,0.717098,0.286323,0.897051,0.537539,0.075333,0.033717,0.081737,0.117722


In [20]:
# 保存为 CSV 文件
csv_file_path = '/home/sdong/data/credit_card/application_record_string_embeddings.csv'
instance_embeddings_df_scaled.to_csv(csv_file_path, index=False)

print(f'Instance embeddings saved to {csv_file_path}')

Instance embeddings saved to /home/sdong/data/credit_card/application_record_string_embeddings.csv
