## 数据预处理

In [44]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import torch
import numpy as np

# 加载数据
file_path_clean = '/home/sdong/data/airbnb/airbnb_nyc_clean.csv'
file_path_dirty = '/home/sdong/data/airbnb/Airbnb_Open_Data_Alignement.csv'
data_clean = pd.read_csv(file_path_clean)
data_dirty = pd.read_csv(file_path_dirty)

# 填充缺失值
data_clean.fillna(0, inplace=True)
data_dirty.fillna(0, inplace=True)


data_clean.head()

Unnamed: 0,id,name,host_id,host_identity_verified,host_name,neighbourhood_group,neighbourhood,lat,long,instant_bookable,...,price,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,house_rules
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,False,...,966.0,193.0,10.0,9.0,2021-10-19,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,False,...,142.0,28.0,13.0,45.0,2022-05-21,0.38,4.0,2.0,228.0,Pet friendly but please confirm with me if the...
2,1002403,THE VILLAGE OF HARLEM....NEW YORK !,78829239556,unconfirmed,Elise,Manhattan,Harlem,40.80902,-73.9419,True,...,620.0,124.0,3.0,0.0,2019-06-14,0.79,5.0,1.0,352.0,"I encourage you to use my kitchen, cooking and..."
3,1002755,blank,85098326012,unconfirmed,Garry,Brooklyn,Clinton Hill,40.68514,-73.95976,True,...,368.0,74.0,13.0,270.0,2019-07-05,4.64,4.0,1.0,322.0,blank
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,False,...,204.0,41.0,10.0,9.0,2018-11-19,0.1,3.0,1.0,289.0,"Please no smoking in the house, porch or on th..."


In [45]:
# 识别非数值列
non_numeric_cols = data_clean.select_dtypes(include=['object']).columns

# 对非数值列进行标签编码
label_encoders = {}
for col in non_numeric_cols:
    le = LabelEncoder()
    # 合并干净和脏的数据
    combined_data = pd.concat([data_clean[col], data_dirty[col]], axis=0)
    le.fit(combined_data.astype(str))
    # 对干净和脏的数据分别进行转换
    data_clean[col] = le.transform(data_clean[col].astype(str))
    data_dirty[col] = le.transform(data_dirty[col].astype(str))
    label_encoders[col] = le

# 确保所有特征都是数值类型
print("Data types after encoding:\n", data_clean.dtypes)


Data types after encoding:
 id                                  int64
name                                int64
host_id                             int64
host_identity_verified              int64
host_name                           int64
neighbourhood_group                 int64
neighbourhood                       int64
lat                               float64
long                              float64
instant_bookable                     bool
cancellation_policy                 int64
room_type                           int64
construction_year                 float64
price                             float64
service_fee                       float64
minimum_nights                    float64
number_of_reviews                 float64
last_review                         int64
reviews_per_month                 float64
review_rate_number                float64
calculated_host_listings_count    float64
availability_365                  float64
house_rules                         int64
dtype:

In [46]:
data_clean = data_clean.astype(np.float64)
data_dirty = data_dirty.astype(np.float64)

In [47]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# 分离数值型特征和类别型特征
numeric_cols = data_clean.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = list(set(data_clean.columns) - set(numeric_cols))

# 初始化标准化器
scaler = MinMaxScaler()

# 在干净和脏数据的组合上拟合标准化器
combined_numeric_data = pd.concat([data_clean[numeric_cols], data_dirty[numeric_cols]], axis=0)
scaler.fit(combined_numeric_data)

# 对干净和脏数据进行标准化
data_clean[numeric_cols] = scaler.transform(data_clean[numeric_cols])
data_dirty[numeric_cols] = scaler.transform(data_dirty[numeric_cols])

In [48]:
print("Data types of data_clean:")
print(data_clean.dtypes)

print("Data types of data_dirty:")
print(data_dirty.dtypes)

Data types of data_clean:
id                                float64
name                              float64
host_id                           float64
host_identity_verified            float64
host_name                         float64
neighbourhood_group               float64
neighbourhood                     float64
lat                               float64
long                              float64
instant_bookable                  float64
cancellation_policy               float64
room_type                         float64
construction_year                 float64
price                             float64
service_fee                       float64
minimum_nights                    float64
number_of_reviews                 float64
last_review                       float64
reviews_per_month                 float64
review_rate_number                float64
calculated_host_listings_count    float64
availability_365                  float64
house_rules                       float64
dtype: o

In [49]:
data_clean.head()

Unnamed: 0,id,name,host_id,host_identity_verified,host_name,neighbourhood_group,neighbourhood,lat,long,instant_bookable,...,price,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,house_rules
0,0.0,0.257449,0.809928,0.5,0.562884,0.285714,0.518828,0.993414,0.003737,0.0,...,0.805,0.804167,0.179528,0.008789,0.669832,0.002333,0.8,0.018072,0.080282,0.225089
1,1.5e-05,0.781975,0.529317,1.0,0.416344,0.428571,0.606695,0.996008,0.003583,0.0,...,0.118333,0.116667,0.179965,0.043945,0.698121,0.004222,0.8,0.006024,0.064551,0.616591
2,2e-05,0.903267,0.797912,0.5,0.253051,0.428571,0.456067,0.997362,0.004147,1.0,...,0.516667,0.516667,0.178509,0.0,0.546777,0.008778,1.0,0.003012,0.098183,0.295397
3,2.7e-05,0.976943,0.861467,0.5,0.308013,0.285714,0.200837,0.994334,0.003907,1.0,...,0.306667,0.308333,0.179965,0.263672,0.55102,0.051556,0.8,0.003012,0.090046,0.975721
4,4.3e-05,0.387128,0.931817,1.0,0.558108,0.428571,0.297071,0.997105,0.004119,0.0,...,0.17,0.170833,0.179528,0.008789,0.50495,0.001111,0.6,0.003012,0.081096,0.692463


## 特征图构建

In [8]:
import networkx as nx
from torch_geometric.utils import from_networkx
from torch_geometric.data import Data
import torch
import numpy as np
import pandas as pd

# 定义特征列和关系
columns = [
    'id', 'name', 'host_id', 'host_identity_verified', 'host_name',
    'neighbourhood_group', 'neighbourhood', 'lat', 'long',
    'instant_bookable', 'cancellation_policy', 'room_type',
    'construction_year', 'price', 'service_fee', 'minimum_nights',
    'number_of_reviews', 'last_review', 'reviews_per_month',
    'review_rate_number', 'calculated_host_listings_count', 'availability_365',
    'house_rules'
]
relations = [
    ('id', 'host_id'),
    ('host_id', 'host_identity_verified'),
    ('host_id', 'host_name'),
    ('neighbourhood_group', 'neighbourhood'),
    ('lat', 'long'),
    ('instant_bookable', 'cancellation_policy'),
    ('room_type', 'price'),
    ('price', 'service_fee'),
    ('minimum_nights', 'number_of_reviews'),
    ('number_of_reviews', 'reviews_per_month'),
    ('reviews_per_month', 'review_rate_number'),
    ('review_rate_number', 'calculated_host_listings_count'),
    ('calculated_host_listings_count', 'availability_365'),
    ('availability_365', 'house_rules')
]

# 创建特征名称到索引的映射
feature_to_index = {col: idx for idx, col in enumerate(columns)}
index_to_feature = {idx: col for idx, col in enumerate(columns)}

# 创建空的无向图
G = nx.Graph()

# 添加节点（使用索引作为节点）
for idx in range(len(columns)):
    G.add_node(idx)

# 添加边（将特征名称映射到索引）
for src, dst in relations:
    src_idx = feature_to_index[src]
    dst_idx = feature_to_index[dst]
    G.add_edge(src_idx, dst_idx)

# 将 NetworkX 图转换为 PyTorch Geometric 图
data = Data()
data.edge_index = torch.tensor(list(G.edges())).t().contiguous()


## 新的GNN模型 编码器设计

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, GINConv

class GAT_GIN_Encoder(nn.Module):
    def __init__(self, num_features, hidden_channels):
        super(GAT_GIN_Encoder, self).__init__()
        # 第一层：GATConv
        self.gat_conv1 = GATConv(num_features, hidden_channels, heads=8, concat=False)
        # 第二层：GINConv
        nn1 = nn.Sequential(
            nn.Linear(hidden_channels, hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels, hidden_channels)
        )
        self.gin_conv1 = GINConv(nn1)
        # 第三层：GATConv
        self.gat_conv2 = GATConv(hidden_channels, hidden_channels, heads=8, concat=False)
        # 第四层：GINConv
        nn2 = nn.Sequential(
            nn.Linear(hidden_channels, hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels, hidden_channels)
        )
        self.gin_conv2 = GINConv(nn2)

    def forward(self, x, edge_index):
        x = F.relu(self.gat_conv1(x, edge_index))
        x = F.relu(self.gin_conv1(x, edge_index))
        x = F.relu(self.gat_conv2(x, edge_index))
        x = F.relu(self.gin_conv2(x, edge_index))
        return x


### 解码器设计

In [10]:
class MultiTaskDecoder(nn.Module):
    def __init__(self, hidden_channels, num_features):
        super(MultiTaskDecoder, self).__init__()
        # 数据质量验证解码器
        self.decoder_validation = nn.Sequential(
            nn.Linear(hidden_channels, hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels, num_features)
        )
        # 数据修复解码器
        self.decoder_repair = nn.Sequential(
            nn.Linear(hidden_channels, hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels, num_features)
        )

    def forward(self, x):
        # 数据质量验证输出
        out_validation = self.decoder_validation(x)
        # 数据修复输出
        out_repair = self.decoder_repair(x)
        return out_validation, out_repair


### 整合模型

In [11]:
class MultiTaskGNN(nn.Module):
    def __init__(self, num_features, hidden_channels):
        super(MultiTaskGNN, self).__init__()
        self.encoder = GAT_GIN_Encoder(num_features, hidden_channels)
        self.decoder = MultiTaskDecoder(hidden_channels, num_features)

    def forward(self, data):
        x = data.x  # x 的形状应为 [num_nodes, num_node_features]
        edge_index = data.edge_index
        x = self.encoder(x, edge_index)
        out_validation, out_repair = self.decoder(x)
        return out_validation, out_repair



## 准备数据

### 准备图数据

In [12]:
# 创建节点特征矩阵
num_nodes = len(columns)
num_features = num_nodes  # 每个节点的特征维度

def create_node_features(instance):
    # 确保实例中的特征按照 columns 列表的顺序排列
    values = instance[columns].values.astype(np.float32)
    # 将数据转换为张量，形状为 [num_nodes, num_node_features]
    node_features = torch.tensor(values, dtype=torch.float).view(-1, 1)
    return node_features


# 创建 PyTorch Geometric 数据对象
def create_data_object(instance):
    node_features = create_node_features(instance)
    data_instance = Data()
    data_instance.x = node_features
    data_instance.edge_index = data.edge_index
    data_instance.num_nodes = data.num_nodes
    return data_instance



### 数据集划分

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 假设 data_clean 和 data_dirty 已经经过预处理，并且所有列都是数值类型

# 将 data_clean 随机打乱，并划分为训练集和临时集（50% / 50%）
train_data, temp_data = train_test_split(data_clean, test_size=0.5, random_state=42)

# 将临时集再划分为验证集和测试集1（各占25%）
val_data, test_data_1 = train_test_split(temp_data, test_size=0.5, random_state=42)

# data_dirty 作为测试集2
test_data_2 = data_dirty  # 已经预处理好的脏数据


### 创建数据加载器

In [14]:
from torch.utils.data import Dataset, DataLoader
import os

class GraphDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        instance = self.dataframe.iloc[idx]
        data_instance = create_data_object(instance)
        return data_instance

# 创建数据集对象
train_dataset = GraphDataset(train_data)
val_dataset = GraphDataset(val_data)
test_dataset_1 = GraphDataset(test_data_1)
test_dataset_2 = GraphDataset(test_data_2)



## 训练模型

### 定义损失函数

In [15]:
# def loss_function(out_validation, out_repair, target, lambda_validation=1.0, lambda_repair=1.0):
#     # 数据质量验证损失
#     loss_validation = F.mse_loss(out_validation, target)
#     # 数据修复损失
#     loss_repair = F.mse_loss(out_repair, target)
#     # 总损失
#     loss = lambda_validation * loss_validation + lambda_repair * loss_repair
#     return loss
# 定义损失函数，包含验证损失和修复损失
def loss_function(out_validation, out_repair, target, weights, lambda_validation=1.0, lambda_repair=1.0):
    # 数据质量验证损失，使用加权 MSE Loss
    loss_validation = (weights * F.mse_loss(out_validation, target, reduction='none').sum(dim=1)).mean()
    
    # 数据修复损失，普通的 MSE Loss
    loss_repair = F.mse_loss(out_repair, target)
    
    # 总损失
    loss = lambda_validation * loss_validation + lambda_repair * loss_repair
    return loss

### 5.2 训练循环

In [16]:
from torch_geometric.loader import DataLoader
import warnings

# 抑制所有警告
warnings.filterwarnings("ignore")

# 创建数据加载器
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader_1 = DataLoader(test_dataset_1, batch_size=batch_size, shuffle=False)
test_loader_2 = DataLoader(test_dataset_2, batch_size=batch_size, shuffle=False)

# 初始化模型、优化器
num_features = 1  # 因为每个节点只有一个特征
hidden_channels = 64
model = MultiTaskGNN(num_features, hidden_channels)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# 定义保存模型的路径
save_path = '/home/sdong/experiments/GVAE/model/multitask_gnn_model.pth'
# 定义checkpoint文件的路径
checkpoint_path = '/home/sdong/experiments/GVAE/model/checkpoint.pth'

# 加载已保存的checkpoint
if os.path.isfile(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1  # 从中断的下一轮开始
    loss = checkpoint['loss']

    print(f"恢复训练，从 epoch {start_epoch} 开始")
else:
    start_epoch = 1
    print("没有找到已保存的模型，从头开始训练")

# 训练模型
def train():
    model.train()
    total_loss = 0
    alpha = 0.5  # 控制权重衰减的超参数
    for data in train_loader:
        optimizer.zero_grad()
        
        out_validation, out_repair = model(data)
        target = data.x
        
        # 计算每个样本的重构误差
        reconstruction_error = F.mse_loss(out_validation, target, reduction='none').sum(dim=1)
        
        # 计算权重：误差越小权重越大，误差越大权重越小
        weights = torch.exp(-alpha * reconstruction_error)
        
        # 计算损失
        loss = loss_function(out_validation, out_repair, target, weights)
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# 训练循环
num_epochs = 30
for epoch in range(start_epoch, num_epochs + 1):
    loss = train()
    if epoch % 5 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')
        
        with open('train_log.txt', 'a') as f:
            if epoch % 5 == 0:
                f.write(f'Epoch {epoch}, Loss: {loss:.4f}\n')
                
        # 保存新的模型状态
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
        }, checkpoint_path)
        print(f'Model saved at epoch {epoch}')
    


恢复训练，从 epoch 11 开始
Epoch 15, Loss: 0.0015
Model saved at epoch 15
Epoch 20, Loss: 0.0014
Model saved at epoch 20
Epoch 25, Loss: 0.0009
Model saved at epoch 25
Epoch 30, Loss: 0.0015
Model saved at epoch 30


## 测试模型
### 定义评估函数

In [17]:
def evaluate_model(model, data_loader):
    model.eval()
    reconstruction_errors = []
    with torch.no_grad():
        for data in data_loader:
            out_validation, _ = model(data)
            target = data.x
            # 计算重构误差
            loss = F.mse_loss(out_validation, target, reduction='none')
            # 对每个样本计算平均误差
            loss_per_sample = loss.mean(dim=1)
            reconstruction_errors.extend(loss_per_sample.tolist())
    # 返回所有样本的重构误差列表
    return reconstruction_errors


### 计算阈值

In [18]:
import numpy as np

# 收集验证集的重构误差
val_errors = evaluate_model(model, val_loader)

# 计算95%分位数作为阈值
threshold = np.quantile(val_errors, 0.95)
print(f"Loss threshold for detecting data quality issues: {threshold}")

# 打印一些统计信息
print(f"Min validation error: {min(val_errors)}")
print(f"Max validation error: {max(val_errors)}")
print(f"Mean validation error: {np.mean(val_errors)}")
print(f"95th percentile of validation errors: {threshold}")


Loss threshold for detecting data quality issues: 0.0024004371254704856
Min validation error: 5.640019701269594e-16
Max validation error: 0.9992021918296814
Mean validation error: 0.0005992085086316146
95th percentile of validation errors: 0.0024004371254704856


## 检测数据质量问题
### 定义检测函数

In [19]:
def detect_quality_issues(model, data_loader, threshold):
    model.eval()
    total_issues = 0
    total_samples = 0
    with torch.no_grad():
        for data in data_loader:
            out_validation, _ = model(data)
            target = data.x
            # 计算重构误差
            loss = F.mse_loss(out_validation, target, reduction='none')
            loss_per_sample = loss.mean(dim=1)
            # 检测超过阈值的样本
            issues = (loss_per_sample > threshold).sum().item()
            total_issues += issues
            total_samples += loss_per_sample.size(0)
    # 计算有问题的样本比例
    issue_ratio = total_issues / total_samples
    return total_issues, total_samples, issue_ratio


### 在测试集1和测试集2上检测

In [20]:
# 检测测试集1（干净数据）的质量问题
issues_test1, samples_test1, ratio_test1 = detect_quality_issues(model, test_loader_1, threshold)
print(f"Test Set 1 (Clean Data): {issues_test1}/{samples_test1} samples are faulty ({ratio_test1 * 100:.2f}%)")

# 检测测试集2（脏数据）的质量问题
issues_test2, samples_test2, ratio_test2 = detect_quality_issues(model, test_loader_2, threshold)
print(f"Test Set 2 (Dirty Data): {issues_test2}/{samples_test2} samples are faulty ({ratio_test2 * 100:.2f}%)")


Test Set 1 (Clean Data): 19671/398521 samples are faulty (4.94%)
Test Set 2 (Dirty Data): 214910/2359777 samples are faulty (9.11%)


## 额外的测试：随机采样测试

In [21]:
def test_random_samples(model, data, threshold, num_tests=50):
    from sklearn.utils import shuffle
    problematic_batches = 0
    total_tests = num_tests
    for seed in range(num_tests):
        # 随机采样20%的数据
        sample_data = data.sample(frac=0.2, random_state=seed).reset_index(drop=True)
        sample_dataset = GraphDataset(sample_data)
        sample_loader = DataLoader(sample_dataset, batch_size=batch_size, shuffle=False)
        issues, samples, ratio = detect_quality_issues(model, sample_loader, threshold)
        if ratio > 0.06:  # 超过6%的样本有问题
            print(f"Random sample {seed} is problematic: {issues} out of {samples} samples are faulty ({ratio * 100:.2f}%).")
            problematic_batches += 1
        else:
            print(f"Random sample {seed} is ok: {issues} out of {samples} samples are faulty ({ratio * 100:.2f}%).")
    print(f"Total problematic batches across all tests: {problematic_batches}/{total_tests}")


In [22]:
# 对测试集1（干净数据）执行随机采样测试
print("Testing on Test Set 1 (Clean Data):")
test_random_samples(model, test_data_1, threshold)

# 对测试集2（脏数据）执行随机采样测试
print("Testing on Test Set 2 (Dirty Data):")
test_random_samples(model, test_data_2, threshold)


Testing on Test Set 1 (Clean Data):
Random sample 0 is ok: 3836 out of 79695 samples are faulty (4.81%).
Random sample 1 is ok: 3930 out of 79695 samples are faulty (4.93%).
Random sample 2 is ok: 3925 out of 79695 samples are faulty (4.93%).
Random sample 3 is ok: 3902 out of 79695 samples are faulty (4.90%).
Random sample 4 is ok: 3988 out of 79695 samples are faulty (5.00%).
Random sample 5 is ok: 3929 out of 79695 samples are faulty (4.93%).
Random sample 6 is ok: 3963 out of 79695 samples are faulty (4.97%).
Random sample 7 is ok: 3899 out of 79695 samples are faulty (4.89%).
Random sample 8 is ok: 3915 out of 79695 samples are faulty (4.91%).
Random sample 9 is ok: 3925 out of 79695 samples are faulty (4.93%).
Random sample 10 is ok: 3837 out of 79695 samples are faulty (4.81%).
Random sample 11 is ok: 3887 out of 79695 samples are faulty (4.88%).
Random sample 12 is ok: 3910 out of 79695 samples are faulty (4.91%).
Random sample 13 is ok: 3980 out of 79695 samples are faulty (4.

In [23]:
def test_random_samples(model, data, threshold, num_tests=50):
    from sklearn.utils import shuffle
    problematic_batches = 0
    total_tests = num_tests

    for seed in range(num_tests):
        # 随机采样20%的数据
        sample_data = data.sample(frac=0.2, random_state=seed).reset_index(drop=True)
        sample_dataset = GraphDataset(sample_data)
        sample_loader = DataLoader(sample_dataset, batch_size=batch_size, shuffle=False)
        issues, samples, ratio = detect_quality_issues(model, sample_loader, threshold)

        if ratio > 0.06:  # 超过6%的样本有问题
            print(f"Random sample {seed} is problematic: {issues} out of {samples} samples are faulty ({ratio * 100:.2f}%).")
            problematic_batches += 1
        else:
            print(f"Random sample {seed} is ok: {issues} out of {samples} samples are faulty ({ratio * 100:.2f}%).")

    correct_batches = total_tests - problematic_batches
    correct_ratio = correct_batches / total_tests

    print(f"Total problematic batches across all tests: {problematic_batches}/{total_tests}")
    print(f"Correct ratio: {correct_batches}/{total_tests} ({correct_ratio * 100:.2f}%)")

    return correct_batches, total_tests

# 对测试集1（干净数据）执行随机采样测试
print("Testing on Test Set 1 (Clean Data):")
correct_batches_1, total_tests_1 = test_random_samples(model, test_data_1, threshold)

# 对测试集2（脏数据）执行随机采样测试
print("Testing on Test Set 2 (Dirty Data):")
correct_batches_2, total_tests_2 = test_random_samples(model, test_data_2, threshold)

# 计算总的预测正确的百分比
total_correct_batches = correct_batches_1 + correct_batches_2
total_tests = total_tests_1 + total_tests_2
overall_correct_ratio = total_correct_batches / total_tests

print(f"\nOverall correct ratio across all tests: {total_correct_batches}/{total_tests} ({overall_correct_ratio * 100:.2f}%)")


Testing on Test Set 1 (Clean Data):
Random sample 0 is ok: 3836 out of 79695 samples are faulty (4.81%).
Random sample 1 is ok: 3930 out of 79695 samples are faulty (4.93%).
Random sample 2 is ok: 3925 out of 79695 samples are faulty (4.93%).
Random sample 3 is ok: 3902 out of 79695 samples are faulty (4.90%).
Random sample 4 is ok: 3988 out of 79695 samples are faulty (5.00%).
Random sample 5 is ok: 3929 out of 79695 samples are faulty (4.93%).
Random sample 6 is ok: 3963 out of 79695 samples are faulty (4.97%).
Random sample 7 is ok: 3899 out of 79695 samples are faulty (4.89%).
Random sample 8 is ok: 3915 out of 79695 samples are faulty (4.91%).
Random sample 9 is ok: 3925 out of 79695 samples are faulty (4.93%).
Random sample 10 is ok: 3837 out of 79695 samples are faulty (4.81%).
Random sample 11 is ok: 3887 out of 79695 samples are faulty (4.88%).
Random sample 12 is ok: 3910 out of 79695 samples are faulty (4.91%).
Random sample 13 is ok: 3980 out of 79695 samples are faulty (4.

KeyboardInterrupt: 

In [28]:
def test_random_samples(model, data, threshold, num_tests=50, sample_size=0.2):
    from sklearn.utils import shuffle
    problematic_batches = 0
    total_tests = num_tests

    for seed in range(num_tests):
        # 随机采样指定数量的数据
        sample_data = data.sample(n=sample_size, random_state=seed).reset_index(drop=True)
        sample_dataset = GraphDataset(sample_data)
        sample_loader = DataLoader(sample_dataset, batch_size=batch_size, shuffle=False)
        issues, samples, ratio = detect_quality_issues(model, sample_loader, threshold)

        if ratio > 0.06:  # 超过6%的样本有问题
            #print(f"Random sample {seed} is problematic: {issues} out of {samples} samples are faulty ({ratio * 100:.2f}%).")
            problematic_batches += 1
        #else:
            #print(f"Random sample {seed} is ok: {issues} out of {samples} samples are faulty ({ratio * 100:.2f}%).")

    correct_batches = total_tests - problematic_batches
    correct_ratio = correct_batches / total_tests

    # print(f"Total problematic batches across all tests: {problematic_batches}/{total_tests}")
    # print(f"Correct ratio: {correct_batches}/{total_tests} ({correct_ratio * 100:.2f}%)")

    return correct_batches, total_tests

# 定义不同的采样大小
sample_sizes = [10 ,20,50, 100, 500, 1000]

# 用于保存每个采样大小下的总体正确率
overall_correct_ratios = []

for sample_size in sample_sizes:
    #print(f"\nTesting with sample size: {sample_size}")

    # 对测试集1（干净数据）执行随机采样测试
    #print(f"Testing on Test Set 1 (Clean Data) with sample size {sample_size}:")
    correct_batches_1, total_tests_1 = test_random_samples(model, test_data_1, threshold, sample_size=sample_size)

    # 对测试集2（脏数据）执行随机采样测试
    #print(f"Testing on Test Set 2 (Dirty Data) with sample size {sample_size}:")
    correct_batches_2, total_tests_2 = test_random_samples(model, test_data_2, threshold, sample_size=sample_size)

    # 计算总的预测正确的百分比
    total_correct_batches = correct_batches_1 + (50-correct_batches_2)
    total_tests = total_tests_1 + total_tests_2
    overall_correct_ratio = total_correct_batches / total_tests

    overall_correct_ratios.append((sample_size, overall_correct_ratio))

    #print(f"\nOverall correct ratio for sample size {sample_size}: {total_correct_batches}/{total_tests} ({overall_correct_ratio * 100:.2f}%)")

# 显示不同采样大小下的总体正确率
print("\nSummary of Overall Correct Ratios for Different Sample Sizes:")
for sample_size, ratio in overall_correct_ratios:
    print(f"Sample Size {sample_size}: Correct Ratio = {ratio * 100:.2f}%")



Summary of Overall Correct Ratios for Different Sample Sizes:
Sample Size 10: Correct Ratio = 85.00%
Sample Size 20: Correct Ratio = 93.00%
Sample Size 50: Correct Ratio = 99.00%
Sample Size 100: Correct Ratio = 99.00%
Sample Size 500: Correct Ratio = 100.00%
Sample Size 1000: Correct Ratio = 100.00%


## 6. 数据质量验证和数据修复
### 6.1 检测数据质量问题

### 6.2 修复数据

In [82]:
def repair_data(model, data_loader):
    model.eval()
    repaired_data = []
    with torch.no_grad():
        for data in data_loader:
            _, out_repair = model(data)
            repaired_instance = out_repair.squeeze().numpy()
            repaired_data.append(repaired_instance)
    repaired_data = np.array(repaired_data)
    return repaired_data


### 7. 应用于脏数据集