## 数据预处理

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import torch
import numpy as np

# 加载数据
file_path_clean = '/home/sdong/data/chicago_bicycle/data_pr_cleaned.csv'
file_path_dirty = '/home/sdong/data/chicago_bicycle/data_pr_raw.csv'
data_clean = pd.read_csv(file_path_clean)
data_dirty = pd.read_csv(file_path_dirty)

# 填充缺失值
data_clean.fillna(0, inplace=True)
data_dirty.fillna(0, inplace=True)


data_clean.head()

Unnamed: 0,trip_id,usertype,gender,starttime,stoptime,tripduration,temperature,events,from_station_id,from_station_name,latitude_start,longitude_start,dpcapacity_start,to_station_id,to_station_name,latitude_end,longitude_end,dpcapacity_end
0,2355134,Subscriber,Male,2014-06-30 23:57:00,2014-07-01 00:07:00,10.066667,68.0,tstorms,131,Lincoln Ave & Belmont Ave,41.939365,-87.668385,15.0,303,Broadway & Cornelia Ave,41.945512,-87.64598,15.0
1,2355133,Subscriber,Male,2014-06-30 23:56:00,2014-07-01 00:00:00,4.383333,68.0,tstorms,282,Halsted St & Maxwell St,41.86458,-87.64693,15.0,22,May St & Taylor St,41.869482,-87.655486,15.0
2,2355130,Subscriber,Male,2014-06-30 23:33:00,2014-06-30 23:35:00,2.1,68.0,tstorms,327,Sheffield Ave & Webster Ave,41.921687,-87.653714,19.0,225,Halsted St & Dickens Ave,41.919936,-87.64883,15.0
3,2355129,Subscriber,Female,2014-06-30 23:26:00,2014-07-01 00:24:00,58.016667,68.0,tstorms,134,Peoria St & Jackson Blvd,41.877749,-87.649633,19.0,194,State St & Wacker Dr,41.887155,-87.62775,11.0
4,2355128,Subscriber,Female,2014-06-30 23:16:00,2014-06-30 23:26:00,10.633333,68.0,tstorms,320,Loomis St & Lexington St,41.872187,-87.661501,15.0,134,Peoria St & Jackson Blvd,41.877749,-87.649633,19.0


In [2]:
# 识别非数值列
non_numeric_cols = data_clean.select_dtypes(include=['object']).columns

# 对非数值列进行标签编码
label_encoders = {}
for col in non_numeric_cols:
    le = LabelEncoder()
    # 合并干净和脏的数据
    combined_data = pd.concat([data_clean[col], data_dirty[col]], axis=0)
    le.fit(combined_data.astype(str))
    # 对干净和脏的数据分别进行转换
    data_clean[col] = le.transform(data_clean[col].astype(str))
    data_dirty[col] = le.transform(data_dirty[col].astype(str))
    label_encoders[col] = le

# 确保所有特征都是数值类型
print("Data types after encoding:\n", data_clean.dtypes)


Data types after encoding:
 trip_id                int64
usertype               int64
gender                 int64
starttime              int64
stoptime               int64
tripduration         float64
temperature          float64
events                 int64
from_station_id        int64
from_station_name      int64
latitude_start       float64
longitude_start      float64
dpcapacity_start     float64
to_station_id          int64
to_station_name        int64
latitude_end         float64
longitude_end        float64
dpcapacity_end       float64
dtype: object


In [3]:
data_clean = data_clean.astype(np.float64)
data_dirty = data_dirty.astype(np.float64)

In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# 分离数值型特征和类别型特征
numeric_cols = data_clean.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = list(set(data_clean.columns) - set(numeric_cols))

# 初始化标准化器
scaler = MinMaxScaler()

# 在干净和脏数据的组合上拟合标准化器
combined_numeric_data = pd.concat([data_clean[numeric_cols], data_dirty[numeric_cols]], axis=0)
scaler.fit(combined_numeric_data)

# 对干净和脏数据进行标准化
data_clean[numeric_cols] = scaler.transform(data_clean[numeric_cols])
data_dirty[numeric_cols] = scaler.transform(data_dirty[numeric_cols])

In [5]:
print("Data types of data_clean:")
print(data_clean.dtypes)

print("Data types of data_dirty:")
print(data_dirty.dtypes)

Data types of data_clean:
trip_id              float64
usertype             float64
gender               float64
starttime            float64
stoptime             float64
tripduration         float64
temperature          float64
events               float64
from_station_id      float64
from_station_name    float64
latitude_start       float64
longitude_start      float64
dpcapacity_start     float64
to_station_id        float64
to_station_name      float64
latitude_end         float64
longitude_end        float64
dpcapacity_end       float64
dtype: object
Data types of data_dirty:
trip_id              float64
usertype             float64
gender               float64
starttime            float64
stoptime             float64
tripduration         float64
temperature          float64
events               float64
from_station_id      float64
from_station_name    float64
latitude_start       float64
longitude_start      float64
dpcapacity_start     float64
to_station_id        float64
to_sta

In [6]:
data_clean.head()

Unnamed: 0,trip_id,usertype,gender,starttime,stoptime,tripduration,temperature,events,from_station_id,from_station_name,latitude_start,longitude_start,dpcapacity_start,to_station_id,to_station_name,latitude_end,longitude_end,dpcapacity_end
0,0.134103,1.0,1.0,0.057209,0.05847,9.3e-05,0.997216,0.916667,0.206731,0.549849,0.99703,0.001532,0.272727,0.482372,0.07855,0.997176,0.001787,0.272727
1,0.134103,1.0,1.0,0.057208,0.05847,2.8e-05,0.997216,0.916667,0.448718,0.416918,0.995252,0.001776,0.272727,0.032051,0.60574,0.995368,0.001679,0.272727
2,0.134103,1.0,1.0,0.057208,0.05847,1e-06,0.997216,0.916667,0.520833,0.79003,0.996609,0.001699,0.345455,0.357372,0.410876,0.996568,0.001754,0.272727
3,0.134103,1.0,0.5,0.057208,0.05847,0.000648,0.997216,0.916667,0.211538,0.691843,0.995565,0.001745,0.345455,0.307692,0.861027,0.995788,0.001994,0.2
4,0.134103,1.0,0.5,0.057208,0.058469,0.0001,0.997216,0.916667,0.509615,0.569486,0.995433,0.00161,0.272727,0.211538,0.691843,0.995565,0.001745,0.345455


## 特征图构建

In [7]:
import networkx as nx
from torch_geometric.utils import from_networkx
from torch_geometric.data import Data
import torch
import numpy as np
import pandas as pd

# 示例表格数据特征
columns = ['trip_id', 'usertype', 'gender',
    'starttime', 'stoptime', 'tripduration', 'temperature', 'events',
    'from_station_id', 'from_station_name', 'latitude_start', 'longitude_start',
    'dpcapacity_start', 'to_station_id', 'to_station_name', 'latitude_end',
    'longitude_end', 'dpcapacity_end'
    
]

# 定义特征之间的关系
relations = [
    ('trip_id', 'usertype'),
    ('trip_id', 'gender'),
    ('starttime', 'stoptime'),
    ('starttime', 'latitude_start'),
    ('starttime', 'longitude_start'),
    ('stoptime', 'latitude_end'),
    ('stoptime', 'longitude_end'),
    ('latitude_start', 'longitude_start'),
    ('latitude_end', 'longitude_end'),
    ('from_station_id', 'from_station_name'),
    ('to_station_id', 'to_station_name'),
    ('from_station_id', 'latitude_start'),
    ('from_station_id', 'longitude_start'),
    ('to_station_id', 'latitude_end'),
    ('to_station_id', 'longitude_end'),
    ('tripduration', 'temperature'),
    ('tripduration', 'events'),
    ('dpcapacity_start', 'from_station_id'),
    ('dpcapacity_end', 'to_station_id')
    # 可以添加更多关系
]

# 创建特征名称到索引的映射
feature_to_index = {col: idx for idx, col in enumerate(columns)}
index_to_feature = {idx: col for idx, col in enumerate(columns)}

# 创建空的无向图
G = nx.Graph()

# 添加节点（使用索引作为节点）
for idx in range(len(columns)):
    G.add_node(idx)

# 添加边（将特征名称映射到索引）
for src, dst in relations:
    src_idx = feature_to_index[src]
    dst_idx = feature_to_index[dst]
    G.add_edge(src_idx, dst_idx)

# 将 NetworkX 图转换为 PyTorch Geometric 图
data = Data()
data.edge_index = torch.tensor(list(G.edges())).t().contiguous()


  from .autonotebook import tqdm as notebook_tqdm


## 新的GNN模型 编码器设计

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, GINConv

class GAT_GIN_Encoder(nn.Module):
    def __init__(self, num_features, hidden_channels):
        super(GAT_GIN_Encoder, self).__init__()
        # 第一层：GATConv
        self.gat_conv1 = GATConv(num_features, hidden_channels, heads=8, concat=False)
        # 第二层：GINConv
        nn1 = nn.Sequential(
            nn.Linear(hidden_channels, hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels, hidden_channels)
        )
        self.gin_conv1 = GINConv(nn1)
        # 第三层：GATConv
        self.gat_conv2 = GATConv(hidden_channels, hidden_channels, heads=8, concat=False)
        # 第四层：GINConv
        nn2 = nn.Sequential(
            nn.Linear(hidden_channels, hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels, hidden_channels)
        )
        self.gin_conv2 = GINConv(nn2)

    def forward(self, x, edge_index):
        x = F.relu(self.gat_conv1(x, edge_index))
        x = F.relu(self.gin_conv1(x, edge_index))
        x = F.relu(self.gat_conv2(x, edge_index))
        x = F.relu(self.gin_conv2(x, edge_index))
        return x


### 解码器设计

In [9]:
class MultiTaskDecoder(nn.Module):
    def __init__(self, hidden_channels, num_features):
        super(MultiTaskDecoder, self).__init__()
        # 数据质量验证解码器
        self.decoder_validation = nn.Sequential(
            nn.Linear(hidden_channels, hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels, num_features)
        )
        # 数据修复解码器
        self.decoder_repair = nn.Sequential(
            nn.Linear(hidden_channels, hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels, num_features)
        )

    def forward(self, x):
        # 数据质量验证输出
        out_validation = self.decoder_validation(x)
        # 数据修复输出
        out_repair = self.decoder_repair(x)
        return out_validation, out_repair


### 整合模型

In [10]:
class MultiTaskGNN(nn.Module):
    def __init__(self, num_features, hidden_channels):
        super(MultiTaskGNN, self).__init__()
        self.encoder = GAT_GIN_Encoder(num_features, hidden_channels)
        self.decoder = MultiTaskDecoder(hidden_channels, num_features)

    def forward(self, data):
        x = data.x  # x 的形状应为 [num_nodes, num_node_features]
        edge_index = data.edge_index
        x = self.encoder(x, edge_index)
        out_validation, out_repair = self.decoder(x)
        return out_validation, out_repair



## 准备数据

### 准备图数据

In [11]:
# 创建节点特征矩阵
num_nodes = len(columns)
num_features = num_nodes  # 每个节点的特征维度

def create_node_features(instance):
    # 确保实例中的特征按照 columns 列表的顺序排列
    values = instance[columns].values.astype(np.float32)
    # 将数据转换为张量，形状为 [num_nodes, num_node_features]
    node_features = torch.tensor(values, dtype=torch.float).view(-1, 1)
    return node_features


# 创建 PyTorch Geometric 数据对象
def create_data_object(instance):
    node_features = create_node_features(instance)
    data_instance = Data()
    data_instance.x = node_features
    data_instance.edge_index = data.edge_index
    data_instance.num_nodes = data.num_nodes
    return data_instance



### 数据集划分

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 假设 data_clean 和 data_dirty 已经经过预处理，并且所有列都是数值类型

# 将 data_clean 随机打乱，并划分为训练集和临时集（50% / 50%）
train_data, temp_data = train_test_split(data_clean, test_size=0.001, random_state=42)

# 将临时集再划分为验证集和测试集1（各占25%）
val_data, test_data_1 = train_test_split(temp_data, test_size=0.5, random_state=42)

# data_dirty 作为测试集2
test_data_2 = data_dirty.head(10000)  # 已经预处理好的脏数据


In [13]:
test_data_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   trip_id            10000 non-null  float64
 1   usertype           10000 non-null  float64
 2   gender             10000 non-null  float64
 3   starttime          10000 non-null  float64
 4   stoptime           10000 non-null  float64
 5   tripduration       10000 non-null  float64
 6   temperature        10000 non-null  float64
 7   events             10000 non-null  float64
 8   from_station_id    10000 non-null  float64
 9   from_station_name  10000 non-null  float64
 10  latitude_start     10000 non-null  float64
 11  longitude_start    10000 non-null  float64
 12  dpcapacity_start   10000 non-null  float64
 13  to_station_id      10000 non-null  float64
 14  to_station_name    10000 non-null  float64
 15  latitude_end       10000 non-null  float64
 16  longitude_end      1000

### 创建数据加载器

In [14]:
from torch.utils.data import Dataset, DataLoader
import os

class GraphDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        instance = self.dataframe.iloc[idx]
        data_instance = create_data_object(instance)
        return data_instance

# 创建数据集对象
train_dataset = GraphDataset(train_data)
val_dataset = GraphDataset(val_data)
test_dataset_1 = GraphDataset(test_data_1)
test_dataset_2 = GraphDataset(test_data_2)



## 训练模型

### 定义损失函数

In [15]:
def loss_function(out_validation, out_repair, target, lambda_validation=1.0, lambda_repair=1.0):
    # 数据质量验证损失
    loss_validation = F.mse_loss(out_validation, target)
    # 数据修复损失
    loss_repair = F.mse_loss(out_repair, target)
    # 总损失
    loss = lambda_validation * loss_validation + lambda_repair * loss_repair
    return loss


### 5.2 训练循环

In [16]:
from torch_geometric.loader import DataLoader
import warnings

# 抑制所有警告
warnings.filterwarnings("ignore")

# 创建数据加载器
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader_1 = DataLoader(test_dataset_1, batch_size=batch_size, shuffle=False)
test_loader_2 = DataLoader(test_dataset_2, batch_size=batch_size, shuffle=False)



In [17]:
# 初始化模型、优化器
num_features = 1  # 因为每个节点只有一个特征
hidden_channels = 64
model = MultiTaskGNN(num_features, hidden_channels)
# 打印模型结构
print(model)



MultiTaskGNN(
  (encoder): GAT_GIN_Encoder(
    (gat_conv1): GATConv(1, 64, heads=8)
    (gin_conv1): GINConv(nn=Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
    ))
    (gat_conv2): GATConv(64, 64, heads=8)
    (gin_conv2): GINConv(nn=Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
    ))
  )
  (decoder): MultiTaskDecoder(
    (decoder_validation): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=1, bias=True)
    )
    (decoder_repair): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=1, bias=True)
    )
  )
)


In [18]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# 定义保存模型的路径
save_path = '/home/sdong/experiments/GVAE/model/multitask_gnn_model.pth'
# 定义checkpoint文件的路径
checkpoint_path = '/home/sdong/experiments/GVAE/model/checkpoint_bicycle.pth'

# 加载已保存的checkpoint
if os.path.isfile(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1  # 从中断的下一轮开始
    loss = checkpoint['loss']

    print(f"恢复训练，从 epoch {start_epoch} 开始")
else:
    start_epoch = 1
    print("没有找到已保存的模型，从头开始训练")

# 训练模型
def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        optimizer.zero_grad()
        out_validation, out_repair = model(data)
        target = data.x
        loss = loss_function(out_validation, out_repair, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# 训练循环
num_epochs = 50
for epoch in range(start_epoch, num_epochs + 1):
    loss = train()
    if epoch % 5 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')
        
        with open('train_log.txt', 'a') as f:
            if epoch % 5 == 0:
                f.write(f'Epoch {epoch}, Loss: {loss:.4f}\n')
                
        # 保存新的模型状态
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
        }, checkpoint_path)
        print(f'Model saved at epoch {epoch}')
    

恢复训练，从 epoch 51 开始


## 测试模型
### 定义评估函数

In [19]:
def evaluate_model(model, data_loader):
    model.eval()
    reconstruction_errors = []
    with torch.no_grad():
        for data in data_loader:
            out_validation, _ = model(data)
            target = data.x
            # 计算重构误差
            loss = F.mse_loss(out_validation, target, reduction='none')
            # 对每个样本计算平均误差
            loss_per_sample = loss.mean(dim=1)
            reconstruction_errors.extend(loss_per_sample.tolist())
    # 返回所有样本的重构误差列表
    return reconstruction_errors


### 计算阈值

In [20]:
import numpy as np

# 收集验证集的重构误差
val_errors = evaluate_model(model, val_loader)

# 计算95%分位数作为阈值
threshold = np.quantile(val_errors, 0.95)
print(f"Loss threshold for detecting data quality issues: {threshold}")

# 打印一些统计信息
print(f"Min validation error: {min(val_errors)}")
print(f"Max validation error: {max(val_errors)}")
print(f"Mean validation error: {np.mean(val_errors)}")
print(f"95th percentile of validation errors: {threshold}")


Loss threshold for detecting data quality issues: 6.1952723626745865e-06
Min validation error: 2.7200464103316335e-15
Max validation error: 0.0030698971822857857
Mean validation error: 2.8008653246326952e-06
95th percentile of validation errors: 6.1952723626745865e-06


## 检测数据质量问题
### 定义检测函数

In [22]:
def detect_quality_issues(model, data_loader, threshold):
    model.eval()
    total_issues = 0
    total_samples = 0
    with torch.no_grad():
        for data in data_loader:
            out_validation, _ = model(data)
            target = data.x
            # 计算重构误差
            loss = F.mse_loss(out_validation, target, reduction='none')
            loss_per_sample = loss.mean(dim=1)
            # 检测超过阈值的样本
            issues = (loss_per_sample > threshold).sum().item()
            total_issues += issues
            total_samples += loss_per_sample.size(0)
    # 计算有问题的样本比例
    issue_ratio = total_issues / total_samples
    return total_issues, total_samples, issue_ratio


### 在测试集1和测试集2上检测

In [29]:
# 检测测试集1（干净数据）的质量问题
issues_test1, samples_test1, ratio_test1 = detect_quality_issues(model, test_loader_1, threshold)
print(f"Test Set 1 (Clean Data): {issues_test1}/{samples_test1} samples are faulty ({ratio_test1 * 100:.2f}%)")

# 检测测试集2（脏数据）的质量问题
issues_test2, samples_test2, ratio_test2 = detect_quality_issues(model, test_loader_2, threshold)
print(f"Test Set 2 (Dirty Data): {issues_test2}/{samples_test2} samples are faulty ({ratio_test2 * 100:.2f}%)")


Test Set 1 (Clean Data): 467/8550 samples are faulty (5.46%)
Test Set 2 (Dirty Data): 489164/1800000 samples are faulty (27.18%)


## 额外的测试：随机采样测试

In [23]:
def test_random_samples(model, data, threshold, num_tests=50):
    from sklearn.utils import shuffle
    problematic_batches = 0
    total_tests = num_tests
    for seed in range(num_tests):
        # 随机采样20%的数据
        sample_data = data.sample(frac=0.2, random_state=seed).reset_index(drop=True)
        sample_dataset = GraphDataset(sample_data)
        sample_loader = DataLoader(sample_dataset, batch_size=batch_size, shuffle=False)
        issues, samples, ratio = detect_quality_issues(model, sample_loader, threshold)
        if ratio > 0.06:  # 超过6%的样本有问题
            print(f"Random sample {seed} is problematic: {issues} out of {samples} samples are faulty ({ratio * 100:.2f}%).")
            problematic_batches += 1
        else:
            print(f"Random sample {seed} is ok: {issues} out of {samples} samples are faulty ({ratio * 100:.2f}%).")
    print(f"Total problematic batches across all tests: {problematic_batches}/{total_tests}")


In [31]:
# 对测试集1（干净数据）执行随机采样测试
print("Testing on Test Set 1 (Clean Data):")
test_random_samples(model, test_data_1, threshold)




Testing on Test Set 1 (Clean Data):
Random sample 0 is ok: 91 out of 1710 samples are faulty (5.32%).
Random sample 1 is ok: 83 out of 1710 samples are faulty (4.85%).
Random sample 2 is ok: 89 out of 1710 samples are faulty (5.20%).
Random sample 3 is ok: 89 out of 1710 samples are faulty (5.20%).
Random sample 4 is ok: 94 out of 1710 samples are faulty (5.50%).
Random sample 5 is ok: 86 out of 1710 samples are faulty (5.03%).
Random sample 6 is ok: 100 out of 1710 samples are faulty (5.85%).
Random sample 7 is ok: 91 out of 1710 samples are faulty (5.32%).
Random sample 8 is problematic: 103 out of 1710 samples are faulty (6.02%).
Random sample 9 is ok: 94 out of 1710 samples are faulty (5.50%).
Random sample 10 is problematic: 110 out of 1710 samples are faulty (6.43%).
Random sample 11 is ok: 80 out of 1710 samples are faulty (4.68%).
Random sample 12 is ok: 89 out of 1710 samples are faulty (5.20%).
Random sample 13 is ok: 102 out of 1710 samples are faulty (5.96%).
Random sample 

KeyboardInterrupt: 

In [39]:
# 对测试集2（脏数据）执行随机采样测试
print("Testing on Test Set 2 (Dirty Data):")
test_random_samples(model, test_data_2, threshold)

Testing on Test Set 2 (Dirty Data):
Random sample 0 is problematic: 7263 out of 36000 samples are faulty (20.18%).
Random sample 1 is problematic: 7102 out of 36000 samples are faulty (19.73%).
Random sample 2 is problematic: 7299 out of 36000 samples are faulty (20.28%).
Random sample 3 is problematic: 7191 out of 36000 samples are faulty (19.98%).
Random sample 4 is problematic: 7173 out of 36000 samples are faulty (19.93%).
Random sample 5 is problematic: 7260 out of 36000 samples are faulty (20.17%).
Random sample 6 is problematic: 7109 out of 36000 samples are faulty (19.75%).
Random sample 7 is problematic: 7252 out of 36000 samples are faulty (20.14%).
Random sample 8 is problematic: 7191 out of 36000 samples are faulty (19.98%).
Random sample 9 is problematic: 7174 out of 36000 samples are faulty (19.93%).
Random sample 10 is problematic: 7238 out of 36000 samples are faulty (20.11%).
Random sample 11 is problematic: 7134 out of 36000 samples are faulty (19.82%).
Random sample 

In [24]:
def test_random_samples(model, data, threshold, num_tests=50, sample_size=0.2):
    from sklearn.utils import shuffle
    problematic_batches = 0
    total_tests = num_tests

    for seed in range(num_tests):
        # 随机采样指定数量的数据
        sample_data = data.sample(n=sample_size, random_state=seed).reset_index(drop=True)
        sample_dataset = GraphDataset(sample_data)
        sample_loader = DataLoader(sample_dataset, batch_size=batch_size, shuffle=False)
        issues, samples, ratio = detect_quality_issues(model, sample_loader, threshold)

        if ratio > 0.06:  # 超过6%的样本有问题
            #print(f"Random sample {seed} is problematic: {issues} out of {samples} samples are faulty ({ratio * 100:.2f}%).")
            problematic_batches += 1
        #else:
            #print(f"Random sample {seed} is ok: {issues} out of {samples} samples are faulty ({ratio * 100:.2f}%).")

    correct_batches = total_tests - problematic_batches
    correct_ratio = correct_batches / total_tests

    # print(f"Total problematic batches across all tests: {problematic_batches}/{total_tests}")
    # print(f"Correct ratio: {correct_batches}/{total_tests} ({correct_ratio * 100:.2f}%)")

    return correct_batches, total_tests

# 定义不同的采样大小
sample_sizes = [10 ,20,50, 100, 500, 1000]

# 用于保存每个采样大小下的总体正确率
overall_correct_ratios = []

for sample_size in sample_sizes:
    #print(f"\nTesting with sample size: {sample_size}")

    # 对测试集1（干净数据）执行随机采样测试
    #print(f"Testing on Test Set 1 (Clean Data) with sample size {sample_size}:")
    correct_batches_1, total_tests_1 = test_random_samples(model, test_data_1, threshold, sample_size=sample_size)

    # 对测试集2（脏数据）执行随机采样测试
    #print(f"Testing on Test Set 2 (Dirty Data) with sample size {sample_size}:")
    correct_batches_2, total_tests_2 = test_random_samples(model, test_data_2, threshold, sample_size=sample_size)

    # 计算总的预测正确的百分比
    total_correct_batches = correct_batches_1 + (50-correct_batches_2)
    total_tests = total_tests_1 + total_tests_2
    overall_correct_ratio = total_correct_batches / total_tests

    overall_correct_ratios.append((sample_size, overall_correct_ratio))

    #print(f"\nOverall correct ratio for sample size {sample_size}: {total_correct_batches}/{total_tests} ({overall_correct_ratio * 100:.2f}%)")

# 显示不同采样大小下的总体正确率
print("\nSummary of Overall Correct Ratios for Different Sample Sizes:")
for sample_size, ratio in overall_correct_ratios:
    print(f"Sample Size {sample_size}: Correct Ratio = {ratio * 100:.2f}%")



Summary of Overall Correct Ratios for Different Sample Sizes:
Sample Size 10: Correct Ratio = 86.00%
Sample Size 20: Correct Ratio = 92.00%
Sample Size 50: Correct Ratio = 89.00%
Sample Size 100: Correct Ratio = 97.00%
Sample Size 500: Correct Ratio = 100.00%
Sample Size 1000: Correct Ratio = 100.00%


## 6. 数据质量验证和数据修复
### 6.1 检测数据质量问题

### 6.2 修复数据

In [14]:
def repair_data(model, data_loader):
    model.eval()
    repaired_data = []
    with torch.no_grad():
        for data in data_loader:
            _, out_repair = model(data)
            repaired_instance = out_repair.squeeze().numpy()
            repaired_data.append(repaired_instance)
    repaired_data = np.array(repaired_data)
    return repaired_data


### 7. 应用于脏数据集

In [15]:
# 创建脏数据的加载器
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# 设置阈值，可以根据验证集的损失分布确定，这里暂时设置为经验值
threshold = 0.1

# 检测数据质量问题
issues_detected, total_samples = detect_quality_issues(model, test_loader, threshold)

# 修复数据
repaired_data = repair_data(model, test_loader)

# 将修复后的数据保存
repaired_df = pd.DataFrame(repaired_data, columns=columns)
repaired_df.to_csv('repaired_data.csv', index=False)
print("Repaired data saved to repaired_data.csv")


KeyboardInterrupt: 