In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np


# 加载数据
file_path_clean = '/home/sdong/data/airbnb/airbnb_nyc_clean_embeddings.csv'
file_path_origi = '/home/sdong/data/airbnb/Airbnb_Open_Data_Alignement_embeddings.csv'
data = pd.read_csv(file_path_clean)
data_dirty = pd.read_csv(file_path_origi)
# 设置显示所有列和部分行
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)  
# 显示数据集的前几行和数据结构

print(data_dirty.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102599 entries, 0 to 102598
Data columns (total 23 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       102599 non-null  float64
 1   1       102599 non-null  float64
 2   2       102599 non-null  float64
 3   3       102599 non-null  float64
 4   4       102599 non-null  float64
 5   5       102599 non-null  float64
 6   6       102599 non-null  float64
 7   7       102599 non-null  float64
 8   8       102599 non-null  float64
 9   9       102599 non-null  float64
 10  10      102599 non-null  float64
 11  11      102599 non-null  float64
 12  12      102599 non-null  float64
 13  13      102599 non-null  float64
 14  14      102599 non-null  float64
 15  15      102599 non-null  float64
 16  16      102599 non-null  float64
 17  17      102599 non-null  float64
 18  18      102599 non-null  float64
 19  19      102599 non-null  float64
 20  20      102599 non-null  float64
 21  21      10

In [15]:
# 检查两个数据集中是否所有值都在0到1之间
def check_values_in_range(df, lower=0, upper=1):
    return ((df >= lower) & (df <= upper)).all().all()

is_data_in_range = check_values_in_range(data)
is_data_dirty_in_range = check_values_in_range(data_dirty)

print(f"Clean data values are within [0, 1]: {is_data_in_range}")
print(f"Dirty data values are within [0, 1]: {is_data_dirty_in_range}")

if not is_data_in_range:
    print("Clean data contains values out of range [0, 1].")

if not is_data_dirty_in_range:
    print("Dirty data contains values out of range [0, 1].")

# 如果需要，打印出不在范围内的值和对应的索引
def find_out_of_range_values(df, lower=0, upper=1):
    out_of_range = df[(df < lower) | (df > upper)]
    return out_of_range.dropna(how='all')

if not is_data_in_range:
    out_of_range_clean = find_out_of_range_values(data)
    print("Out of range values in clean data:")
    print(out_of_range_clean)

if not is_data_dirty_in_range:
    out_of_range_dirty = find_out_of_range_values(data_dirty)
    print("Out of range values in dirty data:")
    print(out_of_range_dirty)
    
    # 获取不在范围内的值的索引
out_of_range_clean_indices = out_of_range_clean.index
out_of_range_dirty_indices = out_of_range_dirty.index

# 删除不在范围内的行
data = data.drop(out_of_range_clean_indices)
data_dirty = data_dirty.drop(out_of_range_dirty_indices)

Clean data values are within [0, 1]: False
Dirty data values are within [0, 1]: False
Clean data contains values out of range [0, 1].
Dirty data contains values out of range [0, 1].
Out of range values in clean data:
        0   1   2   3   4   5   6   7   8   9  10   11  12  13  14  15  16  \
2173  NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN  1.0 NaN NaN NaN NaN NaN   
49653 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN  NaN NaN NaN NaN NaN NaN   

       17  18  19  20  21   22  
2173  NaN NaN NaN NaN NaN  NaN  
49653 NaN NaN NaN NaN NaN  1.0  
Out of range values in dirty data:
        0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  \
63948 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN   

        17  18  19  20  21  22  
63948  1.0 NaN NaN NaN NaN NaN  


In [16]:
data_dirty.info()

<class 'pandas.core.frame.DataFrame'>
Index: 102598 entries, 0 to 102598
Data columns (total 23 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       102598 non-null  float64
 1   1       102598 non-null  float64
 2   2       102598 non-null  float64
 3   3       102598 non-null  float64
 4   4       102598 non-null  float64
 5   5       102598 non-null  float64
 6   6       102598 non-null  float64
 7   7       102598 non-null  float64
 8   8       102598 non-null  float64
 9   9       102598 non-null  float64
 10  10      102598 non-null  float64
 11  11      102598 non-null  float64
 12  12      102598 non-null  float64
 13  13      102598 non-null  float64
 14  14      102598 non-null  float64
 15  15      102598 non-null  float64
 16  16      102598 non-null  float64
 17  17      102598 non-null  float64
 18  18      102598 non-null  float64
 19  19      102598 non-null  float64
 20  20      102598 non-null  float64
 21  21      102598 

In [17]:
import torch
from torch import nn
from torch.nn import functional as F
from sklearn.model_selection import train_test_split

# 假设 data 已经是一个经过预处理的 DataFrame
data_array = data.values.astype(np.float32)  # 转换为浮点数类型的 NumPy 数组

# 分割数据为训练集和临时测试集（包括真正的测试集和验证集）
train_data, val_test_data = train_test_split(data_array, test_size=0.5, random_state=42)

# 将训练验证集进一步分割为训练集和验证集
val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=42)  # 0.25 x 0.8 = 0.2


# 转换为PyTorch张量


# 创建数据加载器
from torch.utils.data import DataLoader, TensorDataset

batch_size = 128  # 或者任何适合你GPU的大小

train_tensor = torch.tensor(train_data) #0.6
train_dataset = TensorDataset(train_tensor, train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_tensor = torch.tensor(val_data) #0.2
val_dataset = TensorDataset(val_tensor, val_tensor)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

test_tensor = torch.tensor(test_data)  #20%
test_dataset = TensorDataset(test_tensor, test_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)  # 50个批次
#len(test_dataset)// 50

data_dirty_array = data_dirty.values.astype(np.float32)  # 转换为浮点数类型的 NumPy 数组
test_dirty_tensor = torch.tensor(data_dirty_array)  #20%
test_dirty_dataset = TensorDataset(test_dirty_tensor, test_dirty_tensor)
test_dirty_loader = DataLoader(test_dirty_dataset, batch_size=batch_size, shuffle=False)  # 50个批次


In [18]:
# # 检查数据中是否有NaN或无穷大的值
# if torch.isnan(train_tensor).any() or torch.isinf(train_tensor).any():
#     print("Data contains NaNs or Infs.")
# # 检查数据中是否有NaN或无穷大的值
# if torch.isnan(test_dirty_tensor).any() or torch.isinf(test_dirty_tensor).any():
#     print("Data contains NaNs or Infs.")


In [19]:
import torch
from torch import nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class VAE(nn.Module):
    def __init__(self):
        super(VAE, self).__init__()
        # Encoder
        self.fc1 = nn.Linear(23, 128)  # Input layer
        self.fc2 = nn.Linear(128, 64)  # Hidden layer
        self.fc31 = nn.Linear(64, 20)  # Output layer for mu
        self.fc32 = nn.Linear(64, 20)  # Output layer for logvar

        # Decoder
        self.fc4 = nn.Linear(20, 64)   # Input layer
        self.fc5 = nn.Linear(64, 128)  # Hidden layer
        self.fc6 = nn.Linear(128, 23)  # Output layer

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        h2 = F.relu(self.fc2(h1))
        return self.fc31(h2), self.fc32(h2)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar) + 1e-8  # Adding a small constant for numerical stability
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        h3 = F.relu(self.fc4(z))
        h4 = F.relu(self.fc5(h3))
        return torch.sigmoid(self.fc6(h4))  # Use sigmoid to ensure output is between 0 and 1

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

# Instantiate the model
model = VAE()
print(model)


VAE(
  (fc1): Linear(in_features=23, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc31): Linear(in_features=64, out_features=20, bias=True)
  (fc32): Linear(in_features=64, out_features=20, bias=True)
  (fc4): Linear(in_features=20, out_features=64, bias=True)
  (fc5): Linear(in_features=64, out_features=128, bias=True)
  (fc6): Linear(in_features=128, out_features=23, bias=True)
)


In [20]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


# # 定义VAE的架构
# class VAE(nn.Module):
#     def __init__(self):
#         super(VAE, self).__init__()
#         self.fc1 = nn.Linear(23, 12)  # 假设有23个特征
#         self.fc21 = nn.Linear(12, 6)  # 均值输出
#         self.fc22 = nn.Linear(12, 6)  # 方差输出
#         self.fc3 = nn.Linear(6, 12)
#         self.fc4 = nn.Linear(12, 23)

#     def encode(self, x):
#         h1 = F.relu(self.fc1(x))
#         return self.fc21(h1), self.fc22(h1)

#     def reparameterize(self, mu, logvar):
#         std = torch.exp(0.5 * logvar) + 1e-8  # 添加一个小常数以提高数值稳定性
#         eps = torch.randn_like(std)
#         return mu + eps*std

#     def decode(self, z):
#         h3 = F.relu(self.fc3(z))
#         return torch.sigmoid(self.fc4(h3))

#     def forward(self, x):
#         mu, logvar = self.encode(x.view(-1, 23))
#         z = self.reparameterize(mu, logvar)
#         return self.decode(z), mu, logvar

# # 实例化模型
# model = VAE()
# print(model)


In [21]:
#device = torch.device("cpu")
model = VAE().to(device)

import torch.optim as optim

# 设置优化器
optimizer = optim.Adam(model.parameters(), lr=1e-3)

def loss_function(recon_x, x, mu, logvar):
    # 确保目标张量也是浮点类型且维度匹配
    recon_x = torch.clamp(recon_x, 0, 1)  # 确保输出值在[0, 1]范围内
    BCE = F.binary_cross_entropy(recon_x, x, reduction='none')  # 保留每个样本的损失
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1, keepdim=True)  # 保持维度
    return BCE + KLD



In [22]:
import torch

def train(epoch):
    model.train()
    total_BCE = 0
    total_KLD = 0
    total_loss = 0
    for batch_idx, (data, _) in enumerate(train_loader):  # 使用TensorDataset，数据被重复用作输入和标签
        data = data.to(device)
        # 检查输入数据的范围
        if (data < 0).any() or (data > 1).any():
            raise ValueError("Input data contains values out of range [0, 1]")
        
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        if (recon_batch < 0).any() or (recon_batch > 1).any():
            raise ValueError("Warning: recon_batch contains values out of range [0, 1]")
        loss = loss_function(recon_batch, data, mu, logvar)  # 这里loss是每个样本的损失
        loss.mean().backward()  # 使用.mean()在所有样本上平均损失然后进行反向传播
        total_loss += loss.sum().item()  # 更新总损失
        optimizer.step()
        
        # if batch_idx % 100 == 0:
        #     print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
        #         epoch, batch_idx * len(data), len(train_loader.dataset),
        #         100. * batch_idx / len(train_loader), loss.mean().item()))

    # 打印每个epoch的平均损失
    print(f'Epoch: {epoch} Total Loss: {total_loss / len(train_loader.dataset)}')

# 训练模型
num_epochs = 10  # 可根据需要调整
for epoch in range(1, num_epochs + 1):
    train(epoch)

# 保存模型的状态字典
torch.save(model.state_dict(), 'vae_model_airnbnb_graph.pth')
print("Model saved to vae_model_airnbnb_graph.pth")

# 添加调试信息
if torch.cuda.is_available():
    torch.cuda.synchronize()
    torch.cuda.empty_cache()
    print("CUDA cache cleared.")


Epoch: 1 Total Loss: 15.016054565891181
Epoch: 2 Total Loss: 14.904005591581953
Epoch: 3 Total Loss: 14.898292319784467
Epoch: 4 Total Loss: 14.895097270004609
Epoch: 5 Total Loss: 14.892928285828015
Epoch: 6 Total Loss: 14.892401732323968
Epoch: 7 Total Loss: 14.89168929206025
Epoch: 8 Total Loss: 14.891398653323948
Epoch: 9 Total Loss: 14.89072084973197
Epoch: 10 Total Loss: 14.890612860255205
Model saved to vae_model_airnbnb_graph.pth
CUDA cache cleared.


In [23]:
def evaluate_model(model, data_loader):
    model.eval()  # 切换到评估模式
    total_loss = 0
    with torch.no_grad():  # 关闭梯度计算
        for inputs, _ in data_loader:  # 假设 data_loader 返回 inputs 和 targets，这里我们不需要 targets
            inputs = inputs.to(device)  # 确保将 inputs 转移到正确的设备
            recon, mu, logvar = model(inputs)
            loss = loss_function(recon, inputs, mu, logvar)  # 每个样本的损失列表
            total_loss += loss.sum().item()  # 累计所有样本的损失

    average_loss = total_loss / len(data_loader.dataset)
    return average_loss

# 加载模型
model = VAE().to(device)
model.load_state_dict(torch.load('vae_model_airnbnb_graph.pth'))

# 计算测试集和验证集上的平均损失
val_loss = evaluate_model(model, val_loader)
test_loss = evaluate_model(model, test_loader)
test_dirty_loss = evaluate_model(model, test_dirty_loader)

print(f"Average loss on validation data: {val_loss}")
print(f"Average loss on test data: {test_loss}")
print(f"Average loss on test dirty data: {test_dirty_loss}")


Average loss on validation data: 14.896268011254085
Average loss on test data: 14.898248455175747
Average loss on test dirty data: 15.14082212836104


In [24]:
import numpy as np

def collect_reconstruction_errors(model, data_loader):
    model.eval()
    reconstruction_errors = []
    with torch.no_grad():
        for inputs, _ in data_loader:  # 假设 data_loader 返回的是 inputs 和 labels，这里我们忽略 labels
            inputs = inputs.to(device)  # 将输入数据移动到正确的设备
            recon, mu, logvar = model(inputs)
            loss = loss_function(recon, inputs, mu, logvar)  # 每个样本的损失列表
            loss_per_sample = loss.sum(dim=1)
            # print("Type of loss:", type(loss_per_sample))
            # print("Shape of loss:", loss_per_sample.shape)
            #print("First few loss values:", loss[:10])  # 打印前10个损失值
            reconstruction_errors.extend(loss_per_sample.tolist())  

    return reconstruction_errors

# 收集验证集的重构误差
val_errors = collect_reconstruction_errors(model, val_loader)

threshold = np.quantile(val_errors, 0.95)  # 计算95%分位数作为阈值
print(f"Loss threshold for detecting data quality issues: {threshold}")

min_val_error = min(val_errors)
max_val_error = max(val_errors)
mean_val_error = sum(val_errors) / len(val_errors)
print(f"Min validation error: {min_val_error}")
print(f"Max validation error: {max_val_error}")
print(f"Mean validation error: {mean_val_error}")
print(f"95th percentile of validation errors: {np.quantile(val_errors, 0.95)}")
print(f"Maximum validation error (100th percentile): {np.quantile(val_errors, 1)}")


Loss threshold for detecting data quality issues: 15.640409708023071
Min validation error: 13.763216018676758
Max validation error: 17.723398208618164
Mean validation error: 14.895785944354213
95th percentile of validation errors: 15.640409708023071
Maximum validation error (100th percentile): 17.723398208618164


## Loop test

In [25]:
import torch
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

def detect_quality_issues(model, data_loader, threshold, seed):
    model.eval()
    current_issues_count = 0
    total_samples = 0  # 累积处理的样本总数

    with torch.no_grad():
        for inputs, _ in data_loader:
            inputs = inputs.to(device)
            recon, mu, logvar = model(inputs)
            losses = loss_function(recon, inputs, mu, logvar)  # 获取每个样本的损失
            loss_per_sample = losses.sum(dim=1)
            # 检查每个样本是否有问题
            for loss in loss_per_sample:
                if loss.item() > threshold:
                    current_issues_count += 1
            total_samples += inputs.size(0)  # 更新处理的样本总数

    # 评估是否有超过10%的样本有问题
    if current_issues_count > total_samples * 0.05*1.2:
        print(f"Random sample {seed} is problematic: {current_issues_count} out of {total_samples} samples are faulty ({(current_issues_count/total_samples * 100):.2f}%).")
        return True
    else:
        print(f"Random sample {seed} is ok: {current_issues_count} out of {total_samples} samples are faulty ({(current_issues_count/total_samples * 100):.2f}%).")
        return False

# 主函数：执行50次随机采样测试
def test_random_samples(model, data, threshold):
    problematic_batches = 0
    for seed in range(50):
        _, sample_data = train_test_split(data, test_size=0.2, random_state=seed)  # 随机采样20%
        sample_tensor = torch.tensor(sample_data, dtype=torch.float32)  # 确保数据类型是float32
        sample_dataset = TensorDataset(sample_tensor, sample_tensor)
        test_dirty_loader = DataLoader(sample_dataset, batch_size=batch_size, shuffle=False)
        if detect_quality_issues(model, test_dirty_loader, threshold, seed):
            problematic_batches += 1

    print(f"Total problematic batches across all tests: {problematic_batches}")

# 假设 model, data_dirty, threshold, device 已经被正确定义和设置
test_random_samples(model, test_data, threshold)


Random sample 0 is ok: 167 out of 3466 samples are faulty (4.82%).
Random sample 1 is ok: 162 out of 3466 samples are faulty (4.67%).
Random sample 2 is ok: 162 out of 3466 samples are faulty (4.67%).
Random sample 3 is ok: 163 out of 3466 samples are faulty (4.70%).
Random sample 4 is ok: 191 out of 3466 samples are faulty (5.51%).
Random sample 5 is ok: 165 out of 3466 samples are faulty (4.76%).
Random sample 6 is ok: 165 out of 3466 samples are faulty (4.76%).
Random sample 7 is ok: 197 out of 3466 samples are faulty (5.68%).
Random sample 8 is ok: 171 out of 3466 samples are faulty (4.93%).
Random sample 9 is ok: 172 out of 3466 samples are faulty (4.96%).
Random sample 10 is ok: 184 out of 3466 samples are faulty (5.31%).
Random sample 11 is ok: 166 out of 3466 samples are faulty (4.79%).
Random sample 12 is ok: 177 out of 3466 samples are faulty (5.11%).
Random sample 13 is ok: 160 out of 3466 samples are faulty (4.62%).
Random sample 14 is ok: 159 out of 3466 samples are faulty

In [26]:
import torch
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

def detect_quality_issues(model, data_loader, threshold, seed):
    model.eval()
    current_issues_count = 0
    total_samples = 0  # 累积处理的样本总数

    with torch.no_grad():
        for inputs, _ in data_loader:
            inputs = inputs.to(device)
            recon, mu, logvar = model(inputs)
            losses = loss_function(recon, inputs, mu, logvar)  # 获取每个样本的损失
            loss_per_sample = losses.sum(dim=1)
            # 检查每个样本是否有问题
            for loss in loss_per_sample:
                if loss.item() > threshold:
                    current_issues_count += 1
            total_samples += inputs.size(0)  # 更新处理的样本总数

    # 评估是否有超过10%的样本有问题
    if current_issues_count > total_samples * 0.06:
        print(f"Random sample {seed} is problematic: {current_issues_count} out of {total_samples} samples are faulty ({(current_issues_count/total_samples * 100):.2f}%).")
        return True
    else:
        print(f"Random sample {seed} is ok: {current_issues_count} out of {total_samples} samples are faulty ({(current_issues_count/total_samples * 100):.2f}%).")
        return False

# 主函数：执行50次随机采样测试
def test_random_samples(model, data, threshold):
    problematic_batches = 0
    for seed in range(50):
        _, sample_data = train_test_split(data, test_size=0.2, random_state=seed)  # 随机采样20%
        sample_data_array = sample_data.values.astype(np.float32)  # 转换为浮点数类型的 NumPy 数组
        sample_tensor = torch.tensor(sample_data_array)  # 确保数据类型是float32
        sample_dataset = TensorDataset(sample_tensor, sample_tensor)
        test_dirty_loader = DataLoader(sample_dataset, batch_size=batch_size, shuffle=False)
        if detect_quality_issues(model, test_dirty_loader, threshold, seed):
            problematic_batches += 1

    print(f"Total problematic batches across all tests: {problematic_batches}")

# 假设 model, data_dirty, threshold, device 已经被正确定义和设置
test_random_samples(model, data_dirty, threshold)


Random sample 0 is problematic: 1440 out of 20520 samples are faulty (7.02%).
Random sample 1 is problematic: 1459 out of 20520 samples are faulty (7.11%).
Random sample 2 is problematic: 1386 out of 20520 samples are faulty (6.75%).
Random sample 3 is problematic: 1450 out of 20520 samples are faulty (7.07%).
Random sample 4 is problematic: 1387 out of 20520 samples are faulty (6.76%).
Random sample 5 is problematic: 1430 out of 20520 samples are faulty (6.97%).
Random sample 6 is problematic: 1409 out of 20520 samples are faulty (6.87%).
Random sample 7 is problematic: 1462 out of 20520 samples are faulty (7.12%).
Random sample 8 is problematic: 1446 out of 20520 samples are faulty (7.05%).
Random sample 9 is problematic: 1459 out of 20520 samples are faulty (7.11%).
Random sample 10 is problematic: 1452 out of 20520 samples are faulty (7.08%).
Random sample 11 is problematic: 1412 out of 20520 samples are faulty (6.88%).
Random sample 12 is problematic: 1459 out of 20520 samples are

In [13]:
# import torch
# from torch.utils.data import DataLoader, TensorDataset

# def detect_quality_issues(model, data_loader, threshold):
#     model.eval()
#     total_issue_count = 0
#     total_batches_with_issues = 0
#     total_samples = 0
#     current_batch_issues = 0
#     batch_count = 0
#     batch_size = len(data_loader.dataset) // 50  # 你希望的批次大小

#     with torch.no_grad():
#         for inputs, _ in data_loader:
#             inputs = inputs.to(device)  # 将输入数据移动到正确的设备
#             recon, mu, logvar = model(inputs)
#             BCE, KLD = loss_function(recon, inputs, mu, logvar)
#             total_loss = BCE + KLD  # 计算当前样本的总损失
#             total_samples += 1

#             # 判断当前样本是否有问题
#             if total_loss.item() > threshold:
#                 current_batch_issues += 1

#             # 当累积样本数达到你设定的批次大小时，评估这个批次
#             if total_samples % batch_size == 0:
#                 if current_batch_issues >= batch_size * 0.02:  # 判断这个批次是否有超过5%的样本有问题
#                     print(f"Batch {batch_count} is problematic: {current_batch_issues} out of {batch_size} samples are faulty ({(current_batch_issues/batch_size * 100):.2f}%).")
#                     total_batches_with_issues += 1
#                     total_issue_count = total_issue_count + current_batch_issues
#                 else:
#                     total_issue_count = total_issue_count + current_batch_issues
#                     print(f"Batch {batch_count} is ok: {current_batch_issues} out of {batch_size} samples are faulty ({(current_batch_issues/batch_size * 100):.2f}%).")
#                 current_batch_issues = 0
#                 batch_count += 1

#     total_issue_rate = total_issue_count / total_samples
#     print(f"Total batches with issues: {total_batches_with_issues} out of {batch_count}")
#     print(f"Total problematic samples: {total_issue_count} out of {total_samples} ({(total_issue_rate * 100):.2f}%)")
#     return total_issue_rate

# # Example usage
# # test_dataset = TensorDataset(test_tensor, test_tensor)
# # test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
# issue_rate = detect_quality_issues(model, test_loader, threshold)
# print(f"Percentage of data quality issues detected in the test set: {issue_rate * 100:.2f}%")



In [33]:
# import torch
# from torch.utils.data import DataLoader, TensorDataset

# def detect_quality_issues(model, data_loader, threshold):
#     model.eval()
#     total_issue_count = 0
#     total_batches_with_issues = 0
#     total_samples = 0
#     current_batch_issues = 0
#     batch_count = 0
#     batch_size = len(data_loader.dataset) // 50  # 你希望的批次大小

#     with torch.no_grad():
#         for inputs, _ in data_loader:
#             inputs = inputs.to(device)  # 将输入数据移动到正确的设备
#             recon, mu, logvar = model(inputs)
#             BCE, KLD = loss_function(recon, inputs, mu, logvar)
#             total_loss = BCE + KLD  # 计算当前样本的总损失
#             total_samples += 1

#             # 判断当前样本是否有问题
#             if total_loss.item() > threshold:
#                 current_batch_issues += 1

#             # 当累积样本数达到你设定的批次大小时，评估这个批次
#             if total_samples % batch_size == 0:
#                 if current_batch_issues >= batch_size * 0.02:  # 判断这个批次是否有超过5%的样本有问题
#                     print(f"Batch {batch_count} is problematic: {current_batch_issues} out of {batch_size} samples are faulty ({(current_batch_issues/batch_size * 100):.2f}%).")
#                     total_batches_with_issues += 1
#                     total_issue_count = total_issue_count + current_batch_issues
#                 else:
#                     total_issue_count = total_issue_count + current_batch_issues
#                     print(f"Batch {batch_count} is ok: {current_batch_issues} out of {batch_size} samples are faulty ({(current_batch_issues/batch_size * 100):.2f}%).")
#                 current_batch_issues = 0
#                 batch_count += 1

#     total_issue_rate = total_issue_count / total_samples
#     print(f"Total batches with issues: {total_batches_with_issues} out of {batch_count}")
#     print(f"Total problematic samples: {total_issue_count} out of {total_samples} ({(total_issue_rate * 100):.2f}%)")
#     return total_issue_rate

# # Example usage



# # 假设 data_dirty 已经是一个经过预处理的 DataFrame
# data_dirty_array = data_dirty.values.astype(np.float32)  # 转换为浮点数类型的 NumPy 数组
# test_dirty_tensor = torch.tensor(data_dirty_array)  
# test_dirty_dataset = TensorDataset(test_dirty_tensor, test_dirty_tensor)
# test_dirty_loader = DataLoader(test_dirty_dataset, batch_size=1, shuffle=False) 

# issue_rate = detect_quality_issues(model, test_dirty_loader, threshold)
# print(f"Percentage of data quality issues detected in the test set: {issue_rate * 100:.2f}%")


Batch 0 is ok: 12 out of 2051 samples are faulty (0.59%).
Batch 1 is ok: 4 out of 2051 samples are faulty (0.20%).
Batch 2 is ok: 2 out of 2051 samples are faulty (0.10%).
Batch 3 is ok: 3 out of 2051 samples are faulty (0.15%).
Batch 4 is ok: 2 out of 2051 samples are faulty (0.10%).
Batch 5 is ok: 3 out of 2051 samples are faulty (0.15%).
Batch 6 is ok: 2 out of 2051 samples are faulty (0.10%).
Batch 7 is ok: 0 out of 2051 samples are faulty (0.00%).
Batch 8 is ok: 0 out of 2051 samples are faulty (0.00%).
Batch 9 is ok: 3 out of 2051 samples are faulty (0.15%).
Batch 10 is ok: 5 out of 2051 samples are faulty (0.24%).
Batch 11 is ok: 0 out of 2051 samples are faulty (0.00%).
Batch 12 is ok: 1 out of 2051 samples are faulty (0.05%).
Batch 13 is ok: 2 out of 2051 samples are faulty (0.10%).
Batch 14 is ok: 1 out of 2051 samples are faulty (0.05%).
Batch 15 is ok: 1 out of 2051 samples are faulty (0.05%).
Batch 16 is ok: 1 out of 2051 samples are faulty (0.05%).
Batch 17 is ok: 7 out o

## 保存loss 到文件中

In [36]:
# import torch
# import numpy as np
# from torch.utils.data import DataLoader, TensorDataset
# from sklearn.model_selection import train_test_split
# import pandas as pd

# def record_losses(model, data_loader, seed):
#     model.eval()
#     losses = []
    
#     with torch.no_grad():
#         for inputs, _ in data_loader:
#             inputs = inputs.to(device)
#             recon, mu, logvar = model(inputs)
#             BCE, KLD = loss_function(recon, inputs, mu, logvar)
#             total_loss = BCE + KLD
#             losses.append(total_loss.item())
    
#     # Save the losses to a CSV file
#     losses_df = pd.DataFrame(losses, columns=['Loss_dirty_graph'])
#     losses_df.to_csv(f'loss_data_dirty_graph.csv', index=False)
#     print(f"Loss data for random sample data_dirty_graph saved.")

# # 主函数：测试2000个随机样本
# def test_random_sample(model, data_dirty):
#     seed = 42  # Use a fixed seed for reproducibility
#     _, sample_data = train_test_split(data_dirty, test_size=2000, train_size=None, random_state=seed)
#     sample_data_array = sample_data.values.astype(np.float32)
#     sample_tensor = torch.tensor(sample_data_array)
#     sample_dataset = TensorDataset(sample_tensor, sample_tensor)
#     test_loader = DataLoader(sample_dataset, batch_size=1, shuffle=False) 
#     record_losses(model, test_loader, seed)

# # 假设 model, data_dirty, device, loss_function 已经被正确定义和设置
# test_random_sample(model, data_dirty)


In [28]:
import torch
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import pandas as pd
import os

def record_losses(model, data_loader, seed, device, loss_function):
    model.eval()
    losses = []
    
    with torch.no_grad():
        for inputs, _ in data_loader:
            inputs = inputs.to(device)
            recon, mu, logvar = model(inputs)
            loss = loss_function(recon, inputs, mu, logvar)
            loss_per_sample = loss.sum(dim=1)
            losses.extend(loss_per_sample.tolist())  
    
    return losses

# 主函数：测试两个数据集并将结果保存到同一个CSV文件
def test_and_save_combined_losses(model, data_clean, data_dirty, file_name='/home/sdong/experiments/VAE_method/results/combined_loss_data_gragh.csv'):
    seed = 42  # Use a fixed seed for reproducibility
    # Process clean data
    _, sample_data_clean = train_test_split(data_clean, test_size=2000, train_size=None, random_state=seed)
    #sample_clean_array = sample_data_clean.values.astype(np.float32)
    sample_clean_tensor = torch.tensor(sample_data_clean)
    sample_clean_dataset = TensorDataset(sample_clean_tensor, sample_clean_tensor)
    clean_loader = DataLoader(sample_clean_dataset, batch_size=1024, shuffle=False)
    clean_losses = record_losses(model, clean_loader, seed, device, loss_function)
    
    # Process dirty data
    _, sample_data_dirty = train_test_split(data_dirty, test_size=2000, train_size=None, random_state=seed)
    sample_dirty_array = sample_data_dirty.values.astype(np.float32)
    sample_dirty_tensor = torch.tensor(sample_dirty_array)
    sample_dirty_dataset = TensorDataset(sample_dirty_tensor, sample_dirty_tensor)
    dirty_loader = DataLoader(sample_dirty_dataset, batch_size=1024, shuffle=False)
    dirty_losses = record_losses(model, dirty_loader, seed, device, loss_function)
    
    # Combine and save to CSV
    combined_df = pd.DataFrame({
        'Loss_clean_graph': clean_losses,
        'Loss_dirty_graph': dirty_losses
    })
    combined_df.to_csv(file_name, index=False)
    print(f"Combined loss data saved to {file_name}.")

# 假设 model, data_clean, data_dirty, device, loss_function 已经被正确定义和设置
test_and_save_combined_losses(model, test_data, data_dirty)


Combined loss data saved to /home/sdong/experiments/VAE_method/results/combined_loss_data_gragh.csv.
