In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np


# 加载数据
file_path_clean = '/home/sdong/data/chicago_bicycle/data_pr_cleaned_embeddings.csv'
file_path_origi = '/home/sdong/data/chicago_bicycle/data_pr_raw_embeddings.csv'
data = pd.read_csv(file_path_clean)
data_dirty = pd.read_csv(file_path_origi)
# 设置显示所有列和部分行
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)  
# 显示数据集的前几行和数据结构

print(data_dirty.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 688736 entries, 0 to 688735
Data columns (total 18 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       688736 non-null  float64
 1   1       688736 non-null  float64
 2   2       688736 non-null  float64
 3   3       688736 non-null  float64
 4   4       688736 non-null  float64
 5   5       688736 non-null  float64
 6   6       688736 non-null  float64
 7   7       688736 non-null  float64
 8   8       688736 non-null  float64
 9   9       688736 non-null  float64
 10  10      688736 non-null  float64
 11  11      688736 non-null  float64
 12  12      688736 non-null  float64
 13  13      688736 non-null  float64
 14  14      688736 non-null  float64
 15  15      688736 non-null  float64
 16  16      688736 non-null  float64
 17  17      688736 non-null  float64
dtypes: float64(18)
memory usage: 94.6 MB
None


In [2]:
# 检查两个数据集中是否所有值都在0到1之间
def check_values_in_range(df, lower=0, upper=1):
    return ((df >= lower) & (df <= upper)).all().all()

is_data_in_range = check_values_in_range(data)
is_data_dirty_in_range = check_values_in_range(data_dirty)

print(f"Clean data values are within [0, 1]: {is_data_in_range}")
print(f"Dirty data values are within [0, 1]: {is_data_dirty_in_range}")

if not is_data_in_range:
    print("Clean data contains values out of range [0, 1].")

if not is_data_dirty_in_range:
    print("Dirty data contains values out of range [0, 1].")

# 如果需要，打印出不在范围内的值和对应的索引
def find_out_of_range_values(df, lower=0, upper=1):
    out_of_range = df[(df < lower) | (df > upper)]
    return out_of_range.dropna(how='all')

if not is_data_in_range:
    out_of_range_clean = find_out_of_range_values(data)
    print("Out of range values in clean data:")
    print(out_of_range_clean)

if not is_data_dirty_in_range:
    out_of_range_dirty = find_out_of_range_values(data_dirty)
    print("Out of range values in dirty data:")
    print(out_of_range_dirty)
    
    # 获取不在范围内的值的索引
out_of_range_clean_indices = out_of_range_clean.index
out_of_range_dirty_indices = out_of_range_dirty.index

# 删除不在范围内的行
data = data.drop(out_of_range_clean_indices)
data_dirty = data_dirty.drop(out_of_range_dirty_indices)

Clean data values are within [0, 1]: False
Dirty data values are within [0, 1]: False
Clean data contains values out of range [0, 1].
Dirty data contains values out of range [0, 1].
Out of range values in clean data:
          0   1   2   3   4   5   6   7   8   9  10  11  12  13   14   15  16  \
60400   1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN  NaN  NaN NaN   
109362  NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN  1.0  NaN NaN   
244595  NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN  NaN  1.0 NaN   

        17  
60400  NaN  
109362 NaN  
244595 NaN  
Out of range values in dirty data:
         0    1   2   3   4   5   6   7   8    9  10   11  12  13  14  15  16  \
145205 NaN  NaN NaN NaN NaN NaN NaN NaN NaN  NaN NaN  1.0 NaN NaN NaN NaN NaN   
257843 NaN  1.0 NaN NaN NaN NaN NaN NaN NaN  NaN NaN  NaN NaN NaN NaN NaN NaN   
560525 NaN  NaN NaN NaN NaN NaN NaN NaN NaN  1.0 NaN  NaN NaN NaN NaN NaN NaN   

        17  
145205 NaN  
257843 NaN  
560525 

In [3]:
data_dirty.info()

<class 'pandas.core.frame.DataFrame'>
Index: 688733 entries, 0 to 688735
Data columns (total 18 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       688733 non-null  float64
 1   1       688733 non-null  float64
 2   2       688733 non-null  float64
 3   3       688733 non-null  float64
 4   4       688733 non-null  float64
 5   5       688733 non-null  float64
 6   6       688733 non-null  float64
 7   7       688733 non-null  float64
 8   8       688733 non-null  float64
 9   9       688733 non-null  float64
 10  10      688733 non-null  float64
 11  11      688733 non-null  float64
 12  12      688733 non-null  float64
 13  13      688733 non-null  float64
 14  14      688733 non-null  float64
 15  15      688733 non-null  float64
 16  16      688733 non-null  float64
 17  17      688733 non-null  float64
dtypes: float64(18)
memory usage: 99.8 MB


In [4]:
import torch
from torch import nn
from torch.nn import functional as F
from sklearn.model_selection import train_test_split

# 假设 data 已经是一个经过预处理的 DataFrame
data_array = data.values.astype(np.float32)  # 转换为浮点数类型的 NumPy 数组

# 分割数据为训练集和临时测试集（包括真正的测试集和验证集）
train_data, val_test_data = train_test_split(data_array, test_size=0.5, random_state=42)

# 将训练验证集进一步分割为训练集和验证集
val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=42)  # 0.25 x 0.8 = 0.2


# 转换为PyTorch张量


# 创建数据加载器
from torch.utils.data import DataLoader, TensorDataset

batch_size = 32  # 或者任何适合你GPU的大小

train_tensor = torch.tensor(train_data) #0.6
train_dataset = TensorDataset(train_tensor, train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_tensor = torch.tensor(val_data) #0.2
val_dataset = TensorDataset(val_tensor, val_tensor)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)

test_tensor = torch.tensor(test_data)  #20%
test_dataset = TensorDataset(test_tensor, test_tensor)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)  # 50个批次
#len(test_dataset)// 50

data_dirty_array = data_dirty.values.astype(np.float32)  # 转换为浮点数类型的 NumPy 数组
test_dirty_tensor = torch.tensor(data_dirty_array)  #20%
test_dirty_dataset = TensorDataset(test_dirty_tensor, test_dirty_tensor)
test_dirty_loader = DataLoader(test_dirty_dataset, batch_size=1, shuffle=False)  # 50个批次


In [5]:
# # 检查数据中是否有NaN或无穷大的值
# if torch.isnan(train_tensor).any() or torch.isinf(train_tensor).any():
#     print("Data contains NaNs or Infs.")
# # 检查数据中是否有NaN或无穷大的值
# if torch.isnan(test_dirty_tensor).any() or torch.isinf(test_dirty_tensor).any():
#     print("Data contains NaNs or Infs.")


In [6]:
import torch
from torch import nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class VAE(nn.Module):
    def __init__(self):
        super(VAE, self).__init__()
        # Encoder
        self.fc1 = nn.Linear(18, 128)  # Input layer
        self.fc2 = nn.Linear(128, 64)  # Hidden layer
        self.fc31 = nn.Linear(64, 20)  # Output layer for mu
        self.fc32 = nn.Linear(64, 20)  # Output layer for logvar

        # Decoder
        self.fc4 = nn.Linear(20, 64)   # Input layer
        self.fc5 = nn.Linear(64, 128)  # Hidden layer
        self.fc6 = nn.Linear(128, 18)  # Output layer

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        h2 = F.relu(self.fc2(h1))
        return self.fc31(h2), self.fc32(h2)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar) + 1e-8  # Adding a small constant for numerical stability
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        h3 = F.relu(self.fc4(z))
        h4 = F.relu(self.fc5(h3))
        return torch.sigmoid(self.fc6(h4))  # Use sigmoid to ensure output is between 0 and 1

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

# Instantiate the model
model = VAE()
print(model)


VAE(
  (fc1): Linear(in_features=18, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc31): Linear(in_features=64, out_features=20, bias=True)
  (fc32): Linear(in_features=64, out_features=20, bias=True)
  (fc4): Linear(in_features=20, out_features=64, bias=True)
  (fc5): Linear(in_features=64, out_features=128, bias=True)
  (fc6): Linear(in_features=128, out_features=18, bias=True)
)


In [7]:
#device = torch.device("cpu")
model = VAE().to(device)

import torch.optim as optim

# 设置优化器
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# 定义损失函数
def loss_function(recon_x, x, mu, logvar):
    # 确保目标张量也是浮点类型且维度匹配
    recon_x = torch.clamp(recon_x, 0, 1)  # 确保输出值在[0, 1]范围内
    BCE = F.binary_cross_entropy(recon_x, x.view(-1, 18).float(), reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE, KLD


  from .autonotebook import tqdm as notebook_tqdm


In [8]:



# 定义训练函数
def train(epoch):
    model.train()
    #train_loss = 0
    total_BCE = 0
    total_KLD = 0
    total_loss = 0
    for batch_idx, (data, _) in enumerate(train_loader):  # 由于使用TensorDataset，数据被重复用作输入和标签
        data = data.to(device)
        # 检查输入数据的范围
        if (data < 0).any() or (data > 1).any():
            raise ValueError("Input data contains values out of range [0, 1]")
        
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        if (recon_batch < 0).any() or (recon_batch > 1).any():
            raise ValueError("Warning: recon_batch contains values out of range [0, 1]")
        BCE, KLD = loss_function(recon_batch, data, mu, logvar)
        loss = BCE + KLD
        loss.backward()
        total_loss += loss.item()
        total_BCE += BCE.item()
        total_KLD += KLD.item()
        optimizer.step()
        # if epoch == 1:  # 只在第一个epoch检查
        #     print("Sample recon_x:", recon_batch[0].data)
        #     print("Sample x:", data[0].data)

        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item() / len(data)))
    #print('====> Epoch: {} Average loss: {:.4f}'.format(epoch, train_loss / len(train_loader.dataset)))
    print(f'Epoch: {epoch} Average BCE: {total_BCE / len(train_loader.dataset)} Average KLD: {total_KLD / len(train_loader.dataset)} Total Loss: {total_loss / len(train_loader.dataset)}')

# 训练模型
num_epochs = 1  # 可根据需要调整
for epoch in range(1, num_epochs + 1):
    train(epoch)

# 保存模型的状态字典
torch.save(model.state_dict(), 'vae_model_bicycle_graph.pth')

print("Model saved to vae_model_bicycle_graph.pth")

# 添加调试信息
if torch.cuda.is_available():
    torch.cuda.synchronize()
    torch.cuda.empty_cache()
    print("CUDA cache cleared.")


Epoch: 1 Average BCE: 11.761324358538658 Average KLD: 0.0009563937635409178 Total Loss: 11.762280756426684
Model saved to vae_model_bicycle_graph.pth
CUDA cache cleared.


In [9]:
def evaluate_model(model, data_loader):
    model.eval()  # 切换到评估模式
    total_loss = 0
    with torch.no_grad():  # 关闭梯度计算
        for inputs, _ in data_loader:  # 假设 data_loader 返回 inputs 和 targets，这里我们不需要 targets
            inputs = inputs.to(device)  # 确保将 inputs 转移到正确的设备
            recon, mu, logvar = model(inputs)
            BCE, KLD = loss_function(recon, inputs, mu, logvar)
            loss = BCE + KLD  # 将损失元组中的元素相加
            total_loss += loss.item()  # 现在这是一个单一的数值
    print(len(data_loader.dataset))
    return total_loss / len(data_loader.dataset)


model = VAE().to(device)
model.load_state_dict(torch.load('vae_model_bicycle_graph.pth'))
# 计算测试集上的平均损失
# 计算测试集和验证集上的平均损失
val_loss = evaluate_model(model, val_loader)
test_loss = evaluate_model(model, test_loader)
test_dirty_loss = evaluate_model(model, test_dirty_loader)
print(f"Average loss on validation data: {val_loss}")
print(f"Average loss on test data: {test_loss}")
print(f"Average loss on test dirty data: {test_dirty_loss}")
# 简单的基于阈值的数据质量问题判断
# 这里我们需要设置一个阈值来决定什么样的重构误差被认为是“异常”的，此阈值可以基于训练集或验证集的性能来确定
# 假设我们根据验证集确定阈值
# threshold = np.quantile([loss_function(model(recon, data.to(device), mu, logvar).item() for data, _ in val_loader], 0.95)
# print(f"Loss threshold for detecting data quality issues: {threshold}")

# # 判断测试集
# quality_issues = test_loss > threshold
# print(f"Data quality issues detected: {quality_issues}")


118690
118690
688733
Average loss on validation data: 11.761952425959699
Average loss on test data: 11.758920027615297
Average loss on test dirty data: 12.037338681076745


In [10]:
def collect_reconstruction_errors(model, data_loader):
    model.eval()
    reconstruction_errors = []
    with torch.no_grad():
        for inputs, _ in data_loader:  # 假设 data_loader 返回的是 inputs 和 labels，这里我们忽略 labels
            inputs = inputs.to(device)  # 将输入数据移动到正确的设备
            recon, mu, logvar = model(inputs)
            BCE, KLD = loss_function(recon, inputs, mu, logvar)
            total_loss = BCE + KLD  # 计算总损失
            average_loss = total_loss.item() / inputs.size(0)  # 计算平均损失
            reconstruction_errors.append(average_loss)  # 添加单个损失值到列表中
    return reconstruction_errors

# 收集验证集的重构误差
val_errors = collect_reconstruction_errors(model, val_loader)
threshold = np.quantile(val_errors, 0.95)  # 计算95%分位数作为阈值
threshold = threshold * 1
print(f"Loss threshold for detecting data quality issues: {threshold}")

min_val_error = min(val_errors)
max_val_error = max(val_errors)
mean_val_error = sum(val_errors) / len(val_errors)
print(f"Min validation error: {min_val_error}")
print(f"Max validation error: {max_val_error}")
print(f"Mean validation error: {mean_val_error}")
print(f"95th percentile of validation errors: {np.quantile(val_errors, 0.95)}")
print(f"Maximum validation error (100th percentile): {np.quantile(val_errors, 1)}")


Loss threshold for detecting data quality issues: 12.56061749458313
Min validation error: 10.639154434204102
Max validation error: 14.13665771484375
Mean validation error: 11.761976583561024
95th percentile of validation errors: 12.56061749458313
Maximum validation error (100th percentile): 14.13665771484375


In [11]:
import torch
from torch.utils.data import DataLoader, TensorDataset

def detect_quality_issues(model, data_loader, threshold):
    model.eval()
    total_issue_count = 0
    total_batches_with_issues = 0
    total_samples = 0
    current_batch_issues = 0
    batch_count = 0
    batch_size = len(data_loader.dataset) // 50  # 你希望的批次大小

    with torch.no_grad():
        for inputs, _ in data_loader:
            inputs = inputs.to(device)  # 将输入数据移动到正确的设备
            recon, mu, logvar = model(inputs)
            BCE, KLD = loss_function(recon, inputs, mu, logvar)
            total_loss = BCE + KLD  # 计算当前样本的总损失
            total_samples += 1

            # 判断当前样本是否有问题
            if total_loss.item() > threshold:
                current_batch_issues += 1

            # 当累积样本数达到你设定的批次大小时，评估这个批次
            if total_samples % batch_size == 0:
                if current_batch_issues >= batch_size * 0.02:  # 判断这个批次是否有超过5%的样本有问题
                    print(f"Batch {batch_count} is problematic: {current_batch_issues} out of {batch_size} samples are faulty ({(current_batch_issues/batch_size * 100):.2f}%).")
                    total_batches_with_issues += 1
                    total_issue_count = total_issue_count + current_batch_issues
                else:
                    total_issue_count = total_issue_count + current_batch_issues
                    print(f"Batch {batch_count} is ok: {current_batch_issues} out of {batch_size} samples are faulty ({(current_batch_issues/batch_size * 100):.2f}%).")
                current_batch_issues = 0
                batch_count += 1

    total_issue_rate = total_issue_count / total_samples
    print(f"Total batches with issues: {total_batches_with_issues} out of {batch_count}")
    print(f"Total problematic samples: {total_issue_count} out of {total_samples} ({(total_issue_rate * 100):.2f}%)")
    return total_issue_rate

# Example usage
# test_dataset = TensorDataset(test_tensor, test_tensor)
# test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
issue_rate = detect_quality_issues(model, test_loader, threshold)
print(f"Percentage of data quality issues detected in the test set: {issue_rate * 100:.2f}%")



Batch 0 is ok: 18 out of 2373 samples are faulty (0.76%).
Batch 1 is ok: 27 out of 2373 samples are faulty (1.14%).
Batch 2 is ok: 28 out of 2373 samples are faulty (1.18%).
Batch 3 is ok: 20 out of 2373 samples are faulty (0.84%).
Batch 4 is ok: 21 out of 2373 samples are faulty (0.88%).
Batch 5 is ok: 18 out of 2373 samples are faulty (0.76%).
Batch 6 is ok: 20 out of 2373 samples are faulty (0.84%).
Batch 7 is ok: 19 out of 2373 samples are faulty (0.80%).
Batch 8 is ok: 23 out of 2373 samples are faulty (0.97%).
Batch 9 is ok: 18 out of 2373 samples are faulty (0.76%).
Batch 10 is ok: 15 out of 2373 samples are faulty (0.63%).
Batch 11 is ok: 32 out of 2373 samples are faulty (1.35%).
Batch 12 is ok: 33 out of 2373 samples are faulty (1.39%).
Batch 13 is ok: 25 out of 2373 samples are faulty (1.05%).
Batch 14 is ok: 22 out of 2373 samples are faulty (0.93%).
Batch 15 is ok: 27 out of 2373 samples are faulty (1.14%).
Batch 16 is ok: 20 out of 2373 samples are faulty (0.84%).
Batch 1

In [24]:
import torch
from torch.utils.data import DataLoader, TensorDataset

def detect_quality_issues(model, data_loader, threshold):
    model.eval()
    total_issue_count = 0
    total_batches_with_issues = 0
    total_samples = 0
    current_batch_issues = 0
    batch_count = 0
    batch_size = len(data_loader.dataset) // 50  # 你希望的批次大小

    with torch.no_grad():
        for inputs, _ in data_loader:
            inputs = inputs.to(device)  # 将输入数据移动到正确的设备
            recon, mu, logvar = model(inputs)
            BCE, KLD = loss_function(recon, inputs, mu, logvar)
            total_loss = BCE + KLD  # 计算当前样本的总损失
            total_samples += 1

            # 判断当前样本是否有问题
            if total_loss.item() > threshold:
                current_batch_issues += 1

            # 当累积样本数达到你设定的批次大小时，评估这个批次
            if total_samples % batch_size == 0:
                if current_batch_issues >= batch_size * 0.02:  # 判断这个批次是否有超过5%的样本有问题
                    print(f"Batch {batch_count} is problematic: {current_batch_issues} out of {batch_size} samples are faulty ({(current_batch_issues/batch_size * 100):.2f}%).")
                    total_batches_with_issues += 1
                    total_issue_count = total_issue_count + current_batch_issues
                else:
                    total_issue_count = total_issue_count + current_batch_issues
                    print(f"Batch {batch_count} is ok: {current_batch_issues} out of {batch_size} samples are faulty ({(current_batch_issues/batch_size * 100):.2f}%).")
                current_batch_issues = 0
                batch_count += 1

    total_issue_rate = total_issue_count / total_samples
    print(f"Total batches with issues: {total_batches_with_issues} out of {batch_count}")
    print(f"Total problematic samples: {total_issue_count} out of {total_samples} ({(total_issue_rate * 100):.2f}%)")
    return total_issue_rate

# Example usage



# 假设 data_dirty 已经是一个经过预处理的 DataFrame
data_dirty_array = data_dirty.values.astype(np.float32)  # 转换为浮点数类型的 NumPy 数组
test_dirty_tensor = torch.tensor(data_dirty_array)  
test_dirty_dataset = TensorDataset(test_dirty_tensor, test_dirty_tensor)
test_dirty_loader = DataLoader(test_dirty_dataset, batch_size=1, shuffle=False) 

issue_rate = detect_quality_issues(model, test_dirty_loader, threshold)
print(f"Percentage of data quality issues detected in the test set: {issue_rate * 100:.2f}%")


Batch 0 is problematic: 1907 out of 13774 samples are faulty (13.84%).
Batch 1 is problematic: 1920 out of 13774 samples are faulty (13.94%).


KeyboardInterrupt: 

##loop test 

In [11]:
import torch
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

def detect_quality_issues(model, data_loader, threshold, seed):
    model.eval()
    current_batch_issues = 0
    batch_size = len(data_loader.dataset)  # 这里的批次大小是整个数据集的大小

    with torch.no_grad():
        for inputs, _ in data_loader:
            inputs = inputs.to(device)
            recon, mu, logvar = model(inputs)
            BCE, KLD = loss_function(recon, inputs, mu, logvar)
            total_loss = BCE + KLD
            # 判断当前样本是否有问题
            if total_loss.item() > threshold:
                current_batch_issues += 1

    # 评估是否有超过5%的样本有问题
    if current_batch_issues > batch_size * 0.1:
        print(f"Random sample {seed} is problematic: {current_batch_issues} out of {batch_size} samples are faulty ({(current_batch_issues/batch_size * 100):.2f}%).")
        return True
    else:
        print(f"Random sample {seed} is ok: {current_batch_issues} out of {batch_size} samples are faulty ({(current_batch_issues/batch_size * 100):.2f}%).")
        return False

# 主函数：执行50次随机采样测试
def test_random_samples(model, data_dirty, threshold):
    problematic_batches = 0
    for seed in range(50):
        _, sample_data = train_test_split(data_dirty, test_size=0.1, random_state=seed)  # 随机采样20%
        sample_data_array = sample_data.values.astype(np.float32)
        sample_tensor = torch.tensor(sample_data_array)
        sample_dataset = TensorDataset(sample_tensor, sample_tensor)
        test_dirty_loader = DataLoader(sample_dataset, batch_size=1, shuffle=False) 
        if detect_quality_issues(model, test_dirty_loader, threshold, seed):
            problematic_batches += 1

    print(f"Total problematic batches across all tests: {problematic_batches}")

# 假设 model, data_dirty, threshold, device 已经被正确定义和设置
test_random_samples(model, data_dirty, threshold)


Random sample 0 is problematic: 14130 out of 68874 samples are faulty (20.52%).
Random sample 1 is problematic: 13941 out of 68874 samples are faulty (20.24%).
Random sample 2 is problematic: 14016 out of 68874 samples are faulty (20.35%).
Random sample 3 is problematic: 14183 out of 68874 samples are faulty (20.59%).
Random sample 4 is problematic: 13949 out of 68874 samples are faulty (20.25%).
Random sample 5 is problematic: 14062 out of 68874 samples are faulty (20.42%).
Random sample 6 is problematic: 13942 out of 68874 samples are faulty (20.24%).
Random sample 7 is problematic: 13903 out of 68874 samples are faulty (20.19%).
Random sample 8 is problematic: 13982 out of 68874 samples are faulty (20.30%).
Random sample 9 is problematic: 13990 out of 68874 samples are faulty (20.31%).
Random sample 10 is problematic: 14107 out of 68874 samples are faulty (20.48%).
Random sample 11 is problematic: 13970 out of 68874 samples are faulty (20.28%).
Random sample 12 is problematic: 13980

KeyboardInterrupt: 

In [None]:
import torch
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

def detect_quality_issues(model, data_loader, threshold, seed):
    model.eval()
    current_batch_issues = 0
    batch_size = len(data_loader.dataset)  # 这里的批次大小是整个数据集的大小

    with torch.no_grad():
        for inputs, _ in data_loader:
            inputs = inputs.to(device)
            recon, mu, logvar = model(inputs)
            BCE, KLD = loss_function(recon, inputs, mu, logvar)
            total_loss = BCE + KLD
            # 判断当前样本是否有问题
            if total_loss.item() > threshold:
                current_batch_issues += 1

    # 评估是否有超过5%的样本有问题
    if current_batch_issues > batch_size * 0.1:
        print(f"Random sample {seed} is problematic: {current_batch_issues} out of {batch_size} samples are faulty ({(current_batch_issues/batch_size * 100):.2f}%).")
        return True
    else:
        print(f"Random sample {seed} is ok: {current_batch_issues} out of {batch_size} samples are faulty ({(current_batch_issues/batch_size * 100):.2f}%).")
        return False

# 主函数：执行50次随机采样测试
def test_random_samples(model, data, threshold):
    problematic_batches = 0
    for seed in range(50):
        _, sample_data = train_test_split(data, test_size=0.2, random_state=seed)  # 随机采样20%
        ##sample_data_array = sample_data.values.astype(np.float32)
        sample_tensor = torch.tensor(sample_data)
        sample_dataset = TensorDataset(sample_tensor, sample_tensor)
        test_dirty_loader = DataLoader(sample_dataset, batch_size=1, shuffle=False) 
        if detect_quality_issues(model, test_dirty_loader, threshold, seed):
            problematic_batches += 1

    print(f"Total problematic batches across all tests: {problematic_batches}")


# 假设 model, data_dirty, threshold, device 已经被正确定义和设置
test_random_samples(model, test_data, threshold)


Random sample 0 is ok: 1217 out of 23738 samples are faulty (5.13%).


KeyboardInterrupt: 

## 保存loss 到文件中

In [13]:
# import torch
# import numpy as np
# from torch.utils.data import DataLoader, TensorDataset
# from sklearn.model_selection import train_test_split
# import pandas as pd

# def record_losses(model, data_loader, seed):
#     model.eval()
#     losses = []
    
#     with torch.no_grad():
#         for inputs, _ in data_loader:
#             inputs = inputs.to(device)
#             recon, mu, logvar = model(inputs)
#             BCE, KLD = loss_function(recon, inputs, mu, logvar)
#             total_loss = BCE + KLD
#             losses.append(total_loss.item())
    
#     # Save the losses to a CSV file
#     losses_df = pd.DataFrame(losses, columns=['Loss_dirty_graph'])
#     losses_df.to_csv(f'loss_data_dirty_graph.csv', index=False)
#     print(f"Loss data for random sample data_dirty_graph saved.")

# # 主函数：测试2000个随机样本
# def test_random_sample(model, data_dirty):
#     seed = 42  # Use a fixed seed for reproducibility
#     _, sample_data = train_test_split(data_dirty, test_size=2000, train_size=None, random_state=seed)
#     sample_data_array = sample_data.values.astype(np.float32)
#     sample_tensor = torch.tensor(sample_data_array)
#     sample_dataset = TensorDataset(sample_tensor, sample_tensor)
#     test_loader = DataLoader(sample_dataset, batch_size=1, shuffle=False) 
#     record_losses(model, test_loader, seed)

# # 假设 model, data_dirty, device, loss_function 已经被正确定义和设置
# test_random_sample(model, data_dirty)


Loss data for random sample data_dirty_graph saved.


In [17]:
# import torch
# import numpy as np
# from torch.utils.data import DataLoader, TensorDataset
# from sklearn.model_selection import train_test_split
# import pandas as pd
# import os

# def record_losses(model, data_loader, seed, device, loss_function):
#     model.eval()
#     losses = []
    
#     with torch.no_grad():
#         for inputs, _ in data_loader:
#             inputs = inputs.to(device)
#             recon, mu, logvar = model(inputs)
#             BCE, KLD = loss_function(recon, inputs, mu, logvar)
#             total_loss = BCE + KLD
#             losses.append(total_loss.item())
    
#     return losses

# # 主函数：测试两个数据集并将结果保存到同一个CSV文件
# def test_and_save_combined_losses(model, data_clean, data_dirty, file_name='/home/sdong/experiments/VAE_method/results/combined_loss_data_gragh.csv'):
#     seed = 42  # Use a fixed seed for reproducibility
#     # Process clean data
#     _, sample_data_clean = train_test_split(data_clean, test_size=2000, train_size=None, random_state=seed)
#     #sample_clean_array = sample_data_clean.values.astype(np.float32)
#     sample_clean_tensor = torch.tensor(sample_data_clean)
#     sample_clean_dataset = TensorDataset(sample_clean_tensor, sample_clean_tensor)
#     clean_loader = DataLoader(sample_clean_dataset, batch_size=1, shuffle=False)
#     clean_losses = record_losses(model, clean_loader, seed, device, loss_function)
    
#     # Process dirty data
#     _, sample_data_dirty = train_test_split(data_dirty, test_size=2000, train_size=None, random_state=seed)
#     sample_dirty_array = sample_data_dirty.values.astype(np.float32)
#     sample_dirty_tensor = torch.tensor(sample_dirty_array)
#     sample_dirty_dataset = TensorDataset(sample_dirty_tensor, sample_dirty_tensor)
#     dirty_loader = DataLoader(sample_dirty_dataset, batch_size=1, shuffle=False)
#     dirty_losses = record_losses(model, dirty_loader, seed, device, loss_function)
    
#     # Combine and save to CSV
#     combined_df = pd.DataFrame({
#         'Loss_clean_graph': clean_losses,
#         'Loss_dirty_graph': dirty_losses
#     })
#     combined_df.to_csv(file_name, index=False)
#     print(f"Combined loss data saved to {file_name}.")

# # 假设 model, data_clean, data_dirty, device, loss_function 已经被正确定义和设置
# test_and_save_combined_losses(model, test_data, data_dirty)


Combined loss data saved to /home/sdong/experiments/VAE_method/results/combined_loss_data_gragh.csv.
