In [1]:
import os
import pandas as pd
import numpy as np

def analysis(base_dir):
    # 用于存储结果的列表
    results = []

    # 遍历 SimCQRVAE 的所有设置
    for setting in os.listdir(base_dir):
        setting_path = os.path.join(base_dir, setting)
        if not os.path.isdir(setting_path):
            continue  # 跳过非目录文件

        # 提取当前设置的超参数信息
        lr, weight_decay, kld_weight, scheduler_gamma = None, None, None, None
        try:
            for param in setting.split("_"):
                if param.startswith("lr"):
                    lr = float(param.replace("lr", ""))
                elif param.startswith("wd"):
                    weight_decay = float(param.replace("wd", ""))
                elif param.startswith("kld"):
                    kld_weight = float(param.replace("kld", ""))
                elif param.startswith("schedgamma"):
                    gamma_value = param.replace("schedgamma", "")
                    scheduler_gamma = None if gamma_value == "None" else float(gamma_value)
        except ValueError as e:
            print(f"Error parsing parameters in setting: {setting}. Skipping... {e}")
            continue  # 跳过解析错误的设置

        # 用于存储每个种子的结果
        seed_kld_list = []
        seed_loss_list = []
        seed_recon_loss_list = []
        seed_label_loss_list = []

        # 遍历该设置下的所有种子
        for seed_dir in os.listdir(setting_path):
            seed_path = os.path.join(setting_path, seed_dir)
            if not os.path.isdir(seed_path):
                continue  # 跳过非目录文件

            # 读取 metrics.csv 文件
            metrics_file = os.path.join(seed_path, "metrics.csv")
            if not os.path.exists(metrics_file):
                print(f"Metrics file not found in {seed_path}")
                continue

            # 加载数据
            try:
                data = pd.read_csv(metrics_file)
            except Exception as e:
                print(f"Error reading metrics file: {metrics_file}. Skipping... {e}")
                continue

            # 计算最后几轮的 KLD 和 loss 平均值
            try:
                kld = data.loc[data['epoch'] > 449, ['KLD']].dropna().mean().values[0]
                loss = data.loc[data['epoch'] > 449, ['loss']].dropna().mean().values[0]
                recon_loss = data.loc[data['epoch'] > 449, ['Reconstruction_Loss']].dropna().mean().values[0]

                # 如果存在 Label_Loss 列，则计算其平均值
                if 'Label_Loss' in data.columns:
                    label_loss = data.loc[data['epoch'] > 449, ['Label_Loss']].dropna().mean().values[0]
                else:
                    label_loss = None  # 如果没有 Label_Loss 列，设置为 None
            except KeyError as e:
                print(f"Missing required columns in metrics file: {metrics_file}. Skipping... {e}")
                continue
            except Exception as e:
                print(f"Error calculating metrics for: {metrics_file}. Skipping... {e}")
                continue

            seed_kld_list.append(kld)
            seed_loss_list.append(loss)
            seed_recon_loss_list.append(recon_loss)
            if label_loss is not None:
                seed_label_loss_list.append(label_loss)

        # 如果该设置下没有种子数据，跳过
        if not seed_kld_list or not seed_loss_list or not seed_recon_loss_list:
            print(f"No valid data found for setting {setting}")
            continue

        # 计算当前设置的种子结果平均值
        avg_kld = np.mean(seed_kld_list)
        avg_loss = np.mean(seed_loss_list)
        avg_recon_loss = np.mean(seed_recon_loss_list)

        # 如果 Label_Loss 有值，则计算其平均值，否则设置为 None
        avg_label_loss = np.mean(seed_label_loss_list) if seed_label_loss_list else None

        # 将结果添加到结果列表
        results.append({
            "lr": lr,
            "weight_decay": weight_decay,
            "kld_weight": kld_weight,
            "scheduler_gamma": scheduler_gamma,
            "avg_kld": avg_kld,
            "avg_loss": avg_loss,
            "avg_recon_loss": avg_recon_loss,
            "avg_label_loss": avg_label_loss,  # 如果没有 Label_Loss，则为 None
        })

    # 将结果转换为 Pandas DataFrame
    results_df = pd.DataFrame(results)
    return results_df

In [2]:
import os
import pandas as pd
import numpy as np

def calculate_kl_est(base_dir):
    # 用于存储结果的列表
    results = []

    # 遍历 SimCQRVAE 的所有设置
    for setting in os.listdir(base_dir):
        setting_path = os.path.join(base_dir, setting)
        if not os.path.isdir(setting_path):
            # print(f"Skipping non-directory file: {setting_path}")
            continue  # 跳过非目录文件

        # 提取当前设置的超参数信息
        lr, weight_decay, kld_weight, scheduler_gamma = None, None, None, None
        for param in setting.split("_"):
            if param.startswith("lr"):
                lr = float(param.replace("lr", ""))
            elif param.startswith("wd"):
                weight_decay = float(param.replace("wd", ""))
            elif param.startswith("kld"):
                kld_weight = float(param.replace("kld", ""))
            elif param.startswith("schedgamma"):
                gamma_value = param.replace("schedgamma", "")
                scheduler_gamma = None if gamma_value == "None" else float(gamma_value)

        # 用于存储每个种子的 KL 估计值
        seed_kl_est_list = []

        # 遍历该设置下的所有种子
        for seed_dir in os.listdir(setting_path):
            seed_path = os.path.join(setting_path, seed_dir)
            # 读取 kld.csv 文件
            kl_file = os.path.join(seed_path, "kld.csv")

            # 加载数据
            kl_est = pd.read_csv(kl_file, header=None).values.squeeze()


            # 计算 KL 估计值的平均值
            avg_kl_est = kl_est.mean()
            seed_kl_est_list.append(avg_kl_est)


        # 计算当前设置的种子 KL 估计值平均值
        avg_kl_est_over_seeds = np.mean(seed_kl_est_list)

        # 将结果添加到结果列表
        results.append({
            "lr": lr,
            "weight_decay": weight_decay,
            "kld_weight": kld_weight,
            "scheduler_gamma": scheduler_gamma,
            "avg_kl_est": avg_kl_est_over_seeds,
        })

    # 将结果转换为 Pandas DataFrame
    results_df = pd.DataFrame(results)
    return results_df

In [3]:
base_dir = "logs/sim/SimQRVAE"
results_df = calculate_kl_est(base_dir)
results_df.sort_values(by="avg_kl_est", ascending=True).iloc[:3,:]

Unnamed: 0,lr,weight_decay,kld_weight,scheduler_gamma,avg_kl_est
5,0.01,0.0,0.8,0.95,0.501239
4,0.005,0.0,0.8,0.95,1.528569
0,0.01,0.0,0.8,0.9,1.586965


In [4]:
base_dir = "logs/sim/SimVAE"
results_df = calculate_kl_est(base_dir)
results_df.sort_values(by="avg_kl_est", ascending=True)

Unnamed: 0,lr,weight_decay,kld_weight,scheduler_gamma,avg_kl_est
0,0.01,0.0,0.8,0.95,1.381858


## Sim_c

In [5]:
base_dir = "logs/sim_cond_v2/SimCQRVAE"
results_df = calculate_kl_est(base_dir)
results_df.sort_values(by="avg_kl_est", ascending=True).iloc[:3,:]

Unnamed: 0,lr,weight_decay,kld_weight,scheduler_gamma,avg_kl_est
0,0.01,0.0,0.8,0.95,5.040411


In [6]:
base_dir = "logs/sim_cond_v2/SimCVAE"
results_df = calculate_kl_est(base_dir)
results_df.sort_values(by="avg_kl_est", ascending=True)

Unnamed: 0,lr,weight_decay,kld_weight,scheduler_gamma,avg_kl_est
0,0.01,0.0,0.8,0.95,5.607318


In [7]:
base_dir = "logs/sim_cond_v2/SimQRVAE"
results_df = calculate_kl_est(base_dir)
results_df.sort_values(by="avg_kl_est", ascending=True)

Unnamed: 0,lr,weight_decay,kld_weight,scheduler_gamma,avg_kl_est
0,0.001,0.0,0.8,,6.062596


In [8]:
base_dir = "logs/sim_cond_v2/SimVAE"
results_df = calculate_kl_est(base_dir)
results_df.sort_values(by="avg_kl_est", ascending=True)

Unnamed: 0,lr,weight_decay,kld_weight,scheduler_gamma,avg_kl_est
0,0.001,0.0,0.8,,6.05832
