In [None]:
import glob
import os
import scripts
import torch
from torch.utils.tensorboard import SummaryWriter

# import odvae

from diffusers import StableDiffusionPipeline, AutoencoderKL
from diffusers import UNet2DConditionModel


In [8]:
diffusion_model_id = "./checkpoints/stable-diffusion-v1-5"
vae = AutoencoderKL.from_pretrained(diffusion_model_id, subfolder="vae")
unet = UNet2DConditionModel.from_pretrained(diffusion_model_id, subfolder="unet")

# Print the VAE model structure
print(vae)

In [4]:
print(unet)

In [5]:
# VAE整体结构
for name, module in vae.named_modules():
    print(f"{name}: {module}")

In [6]:
encoder = vae.encoder
decoder = vae.decoder

# Print the encoder structure
print(encoder)

In [7]:
# Print the decoder structure
print(decoder)

In [8]:
# 载入lora weights，打印观察结构
# lora_model_id = "./checkpoints/georgefen-AquaLoRA-Models/models--georgefen--AquaLoRA-Models/snapshots/98688b2a1e762339593ee8fe96ed13762f06b732/ppft_trained/101010101010101010101010101010101010101010101010/pytorch_lora_weights.safetensors"
lora_model_id = "../checkpoints/pytorch_lora_weights.safetensors"
lora = AutoencoderKL.from_pretrained(lora_model_id, device_map="auto",
                                     low_cpu_mem_usage=True,
                                     use_safetensors=True)
print(lora)

In [9]:
# 这个是合并了水印的lora模型
from safetensors.torch import load_file

filename = "../checkpoints/pytorch_lora_weights.safetensors"
model = load_file(filename)
state_dict = model
for key, value in model.items():
    print(key, value)


In [4]:
# 这个是没有合并水印的lora模型的结构
from safetensors.torch import load_file

filename = "./checkpoints/pytorch_lora_weights_origin.safetensors"
model = load_file(filename)
no_watermarked_state_dict = model
for key, value in model.items():
    print("\nkey:", key, "\nvalue:", value)

In [11]:
# 合并水印的lora模型的结构
print("\nSummary of LoRA weights:")
lora_num_layers = {}
lora_layer_shapes = {}
for key, value in state_dict.items():
    layer_name = key.split('.')[0]
    if layer_name not in lora_num_layers:
        lora_num_layers[layer_name] = 0
        lora_layer_shapes[layer_name] = []
    lora_num_layers[layer_name] += 1
    lora_layer_shapes[layer_name].append(value.shape)

for layer, count in lora_num_layers.items():
    print(f"Layer: {layer}, Number of parameters: {count}, Shapes: {lora_layer_shapes[layer]}")


In [12]:
# 没有合并水印的lora模型的结构
print("\nSummary of no-watermarked LoRA weights:")
no_watermarked_lora_num_layers = {}
no_watermarked_lora_layer_shapes = {}
for key, value in no_watermarked_state_dict.items():
    layer_name = key.split('.')[0]
    if layer_name not in no_watermarked_lora_num_layers:
        no_watermarked_lora_num_layers[layer_name] = 0
        no_watermarked_lora_layer_shapes[layer_name] = []
    no_watermarked_lora_num_layers[layer_name] += 1
    no_watermarked_lora_layer_shapes[layer_name].append(value.shape)

for layer, count in no_watermarked_lora_num_layers.items():
    print(f"Layer: {layer}, Number of parameters: {count}, Shapes: {no_watermarked_lora_layer_shapes[layer]}")

In [13]:
# 比较没有合并水印的lora模型的结构和合并了的的每一个layer参数格式是否一致
if no_watermarked_lora_layer_shapes['unet'] == lora_layer_shapes['unet']:
    print("The layer shapes are the same.")


In [14]:
print("\nSummary of LoRA weights:")
num_blocks = {}
block_shapes = {}

# 遍历 state_dict 中的所有键
for key, value in state_dict.items():
    # 分割键来提取具体 block 的信息，通常结构是 'layer.block.subblock.weight'
    # 我们将前两个部分(layer 和 block)组合起来作为 block 的名称
    block_name = '.'.join(key.split('.')[:3])  # 提取 'layer.block.subblock' 名称
    if block_name not in num_blocks:
        num_blocks[block_name] = 0
        block_shapes[block_name] = []

    # 记录每个 block 的参数数量和 shape
    num_blocks[block_name] += 1
    block_shapes[block_name].append(value.shape)

# 输出每个 block 的信息
for block, count in num_blocks.items():
    print(f"Block: {block}, Number of parameters: {count}, Shapes: {block_shapes[block]}")


In [15]:
print("\nLoRA 权重的结构信息：")

num_blocks = {}  # 用于存储每个 block 的信息
layer_type_count = {}  # 用于存储每种层的数量
shape_analysis = {}  # 用于存储每种层的 shape 信息
# 遍历 state_dict 中的所有键
for key, value in state_dict.items():
    # 将键按照 '.' 分割
    key_parts = key.split('.')

    # 定义 block 的层级，例如到 'unet.down_blocks.0.attentions.0'
    block_level = 5  # 根据模型的具体结构，选择哪几个.分割的部分作为 block 的名称
    block_name = '.'.join(key_parts[:block_level])  # 提取 block 的名称

    # 定义 layer_name 为剩余部分
    layer_name = '.'.join(key_parts[block_level:])

    # 初始化 block 的信息
    if block_name not in num_blocks:
        num_blocks[block_name] = {}

    # 记录每个 block 下的层信息
    if layer_name not in num_blocks[block_name]:
        num_blocks[block_name][layer_name] = []
    num_blocks[block_name][layer_name].append(value.shape)

    # 统计每种 layer 类型出现的次数
    layer_type = '.'.join(key_parts[block_level:-1])  # 去掉最后的 weight

    if layer_type not in layer_type_count:
        layer_type_count[layer_type] = 0
    layer_type_count[layer_type] += 1

    # 记录每种层的 shape 信息
    if layer_type not in shape_analysis:
        shape_analysis[layer_type] = []
    shape_analysis[layer_type].append(value.shape)

# 输出每个 block 和其下的层信息
for block, layers in num_blocks.items():
    print(f"\nBlock: {block}")
    for layer, shapes in layers.items():
        print(f"  Layer: {layer}, Shapes: {shapes}")


In [16]:
# 输出层类型统计信息
print("\nLoRA 层类型统计信息：")
for layer_type, count in layer_type_count.items():
    print(f"  Layer type: {layer_type}, Count: {count}")

In [17]:
# 输出层形状分析信息
print("\nLoRA 层的形状分析：")
for layer_type, shapes in shape_analysis.items():
    unique_shapes = set(shapes)  # 统计唯一的 shape
    print(f"  Layer type: {layer_type}, Unique Shapes: {unique_shapes}")


  Layer type: lora.down, Unique Shapes: {torch.Size([320, 1280, 1, 1])}
  Layer type: lora.up, Unique Shapes: {torch.Size([1280, 320, 1, 1])}
是什么?
是mid_block的proj_in和proj_out的shape

In [1]:
# 训练获取一定数量的lora模型作为我们的训练数据
dataset_path = "./checkpoints/lora_weights_dataset"

# TODO


In [5]:
dataset_path = "./checkpoints/lora_weights_dataset/rank320_batchsz8_gpu4"
original_lora_param_info = {}
# original_lora_data_lengths = []
for file in glob.glob(os.path.join(dataset_path, "*.safetensors")):
    model = load_file(file)
    for key, value in model.items():
        param_info = {
            'shape': value.shape,
            'length': value.numel()
        }
        original_lora_param_info[key] = param_info

In [6]:
idx = 0
for layer_name, param_info in original_lora_param_info.items():
    idx += 1
    print(f"Layer: {layer_name}, Shape: {param_info['shape']}, Length: {param_info['length']}\n")
    if idx == 20:
        break


In [7]:
# 对获取的lora模型权重进行预处理，将其转换为我们需要的格式
# TODO: 之后可能需要改成batch的形式
total_lora_data = []
for file in glob.glob(os.path.join(dataset_path, "*.safetensors")):
    single_lora_weights = []
    model = load_file(file)
    for key, value in model.items():
        # 临时代码，打印观察 TODO:删除
        # print("value: ")
        # print(value)
        # 对每个value进行Z-score标准化的策略
        mean = value.mean()
        std = value.std()
        value = (value - mean) / std

        # 将value展平为一维
        flattened_value = value.flatten()
        single_lora_weights.append(flattened_value)
        # print("flattened_value：")
        # print(flattened_value)
    single_lora_weights = torch.cat(single_lora_weights, dim=0)
    # total_lora_data.append(single_lora_weights)
    torch.save(single_lora_weights,
               os.path.join(dataset_path, "normalized_{}.pth".format(os.path.basename(file).split(".")[0])))
    # torch.save(total_lora_data, "./checkpoints/lora_data.pth")

In [23]:
total_lora_data[0].shape

In [24]:
# 试着重建展平的lora权重回原来的形状
# 重建模型的字典
start = 0
reconstructed_lora_weights = {}
for layer_name, param_info in original_lora_param_info.items():
    # 在lora权重中，key为layer_name代表某一层的名字，param_info['length']代表这一层的参数个数, param_info['shape']代表这一层参数的shape
    # 从total_lora_data中取出这一层的参数
    end = start + param_info['length']
    layer_weight_vector = total_lora_data[0][start:end]
    layer_weight_matrix = layer_weight_vector.view(param_info['shape'])
    reconstructed_lora_weights[layer_name] = layer_weight_matrix
    start = end



In [25]:
# 打印观察重建的lora权重
for layer_name, layer_weight in reconstructed_lora_weights.items():
    print(f"Layer: {layer_name}, Shape: {layer_weight.shape}")
    print(layer_weight)

观察发现重建的结果一致，此方法可行。
2024/10/15 22:20更新: 找到了论文的代码，方法和我一样

In [1]:
from torch.utils.data import DataLoader, TensorDataset

from torch.utils.data import Dataset


class ParameterVectorDataset(Dataset):
    def __init__(self, data_paths):
        self.data_paths = data_paths  # 数据文件的路径列表

    def __len__(self):
        return len(self.data_paths)

    def __getitem__(self, idx):
        # 加载第 idx 个参数向量
        data = torch.load(self.data_paths[idx])
        return data

In [4]:
# 如果保存模型检查点的目录不存在，则创建
checkpoint_dir = "./checkpoints/lora_vae_checkpoints"
if not os.path.isdir(checkpoint_dir):
    os.makedirs(checkpoint_dir)
    
# 如果保存模型日志的目录不存在，则创建
log_dir = "./logs/lora_vae_logs"
if not os.path.isdir(log_dir):
    os.makedirs(log_dir)
writer = SummaryWriter(log_dir)

In [5]:

# 示例数据加载（请替换为您的数据加载逻辑）
# 生成随机数据作为示例
batch_size = 2
num_samples = 100  # 请根据您的数据量调整
in_dim = 135659520  # 请根据您的数据维度调整

In [6]:
dataset_path = "./checkpoints/lora_weights_dataset"

rand_val_dataset_path = os.path.join(dataset_path, "rand_val")
rand_test_dataset_path = os.path.join(dataset_path, "rand_test")

In [19]:

# 随机生成100个样本，每个样本的维度为 135659520，值的区间为[-1,+1]，并分别保存在 rand_test_normalized_data1.pth, rand_test_normalized_data2.pth, ..., rand_test_normalized_data100.pth 中
for i in range(num_samples):
    data = torch.rand(in_dim) * 2 - 1
    torch.save(data, os.path.join(rand_test_dataset_path, f"rand_test_normalized_data{i + 1}.pth"))


In [None]:
# 生成随机的评估数据
# 随机生成100个样本，每个样本的维度为 135659520，值的区间为[-1,+1]，并分别保存在 rand_val_normalized_data1.pth, rand_val_normalized_data2.pth, ..., rand_val_normalized_data100.pth 中

for i in range(num_samples):
    data = torch.rand(in_dim) * 2 - 1
    torch.save(data, os.path.join(rand_val_dataset_path, f"rand_val_normalized_data{i + 1}.pth"))

In [7]:

# 随机测试数据文件列表
rand_test_data_paths = glob.glob(os.path.join(rand_test_dataset_path,"rand_test_normalized_data*.pth"))
# 创建数据集
rand_test_data_sets = ParameterVectorDataset(rand_test_data_paths)
# 创建数据加载器
rand_test_data_loader = DataLoader(rand_test_data_sets, batch_size=batch_size, shuffle=True, num_workers=2)


In [8]:
# 随机评估数据文件列表
rand_val_data_paths = glob.glob(os.path.join(rand_val_dataset_path, "rand_val_normalized_data*.pth"))
rand_val_data_paths

In [9]:

# 创建数据集
rand_val_data_sets = ParameterVectorDataset(rand_val_data_paths)
# 创建数据加载器
rand_val_data_loader = DataLoader(rand_val_data_sets, batch_size=batch_size, shuffle=True, num_workers=2)


In [10]:

# 训练数据文件列表
train_data_paths = glob.glob(os.path.join(dataset_path, "normalized_*.pth"))
# 创建数据集
train_data_sets = ParameterVectorDataset(train_data_paths)

# 创建数据加载器
train_data_loader = DataLoader(train_data_sets, batch_size=batch_size, shuffle=True, num_workers=2)

In [12]:
# from tqdm import tqdm
# from torch.cuda.amp import GradScaler, autocast
# from accelerate import Accelerator
# import torch
# import os
# # 试着将重建的lora权重加载到odvae模型中
# from odvae import ODVAE, medium, small
# 
# # 设置模型参数
# latent_dim = 12
# kld_weight = 0.005
# in_dim = 2048  # 请确保 in_dim 设置正确
# 
# # 使用 Accelerator 进行多卡训练
# accelerator = Accelerator()
# 
# # 初始化模型
# ODVAE_model = medium(in_dim=in_dim, latent_dim=latent_dim, kld_weight=kld_weight)
# 
# # 设置优化器
# optimizer = torch.optim.Adam(ODVAE_model.parameters(), lr=1e-3, weight_decay=2e-6)
# 
# # 初始化最佳验证损失
# best_val_loss = float('inf')
# 
# # 定义学习率调度器
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
#                                                        factor=0.5, patience=10,
#                                                        verbose=True, min_lr=1e-6)
# 
# # 定义早停参数
# early_stopping_patience = 20
# early_stopping_counter = 0
# 
# # 定义训练参数
# num_epochs = 30000
# batch_size = 4
from tqdm import tqdm
from torch.cuda.amp import GradScaler, autocast
from accelerate import Accelerator
import torch
import os
# 试着将重建的lora权重加载到odvae模型中
from odvae import ODVAE, medium, small

# 设置模型参数
latent_dim = 256
kld_weight = 0.005
in_dim = 135659520  # 请确保 in_dim 设置正确

# 使用 Accelerator 进行多卡训练
accelerator = Accelerator()
def free_memory():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

# 初始化模型
ODVAE_model = medium(in_dim=in_dim, latent_dim=latent_dim, kld_weight=kld_weight)

# 设置优化器
optimizer = torch.optim.Adam(ODVAE_model.parameters(), lr=1e-3, weight_decay=2e-6)

# 初始化最佳验证损失
best_val_loss = float('inf')

# 定义学习率调度器
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
                                                       factor=0.5, patience=10,
                                                       verbose=True, min_lr=1e-6)

# 定义早停参数
early_stopping_patience = 20
early_stopping_counter = 0

# 定义训练参数
num_epochs = 30000
# batch_size = 2

In [13]:
# for epoch in range(num_epochs):
#     ODVAE_model.train()
#     epoch_loss = 0.0
#     
#     rand_test_loader_tqdm = tqdm(rand_test_data_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Train", leave=False)
# 
#     
#     for batch_idx, batch in enumerate(rand_test_loader_tqdm):
#         x = batch  # x 的形状为 [batch_size, 135659520]
#         x = x.to(device)
#         optimizer.zero_grad()
#         # with autocast():
#         # 前向传播
#         outputs = ODVAE_model(x)
#         loss = outputs['loss']
#         # 反向传播和优化
#         loss.backward()
#         optimizer.step()
#         
#         epoch_loss += loss.item()
#         writer.add_scalar('Train/Loss', loss.item(), epoch * len(rand_test_data_loader) + batch_idx)
#     avg_test_loss = epoch_loss / len(rand_test_data_loader)
#     
#     # 验证阶段
#     ODVAE_model.eval()
#     val_loss = 0.0
#     rand_val_loader_tqdm = tqdm(rand_val_data_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation", leave=False)
# 
#     
#     with torch.no_grad():
#         for batch_idx, batch in enumerate(rand_val_loader_tqdm):
#             x = batch  # x 的形状为 [batch_size, 135659520]
#             x = x.to(device)
#             outputs = ODVAE_model(x)
#             loss = outputs['loss']
#             val_loss += loss.item()
#     avg_val_loss = val_loss / len(rand_val_data_loader)
#     print(f'Avg Validation Loss: {avg_val_loss:.4f}')
#         
#     scheduler.step(avg_val_loss)
#     writer.add_scalar('AvgLoss/Train', avg_test_loss, epoch)
#     writer.add_scalar('AvgLoss/Validation', avg_val_loss, epoch)
#     print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_test_loss:.4f}')
# 
#     # 记录学习率
#     current_lr = optimizer.param_groups[0]['lr']
#     writer.add_scalar('Learning_Rate', current_lr, epoch)
# 
#         # 检查验证损失是否降低
#     if avg_val_loss < best_val_loss:
#         best_val_loss = avg_val_loss
#         early_stopping_counter = 0  # 重置早停计数器
#         # 保存最佳模型
#         checkpoint_path = os.path.join(checkpoint_dir, 'best_model.pth')
#         torch.save(ODVAE_model.state_dict(), checkpoint_path)
#         print(f'Best ODVAE_model saved at epoch {epoch+1} with validation loss {best_val_loss:.4f}')
#     else:
#         early_stopping_counter += 1
#         if early_stopping_counter >= early_stopping_patience:
#             print(f'Early stopping at epoch {epoch+1}')
#             break
# 
#     # 打印当前 epoch 的训练和验证损失
#         
#     if (epoch + 1) % 1000 == 0:
#         checkpoint_path = os.path.join(checkpoint_dir, f'model_epoch_{epoch+1}.pth')
#         torch.save(ODVAE_model.state_dict(), checkpoint_path)
#         print(f'ODVAE_model checkpoint saved at epoch {epoch+1}')
#         
# writer.close()
# 使用 Accelerator 包装模型、优化器和数据加载器
ODVAE_model, optimizer, rand_test_data_loader, rand_val_data_loader = accelerator.prepare(
    ODVAE_model, optimizer, rand_test_data_loader, rand_val_data_loader
)

# 开始训练循环
for epoch in range(num_epochs):
    ODVAE_model.train()
    epoch_loss = 0.0
    
    rand_test_loader_tqdm = tqdm(rand_test_data_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Train", leave=False)

    for batch_idx, batch in enumerate(rand_test_loader_tqdm):
        x = batch  # x 的形状为 [batch_size, 135659520]
        optimizer.zero_grad()
        
        with accelerator.autocast():
            # 前向传播
            outputs = ODVAE_model(x)
            loss = outputs['loss']
        
        # 反向传播和优化
        accelerator.backward(loss)
        optimizer.step()
        
        epoch_loss += loss.item()
        if accelerator.is_main_process:
            writer.add_scalar('Train/Loss', loss.item(), epoch * len(rand_test_data_loader) + batch_idx)

        # 释放未使用的显存
        free_memory()
    avg_test_loss = epoch_loss / len(rand_test_data_loader)

    # 验证阶段
    ODVAE_model.eval()
    val_loss = 0.0
    rand_val_loader_tqdm = tqdm(rand_val_data_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation", leave=False)

    with torch.no_grad():
        for batch_idx, batch in enumerate(rand_val_loader_tqdm):
            x = batch  # x 的形状为 [batch_size, 135659520]
            with accelerator.autocast():
                outputs = ODVAE_model(x)
                loss = outputs['loss']
            val_loss += loss.item()

    avg_val_loss = val_loss / len(rand_val_data_loader)
    if accelerator.is_main_process:
        print(f'Avg Validation Loss: {avg_val_loss:.4f}')
        scheduler.step(avg_val_loss)
        writer.add_scalar('AvgLoss/Train', avg_test_loss, epoch)
        writer.add_scalar('AvgLoss/Validation', avg_val_loss, epoch)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_test_loss:.4f}')

    # 记录学习率
    current_lr = optimizer.param_groups[0]['lr']
    if accelerator.is_main_process:
        writer.add_scalar('Learning_Rate', current_lr, epoch)

    # 检查验证损失是否降低
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        early_stopping_counter = 0  # 重置早停计数器
        # 保存最佳模型
        if accelerator.is_main_process:
            checkpoint_path = os.path.join(checkpoint_dir, 'best_model.pth')
            accelerator.save(ODVAE_model.state_dict(), checkpoint_path)
            print(f'Best ODVAE_model saved at epoch {epoch+1} with validation loss {best_val_loss:.4f}')
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= early_stopping_patience:
            if accelerator.is_main_process:
                print(f'Early stopping at epoch {epoch+1}')
            break

    # 打印当前 epoch 的训练和验证损失
    if (epoch + 1) % 1000 == 0 and accelerator.is_main_process:
        checkpoint_path = os.path.join(checkpoint_dir, f'model_epoch_{epoch+1}.pth')
        accelerator.save(ODVAE_model.state_dict(), checkpoint_path)
        print(f'ODVAE_model checkpoint saved at epoch {epoch+1}')

if accelerator.is_main_process:
    writer.close()