## unet 2d condition model

[unet](https://huggingface.co/docs/diffusers/en/api/models/unet2d-cond)

In [None]:
from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel
pretrained_model_name_or_path = "runwayml/stable-diffusion-v1-5"  # Stable Diffusion v1.5
unet = UNet2DConditionModel.from_pretrained(pretrained_model_name_or_path, subfolder="unet")

unet_sd = unet.state_dict()
# for key, value in unet_sd.items():
#     print(key, value.shape)
for name in unet.attn_processors.keys():
    cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim

    if name.startswith("mid_block"):
        hidden_size = unet.config.block_out_channels[-1]
    elif name.startswith("up_blocks"):
        block_id = int(name[len("up_blocks.")])
        hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
    elif name.startswith("down_blocks"):
        block_id = int(name[len("down_blocks.")])
        hidden_size = unet.config.block_out_channels[block_id]

    if cross_attention_dim is None:
        pass
    else:
        layer_name = name.split(".processor")[0]
        weights = {
            "to_k_ip.weight": unet_sd[layer_name + ".to_k.weight"],
            "to_v_ip.weight": unet_sd[layer_name + ".to_v.weight"],
        }
        print(True)

In [1]:
from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel
pretrained_model_name_or_path = "runwayml/stable-diffusion-v1-5"  # Stable Diffusion v1.5
unet = UNet2DConditionModel.from_pretrained(pretrained_model_name_or_path, subfolder="unet")
for name, param in unet.named_parameters():
    if "to_k" in name or "to_v" in name:
        print(name, param.shape)
        param.requires_grad = False


down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_k.weight torch.Size([320, 320])
down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_v.weight torch.Size([320, 320])
down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_k.weight torch.Size([320, 768])
down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_v.weight torch.Size([320, 768])
down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_k.weight torch.Size([320, 320])
down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_v.weight torch.Size([320, 320])
down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_k.weight torch.Size([320, 768])
down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_v.weight torch.Size([320, 768])
down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_k.weight torch.Size([640, 640])
down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_v.weight torch.Size([640, 640])
down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_k.weight torch.Size([640, 768])
down_blocks.1.attenti

In [None]:
! pip show torch torchvision

In [None]:
unet.config

In [None]:
unet.attn_processors

In [None]:
import os
import random
import argparse
from pathlib import Path
import json
import itertools
import time
import torch.nn as nn
import torch
import torch.nn.functional as F
from torchvision import transforms
from PIL import Image
from transformers import CLIPImageProcessor
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import ProjectConfiguration
from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel
from transformers import CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
from poseCtrl.models.pose_adaptor import VPmatrixEncoder, VPmatrixPoints
from poseCtrl.models.attention_processor import AttnProcessor, PoseAttnProcessor
from poseCtrl.data.dataset import MyDataset, load_base_points

def parse_args():
    parser = argparse.ArgumentParser(description="Simple example of a training script.")
    parser.add_argument(
        "--pretrained_model_name_or_path",
        type=str, 
        default='runwayml/stable-diffusion-v1-5',
        required=True,
        help="Path to pretrained model or model identifier from huggingface.co/models.",
    )
    parser.add_argument(
        "--pretrained_pose_path",
        type=str,
        default=None,
        help="Path to pretrained  posectrl model. If not specified weights are initialized randomly.",
    )
    # parser.add_argument(
    #     "--data_json_file",
    #     type=str,
    #     default=None,
    #     required=True,
    #     help="Training data",
    # )
    parser.add_argument(
        "--base_point_path",
        type=str,
        default=r'F:\Projects\diffusers\Project\PoseCtrl\dataSet\standardVertex.txt',
        help='Path to base model points'
    )
    parser.add_argument(
        "--data_root_path",
        type=str,
        default="F:\\Projects\\diffusers\\ProgramData\\pic",
        required=True,
        help="Training data root path",
    )
    parser.add_argument(
        "--image_encoder_path",
        type=str,
        default="laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
        required=True,
        help="Path to CLIP image encoder",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default="sd-pose_ctrl",
        help="The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument(
        "--logging_dir",
        type=str,
        default="logs",
        help=(
            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
        ),
    )
    parser.add_argument(
        "--resolution",
        type=int,
        default=512,
        help=(
            "The resolution for input images"
        ),
    )
    parser.add_argument(
        "--learning_rate",
        type=float,
        default=1e-4,
        help="Learning rate to use.",
    )
    parser.add_argument("--weight_decay", type=float, default=1e-2, help="Weight decay to use.")
    parser.add_argument("--num_train_epochs", type=int, default=100)
    parser.add_argument(
        "--train_batch_size", type=int, default=8, help="Batch size (per device) for the training dataloader."
    )
    parser.add_argument(
        "--dataloader_num_workers",
        type=int,
        default=0,
        help=(
            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
        ),
    )
    parser.add_argument(
        "--save_steps",
        type=int,
        default=2000,
        help=(
            "Save a checkpoint of the training state every X updates"
        ),
    )
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default="fp16",
        choices=["no", "fp16", "bf16"],
        help=(
            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
        ),
    )
    parser.add_argument(
        "--report_to",
        type=str,
        default="tensorboard",
        help=(
            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
        ),
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
    
    args = parser.parse_args()
    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
    if env_local_rank != -1 and env_local_rank != args.local_rank:
        args.local_rank = env_local_rank

    return args

class posectrl(nn.Module):
    def __init__(self, unet, vpmatrix_points, atten_modules, ckpt_path=None):
        super().__init__()
        self.unet = unet
        self.vpmatrix_points = vpmatrix_points
        self.atten_modules = atten_modules

        if ckpt_path is not None:
            self.load_from_checkpoint(ckpt_path)

    def forward(self, noisy_latents, timesteps, encoder_hidden_states, V_matrix, P_matrix):
        point_tokens = self.vpmatrix_points(V_matrix, P_matrix)
        """ 修改:防止之后要加text """
        if encoder_hidden_states:
            encoder_hidden_states = torch.cat([encoder_hidden_states, point_tokens], dim=1)
        else:
            encoder_hidden_states=point_tokens
        # Predict the noise residual
        noise_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states).sample
        return noise_pred

    def load_from_checkpoint(self, ckpt_path: str):
        # Calculate original checksums
        orig_VPmatrix_sum = torch.sum(torch.stack([torch.sum(p) for p in self.vpmatrix_points.parameters()]))
        orig_atten_sum = torch.sum(torch.stack([torch.sum(p) for p in self.atten_modules.parameters()]))

        state_dict = torch.load(ckpt_path, map_location="cpu")

        # Load state dict for image_proj_model and adapter_modules
        self.vpmatrix_points.load_state_dict(state_dict["vpmatrix_points"], strict=True)
        self.atten_modules.load_state_dict(state_dict["atten_modules"], strict=True)

        # Calculate new checksums
        new_VPmatrix_sum = torch.sum(torch.stack([torch.sum(p) for p in self.vpmatrix_points.parameters()]))
        new_atten_sum = torch.sum(torch.stack([torch.sum(p) for p in self.atten_modules.parameters()]))

        # Verify if the weights have changed
        assert orig_VPmatrix_sum != new_VPmatrix_sum, "Weights of VPmatrixEncoder did not change!"
        assert orig_atten_sum != new_atten_sum, "Weights of atten_modules did not change!"

        print(f"Successfully loaded weights from checkpoint {ckpt_path}")

def main():
    args = parse_args()
    logging_dir = Path(args.output_dir, args.logging_dir)

    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)

    accelerator = Accelerator(
        mixed_precision=args.mixed_precision,
        log_with=args.report_to,
        project_config=accelerator_project_config,
    )
    
    if accelerator.is_main_process:
        if args.output_dir is not None:
            os.makedirs(args.output_dir, exist_ok=True)

    # Load scheduler, tokenizer and models.
    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
    tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
    text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
    image_encoder = CLIPVisionModelWithProjection.from_pretrained(args.image_encoder_path)

    # freeze parameters of models to save more memory
    unet.requires_grad_(False)
    vae.requires_grad_(False)
    text_encoder.requires_grad_(False)
    image_encoder.requires_grad_(False)
    
    #vp-matrix encoder
    raw_base_points=load_base_points(args.base_point_path)  
    vpmatrix_points_sd = VPmatrixEncoder(raw_base_points)

    # init pose modules
    attn_procs = {}
    unet_sd = unet.state_dict()
    for name in unet.attn_processors.keys():
        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim

        if name.startswith("mid_block"):
            hidden_size = unet.config.block_out_channels[-1]
        elif name.startswith("up_blocks"):
            block_id = int(name[len("up_blocks.")])
            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
        elif name.startswith("down_blocks"):
            block_id = int(name[len("down_blocks.")])
            hidden_size = unet.config.block_out_channels[block_id]

        if cross_attention_dim is None:
            attn_procs[name] = AttnProcessor()
        else:
            layer_name = name.split(".processor")[0]
            weights = {
                "to_k_ip.weight": unet_sd[layer_name + ".to_k.weight"],
                "to_v_ip.weight": unet_sd[layer_name + ".to_v.weight"],
            }
            attn_procs[name] = PoseAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
            attn_procs[name].load_state_dict(weights)

    unet.set_attn_processor(attn_procs)

    atten_modules = torch.nn.ModuleList(unet.attn_processors.values())
    
    pose_ctrl = posectrl(unet, vpmatrix_points_sd, atten_modules, args.pretrained_pose_path)
    
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
        weight_dtype = torch.float16
    elif accelerator.mixed_precision == "bf16":
        weight_dtype = torch.bfloat16
    #unet.to(accelerator.device, dtype=weight_dtype)
    vae.to(accelerator.device, dtype=weight_dtype)
    text_encoder.to(accelerator.device, dtype=weight_dtype)
    image_encoder.to(accelerator.device, dtype=weight_dtype)
    
    # optimizer
    params_to_opt = itertools.chain(pose_ctrl.vpmatrix_points_sd.parameters(),  pose_ctrl.atten_modules.parameters())
    optimizer = torch.optim.AdamW(params_to_opt, lr=args.learning_rate, weight_decay=args.weight_decay)
    
    # dataloader
    train_dataset = MyDataset(args.data_root_path)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        shuffle=True,
        batch_size=args.train_batch_size,
        num_workers=args.dataloader_num_workers,
    )
    
    # Prepare everything with our `accelerator`.
    pose_ctrl, optimizer, train_dataloader = accelerator.prepare(pose_ctrl, optimizer, train_dataloader)
    
    global_step = 0
    for epoch in range(0, args.num_train_epochs): #default is 100
        begin = time.perf_counter()
        for step, batch in enumerate(train_dataloader):
            load_data_time = time.perf_counter() - begin
            with accelerator.accumulate(pose_ctrl):
                # Convert images to latent space
                with torch.no_grad():
                    latents = vae.encode(batch["image"].to(accelerator.device, dtype=weight_dtype)).latent_dist.sample()
                    latents = latents * vae.config.scaling_factor

                # Sample noise that we'll add to the latents
                noise = torch.randn_like(latents)
                bsz = latents.shape[0]
                # Sample a random timestep for each image
                timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
                timesteps = timesteps.long()

                # Add noise to the latents according to the noise magnitude at each timestep
                # (this is the forward diffusion process)
                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

                if "text_input_ids" in batch:
                    with torch.no_grad():
                        encoder_hidden_states = text_encoder(batch["text_input_ids"].to(accelerator.device))[0]
                else:
                    encoder_hidden_states=None
                
                noise_pred = pose_ctrl(noisy_latents, timesteps, encoder_hidden_states, batch['view_matrix'], batch['projection_matrix'])
        
                loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
            
                # Gather the losses across all processes for logging (if we use distributed training).
                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean().item()
                
                # Backpropagate
                accelerator.backward(loss)
                optimizer.step()
                optimizer.zero_grad()

                if accelerator.is_main_process:
                    print("Epoch {}, step {}, data_time: {}, time: {}, step_loss: {}".format(
                        epoch, step, load_data_time, time.perf_counter() - begin, avg_loss))
            
            global_step += 1
            
            if global_step % args.save_steps == 0:
                save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
                accelerator.save_state(save_path)
            
            begin = time.perf_counter()

里面结构没改

应该有两个输入：图像和VP

VP： VPmatrixEncoder -> [77,77]

image: 
- :vae ->latent
- :resampler? 写一个 vit或者别的网络
- 那么训练参数会变成三个


# TODO
- 1. visEncoder.py
- 2. attention_processor.py： 加逻辑
- 3. posectrl.py: 加参数和逻辑
- 4. main.py：加参数
- 

In [None]:
import torch
ckpt = "checkpoint-50000/pytorch_model.bin"
sd = torch.load(ckpt, map_location="cpu")
VPmatrixEncoder_sd = {}
atten_sd = {}
for k in sd:
    if k.startswith("unet"):
        pass
    elif k.startswith("VPmatrixEncoder"):
        VPmatrixEncoder_sd[k.replace("VPmatrixEncoder.", "")] = sd[k]
    elif k.startswith("atten_modules"):
        atten_sd[k.replace("atten_modules.", "")] = sd[k]

torch.save({"VPmatrixEncoder": VPmatrixEncoder_sd, "atten_modules": atten_sd}, "posectrl.bin")

1. 位置矩阵
2. 本地的坐标：可以和vp矩阵相乘，数学意义，M矩阵，
3. 多样性一点：图的特征，加上正面原图随便的特征。




4. 反向：训练什么

# TODO NEW Version 1

- inference
- VP矩阵不需要处理了
- BasePoints: [2000,4,4] @ [4,4] -> <77 768>（这个是text attention之后的结果，不知道图片他们都是怎么做的）可能可以换个大小，可学习的部分直接写出来更换就行。
- 好像流程就没啥问题了
- 每个要改的地方加上"修改",不然找不到忘记了.

# TODO NEW Version 2
   现在的逻辑是： vp矩阵[4,4], 顶点是[13860,4] (77x180), ->[13840,4] reshape [77,768]
-  要改：
   
~~- base_load~~

~~- pose_adaptor~~

   ~~- posectrl traning~~ 
   - 和 posectrl inference   
 
   ~~- attention_pocessor~~ 和之前没区别

   ~~- train main~~

   ~~- 跑通~~

# TODO NEW Version 3
加了个参考图， 这个图attention 加上

~~- dataset 1024 resize~~, 可以不加数量限制

~~- image sampler~~

~~- posectrl train main~~

~~- attention~~

~~- posectrl.py~~

~~- inference~~

~~- weights new version~~

~~- download ~~

  ~~- check in inference and normal pipeline~~、
  
  ~~- upload to google drive~~

- validaton
- 
~~- 检查一遍整个流程~~
- train 得到weights
- inference

# TODO NEW Version 4
- 如果把输入引入噪声里面比单纯引入attention会好一点吗
- animediff是使用attention的改变了吗
- 如果通过lora去训练模型，怎么实现
- 如果加上我的模型做lora，会不会直接省了我的事情


# FUTURE VERSION
- add lora
- add 正负判断

# Questions
~~1. 需不要把好坏prompt设置成~~
 
python 
```
if prompt is None:
    prompt = "best quality, high quality"
if negative_prompt is None:
    negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
```

不用应该

- 2. point-e好像是生成3d底模的东西,不知道有没有用

In [None]:
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image

# 1. 选择 CLIP 预训练模型
model_name = "openai/clip-vit-base-patch16"  # 也可以换成 "openai/clip-vit-large-patch14"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)

# 2. 读取并预处理图片
image_path = r"F:\Projects\diffusers\ProgramData\sample_new\NPC_Avatar_Girl_Sword_Ayaka\feature.png"  # 替换成你的图片路径
image = Image.open(image_path)

inputs = processor(images=image, return_tensors="pt")  # 预处理
image_tensor = inputs["pixel_values"]  # 获取输入张量，形状 (1, 3, 224, 224)

# 3. 获取所有 patch 的特征
with torch.no_grad():
    vision_outputs = model.vision_model(image_tensor)  # 获取所有 Transformer 层的输出
    patch_features = vision_outputs.last_hidden_state  # 形状: (B, X+1, 768)

# 4. 移除 CLS token（第一个 token）
patch_features = patch_features[:, 1:, :]  # (B, X, 768)

print("Patch feature shape:", patch_features.shape)  # 目标形状: (B, X, 768)


In [None]:
!pip install safetensors

In [None]:
import torch
import torch
from pathlib import Path
def change_checkpoint(checkpoint_path, new_checkpoint_path):
    sd = torch.load(checkpoint_path, map_location="cpu")
    vpmatrix_points_sd = {}
    atten_sd = {}
    proj_sd={}
    for k in sd:
        if k.startswith("unet"):
            pass
        elif k.startswith("vpmatrix_points"):
            vpmatrix_points_sd[k.replace("vpmatrix_points.", "")] = sd[k]
        elif k.startswith("atten_modules"):
            atten_sd[k.replace("atten_modules.", "")] = sd[k]
        elif k.startswith("image_proj_model"):
            proj_sd[k.replace("image_proj_model.", "")] = sd[k]
    new_checkpoint_path = Path(new_checkpoint_path, "posectrl.bin")
    print(vpmatrix_points_sd)
    print(atten_sd)
    print(proj_sd)
    for name in sd['state'].keys():
        print(name)
    torch.save({"vpmatrix_points": vpmatrix_points_sd, "atten_modules": atten_sd, "image_proj_model": proj_sd}, new_checkpoint_path)
    print(f"Saved new checkpoint to {new_checkpoint_path}")

ckpt = r"F:\Projects\diffusers\Project\PoseCtrl\sd-pose_ctrl\model.safetensors"

change_checkpoint(ckpt, r"F:\Projects\diffusers\Project\PoseCtrl\sd-pose_ctrl\transfer")


In [None]:
import torch
from safetensors.torch import load_file
from pathlib import Path

def change_checkpoint(checkpoint_path, new_checkpoint_path):
    # 使用 safetensors 加载文件
    sd = load_file(checkpoint_path)
    
    vpmatrix_points_sd = {}
    atten_sd = {}
    proj_sd = {}
    
    # 遍历模型权重并分类
    i, j = 1 , 1
    for k in sd:
        # print(k)
        if k.startswith("unet"):
            pass
        elif k.startswith("vpmatrix_points"):
            vpmatrix_points_sd[k.replace("vpmatrix_points.", "")] = sd[k]
        elif k.startswith("atten_modules"):
            atten_sd[k.replace("atten_modules.", "")] = sd[k]
        elif k.startswith("image_proj_model"):
            proj_sd[k.replace("image_proj_model.", "")] = sd[k]

    # 指定新的文件路径
    new_checkpoint_path = Path(new_checkpoint_path, "posectrl2.bin")
    
    print(atten_sd)
    
    # 保存为新的二进制 checkpoint 文件
    torch.save({
        "vpmatrix_points": vpmatrix_points_sd,
        "atten_modules": atten_sd,
        "image_proj_model": proj_sd
    }, new_checkpoint_path)
    
    print(f"Saved new checkpoint to {new_checkpoint_path}")

# 使用 safetensors 文件路径
ckpt = r"F:\Projects\diffusers\Project\PoseCtrl\sd-pose_ctrl\model.safetensors"

# 调用转换函数
change_checkpoint(ckpt, r"F:\Projects\diffusers\Project\PoseCtrl\sd-pose_ctrl\transfer")


In [None]:
import torch
import torch
from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipelineLegacy, DDIMScheduler, AutoencoderKL
from PIL import Image
import sys
import os
sys.path.append(r'F:\Projects\diffusers\Project\PoseCtrl')
sys.path.append(r'F:\Projects\diffusers\Project\PoseCtrl\poseCtrl')
from poseCtrl.models.pose_adaptor import VPmatrixPoints, ImageProjModel
from poseCtrl.models.attention_processor import AttnProcessor, PoseAttnProcessor
from poseCtrl.data.dataset import CustomDataset, load_base_points
from poseCtrl.models.posectrl import PoseCtrl
import numpy as np


base_point_path=r'F:\Projects\diffusers\Project\PoseCtrl\dataSet\standardVertex.txt'
raw_base_points=load_base_points(base_point_path)  

base_model_path = "runwayml/stable-diffusion-v1-5"
vae_model_path = "stabilityai/sd-vae-ft-mse"
image_encoder_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
ip_ckpt = r"F:\Projects\diffusers\Project\sd-pose_ctrl\trail_3\posectrl2000.bin"
device = "cuda"

def image_grid(imgs, rows, cols):
    assert len(imgs) == rows*cols

    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    grid_w, grid_h = grid.size
    
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid

noise_scheduler = DDIMScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    clip_sample=False,
    set_alpha_to_one=False,
    steps_offset=1,
)
vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)

# load SD pipeline
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    scheduler=noise_scheduler,
    vae=vae,
    feature_extractor=None,
    safety_checker=None
)

path = r"F:\\Projects\\diffusers\\ProgramData\\sample_new"
dataset = CustomDataset(path)
data = dataset[0]
from torchvision import transforms

transform = transforms.Resize((256, 256))

image = data['image']
image = transform(image) 

image_np = image.permute(1, 2, 0).cpu().numpy()
image_np = (image_np * 255).astype(np.uint8)
g_image = data['feature']
g_image = transform(g_image) 

g_image_np = g_image.permute(1, 2, 0).cpu().numpy()
g_image_np = (g_image_np * 255).astype(np.uint8)
vmatrix = data['view_matrix'].to(torch.float16).unsqueeze(0).to(device)
pmatrix = data['projection_matrix'].to(torch.float16).unsqueeze(0).to(device)

pose_model = PoseCtrl(pipe, image_encoder_path, ip_ckpt, raw_base_points, device)
images = pose_model.generate(pil_image=image_np, num_samples=4, num_inference_steps=50, seed=42, image=g_image_np, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
grid = image_grid(images, 1, 4)
grid





In [None]:

pose_model = PoseCtrl(pipe, image_encoder_path, ip_ckpt, raw_base_points, device)
images = pose_model.generate(pil_image=image_np, num_samples=4, num_inference_steps=50, seed=42, image=g_image_np, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
grid = image_grid(images, 1, 4)
grid

In [None]:
from matplotlib import pyplot as plt
import numpy as np
image = data['image']
# 将 (C, H, W) 转换为 (H, W, C) 并转换为 NumPy
image_np = image.permute(1, 2, 0).cpu().numpy()
image_np = (image_np * 255).astype(np.uint8)
# 显示图像
plt.imshow(image_np)
plt.axis('off')  # 不显示坐标轴
plt.show()

In [None]:
import os
import torch
from safetensors.torch import load_file
import pickle

def convert_sd_weights_to_bin(folder_path, output_bin_path):
    """
    将 Stable Diffusion 文件夹中的多个权重文件（safetensors, bin, pkl, pt）合并并保存为 .bin 格式。

    :param folder_path: 包含权重文件的文件夹路径
    :param output_bin_path: 输出的 .bin 文件路径
    """
    merged_state_dict = {}  # 存储所有权重的字典

    # 遍历文件夹，找到所有权重文件
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if filename.endswith('.safetensors'):
            print(f"Loading {file_path} (safetensors)...")
            state_dict = load_file(file_path, device="cpu")
        elif filename.endswith('.bin'):
            print(f"Loading {file_path} (bin)...")
            state_dict = torch.load(file_path, map_location="cpu")
        elif filename.endswith('.pkl'):
            print(f"Loading {file_path} (pkl)...")
            with open(file_path, "rb") as f:
                state_dict = pickle.load(f)
        elif filename.endswith('.pt'):
            print(f"Loading {file_path} (pt)...")
            state_dict = torch.load(file_path, map_location="cpu")
        else:
            print(f"Skipping {file_path}, unsupported format.")
            continue

        # 合并权重（如果存在相同的 key，则不会覆盖）
        for key, value in state_dict.items():
            if key not in merged_state_dict:
                merged_state_dict[key] = value

    # 保存到 .bin 文件
    torch.save(merged_state_dict, output_bin_path)
    print(f"Saved merged weights to {output_bin_path}")

# 示例用法
folder_path = r"/content/sd-pose_ctrl/checkpoint-20"  # 替换为你的文件夹路径
output_bin_path = r"/content/sd-pose_ctrl/checkpoint-20/stable_diffusion.bin"  # 目标 bin 文件路径

convert_sd_weights_to_bin(folder_path, output_bin_path)


In [None]:
!pip install omegaconf

In [None]:
from diffusers import StableDiffusionPipeline

# 加载 .safetensors 文件
pipeline = StableDiffusionPipeline.from_single_file(r"F:\Projects\diffusers\ProgramData\tmndMix_tmndMixSPRAINBOW.safetensors")

# 将模型保存为 diffusers 格式
pipeline.save_pretrained(r"F:\Projects\diffusers\ProgramData\basemodel")


In [None]:
import torch
import torch
from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipelineLegacy, DDIMScheduler, AutoencoderKL
from PIL import Image
import sys
import os
sys.path.append(r'F:\Projects\diffusers\Project\PoseCtrl')
sys.path.append(r'F:\Projects\diffusers\Project\PoseCtrl\poseCtrl')
from poseCtrl.models.pose_adaptor import VPmatrixPoints, ImageProjModel
from poseCtrl.models.attention_processor import AttnProcessor, PoseAttnProcessor
from poseCtrl.data.dataset import CustomDataset, load_base_points
from poseCtrl.models.posectrl import PoseCtrl
import numpy as np


base_point_path=r'F:\Projects\diffusers\Project\PoseCtrl\dataSet\standardVertex.txt'
raw_base_points=load_base_points(base_point_path)  

base_model_path = r"F:\Projects\diffusers\ProgramData\basemodel"
vae_model_path = "stabilityai/sd-vae-ft-mse"
image_encoder_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
ip_ckpt = r"F:\Projects\diffusers\Project\sd-pose_ctrl\trail_1\posectrl.bin"
device = "cuda"

def image_grid(imgs, rows, cols):
    assert len(imgs) == rows*cols

    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    grid_w, grid_h = grid.size
    
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid

noise_scheduler = DDIMScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    clip_sample=False,
    set_alpha_to_one=False,
    steps_offset=1,
)
vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)

# load SD pipeline
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    scheduler=noise_scheduler,
    vae=vae,
    feature_extractor=None,
    safety_checker=None
)

path = r"F:\\Projects\\diffusers\\ProgramData\\sample_new"
dataset = CustomDataset(path)
data = dataset[0]
from torchvision import transforms

transform = transforms.Resize((256, 256))


image = data['image']
image_pil = transforms.ToPILImage()(image)
image_pil = transform(image_pil) 
image = Image.open('F:\Projects\diffusers\Project\PoseCtrl\image.jpg').convert('RGB')
g_image = data['feature']
g_image_pil = transforms.ToPILImage()(g_image)
g_image_pil = transform(g_image_pil) 

vmatrix = data['view_matrix'].to(torch.float16).unsqueeze(0).to(device)
pmatrix = data['projection_matrix'].to(torch.float16).unsqueeze(0).to(device)

pose_model = PoseCtrl(pipe, image_encoder_path, ip_ckpt, raw_base_points, device)
images = pose_model.generate(pil_image=g_image_pil, num_samples=4, num_inference_steps=50, seed=37, image=image, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
# images = pose_model.generate(pil_image=g_image, num_samples=4, num_inference_steps=50, seed=42, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
grid = image_grid(images, 1, 4)
grid


In [None]:
image

In [None]:
image_pil

In [None]:
images = pose_model.generate(pil_image=g_image, num_samples=4, num_inference_steps=50, seed=42, image=image, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
# images = pose_model.generate(pil_image=g_image, num_samples=4, num_inference_steps=50, seed=42, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
grid = image_grid(images, 1, 4)
grid

In [None]:
image_pil

In [None]:
image_pil

In [None]:
pose_model = PoseCtrl(pipe, image_encoder_path, ip_ckpt, raw_base_points, device)
images = pose_model.generate(pil_image=g_image, num_samples=4, num_inference_steps=50, seed=42, image=image, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
# images = pose_model.generate(pil_image=g_image, num_samples=4, num_inference_steps=50, seed=42, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
grid = image_grid(images, 1, 4)
grid

In [None]:
image_pil

In [None]:
images = pose_model.generate(pil_image=g_image_pil, num_samples=4, num_inference_steps=100, seed=42, image=image_pil, strength=0.5, V_matrix=vmatrix,P_matrix=pmatrix )
# images = pose_model.generate(pil_image=g_image, num_samples=4, num_inference_steps=50, seed=42, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
grid = image_grid(images, 1, 4)
grid

In [None]:
path = r"F:\Projects\diffusers\ProgramData\validation"
dataset = CustomDataset(path)
data = dataset[56]
from torchvision import transforms

transform = transforms.Resize((256, 256))


image = data['image']
image_pil = transforms.ToPILImage()(image)
image_pil = transform(image_pil) 

g_image = data['feature']
g_image_pil = transforms.ToPILImage()(g_image)
g_image_pil = transform(g_image_pil) 

vmatrix = data['view_matrix'].to(torch.float16).unsqueeze(0).to(device)
pmatrix = data['projection_matrix'].to(torch.float16).unsqueeze(0).to(device)
image_pil

In [None]:
images = pose_model.generate(pil_image=g_image, num_samples=4, num_inference_steps=50, seed=42,image=image, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
grid = image_grid(images, 1, 4)
grid

In [None]:
import torch
import torch
from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipelineLegacy, DDIMScheduler, AutoencoderKL
from PIL import Image
import sys
import os
sys.path.append(r'F:\Projects\diffusers\Project\PoseCtrl')
sys.path.append(r'F:\Projects\diffusers\Project\PoseCtrl\poseCtrl')
from poseCtrl.models.pose_adaptor import VPmatrixPoints, ImageProjModel
from poseCtrl.models.attention_processor import AttnProcessor, PoseAttnProcessor
from poseCtrl.data.dataset import CustomDataset, load_base_points
from poseCtrl.models.posectrl import PoseCtrl
import numpy as np


base_point_path=r'F:\Projects\diffusers\Project\PoseCtrl\dataSet\standardVertex.txt'
raw_base_points=load_base_points(base_point_path)  

base_model_path = r"F:\Projects\diffusers\ProgramData\basemodel"
vae_model_path = "stabilityai/sd-vae-ft-mse"
image_encoder_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
ip_ckpt = r"F:\Projects\diffusers\Project\sd-pose_ctrl\trail_4\posectrl.bin"
device = "cuda"

def image_grid(imgs, rows, cols):
    assert len(imgs) == rows*cols

    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    grid_w, grid_h = grid.size
    
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid

noise_scheduler = DDIMScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    clip_sample=False,
    set_alpha_to_one=False,
    steps_offset=1,
)
vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)

# load SD pipeline
pipe = StableDiffusionPipeline.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    scheduler=noise_scheduler,
    vae=vae,
    feature_extractor=None,
    safety_checker=None
)
path = r"F:\\Projects\\diffusers\\ProgramData\\sample_new"
dataset = CustomDataset(path)
data = dataset[344]
from torchvision import transforms

transform = transforms.Resize((256, 256))


image = data['image']
image_pil = transforms.ToPILImage()(image)
image_pil = transform(image_pil)  

g_image = data['feature']
g_image_pil = transforms.ToPILImage()(g_image)
g_image_pil = transform(g_image_pil) 

vmatrix = data['view_matrix'].to(torch.float16).unsqueeze(0).to(device)
pmatrix = data['projection_matrix'].to(torch.float16).unsqueeze(0).to(device)

pose_model = PoseCtrl(pipe, image_encoder_path, ip_ckpt, raw_base_points, device)
# images = pose_model.generate(pil_image=g_image, num_samples=4, num_inference_steps=50, seed=42, image=image, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
images = pose_model.generate(pil_image=g_image, num_samples=4, num_inference_steps=100, seed=41, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
grid = image_grid(images, 1, 4)
grid


In [None]:
images = pose_model.generate(pil_image=g_image, num_samples=4, num_inference_steps=10, seed=41, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
grid = image_grid(images, 1, 4)
grid


In [None]:
g_image_pil

In [None]:
pose_model = PoseCtrl(pipe, image_encoder_path, ip_ckpt, raw_base_points, device)
# images = pose_model.generate(pil_image=g_image, num_samples=4, num_inference_steps=50, seed=42, image=image, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
images = pose_model.generate(pil_image=g_image, num_samples=4, num_inference_steps=50, seed=41, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
grid = image_grid(images, 1, 4)
grid

In [None]:
image_pil

In [None]:
data = dataset[125]
from torchvision import transforms

transform = transforms.Resize((256, 256))


image = data['image']
image_pil = transforms.ToPILImage()(image)
image_pil = transform(image_pil) 

g_image = data['feature']
g_image_pil = transforms.ToPILImage()(g_image)
g_image_pil = transform(g_image_pil) 

vmatrix = data['view_matrix'].to(torch.float16).unsqueeze(0).to(device)
pmatrix = data['projection_matrix'].to(torch.float16).unsqueeze(0).to(device)

# images = pose_model.generate(pil_image=g_image, num_samples=4, num_inference_steps=50, seed=42, image=image, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
images = pose_model.generate(pil_image=g_image, num_samples=4, num_inference_steps=50, seed=42, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
grid = image_grid(images, 1, 4)
grid

In [None]:
image_pil

In [None]:
image = Image.open(r"F:\Projects\diffusers\ProgramData\validation\NPC_Homeworld_Avatar_Loli_Catalyst_Klee_Edit\capture_131.png").convert("RGB")

g_image_pil = Image.open(r"F:\Projects\diffusers\ProgramData\validation\NPC_Homeworld_Avatar_Loli_Catalyst_Klee_Edit\feature.png")

images = pose_model.generate(pil_image=g_image_pil, num_samples=4, num_inference_steps=30, seed=42,image=image, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
grid = image_grid(images, 1, 4)
grid

In [None]:
images = pose_model.generate(pil_image=g_image_pil, num_samples=4, num_inference_steps=30, seed=42, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
grid = image_grid(images, 1, 4)
grid

In [None]:
image_pil

In [None]:
# images = pose_model.generate(pil_image=image, num_samples=4, num_inference_steps=50, seed=42, image=image, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
images = pose_model.generate(pil_image=g_image, num_samples=4, num_inference_steps=50, seed=42, image=image, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
grid = image_grid(images, 1, 4)
grid

In [None]:
# images = pose_model.generate(pil_image=image, num_samples=4, num_inference_steps=50, seed=42, image=image, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
images = pose_model.generate(pil_image=g_image, num_samples=4, num_inference_steps=50, seed=42, image=image, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
grid = image_grid(images, 1, 4)
grid

In [None]:
image_pil

In [None]:
# images = pose_model.generate(pil_image=image, num_samples=4, num_inference_steps=50, seed=42, image=image, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
images = pose_model.generate(pil_image=g_image_pil, num_samples=4, num_inference_steps=50, seed=42, image=image_pil, strength=0.6, V_matrix=vmatrix,P_matrix=pmatrix )
grid = image_grid(images, 1, 4)
grid

In [None]:
image_pil

In [None]:
import torch

# 加载 bin checkpoint 文件
checkpoint = torch.load(r"F:\Projects\diffusers\Project\sd-pose_ctrl\posectrl.bin", map_location="cpu")

# 查看 keys（通常包含 'model_state_dict', 'optimizer_state_dict' 等）
print(checkpoint.keys())

# 进一步查看 model_state_dict 的参数
if "image_proj_model" in checkpoint:
    print(checkpoint["image_proj_model"].keys())
    print(checkpoint["image_proj_model"].values())


In [None]:
from transformers import CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection, CLIPProcessor
processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
image_encoder = CLIPVisionModelWithProjection.from_pretrained(image_encoder_path).to(device)
# inputs = processor(images=data['feature'], return_tensors="pt") 
# image_tensor = inputs["pixel_values"] 
image_embeds = image_encoder(data['feature'].unsqueeze(0).to(device)).image_embeds

In [None]:
image  = Image.open(r"F:\Projects\diffusers\ProgramData\sample_new\NPC_Avatar_Girl_Sword_Ayaka\capture_3.png").convert('RGB')
image = image.resize((256,256))
from matplotlib import pyplot as plt
plt.imshow(image)

In [None]:
import torch
from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline, DDIMScheduler, AutoencoderKL
# pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
#     base_model_path,
#     torch_dtype=torch.float16, 
#     scheduler=noise_scheduler,
#     vae=vae,
#     safety_checker=None
# )
base_model_path = r'F:\Projects\diffusers\ProgramData\basemodel'
pipe = StableDiffusionPipeline.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16, 
    # scheduler=noise_scheduler,
    # vae=vae,
    safety_checker=None
)
pipe.to("cuda")  # 把整个模型转移到 GPU


# 运行 pipeline
image = pipe(prompt="1girl,black_hair, blush, closed_eyes, dress, elbow_gloves, breasts, gloves, heart, hug, looking_at_viewer, open_mouth").images


In [None]:
from matplotlib import pyplot as plt
plt.imshow(image[0])

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from diffusers.models.resnet import ResnetBlock2D, Upsample2D
import sys
import os
import matplotlib.pyplot as plt
from poseCtrl.data.dataset import load_base_points
from PIL import Image
import cv2
class VPmatrixPointsV1(nn.Module):
    """ 
    Input:  
        V_matrix: [batch,4,4]
        P_matrix: [batch,4,4]
        raw_base_points: [13860,4]
    Output:
        base_points: [batch,77,768]
    """
    def __init__(self, raw_base_points,image_width = 512,image_height=512):
        super().__init__() 
        self.register_buffer("raw_base_points", raw_base_points)
        self.image_width = image_width
        self.image_height = image_height

    def forward(self, V_matrix, P_matrix):
        VP_matrix = torch.bmm(P_matrix, V_matrix)  # [batch, 4, 4]
        points = self.raw_base_points.unsqueeze(0).expand(VP_matrix.shape[0], -1, -1)
        transformed_points = torch.bmm(points, VP_matrix.transpose(1, 2))  # [batch, 13860, 4]
        transformed_points[..., :3] = torch.where(
            transformed_points[..., 3:4] != 0,
            transformed_points[..., :3] / transformed_points[..., 3:4],
            transformed_points[..., :3]  
        ) # [batch, 13860, 3]
        transformed_points = transformed_points[..., :3]
        image_width, image_height = self.image_width, self.image_height

        screen_coords = transformed_points.clone()
        screen_coords[..., 0] = (screen_coords[..., 0] + 1) * 0.5 * image_width   # X: [-1,1] -> [0,512]
        screen_coords[..., 1] = (1 - (screen_coords[..., 1] + 1) * 0.5) * image_height  # Y 翻转: [-1,1] -> [512,0]

        screen_coords = screen_coords.round().long()  # [batch, 13860, 3]

        batch_size = screen_coords.shape[0]
        tensor_images = torch.zeros((batch_size, 3, image_height, image_width), dtype=torch.uint8)

        for b in range(batch_size):
            pixels = screen_coords[b].cpu().numpy()
            image_array = np.full((image_height, image_width), 255, dtype=np.uint8)

            for x, y, _ in pixels:
                if 0 <= x < image_width and 0 <= y < image_height:
                    image_array[y, x] = 0  
            inverted_array = 255 - image_array
            kernel = np.ones((3, 3), np.uint8)  
            dilated_image = cv2.dilate(inverted_array, kernel, iterations=1)  
            smoothed_image = cv2.GaussianBlur(dilated_image, (7, 7), 0)
            _, binary_mask = cv2.threshold(smoothed_image, 100, 255, cv2.THRESH_BINARY)
            binary_mask_3ch = np.stack([binary_mask] * 3, axis=-1)  # [512, 512, 3]
            tensor_images[b] = torch.from_numpy(binary_mask_3ch).permute(2, 0, 1)
        return tensor_images.float() / 255
    

import numpy as np

from poseCtrl.data.dataset import CustomDataset

path = r"F:\\Projects\\diffusers\\ProgramData\\sample_new"
dataset = CustomDataset(path)
data = dataset[0]

# # Generate VP Matrix
# vp_matrix = data['projection_matrix'] @ data['view_matrix']
# model = VPmatrixEncoder()
# vp_matrix_tensor = vp_matrix.float().unsqueeze(0)

# # Model Testing
# model = VPmatrixEncoder()
# output = model(vp_matrix_tensor)

# print("Input shape:", vp_matrix_tensor.shape)  # Expected: (1, 1, 4, 4)
# print("Output shape:", output.shape)  # Expected: (1, 77, 77)


path=r'F:\Projects\diffusers\Project\PoseCtrl\dataSet\standardVertex.txt'
raw_base_points=load_base_points(path)
points = VPmatrixPointsV1(raw_base_points)
with torch.no_grad():
    base_points=points(data['view_matrix'].unsqueeze(0), data['projection_matrix'].unsqueeze(0))
print(base_points.shape)

In [None]:
base_points = base_points.float() / 255  # 转换为 float 并归一化

torch.max(base_points)


In [None]:
image_pil = transforms.ToPILImage()(base_points[0])

In [None]:
# from torchvision import transforms
# feature = Image.open(r"F:\Projects\diffusers\ProgramData\pic\105901_unit (1)\capture_0.png").convert('RGB')
# # feature.shape
# transform = transforms.Compose([
#             transforms.Resize((512, 512)),  
#             transforms.ToTensor(), 
#         ])
# feature = transform(base_points)
# feature.shape


In [None]:
from transformers import CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection, CLIPProcessor
image_encoder = CLIPVisionModelWithProjection.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
inputs = processor(images=base_points, return_tensors="pt") 
image_tensor = inputs["pixel_values"]
point_embeds = image_encoder(image_tensor).image_embeds
# point_embeds.shape


In [None]:
import torch
import cv2
import numpy as np
from PIL import Image

def forward(V_matrix, P_matrix, raw_base_points, image_width=512, image_height=512):
    """
    处理 3D 点云数据，投影到 2D 屏幕坐标，并对每个 batch 生成平滑的 mask 图像。
    返回所有 batch 处理后的 Tensor。
    """
    # 计算视图-投影矩阵
    VP_matrix = torch.bmm(P_matrix, V_matrix)  # [batch, 4, 4]

    # 扩展点云维度
    points = raw_base_points.unsqueeze(0).expand(VP_matrix.shape[0], -1, -1)

    # 进行变换
    transformed_points = torch.bmm(points, VP_matrix.transpose(1, 2))  # [batch, num_points, 4]

    # 透视除法
    transformed_points[..., :3] = torch.where(
        transformed_points[..., 3:4] != 0,
        transformed_points[..., :3] / transformed_points[..., 3:4],
        transformed_points[..., :3]  
    )

    # 只保留 (x, y, z) 部分
    transformed_points = transformed_points[..., :3]

    # 归一化到屏幕坐标
    screen_coords = transformed_points.clone()
    screen_coords[..., 0] = (screen_coords[..., 0] + 1) * 0.5 * image_width   # X: [-1,1] -> [0,512]
    screen_coords[..., 1] = (1 - (screen_coords[..., 1] + 1) * 0.5) * image_height  # Y 需要翻转

    # 取整得到像素坐标
    screen_coords = screen_coords.round().long()  # [batch, num_points, 3]

    # 存储 batch 处理后的图像
    batch_size = screen_coords.shape[0]
    tensor_images = torch.zeros((batch_size, image_height, image_width), dtype=torch.uint8)

    for b in range(batch_size):
        pixels = screen_coords[b].cpu().numpy()

        # 创建白色背景图像
        image_array = np.full((image_height, image_width), 255, dtype=np.uint8)

        # 将点绘制到图像上
        for x, y, _ in pixels:
            if 0 <= x < image_width and 0 <= y < image_height:
                image_array[y, x] = 0  # 黑色点

        # 反转黑白颜色
        inverted_array = 255 - image_array

        # 进行膨胀操作（扩充点，使其更平滑）
        kernel = np.ones((3, 3), np.uint8)  # 使用稍大的膨胀核
        dilated_image = cv2.dilate(inverted_array, kernel, iterations=1)  # 膨胀 1 次

        # 进行高斯模糊，使边缘更加平滑
        smoothed_image = cv2.GaussianBlur(dilated_image, (7, 7), 0)

        # 二值化，确保是一个清晰的 mask
        _, binary_mask = cv2.threshold(smoothed_image, 100, 255, cv2.THRESH_BINARY)

        # 转换为 PyTorch Tensor 并存入 batch 结果
        tensor_images[b] = torch.from_numpy(binary_mask)

    return tensor_images  # [batch, 512, 512]

# 示例调用
# output_tensor = forward(V_matrix, P_matrix, raw_base_points)
# output_tensor.shape -> [batch, 512, 512]



In [None]:
from transformers import CLIPImageProcessor
from transformers import CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection, CLIPProcessor 
from PIL import Image
from diffusers import UNet2DConditionModel
import torch
image_encoder = CLIPVisionModelWithProjection.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
processor = CLIPImageProcessor()
image = Image.open(r"F:\Projects\diffusers\ProgramData\image_mirror_resized\batch_00000\6857739.jpg").convert("RGB")
unet = UNet2DConditionModel.from_pretrained(r'F:\Projects\diffusers\ProgramData\basemodel', subfolder="unet")
inputs = processor(images=image, return_tensors="pt").pixel_values
print(inputs.shape)  # 输出形状应为 [1, 3, 224, 224]，表示一个图像的输入张量
image_embeds = image_encoder(inputs).image_embeds

class ImageProjModel(torch.nn.Module):
    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
        super().__init__()

        self.generator = None
        self.cross_attention_dim = cross_attention_dim
        self.clip_extra_context_tokens = clip_extra_context_tokens
        self.proj = torch.nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
        self.norm = torch.nn.LayerNorm(cross_attention_dim)

    def forward(self, image_embeds):
        embeds = image_embeds
        clip_extra_context_tokens = self.proj(embeds).reshape(
            -1, self.clip_extra_context_tokens, self.cross_attention_dim
        )
        clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
        return clip_extra_context_tokens
    
image_proj_model = ImageProjModel(
        cross_attention_dim=unet.config.cross_attention_dim,
        clip_embeddings_dim=image_encoder.config.projection_dim,
        clip_extra_context_tokens=4,
    )

feature_tokens = image_proj_model(image_embeds)
print(feature_tokens.shape)  # 输出形状应为 [1, 4, 1024]，表示 4 个额外的上下文 token，每个 token 的维度为 1024

In [None]:
import torch.nn as nn
image_embeds
image_embeds_t = nn.functional.normalize(image_embeds, dim=-1)
loss = torch.nn.functional.mse_loss(image_embeds_t, image_embeds)
print(loss.item())  # 输出损失值

In [None]:
text_encoder = CLIPTextModel.from_pretrained(r'F:\Projects\diffusers\ProgramData\basemodel', subfolder="text_encoder")
tokenizer = CLIPTokenizer.from_pretrained(r'F:\Projects\diffusers\ProgramData\basemodel', subfolder="tokenizer")
text='a highly detailed anime girl, in front of a pure black background'
text_input_ids = tokenizer(
            text,
            max_length=tokenizer.model_max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        ).input_ids

encoder_hidden_states = text_encoder(text_input_ids)[0]
encoder_hidden_states.shape

In [None]:
import os
import sys
sys.path.append(r'F:\Projects\diffusers\Project\PoseCtrl')
sys.path.append(r'F:\Projects\diffusers\Project\PoseCtrl\poseCtrl')
from poseCtrl.data.dataset import CustomDataset_v4, load_base_points, CombinedDataset
from transformers import CLIPImageProcessor
from transformers import CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection, CLIPProcessor 
import torch
import torch.nn as nn
tokenizer = CLIPTokenizer.from_pretrained(r'F:\Projects\diffusers\ProgramData\basemodel', subfolder="tokenizer")
data_root_path_2 = r'F:\Projects\diffusers\Backup\image_mirror_resized'
train_dataset = CombinedDataset(
    # path1=args.data_root_path_1,
    path2=data_root_path_2,
    # path3=args.data_root_path_3,
    # path4=args.data_root_path_4,
    # path5=args.data_root_path_5,
    tokenizer=tokenizer
)
class Encoder(torch.nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model = nn.Sequential(
            nn.Flatten(),           # (B, 4, 4) -> (B, 16)
            nn.Linear(16, 128),
            nn.ReLU(),
            nn.Linear(128, 512),
            nn.ReLU(),
            nn.Linear(512, 1024)    # (B, 1024)
        )
    def forward(self, vp_matrix):
        return self.model(vp_matrix)
    
class Decoder(torch.nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 16),     # (B, 16)
            nn.Unflatten(1, (4, 4)) # (B, 4, 4)
        )
    def forward(self, image_embeds):
        """
        image_embeds: [batch, 1024]
        """
        return self.model(image_embeds)
    
class Autoencodermodel(torch.nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.Encoder = Encoder() #Encoder
        self.Decoder = Decoder() #Decoder
        self.clipencoder = CLIPVisionModelWithProjection.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K").to("cuda")
        # self.set_training()
    
    def set_training(self):
        self.Encoder.require_grad_(True)
        self.Decoder.require_grad_(True)
        self.clipencoder.require_grad_(False)
    
    def forward(self, images, v_matrix, p_matrix):
        """
        images: [batch, 3, 512, 512]
        v_matrix: [batch, 4, 4]
        p_matrix: [batch, 4, 4]
        """
        images = (images + 1.0) / 2.0
        inputs = processor(images=images, return_tensors="pt").pixel_values.to("cuda")
        image_embeds = self.clipencoder(inputs).image_embeds


        vp_matrix = torch.bmm(p_matrix, v_matrix) 
        encoded_vp = self.Encoder(vp_matrix)

        compare_vp = self.Decoder(image_embeds)

        
        return compare_vp, encoded_vp, image_embeds


model = Autoencodermodel()


In [None]:
from torchvision.transforms import ToPILImage
images = train_dataset[0]['image']
images = (images + 1.0) / 2.0
image = ToPILImage()(images)
image

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch.optim as optim

from torch.utils.data import random_split

# 假设 90% train，10% test
train_len = int(len(train_dataset) * 0.9)
test_len = len(train_dataset) - train_len
train_set, test_set = random_split(train_dataset, [train_len, test_len])

train_loader = DataLoader(train_set, batch_size=32, shuffle=True, num_workers=4)
test_loader = DataLoader(test_set, batch_size=32, shuffle=False, num_workers=4)

test_iter = iter(test_loader)  # 用于每个batch后测试


# 初始化模型
model = Autoencodermodel()
model.to("cuda")
checkpoint = torch.load(r"F:\Projects\diffusers\Backup\ckpt\autoencoder_checkpoint.pth", map_location="cuda")
model.Encoder.load_state_dict(checkpoint['encoder'])
model.Decoder.load_state_dict(checkpoint['decoder'])
# 冻结 clipencoder，仅训练 Encoder 和 Decoder
model.clipencoder.eval()
model.clipencoder.requires_grad_(False)
model.Encoder.requires_grad_(True)
model.Decoder.requires_grad_(True)
model.Encoder.train()
model.Decoder.train()
save_root = r"F:\Projects\diffusers\Backup\ckpt"
# 损失函数
mse_loss = nn.MSELoss()

# 优化器（只训练 Encoder 和 Decoder 参数）
optimizer = optim.Adam(
    list(model.Encoder.parameters()) + list(model.Decoder.parameters()), 
    lr=1e-4
)

# processor 初始化（匹配你用的 clip 模型）
processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")

# 训练循环
epochs = 10
for epoch in range(epochs):
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in pbar:
        model.train()
        images = batch["image"].to("cuda")
        v_matrix = batch["view_matrix"].to("cuda")
        p_matrix = batch["projection_matrix"].to("cuda")

        optimizer.zero_grad()
        compare_vp, encoded_vp, image_embeds = model(images, v_matrix, p_matrix)
        image_embeds = nn.functional.normalize(image_embeds, dim=-1)
        encoded_vp = nn.functional.normalize(encoded_vp, dim=-1)
        logits = torch.matmul(encoded_vp, image_embeds.T) / 0.07
        labels = torch.arange(logits.size(0)).to(logits.device)
        loss_align = nn.CrossEntropyLoss()(logits, labels) + nn.CrossEntropyLoss()(logits.T, labels)
        loss_reconstruct = mse_loss(compare_vp, torch.bmm(p_matrix, v_matrix))
        # loss_align = mse_loss(encoded_vp, image_embeds)
        loss = 20 * loss_reconstruct + loss_align

        loss.backward()
        optimizer.step()

        # ==== 测试 ====
        model.eval()
        try:
            test_batch = next(test_iter)
        except StopIteration:
            test_iter = iter(test_loader)
            test_batch = next(test_iter)

        with torch.no_grad():
            images_t = test_batch["image"].to("cuda")
            v_matrix_t = test_batch["view_matrix"].to("cuda")
            p_matrix_t = test_batch["projection_matrix"].to("cuda")

            compare_vp_t, encoded_vp_t, image_embeds_t = model(images_t, v_matrix_t, p_matrix_t)
            # 标准化特征向量（batch, 1024）
            image_embeds_t = nn.functional.normalize(image_embeds_t, dim=-1)
            encoded_vp_t = nn.functional.normalize(encoded_vp_t, dim=-1)
            # 余弦相似度矩阵： (B, B)，对角线是正样本对
            logits_t = torch.matmul(encoded_vp_t, image_embeds_t.T)  # (B, B)
            temperature_t = 0.07
            logits_t /= temperature_t
            labels_t = torch.arange(logits_t.size(0)).to(logits_t.device)

            loss_align_t = nn.CrossEntropyLoss()(logits_t, labels_t) + nn.CrossEntropyLoss()(logits_t.T, labels_t)

            loss_recon_t = mse_loss(compare_vp_t, torch.bmm(p_matrix_t, v_matrix_t))
            # loss_align_t = mse_loss(encoded_vp_t, image_embeds_t)
            loss_test = 20 * loss_recon_t +loss_align_t

        # ==== 打印 ====
        pbar.set_postfix({
            "train_loss": loss.item(),
            "test_loss": loss_test.item(),
            "recon_t": loss_recon_t.item(),
            "align_t": loss_align_t.item()
        })
    epoch_dir = os.path.join(save_root, f"epoch_{epoch+1}")
    os.makedirs(epoch_dir, exist_ok=True)

    save_path = os.path.join(epoch_dir, "autoencoder_checkpoint.pth")
    torch.save({
        'encoder': model.Encoder.state_dict(),
        'decoder': model.Decoder.state_dict(),
        'epoch': epoch + 1
    }, save_path)


In [None]:
# === 重新初始化模型并加载权重 ===
model = Autoencodermodel()
model.Decoder = Decoder()  # 不要忘记补 decoder
model.to("cuda")

checkpoint = torch.load(r"F:\Projects\diffusers\Backup\ckpt\autoencoder_checkpoint.pth", map_location="cuda")
model.Encoder.load_state_dict(checkpoint['encoder'])
model.Decoder.load_state_dict(checkpoint['decoder'])
model.eval()
model.clipencoder.eval()
print(f"Loaded checkpoint from epoch {checkpoint['epoch']}")
# 从 test_set 中取一个样本
sample = train_dataset[2]

# 转为 batch 格式 + 放到 cuda
image = sample["image"].unsqueeze(0).to("cuda")              # [1, 3, H, W]
v_matrix = sample["view_matrix"].unsqueeze(0).to("cuda")     # [1, 4, 4]
p_matrix = sample["projection_matrix"].unsqueeze(0).to("cuda")  # [1, 4, 4]

# 获取 processor（必须使用 CLIPProcessor）
from transformers import CLIPProcessor
processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")

# 转 PIL 图像（必要）
from torchvision import transforms
image_pil = transforms.ToPILImage()(image[0].cpu())
inputs = processor(images=image_pil, return_tensors="pt").pixel_values.to("cuda")

# 前向传播
with torch.no_grad():
    image_embeds = model.clipencoder(inputs).image_embeds
    vp_matrix = torch.bmm(p_matrix, v_matrix)
    encoded_vp = model.Encoder(vp_matrix)
    decoded_vp = model.Decoder(image_embeds)
    
    loss_recon = nn.MSELoss()(decoded_vp, vp_matrix)
    loss_align = nn.MSELoss()(encoded_vp, image_embeds)
    print(f"[Single Test] Recon Loss: {loss_recon.item():.4f}, Align Loss: {loss_align.item():.4f}")


In [None]:
encoded_vp = nn.functional.normalize(encoded_vp, dim=-1)

In [None]:
image_embeds = nn.functional.normalize(image_embeds, dim=-1)

In [None]:
image_embeds_ = image_embeds

In [None]:
import torch.nn.functional as F
similarity = F.cosine_similarity(encoded_vp, image_embeds_, dim=-1).mean() 
similarity

In [None]:
vp_matrix

In [None]:
decoded_vp

In [7]:
import os

def merge_joints2d(root_dir, output_path):
    with open(output_path, 'w', encoding='utf-8') as out_file:
        for folder_name in os.listdir(root_dir):
            folder_path = os.path.join(root_dir, folder_name)
            if not os.path.isdir(folder_path):
                continue

            joints_path = os.path.join(folder_path, 'selected_joints2d.txt')
            if os.path.exists(joints_path):
                out_file.write(f"# {folder_name}\n")  # 写入图像文件名
                with open(joints_path, 'r', encoding='utf-8') as f:
                    for line in f:
                        out_file.write(line.strip() + '\n')
                out_file.write('\n')  # 每组之间加空行

# 使用示例
root_folder = r"F:\Projects\diffusers\ProgramData\points\output_2\output"
output_txt = r"F:\Projects\diffusers\ProgramData\points\output_2\output/merged_joints2d.txt"
merge_joints2d(root_folder, output_txt)


In [3]:
import os
import shutil
from tqdm import tqdm

def collect_all_paths(folder):
    paths = []
    for root, dirs, files in os.walk(folder, topdown=False):
        for f in files:
            paths.append(os.path.join(root, f))
        for d in dirs:
            paths.append(os.path.join(root, d))
    paths.append(folder)  # 最后删除顶层文件夹自身
    return paths

def delete_folder_with_progress(folder_path):
    if not os.path.exists(folder_path):
        print("路径不存在")
        return

    all_items = collect_all_paths(folder_path)

    for path in tqdm(all_items, desc="Deleting", unit="item"):
        try:
            if os.path.isfile(path) or os.path.islink(path):
                os.remove(path)
            elif os.path.isdir(path):
                os.rmdir(path)
        except Exception as e:
            print(f"无法删除: {path}, 错误: {e}")

# 使用示例
folder_to_delete = r"F:\Projects\diffusers\ProgramData\02_output"
delete_folder_with_progress(folder_to_delete)


Deleting: 100%|██████████| 155970/155970 [01:57<00:00, 1326.50item/s]


In [9]:
import os
import sys
sys.path.append(r'F:\Projects\diffusers\Project\PoseCtrl')
sys.path.append(r'F:\Projects\diffusers\Project\PoseCtrl\poseCtrl')
from poseCtrl.data.dataset import CustomDataset_v4, load_base_points, CombinedDataset, CombinedDatasetTest
from transformers import CLIPImageProcessor
from transformers import CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection, CLIPProcessor 
import torch
import torch.nn as nn
processor = CLIPImageProcessor()
tokenizer = CLIPTokenizer.from_pretrained(r'F:\Projects\diffusers\ProgramData\basemodel', subfolder="tokenizer")
data_root_path_2 = r"F:\Projects\diffusers\ProgramData\test"
txt_subdir_name = r'F:\Projects\diffusers\ProgramData\new_data\image\smpl'
train_dataset = CombinedDatasetTest(
    # path1=args.data_root_path_1,
    path2=data_root_path_2,
    # path3=data_root_path_3,
    # path4=args.data_root_path_4,
    # path5=args.data_root_path_5,
    tokenizer=tokenizer,
    txt_subdir_name=txt_subdir_name
)

Loading data from path2: F:\Projects\diffusers\ProgramData\test


In [11]:
train_dataset[4]['points'].shape

torch.Size([10475, 3])

In [39]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import numpy as np
class STNkd(nn.Module):
    def __init__(self, k=64):
        super(STNkd, self).__init__()
        self.conv1 = torch.nn.Conv1d(k, 64, 1)
        self.conv2 = torch.nn.Conv1d(64, 128, 1)
        self.conv3 = torch.nn.Conv1d(128, 1024, 1)
        self.fc1 = nn.Linear(1024, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, k * k)
        self.relu = nn.ReLU()

        self.bn1 = nn.BatchNorm1d(64)
        self.bn2 = nn.BatchNorm1d(128)
        self.bn3 = nn.BatchNorm1d(1024)
        self.bn4 = nn.BatchNorm1d(512)
        self.bn5 = nn.BatchNorm1d(256)

        self.k = k

    def forward(self, x):
        batchsize = x.size()[0]
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = torch.max(x, 2, keepdim=True)[0]
        x = x.view(-1, 1024)
        x = F.relu(self.bn4(self.fc1(x)))
        x = F.relu(self.bn5(self.fc2(x)))
        x = self.fc3(x)

        iden = Variable(torch.from_numpy(np.eye(self.k).flatten().astype(np.float32))).view(1, self.k * self.k).repeat(
            batchsize, 1)
        if x.is_cuda:
            iden = iden.cuda()
        x = x + iden
        x = x.view(-1, self.k, self.k)
        return x
    
class PointNetEncoder(nn.Module):
    def __init__(self, channel=3):
        super(PointNetEncoder, self).__init__()
        self.conv1 = torch.nn.Conv1d(channel, 64, 1)
        self.conv2 = torch.nn.Conv1d(64, 128, 1)
        self.conv3 = torch.nn.Conv1d(128, 1024, 1)
        self.bn1 = nn.BatchNorm1d(64)
        self.bn2 = nn.BatchNorm1d(128)
        self.bn3 = nn.BatchNorm1d(1024)
        self.fstn = STNkd(k=64)

        self.feature_proj = nn.Linear(10475, 768)
        self.seq_proj = nn.Linear(1024, 77)
        self.norm = nn.LayerNorm(768)
        self.act = nn.GELU()

    def forward(self, x, V_matrix, P_matrix):
        B, D, N = x.size()
        trans = torch.bmm(P_matrix, V_matrix) 
        new_dim = torch.ones(B, D, 1, device=x.device)
        x = torch.cat([x, new_dim], dim=2)
        x = torch.bmm(x, trans.transpose(1, 2))
        x[..., :3] = torch.where(
            x[..., 3:4] != 0,
            x[..., :3] / x[..., 3:4],
            x[..., :3]
        )  # [batch, 13860, 3]
        x = x[..., :3]
        x = x.transpose(2, 1)
        x = F.relu(self.bn1(self.conv1(x)))
        trans_feat = self.fstn(x)
        x = x.transpose(2, 1)
        x = torch.bmm(x, trans_feat)
        x = x.transpose(2, 1)
        pointfeat = x

        x = F.relu(self.bn2(self.conv2(x)))
        x = self.bn3(self.conv3(x))
        x = self.feature_proj(x)      # [b, 1024, 768]
        x = self.norm(x)
        x = self.act(x)
        x = x.transpose(1, 2)         # [b, 768, 1024]
        x = self.seq_proj(x)          # [b, 768, 77]
        x = x.transpose(1, 2)
        return x, trans_feat
        
pointnet_encoder = PointNetEncoder(channel=3).to("cuda")
pointnet_encoder.eval()
input_points = train_dataset[4]['points'].unsqueeze(0).to("cuda")  # [1, 3, N]
v_matrix = train_dataset[4]['view_matrix'].unsqueeze(0).to("cuda")     # [1, 4, 4]
p_matrix = train_dataset[4]['projection_matrix'].unsqueeze(0).to("cuda")  # [1, 4, 4]
with torch.no_grad():
    point  = pointnet_encoder(input_points, v_matrix, p_matrix)
print(point[0].shape)  # [1, 1024]
print(point[1].shape)

torch.Size([1, 77, 768])
torch.Size([1, 64, 64])


In [41]:
def feature_transform_reguliarzer(trans):
    d = trans.size()[1]
    I = torch.eye(d)[None, :, :]
    if trans.is_cuda:
        I = I.cuda()
    loss = torch.mean(torch.norm(torch.bmm(trans, trans.transpose(2, 1)) - I, dim=(1, 2)))
    return loss
ft_loss = feature_transform_reguliarzer(point[1])
ft_loss

tensor(3.5578, device='cuda:0')

# TODO
- atten_pro
- inference
- dataset check
- training debug