In [None]:
# 切换工作目录
import os
import time
import torch
import argparse
from omegaconf import OmegaConf

current_dir = os.getcwd()
target_dir_name = 'MimicMotion'

target_dir = os.path.join(current_dir, target_dir_name)

if not current_dir.endswith(target_dir_name):
    if os.path.exists(target_dir):
        os.chdir(target_dir)
        print(f"Current working directory changed to: {os.getcwd()}")
    else:
        print(f"Target directory {target_dir} does not exist.")
else:
    print(f"Already in the target directory: {current_dir}")

# 优化模型

## 1. 处理视频数据

In [None]:
import sys
import cv2
import ast
import einops
import numpy as np
import torch
import random
from PIL import Image
from pathlib import Path
from config import mimicmotion_root

def load():
    paths = [mimicmotion_root, os.path.join(mimicmotion_root, 'MeshGraphormer'),\
             os.path.join(mimicmotion_root, 'mimicmotion', 'modules'),\
             os.path.join(mimicmotion_root, 'dataset')]
    for p in paths:
        sys.path.insert(0, p)
load()

import mediapipe as mp
# from mediapipe.tasks.python.vision import ImageFormat
from controlnet_aux.util import HWC3, resize_image

from mimicmotion.modules.meshgraphormer import MeshGraphormerMediapipe
from mimicmotion.dwpose.util import draw_bodypose, draw_handpose, draw_facepose
from mimicmotion.utils.utils import get_fps, save_videos_from_pil, read_frames, read_handframes

meshgraphormer = MeshGraphormerMediapipe()

In [None]:
def process_single_video_hand(video_path, detector, root_dir, save_dir, save_mask_dir, detect_resolution=576, image_resolution=512, output_type="pil", padding_bbox=30):
    relative_path = os.path.relpath(video_path, root_dir)
    
    out_path = os.path.join(save_dir, relative_path)
    out_mask_path = os.path.join(save_mask_dir, relative_path)
    print('relative_path, video_path, root_dir', relative_path, video_path, root_dir)
    if os.path.exists(out_path) and os.path.exists(out_mask_path) :
        return
    
    output_dir = Path(os.path.dirname(os.path.join(save_dir, relative_path)))
    output_mask_dir = Path(os.path.dirname(os.path.join(save_mask_dir, relative_path)))
    if not output_dir.exists():
        output_dir.mkdir(parents=True, exist_ok=True)
        
    if not output_mask_dir.exists():
        output_mask_dir.mkdir(parents=True, exist_ok=True)
        
    fps = get_fps(video_path)
    frames = read_frames(video_path)
    kps_results = []
    kps_mask_results = []
    for i, frame_pil in enumerate(frames):
        print(f"Processing frame {i+1}", end='\r')
        
        # input_image = cv2.cvtColor(
        #     np.array(frame_pil, dtype=np.uint8), cv2.COLOR_RGB2BGR
        # )
        input_image = np.array(frame_pil)
        # input_image = HWC3(input_image)
        # input_image = resize_image(input_image, detect_resolution)
        # H, W, C = input_image.shape
        
        # input_image_mp = mp.Image(image_format=mp.ImageFormat.SRGB, data=input_image)
        # input_image = convert_to_mp_image(frame_pil)
        depthmap, mask, info = detector.get_hand(input_image, padding_bbox, relative_path)
        
        if depthmap is None:
            depthmap = np.zeros((H, W, 3), dtype=np.uint8)
            depthmap = HWC3(depthmap)
            mask = np.zeros((H, W, 3), dtype=np.uint8)
            mask = HWC3(mask)
        else:
            depthmap = HWC3(depthmap)
            mask = HWC3(mask)

        # depthmap = resize_image(depthmap, image_resolution)
        H, W, C = depthmap.shape
        depthmap = cv2.resize(depthmap, (W, H), interpolation=cv2.INTER_LINEAR)
        
        # mask = resize_image(mask, image_resolution)
        H, W, C = mask.shape
        mask = cv2.resize(mask, (W, H), interpolation=cv2.INTER_LINEAR)
        
        if output_type == "pil":
            depthmap = Image.fromarray(depthmap)
            mask = Image.fromarray(mask)
            
        kps_results.append(depthmap)
        kps_mask_results.append(mask)
        
    save_videos_from_pil(kps_results, out_path, fps=fps)
    save_videos_from_pil(kps_mask_results, out_mask_path, fps=fps)

def process_batch_videos_hand(video_list, detector, root_dir, save_dir, pose_mask_dir):
    for i, video_path in enumerate(video_list):
        print(f"Process {i}/{len(video_list)} video")
        process_single_video_hand(video_path, detector, root_dir, save_dir, pose_mask_dir)

In [None]:
pose_root_dir = "assets/test_data/videos"
pose_save_dir = pose_root_dir + "_dwhand"
pose_mask_dir = pose_root_dir + "_dwmask"

pose_mp4_paths = set()
for root, dirs, files in os.walk(pose_root_dir):
    for name in files:
        if name.endswith(".mp4"):
            pose_mp4_paths.add(os.path.join(root, name))
pose_mp4_paths = list(pose_mp4_paths)
pose_mp4_paths

In [None]:
process_batch_videos_hand(pose_mp4_paths, meshgraphormer, root_dir=pose_root_dir, save_dir=pose_save_dir,\
                         pose_mask_dir=pose_mask_dir)

## 2. 处理参考图像

In [None]:
def process_image(image_path, output_dir, weights, prompt, strength, seed):
    start = time.time()
    os.system(f'python handrefiner.py --input_img {image_path} --out_dir {output_dir} --strength {strength} --weights {weights} --prompt "{prompt}" --seed {seed}')
    end = time.time()
    diff = end - start
    return diff

import math
from PIL import Image
import torch
from torchvision.transforms.functional import resize, center_crop, pil_to_tensor, to_pil_image

def load_and_adjust_image(image_path, output_path, resolution=576, aspect_ratio=9/16):
    # 从路径加载图像并转换为 RGB 格式
    image = Image.open(image_path).convert('RGB')
    image_pixels = pil_to_tensor(image)  # 结果为形状 (C, H, W)
    
    # 获取原始图像的高度和宽度
    h, w = image_pixels.shape[-2:]

    # 根据图像的高度和宽度调整目标宽度和高度
    if h > w:
        w_target, h_target = resolution, int(resolution / aspect_ratio // 64) * 64
    else:
        w_target, h_target = int(resolution / aspect_ratio // 64) * 64, resolution

    # 计算高宽比
    h_w_ratio = float(h) / float(w)
    
    # 根据高宽比调整目标尺寸
    if h_w_ratio < h_target / w_target:
        h_resize, w_resize = h_target, math.ceil(h_target / h_w_ratio)
    else:
        h_resize, w_resize = math.ceil(w_target * h_w_ratio), w_target

    # 调整图像尺寸并进行中心裁剪
    image_pixels = resize(image_pixels, [h_resize, w_resize], antialias=True)
    adjusted_image = center_crop(image_pixels, [h_target, w_target])
    
    adjusted_image_pil = to_pil_image(adjusted_image)
    adjusted_image_pil.save(output_path)
    
    return adjusted_image

In [None]:
prompt = "A woman smiles, her hands folded in front of her."
weights = '../autodl-tmp/models/inpaint_depth_control.ckpt'
strength = 0.55
seed = 1

file_path = 'assets/test_data/images/demo3.jpg'
output_folder = 'assets/test_data/images'

process_image(file_path, output_folder, weights, prompt, strength, seed)

## 3. 验证模型

In [None]:
ref_image_path = 'assets/test_data/images/demo1.jpg'
ref_video_path = 'assets/test_data/videos/test1.mp4'
ref_hand_path = 'assets/test_data/videos_dwhand/test1.mp4' 
ref_hand_mask_path = 'assets/test_data/videos_dwmask/test1.mp4' 
images_depth_path = 'assets/test_data/images_depth/demo1_depth.jpg' 
images_mask_path = 'assets/test_data/images_mask/demo1_mask.jpg' 

ref_image_path2 = 'assets/test_data/images/demo1.jpg'
ref_video_path2 = 'assets/test_data/videos/test2.mp4'
ref_hand_path2 = 'assets/test_data/videos_dwhand/test2.mp4' 
ref_hand_mask_path2 = 'assets/test_data/videos_dwmask/test2.mp4' 
images_depth_path2 = 'assets/test_data/images_depth/demo1_depth.jpg' 
images_mask_path2 = 'assets/test_data/images_mask/demo2_mask.jpg' 

num_frames = 16
resolution = 576
frames_overlap = 6
num_inference_steps = 25
noise_aug_strength = 0
guidance_scale = 2.0
sample_stride = 2
fps = 15
seed = 42

conf = OmegaConf.create({
    'base_model_path': 'models/SVD/stable-video-diffusion-img2vid-xt-1-1',
    'ckpt_path': 'models/MimicMotion.pth',
    'stage2_path': 'models/motion_module-2400.pth',
    'test_case': [
        {
            'ref_video_path': ref_video_path,
            'ref_image_path': ref_image_path,
            'ref_hand_path': ref_hand_path,
            'ref_hand_mask_path': ref_hand_mask_path,
            'images_depth_path': images_depth_path,
            'images_mask_path': images_mask_path,
            'num_frames': num_frames,
            'resolution': resolution,
            'frames_overlap': frames_overlap,
            'num_inference_steps': num_inference_steps,
            'noise_aug_strength': noise_aug_strength,
            'guidance_scale': guidance_scale,
            'sample_stride': sample_stride,
            'fps': fps,
            'seed': seed,
        },
        {
            'ref_video_path': ref_video_path2,
            'ref_image_path': ref_image_path2,
            'ref_hand_path': ref_hand_path2,
            'ref_hand_mask_path': ref_hand_mask_path2,
            'images_depth_path': images_depth_path2,
            'images_mask_path': images_mask_path2,
            'num_frames': num_frames,
            'resolution': resolution,
            'frames_overlap': frames_overlap,
            'num_inference_steps': num_inference_steps,
            'noise_aug_strength': noise_aug_strength,
            'guidance_scale': guidance_scale,
            'sample_stride': sample_stride,
            'fps': fps,
            'seed': seed,
        },
    ],
})
OmegaConf.save(conf, './configs/my_conf.yaml')

In [None]:
# 运行
start = time.time()
!python inference2.py --inference_config configs/my_conf.yaml
end = time.time()

diff = end - start
if diff < 60:
    print(f'耗时：{diff:.3f} 秒。')
else:
    print(f'耗时：{diff/60:.3f} 分。')

# 原始模型

In [None]:
ref_video_path = 'assets/test_data/videos/test1.mp4'
ref_image_path = 'assets/test_data/images/demo1.jpg'

ref_video_path2 = 'assets/test_data/videos/test2.mp4'
ref_image_path2 = 'assets/test_data/images/demo1.jpg'

num_frames = 16
resolution = 576
frames_overlap = 6
num_inference_steps = 25
noise_aug_strength = 0
guidance_scale = 2.0
sample_stride = 2
fps = 15
seed = 42

conf = OmegaConf.create({
    'base_model_path': 'models/SVD/stable-video-diffusion-img2vid-xt-1-1',
    'ckpt_path': 'models/MimicMotion.pth',
    'test_case': [
        {
            'ref_video_path': ref_video_path,
            'ref_image_path': ref_image_path,
            'num_frames': num_frames,
            'resolution': resolution,
            'frames_overlap': frames_overlap,
            'num_inference_steps': num_inference_steps,
            'noise_aug_strength': noise_aug_strength,
            'guidance_scale': guidance_scale,
            'sample_stride': sample_stride,
            'fps': fps,
            'seed': seed,
        },
        {
            'ref_video_path': ref_video_path2,
            'ref_image_path': ref_image_path2,
            'num_frames': num_frames,
            'resolution': resolution,
            'frames_overlap': frames_overlap,
            'num_inference_steps': num_inference_steps,
            'noise_aug_strength': noise_aug_strength,
            'guidance_scale': guidance_scale,
            'sample_stride': sample_stride,
            'fps': fps,
            'seed': seed,
        },
    ],
})
OmegaConf.save(conf, './configs/minic_conf.yaml')

In [None]:
start = time.time()
!python inference.py --inference_config configs/minic_conf.yaml
end = time.time()

diff = end - start
if diff < 60:
    print(f'耗时：{diff:.3f} 秒。')
else:
    print(f'耗时：{diff/60:.3f} 分。')