In [None]:
import os
import cv2
from insightface.app import FaceAnalysis
from insightface.utils import face_align
import torch
from diffusers import StableDiffusionPipeline, DDIMScheduler, AutoencoderKL
from PIL import Image
from rembg import remove

from ip_total.ip_adapter_total import IPAdapterTotal

In [None]:
base_model_path = "models\majicmix-realistic-v7" # pagebrain/majicmix-realistic-v7
image_encoder_path_faceid = "models/CLIP-ViT-H-14-laion2B-s32B-b79K"
image_encoder_path_general = "models/image_encoder"
vae_model_path = "models/sd-vae-ft-mse"
ip_total_ckpt = "models/ip_total.bin"
device = "cuda"

In [None]:
def image_grid(imgs, rows, cols):
    assert len(imgs) == rows*cols

    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    grid_w, grid_h = grid.size
    
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid

In [None]:
# load SD pipeline
noise_scheduler = DDIMScheduler(
        num_train_timesteps=1000,
        beta_start=0.00085,
        beta_end=0.012,
        beta_schedule="scaled_linear",
        clip_sample=False,
        set_alpha_to_one=False,
        steps_offset=1,
)
vae  = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)
pipe = StableDiffusionPipeline.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    scheduler=noise_scheduler,
    vae=vae,
    feature_extractor=None,
    safety_checker=None
)

In [None]:
# load ip-adapter
ip_model = IPAdapterTotal(pipe, 
                          image_encoder_path_faceid=image_encoder_path_faceid,
                          image_encoder_path_general=image_encoder_path_general,
                          ip_ckpt=ip_total_ckpt,
                          device=device)

In [None]:
# preprocess teplate images
image_cloth = Image.open("data/input/cloth_img1.jpg")
image_cloth_rmbg = remove(image_cloth)
image_cloth_rmbg = image_cloth_rmbg.convert("RGB") #resize((256, 256))

image_cloth_rmbg

In [None]:
# preprocess face image
face_app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
face_app.prepare(ctx_id=0, det_size=(640, 640))

img_path = "data/input/face_img1.jpg"
image = cv2.imread(img_path)
faces = face_app.get(image)

faceid_embeds = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)
face_image = face_align.norm_crop(image, landmark=faces[0].kps, image_size=224) # you can also segment the face
face_image = cv2.cvtColor(face_image, cv2.COLOR_BGR2RGB)
face_image = Image.fromarray(face_image)
face_image

In [None]:
# generate
prompt = "photo of a beautiful girl wearing casual shirt in a bar"
negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality, blurry"
scale_faceid = 1.0
scale_general = 1.0
width=768
height=768

images = ip_model.generate(
            prompt=prompt, negative_prompt=negative_prompt,
            scale_faceid=scale_faceid, 
            scale_general=scale_general, 
            face_image=face_image, 
            faceid_embeds=faceid_embeds, 
            template_img=image_cloth_rmbg,
            width=width, height=height, 
            num_samples=4,num_inference_steps=30, seed=21839
        )

# show result
grid = image_grid(images, 1, 4)
grid