In [1]:
import diffusers
from diffusers import DiffusionPipeline
from diffusers import schedulers, AutoencoderKL, UNet2DConditionModel, PNDMScheduler, StableDiffusionPipeline
import torch
from torch.utils.data import DataLoader
import transformers.models
from transformers import CLIPTextModel, CLIPTokenizer
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed
import utils
import cv2, os, math
from tqdm import tqdm_notebook
import torch.nn.functional as F
import numpy as np
import logging
import itertools

%load_ext autoreload
%autoreload 2

In [2]:
pretrained_model_name_or_path = "stablediffusionapi/realistic-vision-v51"


In [4]:
text_encoder: transformers.models.clip.modeling_clip.CLIPTextModel = CLIPTextModel.from_pretrained(
    pretrained_model_name_or_path, subfolder="text_encoder"
)
vae = AutoencoderKL.from_pretrained(
    pretrained_model_name_or_path, subfolder="vae"
)
unet = UNet2DConditionModel.from_pretrained(
    pretrained_model_name_or_path, subfolder="unet"
)
tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(
    pretrained_model_name_or_path,
    subfolder="tokenizer",
)
# pipeline = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path)

### Pipeline test

In [21]:
test_img = pipeline("Young girl sitting at a cafe, portrait, cute, detailed face", width=264, height=384, num_inference_steps=45, negative_prompt="ugly face, deformed").images[0]

  0%|          | 0/45 [00:00<?, ?it/s]

100%|██████████| 45/45 [00:17<00:00,  2.61it/s]


In [22]:
test_img.save('test3.png')

### Textual Inversion

In [2]:
placeholder_token = "<xyz>"

In [6]:
tokenizer.add_tokens(placeholder_token)
text_encoder.resize_token_embeddings(len(tokenizer))
placeholder_token_id = tokenizer.convert_tokens_to_ids(placeholder_token)
placeholder_token_id

49408

In [7]:
text_encoder.text_model.embeddings

CLIPTextEmbeddings(
  (token_embedding): Embedding(49409, 768)
  (position_embedding): Embedding(77, 768)
)

In [8]:
pipeline = StableDiffusionPipeline.from_pretrained(
    pretrained_model_name_or_path,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    vae=vae,
    unet=unet,
)
pipeline.save_pretrained("chilloutmix")

safety_checker/model.safetensors not found


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]



In [6]:
# Freeze everything but the MLP
def freeze_params(params):
    for param in params:
        param.requires_grad = False

# Freeze vae and unet
freeze_params(vae.parameters())
freeze_params(unet.parameters())
# Freeze all parameters except for the token embeddings in text encoder
freeze_params(text_encoder.text_model.encoder.parameters())
freeze_params(text_encoder.text_model.final_layer_norm.parameters())
freeze_params(text_encoder.text_model.embeddings.position_embedding.parameters())

### Training

In [7]:
train_dataset = utils.TextualInversionDataset('data', tokenizer, placeholder_token=placeholder_token)
def create_dataloader(train_batch_size=1):
    return DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)

  0%|          | 0/202599 [00:00<?, ?it/s]

100%|██████████| 202599/202599 [00:00<00:00, 948485.90it/s]


In [8]:
embedding_predictor = utils.MetaTextInversion()
noise_scheduler = PNDMScheduler.from_config(pretrained_model_name_or_path, subfolder="scheduler")

  deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)


In [3]:
hyperparameters = {
    "learning_rate": 5e-04,
    "scale_lr": True,
    "max_train_steps": 750,
    "save_steps": 250,
    "train_batch_size": 4,
    "gradient_accumulation_steps": 1,
    "gradient_checkpointing": True,
    "mixed_precision": "fp16",
    "seed": 42,
    "output_dir": "meta-text-inversion-output"
}

In [34]:
logger = logging.getLogger()

# def save_progress(text_encoder, placeholder_token_id, accelerator, save_path):
#     logger.info("Saving embeddings")
#     learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[placeholder_token_id]
#     learned_embeds_dict = {placeholder_token: learned_embeds.detach().cpu()}
#     torch.save(learned_embeds_dict, save_path)

def training_function(text_encoder, vae, unet, global_step=0):
    train_batch_size = hyperparameters["train_batch_size"]
    gradient_accumulation_steps = hyperparameters["gradient_accumulation_steps"]
    learning_rate = hyperparameters["learning_rate"]
    max_train_steps = hyperparameters["max_train_steps"]
    output_dir = hyperparameters["output_dir"]
    gradient_checkpointing = hyperparameters["gradient_checkpointing"]

    accelerator = Accelerator(
        gradient_accumulation_steps=gradient_accumulation_steps,
        mixed_precision=hyperparameters["mixed_precision"]
    )

    if gradient_checkpointing:
        text_encoder.gradient_checkpointing_enable()
        unet.enable_gradient_checkpointing()

    train_dataloader = create_dataloader(train_batch_size)

    if hyperparameters["scale_lr"]:
        learning_rate = (
            learning_rate * gradient_accumulation_steps * train_batch_size * accelerator.num_processes
        )

    # Initialize the optimizer
    optimizer = torch.optim.AdamW(
        itertools.chain(
            embedding_predictor.parameters(),
            text_encoder.get_input_embeddings().parameters()
        ),  # only optimize the embeddings and the embedding predictor MLP
        lr=learning_rate,
    )

    text_encoder, optimizer, train_dataloader = accelerator.prepare(
        text_encoder, optimizer, train_dataloader
    )

    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
        weight_dtype = torch.float16
    elif accelerator.mixed_precision == "bf16":
        weight_dtype = torch.bfloat16

    # Move vae and unet to device
    vae.to(accelerator.device, dtype=weight_dtype)
    unet.to(accelerator.device, dtype=weight_dtype)
    embedding_predictor.to(accelerator.device, dtype=weight_dtype)

    # Keep vae in eval mode as we don't train it
    vae.eval()
    # Keep unet in train mode to enable gradient checkpointing
    unet.train()

    # Train embedding predictor
    embedding_predictor.train()

    
    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
    num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)

    # Train!
    total_batch_size = train_batch_size * accelerator.num_processes * gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Instantaneous batch size per device = {train_batch_size}")
    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
    logger.info(f"  Gradient Accumulation steps = {gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm_notebook(range(global_step, max_train_steps), disable=not accelerator.is_local_main_process)
    progress_bar.set_description("Steps")

    for epoch in range(num_train_epochs):
        text_encoder.train()
        for step, batch in enumerate(train_dataloader):
            if torch.min(batch['valid']) == 0: continue
            with accelerator.accumulate(text_encoder):
                # Convert images to latent space
                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample().detach()
                latents = latents * 0.18215

                # Sample noise that we'll add to the latents
                noise = torch.randn_like(latents)
                bsz = latents.shape[0]
                # Sample a random timestep for each image
                timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device).long()

                # Add noise to the latents according to the noise magnitude at each timestep
                # (this is the forward diffusion process)
                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

                # Get the text embedding for conditioning
                embeddings = text_encoder.get_input_embeddings().weight.data
                mapped_embeddings = embedding_predictor(batch["face_id"].to(dtype=weight_dtype))
                batched_hidden_states = []
                for b in range(train_batch_size):
                    embeddings[placeholder_token_id] = mapped_embeddings[b]
                    encoder_hidden_states = text_encoder(batch["input_ids"][[b]])[0]
                    batched_hidden_states.append(encoder_hidden_states)
                batched_hidden_states = torch.vstack(batched_hidden_states)

                # Predict the noise residual
                noise_pred = unet(noisy_latents, timesteps, batched_hidden_states.to(weight_dtype)).sample

                 # Get the target for loss depending on the prediction type
                if noise_scheduler.config.prediction_type == "epsilon":
                    target = noise
                elif noise_scheduler.config.prediction_type == "v_prediction":
                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
                else:
                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")

                loss = F.mse_loss(noise_pred, target, reduction="none").mean([1, 2, 3]).mean()
                accelerator.backward(loss)

                # Zero out the gradients for all token embeddings except the newly added
                # embeddings for the concept, as we only want to optimize the concept embeddings
                if accelerator.num_processes > 1:
                    grads = text_encoder.module.get_input_embeddings().weight.grad
                else:
                    grads = text_encoder.get_input_embeddings().weight.grad
                # Get the index for tokens that we want to zero the grads for
                index_grads_to_zero = torch.arange(len(tokenizer)) != placeholder_token_id
                grads.data[index_grads_to_zero, :] = grads.data[index_grads_to_zero, :].fill_(0)

                optimizer.step()
                optimizer.zero_grad()

            # Checks if the accelerator has performed an optimization step behind the scenes
            if accelerator.sync_gradients:
                progress_bar.update(1)
                global_step += 1
                if global_step % hyperparameters["save_steps"] == 0:
                    save_path = os.path.join(output_dir, f"learned_embeds-step-{global_step}.bin")
                    # save_progress(text_encoder, placeholder_token_id, accelerator, save_path)
                    torch.save(embedding_predictor.state_dict(), os.path.join(output_dir, f'mlp-{global_step}.bin'))

            logs = {"loss": loss.detach().item()}
            progress_bar.set_postfix(**logs)

            if global_step >= max_train_steps:
                break

        accelerator.wait_for_everyone()


    # Create the pipeline using using the trained modules and save it.
    if accelerator.is_main_process:
        pipeline = StableDiffusionPipeline.from_pretrained(
            pretrained_model_name_or_path,
            text_encoder=accelerator.unwrap_model(text_encoder),
            tokenizer=tokenizer,
            vae=vae,
            unet=unet,
        )
        pipeline.save_pretrained(output_dir)
        # Also save the newly trained embeddings
        # save_path = os.path.join(output_dir, f"learned_embeds.bin")
        torch.save(embedding_predictor.state_dict(), os.path.join(output_dir, f'mlp-final.bin'))
        # save_progress(text_encoder, placeholder_token_id, accelerator, save_path)


In [38]:
import accelerate
accelerate.notebook_launcher(training_function, args=(text_encoder, vae, unet, 500), num_processes=1)

for param in itertools.chain(unet.parameters(), text_encoder.parameters()):
  if param.grad is not None:
    del param.grad  # free some memory
  torch.cuda.empty_cache()

Launching training on one GPU.


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(range(global_step, max_train_steps), disable=not accelerator.is_local_main_process)


  0%|          | 0/250 [00:00<?, ?it/s]

safety_checker/model.safetensors not found


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

### Evaluation

In [4]:
torch.cuda.empty_cache()
pipe: StableDiffusionPipeline = StableDiffusionPipeline.from_pretrained(
    "attempt-ghostnet",
    scheduler=PNDMScheduler.from_pretrained("attempt-ghostnet", subfolder="scheduler"),
    torch_dtype=torch.float16,
).to("cuda")
pipe.safety_checker = None

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [2]:
# chillout mix
pipe: StableDiffusionPipeline = StableDiffusionPipeline.from_pretrained("chilloutmix", torch_dtype=torch.float16).to("cuda")
pipe.safety_checker = None

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]



In [150]:
del pipe
torch.cuda.empty_cache()

In [5]:
dataset = utils.TextualInversionDataset('data', pipe.tokenizer, placeholder_token=placeholder_token, skip_faceid=False)
dataset[0].keys()

  0%|          | 0/202599 [00:00<?, ?it/s]

100%|██████████| 202599/202599 [00:00<00:00, 1719017.25it/s]


dict_keys(['valid', 'face_id', 'input_ids', 'prompt', 'pixel_values'])

In [6]:
target_face_id = dataset[20]["face_id"] # utils.get_face_embedding("original-test/test2.png")[-1]
# target_face_id = utils.get_face_embedding("out-6300/1-f1.png")[-1]
embedding_predictor = utils.MetaTextInversion()
embedding_predictor.eval().to('cuda', dtype=torch.float16)
embedding_predictor.load_state_dict(torch.load("attempt4/mlp-1900.bin"))
with torch.no_grad():
    target_embedding = embedding_predictor(torch.tensor(target_face_id).to('cuda', dtype=torch.float16).unsqueeze(0))
torch.cuda.empty_cache()

In [18]:
from ghostfacenetsv2 import GhostFaceNetsV2
embedding_predictor = GhostFaceNetsV2(image_size=178, num_classes=768, dropout=0).eval().to('cuda', dtype=torch.float16)
embedding_predictor.load_state_dict(torch.load("attempt-ghostnet/ghostnet-6300.bin"))
with torch.no_grad():
    example = torch.tensor(dataset[0]['pixel_values']).to('cuda', dtype=torch.float16).unsqueeze(0)
    print(example.shape)
    target_embedding = embedding_predictor(example)

torch.Size([1, 3, 178, 178])


  example = torch.tensor(dataset[0]['pixel_values']).to('cuda', dtype=torch.float16).unsqueeze(0)


In [7]:
del embedding_predictor
torch.cuda.empty_cache()

In [8]:
placeholder_token_id = pipe.tokenizer.convert_tokens_to_ids(placeholder_token)
placeholder_token_id

49408

In [17]:
pipe.tokenizer.convert_tokens_to_ids("a")

64

In [127]:
for i in [5]:
    target_face_id = dataset[i]["face_id"] # utils.get_face_embedding("original-test/test2.png")[-1]
    # target_face_id = utils.get_face_embedding(f"poolf/1700-9-f7.png")[-1]
    embedding_predictor = utils.MetaTextInversion()
    embedding_predictor.eval().to('cuda', dtype=torch.float16)
    embedding_predictor.load_state_dict(torch.load("attempt4/mlp-1900.bin"))
    with torch.no_grad():
        target_embedding = embedding_predictor(torch.tensor(target_face_id).to('cuda', dtype=torch.float16).unsqueeze(0))
    torch.cuda.empty_cache()
    
    embeddings = pipe.text_encoder.get_input_embeddings().weight.data
    embeddings[placeholder_token_id] = target_embedding[0] # exclude batch dimension
    for j in [1]:
        conditioned_img = pipe(f"a photo of {placeholder_token}, cute girl, beautiful, film grain, natural lighting, 8K UHD", width=360, height=480, num_inference_steps=50, negative_prompt="disfigured, ugly, bad, cartoon, anime, 3d, painting, blurry", safety_checker=None).images[0]
        conditioned_img.save(f'test/conditioned/${j}_.png')
        # conditioned_img.save(f'poolf/1900-{i}-{j}.png')
        # conditioned_img = pipe(f"{placeholder_token}, a photo of a {placeholder_token}, film grain, natural lighting, sharp focus, 8K UHD, handsome", width=384, height=512, num_inference_steps=50, negative_prompt="disfigured, ugly, bad, immature, cartoon, anime, 3d, painting, blurry, woman", safety_checker=None).images[0]
        # conditioned_img.save(f'poolm-chilloutmix/1900-{i}-f{j}.png')

  0%|          | 0/50 [00:00<?, ?it/s]

In [6]:
# Replace embedding with target
for i in [4]:
    # embeddings = pipe.text_encoder.get_input_embeddings().weight.data
    # embeddings[placeholder_token_id] = target_embedding[0] # exclude batch dimension
    # 2-4
    # conditioned_img = pipe(f"one man, film grain, sharp focus, 8K UHD, handsome", width=384, height=512, num_inference_steps=50, negative_prompt="disfigured, ugly, bad, immature, cartoon, anime, 3d, painting, blurry", safety_checker=None).images[0]
    # conditioned_img = pipe(f"a girl, cute, film grain, natural lighting, 8K UHD", width=360, height=480, num_inference_steps=50, negative_prompt="disfigured, ugly, bad, cartoon, anime, 3d, painting", safety_checker=None).images[0]
    # conditioned_img = pipe(f"a girl, cute, beautiful, rim lighting, natural lighting, dslr, ultra quality, sharp focus, tack sharp, film grain, Fujifilm XT3, crystal clear, 8K UHD, detailed glossy eyes, high detailed skin, skin pores, teen, [Kpop idol]", width=504, height=640, num_inference_steps=46,
    #                        negative_prompt="disfigured, ugly, bad, cartoon, anime, 3d, painting, watermark, extra limbs, monochrome, grayscale, skin spots, pubic hair, unclear eyes", safety_checker=None).images[0]
    # conditioned_img = pipe(f"a girl, cute, beautiful, rim lighting, natural lighting, dslr, ultra quality, sharp focus, tack sharp, film grain, crystal clear, 8K UHD, detailed glossy eyes, high detailed skin, skin pores, teen, (Kpop idol:0.8)", width=592, height=792, num_inference_steps=55,
    #                        negative_prompt="disfigured, ugly, bad, cartoon, anime, 3d, painting, watermark, monochrome, blurry, extra limbs, pubic hair, unclear eyes", safety_checker=None).images[0]
    conditioned_img.save(f'test/vanilla/{i}.png')

  0%|          | 0/55 [00:00<?, ?it/s]

In [217]:
conditioned_img.save('out-1900-1-4.png')

In [4]:
pipeline: StableDiffusionPipeline = StableDiffusionPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=torch.float16).to('cuda')
pipeline.safety_checker = None

unet/diffusion_pytorch_model.safetensors not found


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]



In [79]:
del pipe
torch.cuda.empty_cache()

In [130]:
def folder_similarity(dir: str):
    files = sorted(os.listdir(dir))
    embeds = [utils.get_face_embedding(os.path.join(dir, files[i]))[1] for i in range(len(files)) if not files[i].endswith("1.png")]
    if np.min(np.max(embeds, axis=1)) == 0:
        print("detected 0 somewhere")
        print(np.array(embeds)[:, :2])
        return
    sims = []
    for i in tqdm_notebook(range(len(embeds) - 1)):
        for j in range(i, len(embeds)):
            sims.append(utils.cosine_similarity(embeds[i], embeds[j]))
    
    return np.mean(sims)

In [99]:
folder_similarity("test/vanilla")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(range(len(embeds) - 1)):


  0%|          | 0/9 [00:00<?, ?it/s]

0.9429690063392598

In [131]:
folder_similarity("test/conditioned")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(range(len(embeds) - 1)):


  0%|          | 0/9 [00:00<?, ?it/s]

0.9539687089097488