# The notebook for training the text to image model

## Package Preparation

### Import packages

In [3]:
!pip install -q datasets
!pip install -q transformers
!pip install -q accelerate
!pip install -q git+https://github.com/huggingface/diffusers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [4]:
import logging
import math
import os
import random
import glob
from pathlib import Path

import datasets
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import ProjectConfiguration, set_seed
from datasets import load_dataset
from huggingface_hub import create_repo, upload_folder
from packaging import version
from torchvision import transforms
from tqdm.auto import tqdm
from transformers import CLIPTextModel, CLIPTokenizer

import diffusers
from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
from diffusers.loaders import AttnProcsLayers
from diffusers.models.attention_processor import LoRAAttnProcessor
from diffusers.optimization import get_scheduler
from diffusers.utils import check_min_version, is_wandb_available
from diffusers.utils.import_utils import is_xformers_available

### Check diffuser version & Save model card

In [5]:
check_min_version("0.16.0.dev0")

logger = get_logger(__name__, log_level="INFO")

In [6]:
def save_model_card(repo_id: str, images=None, base_model=str, dataset_name=str, repo_folder=None):
    img_str = ""
    for i, image in enumerate(images):
        image.save(os.path.join(repo_folder, f"image_{i}.png"))
        img_str += f"![img_{i}](./image_{i}.png)\n"

    yaml = f"""
---
license: creativeml-openrail-m
base_model: {base_model}
tags:
- stable-diffusion
- stable-diffusion-diffusers
- text-to-image
- diffusers
- lora
inference: true
---
    """
    model_card = f"""
# LoRA text2image fine-tuning - {repo_id}
These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
{img_str}
"""
    with open(os.path.join(repo_folder, "README.md"), "w") as f:
        f.write(yaml + model_card)

## Set Basic Arguments

### Saving Directory

In [7]:
#@markdown If model weights should be saved directly in google drive (takes around 4-5 GB).
save_to_gdrive = False #@param {type:"boolean"}
if save_to_gdrive:
    from google.colab import drive
    drive.mount('/content/drive')

#@markdown Name/Path of the initial model.
pretrained_model_name_or_path = "runwayml/stable-diffusion-v1-5" #@param {type:"string"}

#@markdown Enter the directory name to save model at.

output_dir = "ml_stable_diffusion_weights/lora" #@param {type:"string"}
if save_to_gdrive:
    output_dir = "/content/drive/MyDrive/" + output_dir
else:
    output_dir = "/content/" + output_dir

print(f"[*] Weights will be saved at {output_dir}")

!mkdir -p $output_dir

[*] Weights will be saved at /content/ml_stable_diffusion_weights/lora


### Configure Accelerator

In [8]:
logging_dir = os.path.join(output_dir, "logs")
accelerator_project_config = ProjectConfiguration(total_limit=None)

accelerator = Accelerator(
        gradient_accumulation_steps=1,
        mixed_precision="fp16",
        log_with="tensorboard",
        logging_dir=logging_dir,
        project_config=accelerator_project_config,
    )



### Handle Repository Creation

In [9]:
if accelerator.is_main_process:
        if output_dir is not None:
            os.makedirs(output_dir, exist_ok=True)

### Load scheduler, tokenizer, models

In [10]:
noise_scheduler = DDPMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
tokenizer = CLIPTokenizer.from_pretrained(
    pretrained_model_name_or_path, subfolder="tokenizer", revision=None
)
text_encoder = CLIPTextModel.from_pretrained(
    pretrained_model_name_or_path, subfolder="text_encoder", revision=None
)
vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae", revision=None)
unet = UNet2DConditionModel.from_pretrained(
    pretrained_model_name_or_path, subfolder="unet", revision=None
)
# freeze parameters of models to save more memory
unet.requires_grad_(False)
vae.requires_grad_(False)

text_encoder.requires_grad_(False)

CLIPTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e

In [11]:
weight_dtype = torch.float32
if accelerator.mixed_precision == "fp16":
    weight_dtype = torch.float16
elif accelerator.mixed_precision == "bf16":
    weight_dtype = torch.bfloat16

### Move unet, vae, text_encoder to device

In [12]:
print(accelerator.device)

cuda


In [13]:
unet.to(accelerator.device, dtype=weight_dtype)
vae.to(accelerator.device, dtype=weight_dtype)
text_encoder.to(accelerator.device, dtype=weight_dtype)

CLIPTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e

## Start adding LoRA weights to attention layers

    # It's important to realize here how many attention weights will be added and of which sizes
    # The sizes of the attention layers consist only of two different variables:
    # 1) - the "hidden_size", which is increased according to `unet.config.block_out_channels`.
    # 2) - the "cross attention size", which is set to `unet.config.cross_attention_dim`.

    # Let's first see how many attention processors we will have to set.
    # For Stable Diffusion, it should be equal to:
    # - down blocks (2x attention layers) * (2x transformer layers) * (3x down blocks) = 12
    # - mid blocks (2x attention layers) * (1x transformer layers) * (1x mid blocks) = 2
    # - up blocks (2x attention layers) * (3x transformer layers) * (3x down blocks) = 18
    # => 32 layers

### Set correct lora layers

In [14]:
lora_attn_procs = {}
for name in unet.attn_processors.keys():
  cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
  # print(name)
  if name.startswith("mid_block"):
    # print(unet.config.block_out_channels)
    hidden_size = unet.config.block_out_channels[-1]
  elif name.startswith("up_blocks"):
    block_id = int(name[len("up_blocks.")])
    hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
    # print(hidden_size)
  elif name.startswith("down_blocks"):
    block_id = int(name[len("down_blocks.")])
    hidden_size = unet.config.block_out_channels[block_id]
    # print(hidden_size)

  lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)

unet.set_attn_processor(lora_attn_procs)
lora_layers = AttnProcsLayers(unet.attn_processors)


### Initalize optimizers

In [15]:
#@markdown Parameters for adamW

optimizer_cls = torch.optim.AdamW
learning_rate = 1e-4 #@param {type:"number"}
adam_beta1 = 0.9 #@param {type:"number"}
adam_beta2 = 0.999 #@param {type:"number"}
adam_weight_decay = 1e-2 #@param {type:"number"}
adam_epsilon = 1e-08 #@param {type:"number"}

In [16]:
optimizer = optimizer_cls(
    lora_layers.parameters(),
    lr=learning_rate,
    betas=(adam_beta1,adam_beta2),
    weight_decay=adam_weight_decay,
    eps=adam_epsilon,
)

### Load Quickdraw Dataset

Read the class name

In [17]:
!wget 'https://raw.githubusercontent.com/zaidalyafeai/zaidalyafeai.github.io/master/sketcher/mini_classes.txt'

--2023-05-02 08:47:55--  https://raw.githubusercontent.com/zaidalyafeai/zaidalyafeai.github.io/master/sketcher/mini_classes.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 760 [text/plain]
Saving to: ‘mini_classes.txt.1’


2023-05-02 08:47:55 (37.4 MB/s) - ‘mini_classes.txt.1’ saved [760/760]



In [18]:
f = open("mini_classes.txt","r")
# And for reading use
classes = f.readlines()
f.close()

In [19]:
classes = [c.replace('\n','').replace(' ','_') for c in classes]
print(len(classes))

100


Download Data

In [20]:
!mkdir data

mkdir: cannot create directory ‘data’: File exists


In [21]:
import urllib.request
from tqdm.auto import tqdm
def download():
    base = 'https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/'
    for c in tqdm(classes):        
        cls_url = c.replace('_', '%20')
        path = base+cls_url+'.npy'
        # print(path)
        urllib.request.urlretrieve(path, 'data/'+c+'.npy')

In [22]:
download()

  0%|          | 0/100 [00:00<?, ?it/s]

load the data

In [23]:
# def load_data(root, vfold_ratio=0.2, max_items_per_class= 4000 ):
#     all_files = glob.glob(os.path.join(root, '*.npy'))

#     #initialize variables 
#     x = np.empty([0, 784])
#     y = np.empty([0])
#     class_names = []

#     #load each data file 
#     for idx, file in enumerate(all_files):
#         data = np.load(file)
#         data = data[0: max_items_per_class, :]
#         labels = np.full(data.shape[0], idx)

#         x = np.concatenate((x, data), axis=0)
#         y = np.append(y, labels)

#         class_name, ext = os.path.splitext(os.path.basename(file))
#         class_names.append(class_name)

#     data = None
#     labels = None
    
#     #randomize the dataset 
#     permutation = np.random.permutation(y.shape[0])
#     x = x[permutation, :]
#     y = y[permutation]

#     #separate into training and testing 
#     vfold_size = int(x.shape[0]/100*(vfold_ratio*100))

#     x_test = x[0:vfold_size, :]
#     y_test = y[0:vfold_size]

#     x_train = x[vfold_size:x.shape[0], :]
#     y_train = y[vfold_size:y.shape[0]]
#     return x_train, y_train, x_test, y_test, class_names

In [24]:
# x_train, y_train, x_test, y_test, class_names = load_data('data')
# num_classes = len(class_names)
# image_size = 28

In [25]:
# print(len(x_train))

In [26]:
# import matplotlib.pyplot as plt
# from random import randint
# %matplotlib inline  
# idx = randint(0, len(x_train))
# plt.imshow(x_train[idx].reshape(28,28)) 
# print(class_names[int(y_train[idx].item())])

In [27]:
# print(len(classes))
# print(len(y_train))

In [28]:
def load_data_for_diffusion(root, max_items_per_class= 4000 ):
    all_files = glob.glob(os.path.join(root, '*.npy'))

    #initialize variables
    data_dic = {"text":[], "img":[]}
    # imgs = np.empty([0, 784])
    # labels = []

    for idx, file in enumerate(all_files):
      data = np.load(file)
      data = data[0: max_items_per_class, :]

      class_name, ext = os.path.splitext(os.path.basename(file))
      data_dic['text'].extend([class_name for i in range(data.shape[0])])
      data_dic['img'].append(data)
      print(data.shape)

      # imgs = np.concatenate((imgs, data), axis=0)


    return data_dic
    

In [None]:
data_dic = load_data_for_diffusion('data')
# print(imgs.shape)
# print(len(labels))


(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
(4000, 784)
