<a href="https://colab.research.google.com/github/Tyler-Pickett/Stable_Diffuse/blob/main/Text_to_Image.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Stable Diffusion Pipeline to generate images from text.

Phase 1 - Build own stable diffusion

Phase 2 - Use Hugging Face pre-trained stable diffusion

* diffusers = 🤗 Diffusers provides pretrained diffusion models across multiple modalities, such as vision and audio, and serves as a modular toolbox for inference and training of diffusion models.
https://pypi.org/project/diffusers/#description

* transformers = 🤗 Transformers provides thousands of pretrained models to perform tasks on different modalities such as text, vision, and audio.
  https://pypi.org/project/transformers/



In [1]:
!pip install diffusers==0.3.0 --q
!pip install transformers scipy ftfy --q
!pip install "ipywidgets>=7,<8" --q # Jupyter Widgets
import IPython.display # Inline image display(show images inside notebook)

In [2]:
from transformers import AutoModel

access_token = "" # needs hidden

# model = AutoModel.from_pretrained("private/model", use_auth_token=access_token)

In [3]:
import gc # garbage collector
import torch # PyTorch
from PIL import Image
import IPython.display
from torch import autocast
from tqdm.auto import tqdm
# from kaggle_secrets import UserSecretsClient (nonexistent module)
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import StableDiffusionPipeline
from diffusers import AutoencoderKL, UNet2DConditionModel
from diffusers import LMSDiscreteScheduler, PNDMScheduler

HF_t = access_token

Create class of hyperparameters

In [4]:
class config:
  device = "cuda" if torch.cuda.is_available() else "cpu"
  height = 512
  width = 512
  num_infer_steps = 500
  guidance_scale = 7.5
  generator = torch.manual_seed(48)
  batch_size = 1

In [5]:
print(config.__dict__) # to check attributes we can also use magic method __dict__. This method only returns instance attributes.

{'__module__': '__main__', 'device': 'cpu', 'height': 512, 'width': 512, 'num_infer_steps': 500, 'guidance_scale': 7.5, 'generator': <torch._C.Generator object at 0x7f912af49cb0>, 'batch_size': 1, '__dict__': <attribute '__dict__' of 'config' objects>, '__weakref__': <attribute '__weakref__' of 'config' objects>, '__doc__': None}


Helper Function

In [6]:
def image_grid(imgs, rows, cols):
    assert len(imgs) == rows*cols
    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    grid_w, grid_h = grid.size
    for i, img in enumerate(imgs):
      grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid

In [7]:
stable_diffuser = AutoencoderKL.from_pretrained('CompVis/stable-diffusion-v1-4',
                                   subfolder='vae', use_auth_token=HF_t)
tokenizer = CLIPTokenizer.from_pretrained('openai/clip-vit-large-patch14')
encoder = CLIPTextModel.from_pretrained('openai/clip-vit-large-patch14')
unet = UNet2DConditionModel.from_pretrained('CompVis/stable-diffusion-v1-4',
                                            subfolder='unet', use_auth_token=HF_t)
sd_vae = stable_diffuser.to(config.device)
text_encoder = encoder.to(config.device)
unet = unet.to(config.device)

Some weights of the model checkpoint at openai/clip-vit-large-patch14 were not used when initializing CLIPTextModel: ['vision_model.encoder.layers.18.self_attn.out_proj.weight', 'vision_model.encoder.layers.18.self_attn.k_proj.weight', 'vision_model.encoder.layers.15.layer_norm2.bias', 'vision_model.encoder.layers.7.self_attn.k_proj.bias', 'vision_model.post_layernorm.bias', 'vision_model.encoder.layers.1.self_attn.out_proj.weight', 'vision_model.encoder.layers.13.self_attn.k_proj.bias', 'vision_model.encoder.layers.14.layer_norm1.weight', 'vision_model.encoder.layers.7.self_attn.q_proj.weight', 'vision_model.encoder.layers.12.self_attn.k_proj.bias', 'vision_model.encoder.layers.8.self_attn.k_proj.bias', 'vision_model.encoder.layers.17.self_attn.k_proj.bias', 'vision_model.encoder.layers.4.self_attn.k_proj.weight', 'vision_model.encoder.layers.23.self_attn.out_proj.weight', 'vision_model.encoder.layers.18.layer_norm2.weight', 'vision_model.encoder.layers.10.self_attn.out_proj.weight', 

In [8]:
# setup LMS Discreet Scheduler(defualt is PNDM)
lms_scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012,
                                     beta_schedule='scaled_linear', num_train_timesteps=1000)

Image Generation Test

In [9]:
# text to feed model
prompt = ["a curious explorer discovers a massive sprawling underground city in a huge cave system. city has churches, european - style buildings and big towers, and is really far away, illuminated by dim ambient lighting. waterfalls are flowing between different levels of the city. award winning digital art, concept art, breathtaking, imaginative, detailed., 8k"]

In [10]:
input_text = tokenizer(prompt, padding='max_length', max_length=tokenizer.model_max_length,
                       truncation=True, return_tensors='pt')

max_length = input_text.input_ids.shape[-1]

with torch.no_grad():
  embedded_text = text_encoder(input_text.input_ids.to(config.device))[0]

uncond_input = tokenizer( [""] * config.batch_size, padding='max_length',
                         max_length=max_length, return_tensors='pt')

with torch.no_grad():
  uncond_embedded = text_encoder(uncond_input.input_ids.to(config.device))[0]

embedded_text = torch.cat([uncond_embedded, embedded_text])

print(f"Embedded Text Shape: {embedded_text.shape}")

Embedded Text Shape: torch.Size([2, 77, 768])


In [11]:
latents = torch.randn((config.batch_size, unet.in_channels, config.height // 8,
                      config.width // 8), generator=config.generator)
latents = latents.to(config.device)

print(f"Latent Shape: {latents.shape}")

Latent Shape: torch.Size([1, 4, 64, 64])


In [12]:
lms_scheduler.set_timesteps(config.num_infer_steps)
latents = latents * lms_scheduler.sigmas[0]

In [None]:
with autocast(config.device):
  for i, t in tqdm(enumerate(lms_scheduler.timesteps)):
    latent_model_input = torch.cat([latents] * 2)
    sigma = lms_scheduler.sigmas[i]
    latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)

    with torch.no_grad():
      noise_pred = unet(latent_model_input, t, encoder_hidden_states=embedded_text).sample
    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
    noise_pred = noise_pred_uncond + config.guidance_scale * (noise_pred_text - noise_pred_uncond)

    latents = lms_scheduler.step(noise_pred, i, latents).prev_sample

In [None]:
latents = 1 / 0.18215 * latents

with torch.no_grad():
  image = sd_vae.decode(latents).sample

print(f"Image Shape: {image.shape}")

In [None]:
image = (image / 2 + 0.5).clamp(0, 1)
image = image.detach().cpu().permute(0, 2, 3, 1).numpy()

images = (image * 255).round().astype('unit8')

pil_images = [Image.fromarray(image) for image in images]
pil_images[0]