In [1]:
%cd ..

/home/raid_storage/karachev/diffusions/Kandinsky-2


In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from copy import deepcopy

import torch
import torch.nn as nn
from matplotlib import pyplot as plt

from kandinsky2 import get_kandinsky2

### Helper functions

In [3]:
def show_image(image, figsize=(5, 5), cmap=None, title='', xlabel=None, ylabel=None, axis=False):
    plt.figure(figsize=figsize)
    plt.imshow(image, cmap=cmap)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.axis(axis)
    plt.show();

def show_images(images, n_rows=1, title='', figsize=(5, 5), cmap=None, xlabel=None, ylabel=None, axis=False):
    n_cols = len(images) // n_rows
    if n_rows == n_cols == 1:
        show_image(images[0], title=title, figsize=figsize, cmap=cmap, xlabel=xlabel, ylabel=ylabel, axis=axis)
    else:
        fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
        fig.tight_layout(pad=0.0)
        axes = axes.flatten()
        for ax, img in zip(axes, images):
            ax.imshow(img, cmap=cmap)
            ax.set_title(title)
            ax.set_xlabel(xlabel)
            ax.set_ylabel(ylabel)
            ax.axis(axis)
        plt.show();
        
def add_new_embeds(model, placeholder_token, embeds_path):
    learned_token = torch.load(embeds_path)

    num_added_tokens = model.tokenizer1.add_tokens(placeholder_token)
    placeholder_token_id = model.tokenizer1.convert_tokens_to_ids(placeholder_token)

    model.text_encoder.model.transformer.resize_token_embeddings(len(model.tokenizer1))
    model.text_encoder.model.transformer.get_input_embeddings().weight.data[placeholder_token_id] = learned_token['t1'][placeholder_token]
      
    t2p_index_to_add = len(model.tokenizer2.encoder)
    model.tokenizer2.encoder[placeholder_token] = t2p_index_to_add
    model.tokenizer2.decoder[t2p_index_to_add] = placeholder_token
    model.tokenizer2.cache[placeholder_token] = placeholder_token

    t2p_tok = model.tokenizer2.encode(placeholder_token)
    t2p_str = model.tokenizer2.decode(t2p_tok)
    
    old_vocab_size, t2_embed_dize = model.clip_model.token_embedding.weight.shape

    new_embed = nn.Embedding(old_vocab_size + 1, t2_embed_dize).to(device)
    new_embed.weight.data[:old_vocab_size, :] = model.clip_model.token_embedding.weight.data.clone()
    new_embed.weight.data[t2p_tok[0], :] = learned_token['t2'][placeholder_token]

    model.clip_model.token_embedding = deepcopy(new_embed)
    
    return model

### Initialize default Kandinsky 2.1 model (careful with cache dir)

In [4]:
device = 'cuda'
task_type = 'text2img'
cache_dir = '/tmp/kandinsky2'

model = get_kandinsky2(
    device=device, 
    task_type=task_type, 
    model_version='2.1', 
    cache_dir=cache_dir, 
    use_flash_attention=False);



making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
Working with z of shape (1, 4, 32, 32) = 4096 dimensions.


### Paste your token and path to file with embeddings

In [5]:
placeholder_token = # your word here
embeds_path = # your path to embedding here 

model = add_new_embeds(model, placeholder_token, embeds_path)

### Generation with p_sampler

In [9]:
prompt = f'''Professional high-quality art of a {placeholder_token}. photorealistic, 4k, HQ'''

for _ in range(2):
    images = model.generate_text2img(
        prompt,
        num_steps=50, 
        batch_size=4, 
        guidance_scale=7.5,
        h=768, 
        w=768,
        sampler='p_sampler', 
        prior_cf_scale=2,
        prior_steps='4',
    )

    show_images(images, n_rows=2, figsize=(15, 15))

### Generation with ddim_sampler

In [10]:
for _ in range(2):
    images = model.generate_text2img(
        prompt,
        num_steps=50, 
        batch_size=4, 
        guidance_scale=7.5,
        h=768, 
        w=768,
        sampler='ddim_sampler', 
        prior_cf_scale=4,
        prior_steps='ddim25',
    )

    show_images(images, n_rows=2, figsize=(15, 15))