In [3]:
import os
import pandas as pd
import pickle
import torch
import io
import urllib
from matplotlib import pyplot as plt
from datasets import load_dataset
from datasets.utils.file_utils import get_datasets_user_agent
import PIL.Image
from torchvision import transforms
from tqdm import tqdm

import utils

USER_AGENT = get_datasets_user_agent()

dir_path = "dataset"
images_dir = "images"
filepath_label = 'filepath'

train_num = 3500 # Expect 15% of these images to not download
valid_num = 900  # Expect 15% of these images to not download

def get_img(url, retries=2):
    for _ in range(retries + 1):
            try:
                request = urllib.request.Request(url, data=None, headers={'user-agent': USER_AGENT})
                with urllib.request.urlopen(request) as req:
                    image = PIL.Image.open(io.BytesIO(req.read()))
                break
            except:
                image = None
    
    return image

def load_data(key, df, num):
    count = 0
    for i in tqdm(range(num)):
        if not df[key][filepath_label][i]:
            pass

        elif df[key][filepath_label][i] == "null":
            url = df[key]['image_url'][i]

            image = get_img(url)
            if not image is None:
                filepath = os.path.join(dir_path, images_dir, f'{key}_{i}.jpg')
                try:
                    image.save(filepath)
                    df[key][filepath_label][i] = filepath
                    count += 1
                except:
                    df[key][filepath_label][i] = None
            else:
                df[key][filepath_label][i] = None
                #print(f'Could not fetch index {i}!')

        else:
            count += 1

    print(f'{count} files retrieved in the "{key}" split.')
    return df

if not os.path.exists(dir_path):
    os.makedirs(dir_path)
    os.makedirs(os.path.join(dir_path, images_dir))

    dset = load_dataset("conceptual_captions")

    filepaths = ["null"] * len(dset['train'])
    dset['train'] = dset['train'].add_column(filepath_label, filepaths)

    filepaths = ["null"] * len(dset['validation'])
    dset['validation'] = dset['validation'].add_column(filepath_label, filepaths)

    df_train = dset['train'].to_pandas()
    df_valid = dset['validation'].to_pandas()

    df = {
        'train': df_train,
        'validation': df_valid
          }

    with open(os.path.join(dir_path, 'df.pkl'), 'wb') as handle:
        pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(os.path.join(dir_path, 'df.pkl'), 'rb') as handle:
    df = pickle.load(handle)

df = load_data('train', df, train_num)
df = load_data('validation', df, valid_num)

with open(os.path.join(dir_path, 'df.pkl'), 'wb') as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)

#image = transforms.Compose([transforms.ToTensor(), utils.Rescale(64)])(image)
#plt.figure(figsize=(8, 8))
#plt.imshow(image.permute(1, 2, 0))
#plt.show()

100%|██████████| 3500/3500 [2:01:51<00:00,  2.09s/it]    


2919 files retrieved in the "train" split.


100%|██████████| 900/900 [53:51<00:00,  3.59s/it]   


710 files retrieved in the "validation" split.


In [5]:
import os
import pandas as pd
import pickle
import torch
import io
import urllib
from matplotlib import pyplot as plt
from datasets import load_dataset
from datasets.utils.file_utils import get_datasets_user_agent
import PIL.Image
from torchvision import transforms, datasets
from tqdm import tqdm

import utils

dir_path = 'aircraft'

datasets.FGVCAircraft(dir_path, download=True)

Downloading https://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/archives/fgvc-aircraft-2013b.tar.gz to aircraft/fgvc-aircraft-2013b.tar.gz


  0%|          | 0/2753340328 [00:00<?, ?it/s]

Extracting aircraft/fgvc-aircraft-2013b.tar.gz to aircraft


Dataset FGVCAircraft
    Number of datapoints: 6667
    Root location: aircraft

In [1]:
import torch

torch.cuda.is_available()

True

In [58]:
import torch
from transformers import T5Tokenizer, T5EncoderModel

text = ["This is a sentence and again"]
max_length = 64

tokenizer = T5Tokenizer.from_pretrained('t5-small', model_max_length=max_length)
encoder = T5EncoderModel.from_pretrained('t5-small')

device = torch.device('cuda')
encoder = encoder.to(device)

tokenized = tokenizer.batch_encode_plus(text, padding='longest', max_length=max_length, 
                                        truncation=True, return_tensors='pt')

print(tokenized)

input_ids = tokenized.input_ids.to(device)
attention_mask = tokenized.attention_mask.to(device)

encoder.eval()

with torch.no_grad():
    t5_out = encoder(input_ids=input_ids, attention_mask=attention_mask)
    final_encoding = t5_out.last_hidden_state.detach()

final_encoding = final_encoding.masked_fill(~attention_mask.unsqueeze(2).bool(), 0.)

print(final_encoding.shape)
print(attention_mask.bool())

Some weights of the model checkpoint at t5-small were not used when initializing T5EncoderModel: ['decoder.block.5.layer.1.EncDecAttention.q.weight', 'decoder.block.0.layer.1.layer_norm.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.1.layer.0.SelfAttention.o.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.1.layer.2.DenseReluDense.wi.weight', 'decoder.block.5.layer.1.EncDecAttention.o.weight', 'decoder.block.4.layer.2.DenseReluDense.wo.weight', 'decoder.block.5.layer.0.layer_norm.weight', 'decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight', 'decoder.block.0.layer.0.SelfAttention.o.weight', 'decoder.block.4.layer.0.SelfAttention.o.weight', 'decoder.block.4.layer.0.SelfAttention.v.weight', 'decoder.block.2.layer.0.SelfAttention.o.weight', 'decoder.block.0.layer.1.EncDecAttention.k.weight', 'decoder.block.5.layer.1.EncDecAttention.v.weight', 'decoder.block.0.layer.1.EncDecAttention.q.weight', 'decoder.block.3.layer.2.Dens

{'input_ids': tensor([[ 100,   19,    3,    9, 7142,   11,  541,    1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
torch.Size([1, 8, 512])
tensor([[True, True, True, True, True, True, True, True]], device='cuda:0')


In [2]:
import os
import sys

import torch
import numpy as np
from tqdm import tqdm
from torch import autocast

from stablediffusion.ldm.models.diffusion.ddim import DDIMSampler
from stablediffusion.ldm.models.diffusion.plms import PLMSSampler
from stablediffusion.scripts.txt2img import load_model_from_config

device = torch.device("cuda")

prompt_i = "a painting of a virus monster playing guitar"
prompt_j = "a forested landscape"

w_i = 0.5
w_j = 0.5

config = "../stablediffusion/configs/latent-diffusion/txt2img-1p4B-eval.yaml"

timesteps = 1000

n = 1 # Number of samples / batch size
ch = 4 # Latent channels
f = 8 # Downsample factor
h = 512 # Image height
w = 512 # Image width

scale = 7.5 # Unconditional guidance scale
ddim_eta = 0.0 # 0.0 corresponds to deterministic sampling
shape = [ch, h // f, w // f]

b = n

model = load_model_from_config(config, '../sd-v2-1.ckpt')
model = model.to(device)
model = PLMSSampler(model)

with torch.no_grad():
    with autocast('cuda'):
        with model.ema_scope():
            uc = model.get_learned_conditioning(n * [""])
            c_i = model.get_learned_conditioning(n * [prompt_i])
            c_j = model.get_learned_conditioning(n * [prompt_j])

@torch.no_grad()
def p_sample(model, x, c, ts, index, old_eps=None, t_next=None):
    outs = model.p_sample_plms(x, c, ts, index=index, unconditional_guidance_scale=scale, unconditional_conditioning=uc,)
    x, _, e_t = outs
    old_eps.append(e_t)
    if len(old_eps) >= 4:
        old_eps.pop(0)

    return old_eps 

with torch.no_grad():
    with autocast('cuda'):
        with model.ema_scope():
            # Initialize sample x_T to N(0,I)
            x = torch.randn((n, ch, h // f, w // f)).to(device)

            model.make_schedule(ddim_num_steps=timesteps, ddim_eta=ddim_eta, verbose=False)
            timesteps = model.ddim_timesteps
            time_range = np.flip(timesteps)
            total_steps = timesteps.shape[0]
            e_ti = []
            e_tj = []
            for i, step in enumerate(tqdm(time_range, desc='PLMS Sampler', total=total_steps)):
                index = total_steps - i - 1
                ts = torch.full((b,), step, device=device, dtype=torch.long)
                ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
                
                # Compute conditional scores for each concept c_i
                e_ti = p_sample(model, x, c_i, ts, index, e_ti, ts_next) 
                e_tj = p_sample(model, x, c_j, ts, index, e_tj, ts_next)
                e_i = e_ti[-1]
                e_j = e_tj[-1]


                # Compute unconditional score
                e_t = p_sample(model, x, uc, ts, index, e_t, ts_next)
                e = e_t[-1]
                
                # Sampling
                mean = x - (e + w_i * (e_i - e) + w_j * (e_j - e))
                covar = model.betas[ts]
                x = torch.normal(mean, covar*torch.eye(h // f, w // f)) # Sampling

ModuleNotFoundError: No module named 'ldm'