In [2]:

import torch
from torchvision.datasets import ImageFolder
import torchvision.transforms as transforms
from torch.utils.data import Subset, DataLoader
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from tqdm import tqdm
from torch.utils.data import DataLoader
from torchvision import datasets

In [3]:
dataset = datasets.ImageFolder(root='/kaggle/input/homedataset')

In [4]:
dataset

Dataset ImageFolder
    Number of datapoints: 31
    Root location: /kaggle/input/homedataset

In [5]:
from datasets import load_dataset
import os

# Load the dataset
ds = load_dataset("ellljoy/interior-design")

# Define a directory to save images
save_dir = "downloaded_images"
os.makedirs(save_dir, exist_ok=True)

# Iterate over the dataset and save images
for i, image in enumerate(ds["train"]["images"]):  
    image_path = os.path.join(save_dir, f"image_{i}.png")  # Ensure correct extension
    image.save(image_path)  # Directly save the PIL image

print(f"Downloaded {len(ds['train'])} images to {save_dir}")

README.md:   0%|          | 0.00/287 [00:00<?, ?B/s]

(…)-00000-of-00001-dda5b1ba796822a6.parquet:   0%|          | 0.00/45.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/30 [00:00<?, ? examples/s]

Downloaded 30 images to downloaded_images


In [6]:
dataset

Dataset ImageFolder
    Number of datapoints: 31
    Root location: /kaggle/input/homedataset

In [7]:
import torch
from transformers import CLIPTokenizer

model_id = "CompVis/stable-diffusion-v1-4"
tokenizer = CLIPTokenizer.from_pretrained(
        model_id,
        subfolder="tokenizer")

def collate_fn(examples):
    input_ids = [example["instance_prompt_ids"] for example in examples]
    pixel_values = [example["instance_images"] for example in examples]
    pixel_values = torch.stack(pixel_values)
    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()

    input_ids = tokenizer.pad(
        {"input_ids": input_ids}, padding=True, return_tensors="pt"
    ).input_ids

    batch = {
        "input_ids": input_ids,
        "pixel_values": pixel_values,
    }
    return batch

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

In [8]:
from datasets import load_dataset
from torch.utils.data import Dataset
from torchvision import transforms

# def push_data_to_hf_hub(dataset_name,local_data_dir):
#     dataset = load_dataset("imagefolder", data_dir=local_data_dir)
#     # Remove the dummy label column
#     dataset = dataset.remove_columns("label")
#     # Push to Hub
#     dataset.push_to_hub(dataset_name)

# def pull_dataset_from_hf_hub(dataset_id='StatsGary/dreambooth-hackathon-images'):
#     dataset_id = dataset_id
#     dataset = load_dataset(dataset_id, split="train")
#     print(f"Loaded dataset number of rows: {len(dataset)}")
#     return dataset


class DreamBoothDataset(Dataset):
    def __init__(self, dataset, instance_prompt, tokenizer, size=512):
        self.dataset = dataset
        self.instance_prompt = instance_prompt
        self.tokenizer = tokenizer
        self.size = size
        self.transforms = transforms.Compose(
            [
                transforms.Resize(size),
                transforms.CenterCrop(size),
                transforms.ToTensor(),
                transforms.Normalize([0.5], [0.5]),
            ]
        )

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        example = {}
        image, _ = self.dataset[index]  # Unpack the tuple (ignore the label)
        example["instance_images"] = self.transforms(image)
        example["instance_prompt_ids"] = self.tokenizer(self.instance_prompt, return_tensors="pt").input_ids
        return example

In [9]:
from PIL import Image

def image_grid(imgs, rows, cols):
    assert len(imgs) == rows * cols
    w, h = imgs[0].size
    grid = Image.new("RGB", size=(cols * w, rows * h))
    grid_w, grid_h = grid.size
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i % cols * w, i // cols * h))
    return grid

In [10]:
pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl (76.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.4
Note: you may need to restart the kernel to use updated packages.


In [11]:
#Define training loop
import math
import torch.nn.functional as F
from accelerate import Accelerator
from accelerate.utils import set_seed
from diffusers import DDPMScheduler, PNDMScheduler, StableDiffusionPipeline
from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import bitsandbytes as bnb
import torch


def train_dreambooth(text_encoder, vae, unet, tokenizer, feature_extractor, train_dataset, train_batch_size=1, max_train_steps=400, shuffle_train=True,
                        beta_start=0.00085, beta_end=0.012, beta_scheduler="scaled_linear", num_train_timesteps=1000, seed=3434554,
                        gradient_checkpoint=True, gradient_accumulation_steps=8, use_8bit_ADAM=True, 
                        learning_rate=2e-06, max_grad_norm=1.0, output_dir='stable-diffusion-trained'):

    # Takes the input from the training arguments to specify the warmup phase of the gradients
    accelerator = Accelerator(
        gradient_accumulation_steps=gradient_accumulation_steps,
    )
    # Sets a reproduable seed to work 
    set_seed(seed)
    if gradient_checkpoint:
        unet.enable_gradient_checkpointing()

    if use_8bit_ADAM:
        optimizer_class = bnb.optim.AdamW8bit
    else:
        optimizer_class = torch.optim.AdamW

    # Then we implemenet and optimizer class which is used with the learning rate
    optimizer = optimizer_class(
        unet.parameters(),  # only optimize unet
        lr=learning_rate,
    )
    # Create a random noise scheduler to be applied to the images
    noise_scheduler = DDPMScheduler(
        beta_start=beta_start,
        beta_end=beta_end,
        beta_schedule=beta_scheduler,
        num_train_timesteps=num_train_timesteps
    )

    # Pass the images into the training data loader
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=train_batch_size,
        shuffle=shuffle_train,
        collate_fn=collate_fn
    )

    unet, optimizer, train_dataloader = accelerator.prepare(
        unet, optimizer, train_dataloader
    )

    # Move text_encode and vae to gpu
    text_encoder.to(accelerator.device)
    vae.to(accelerator.device)

    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / gradient_accumulation_steps
    )
    num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)

    # Train!
    total_batch_size = (
        train_batch_size
        * accelerator.num_processes
        * gradient_accumulation_steps
    )
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(
        range(max_train_steps), disable=not accelerator.is_local_main_process
    )
    progress_bar.set_description(f"Steps based on batch size {total_batch_size}")
    global_step = 0

    # Set the training loop for each epoch
    for epoch in range(num_train_epochs):
        print(f'Epoch: {epoch + 1} of {num_train_epochs}')
        unet.train()
        for step, batch in enumerate(train_dataloader):
            with accelerator.accumulate(unet):
                # Convert images to latent space
                with torch.no_grad():
                    latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
                    latents = latents * 0.18215

                # Sample noise that we'll add to the latents
                noise = torch.randn(latents.shape).to(latents.device)
                bsz = latents.shape[0]
                # Sample a random timestep for each image
                timesteps = torch.randint(
                    0,
                    noise_scheduler.config.num_train_timesteps,
                    (bsz,),
                    device=latents.device,
                ).long()

                # Add noise to the latents according to the noise magnitude at each timestep
                # (this is the forward diffusion process)
                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

                # Get the text embedding for conditioning
                with torch.no_grad():
                    encoder_hidden_states = text_encoder(batch["input_ids"])[0]

                # Predict the noise residual
                noise_pred = unet(
                    noisy_latents, timesteps, encoder_hidden_states
                ).sample
                loss = (
                    F.mse_loss(noise_pred, noise, reduction="none")
                    .mean([1, 2, 3])
                    .mean()
                )

                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    accelerator.clip_grad_norm_(unet.parameters(), max_grad_norm)
                optimizer.step()
                optimizer.zero_grad()

            # Checks if the accelerator has performed an optimization step behind the scenes
            if accelerator.sync_gradients:
                progress_bar.update(1)
                global_step += 1

            logs = {"loss": loss.detach().item()}
            progress_bar.set_postfix(**logs)

            if global_step >= max_train_steps:
                break

        accelerator.wait_for_everyone()

    # Create the pipeline using using the trained modules and save it.
    if accelerator.is_main_process:
        print(f"Loading pipeline and saving to {output_dir}...")
        scheduler = PNDMScheduler(
            beta_start=beta_start,
            beta_end=beta_end,
            beta_schedule=beta_scheduler,
            skip_prk_steps=True,
            steps_offset=1,
        )
        pipeline = StableDiffusionPipeline(
            text_encoder=text_encoder,
            vae=vae,
            unet=accelerator.unwrap_model(unet),
            tokenizer=tokenizer,
            scheduler=scheduler,
            safety_checker=StableDiffusionSafetyChecker.from_pretrained(
                "CompVis/stable-diffusion-safety-checker"
            ),
            feature_extractor=feature_extractor,
        )
        pipeline.save_pretrained(output_dir)

# DreamBooth portion completed

In [None]:
from transformers import CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel
from transformers import CLIPFeatureExtractor, CLIPTextModel

In [13]:
params = {
  'stable_diffusion_backbone': 'CompVis/stable-diffusion-v1-4',
  'feature_extractor': 'openai/clip-vit-base-patch32',
  'hugging_face_image_store': 'StatsGary/dreambooth-hackathon-images',
  'learning_rate' : 2e-06,
  'max_train_steps' : 400,
  'resolution' : 512,
  'train_bs': 1,
  'grad_accum_steps': 8,
  'max_gradient_norm': 1.0,
  'sample_batch_size': 10,
  'model_checkpoint_name' : 'norweigen-fjords-dreambooth', # Not changed
  'random_shuffle_train_set': True,
  'use_8bit_optimizer': True ,
  'concept_name': 'Interior Design',
  'item_type': 'Rooms', #Change to person, cartoon, food, etc.
  'eval_params':{
  'image_save_path': 'images',
  'eval_prompt': 'Dining table at the center classic paintings on each side of the wall of a room'
  }
}

In [14]:
train_params = params
STABLE_DIFFUSION_NAME = train_params['stable_diffusion_backbone']
FEATURE_EXTRACTOR = train_params['feature_extractor']
hf_data_location = train_params['hugging_face_image_store']
learning_rate = float(train_params['learning_rate'])
max_train_steps = int(train_params['max_train_steps'])
resolution = int(train_params['resolution'])
train_batch_size=int(train_params['train_bs'])
grad_accum_steps=int(train_params['grad_accum_steps'])
max_gradient_norm=float(train_params['max_gradient_norm'])
sample_batch_size=int(train_params['sample_batch_size'])
model_checkpoint_name=str(train_params['model_checkpoint_name'])
shuffle_train=bool(train_params['random_shuffle_train_set'])
use_8bit_optim=bool(train_params['use_8bit_optimizer'])
name_of_your_concept=train_params['concept_name']
object_type=train_params['item_type']


In [None]:
if __name__ =='__main__':
    # Load the image dataset from HuggingFace hub

    # Name your concept and set of images
    name_of_your_concept = name_of_your_concept
    type_of_thing = object_type
    instance_prompt = f"a photo of {name_of_your_concept} {type_of_thing}"
    print(f"Instance prompt: {instance_prompt}")

    # Load the CLIP tokenizer
    model_id = STABLE_DIFFUSION_NAME
    tokenizer = CLIPTokenizer.from_pretrained(
        model_id,
        subfolder="tokenizer")

    # Create a train dataset from the Dreambooth data loader
    train_dataset = DreamBoothDataset(dataset, instance_prompt, tokenizer)

    # Get text encoder, UNET and VAE
    text_encoder = CLIPTextModel.from_pretrained(model_id, subfolder="text_encoder")
    vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae")
    unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet")
    feature_extractor = CLIPFeatureExtractor.from_pretrained(FEATURE_EXTRACTOR)

    #Train the model
    model = train_dreambooth(
        text_encoder=text_encoder, 
        vae = vae, 
        unet = unet, 
        tokenizer=tokenizer, 
        feature_extractor=feature_extractor, 
        train_dataset=train_dataset, 
        train_batch_size=train_batch_size,
        max_train_steps=max_train_steps, 
        shuffle_train=shuffle_train,
        gradient_accumulation_steps=grad_accum_steps, 
        use_8bit_ADAM=True, 
        learning_rate=learning_rate, 
        max_grad_norm=max_gradient_norm,
        output_dir=model_checkpoint_name
    )

Instance prompt: a photo of Interior Design Rooms


  0%|          | 0/400 [00:00<?, ?it/s]

Epoch: 1 of 100
Epoch: 2 of 100
Epoch: 3 of 100
Epoch: 4 of 100
Epoch: 5 of 100
Epoch: 6 of 100
Epoch: 7 of 100
Epoch: 8 of 100
Epoch: 9 of 100
Epoch: 10 of 100
Epoch: 11 of 100
Epoch: 12 of 100
Epoch: 13 of 100
Epoch: 14 of 100
Epoch: 15 of 100
Epoch: 16 of 100
Epoch: 17 of 100
Epoch: 18 of 100
Epoch: 19 of 100
Epoch: 20 of 100
Epoch: 21 of 100
Epoch: 22 of 100
Epoch: 23 of 100
Epoch: 24 of 100
Epoch: 25 of 100
Epoch: 26 of 100
Epoch: 27 of 100
Epoch: 28 of 100
Epoch: 29 of 100
Epoch: 30 of 100
Epoch: 31 of 100
Epoch: 32 of 100
Epoch: 33 of 100
Epoch: 34 of 100
Epoch: 35 of 100
Epoch: 36 of 100
Epoch: 37 of 100
Epoch: 38 of 100
Epoch: 39 of 100
Epoch: 40 of 100
Epoch: 41 of 100
Epoch: 42 of 100
Epoch: 43 of 100
Epoch: 44 of 100


In [None]:
from diffusers import StableDiffusionPipeline
import torch
import os

# Parameters
eval_prompt = params['eval_params']['eval_prompt']
image_save_path = params['eval_params']['image_save_path']
model_checkpoint_name = params['model_checkpoint_name']
sample_batch_size = params['sample_batch_size']

# Ensure the image save directory exists
os.makedirs(image_save_path, exist_ok=True)

# Load the trained pipeline
pipeline = StableDiffusionPipeline.from_pretrained(
    model_checkpoint_name,
    torch_dtype=torch.float16,  # Use FP16 for faster inference
).to("cuda")

# Generate images
for i in range(sample_batch_size):
    print(f"Generating image {i + 1}...")
    image = pipeline(eval_prompt).images[0]
    image.save(os.path.join(image_save_path, f"generated_image_{i + 1}.png"))

print(f"Generated images saved to {image_save_path}")