## SDXL training 

In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from diffusers import StableDiffusionPipeline, UNet2DConditionModel
from compel import Compel
from peft import LoraConfig, get_peft_model
from torch import nn, optim

# --- Dataset Loader ---
class ImageCaptionDataset(Dataset):
    def __init__(self, images_dir, captions_dir, transform=None):
        self.images_dir = images_dir
        self.captions_dir = captions_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(images_dir) if f.lower().endswith((".jpg", ".png"))]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_file = self.image_files[idx]
        base = os.path.splitext(img_file)[0]
        caption_file = os.path.join(self.captions_dir, base + ".txt")

        # Load image
        image = Image.open(os.path.join(self.images_dir, img_file)).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Load caption
        caption = ""
        if os.path.exists(caption_file):
            with open(caption_file, "r") as f:
                caption = f.read().strip()

        return {"image": image, "caption": caption}

# --- Parameters ---
images_dir = "/home/jovyan/lora_dataset/instance_images"
captions_dir = "/home/jovyan/lora_dataset/instance_captions"
output_dir = "/home/jovyan/lora_dreambooth_model"
pretrained_model_name_or_path = "stabilityai/stable-diffusion-xl-base-1.0"

resolution = 512
train_batch_size = 1
gradient_accumulation_steps = 2
learning_rate = 5e-5
num_train_epochs = 5
seed = 42

torch.manual_seed(seed)

# --- Load pipeline ---
pipeline = StableDiffusionPipeline.from_pretrained(
    pretrained_model_name_or_path,
    torch_dtype=torch.float16,
    variant="fp16"
)
pipeline.enable_model_cpu_offload()

# --- Compel for long captions ---
compel = Compel(tokenizer=pipeline.tokenizer, text_encoder=pipeline.text_encoder)

# --- LoRA config ---
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["to_q", "to_v"],  # attention layers
    lora_dropout=0.05,
    bias="none",
    task_type="UNET"
)

# Attach LoRA to UNet
pipeline.unet = get_peft_model(pipeline.unet, lora_config)

# --- Dataset + DataLoader ---
dataset = ImageCaptionDataset(images_dir, captions_dir)
dataloader = DataLoader(dataset, batch_size=train_batch_size, shuffle=True)

# --- Optimizer ---
optimizer = optim.AdamW(pipeline.unet.parameters(), lr=learning_rate)

# --- Training Loop ---
for epoch in range(num_train_epochs):
    for step, batch in enumerate(dataloader):
        images = batch["image"]
        captions = batch["caption"]

        # Convert captions to embeddings (handles >77 tokens)
        conditioning = compel.build_conditioning_tensor(captions)

        # Forward pass (simplified example)
        outputs = pipeline.unet(images, conditioning)
        loss = nn.functional.mse_loss(outputs.sample, images)  # placeholder loss

        loss.backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

    print(f"Epoch {epoch+1}/{num_train_epochs} complete")

# --- Save LoRA weights ---
os.makedirs(output_dir, exist_ok=True)
pipeline.unet.save_pretrained(output_dir)
print(f"âœ… LoRA weights saved to {output_dir}")

In [None]:
import torch
from diffusers import StableDiffusionPipeline, DDPMScheduler
from peft import LoraConfig, get_peft_model
from torchvision import transforms
from torch.utils.data import DataLoader

# 1. Preprocessing as per report: 320x320 resolution [cite: 134]
train_transforms = transforms.Compose([
    transforms.Resize(320, interpolation=transforms.InterpolationMode.BICUBIC), # Bicubic preferred [cite: 138]
    transforms.CenterCrop(320),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5]),
])

# 2. Load SD v1.5 
model_id = "runwayml/stable-diffusion-v1-5"
pipeline = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
noise_scheduler = DDPMScheduler.from_pretrained(model_id, subfolder="scheduler")

# 3. LoRA Configuration [cite: 434]
lora_config = LoraConfig(
    r=16, 
    lora_alpha=32, 
    target_modules=["to_q", "to_v"], # Targeting attention layers as suggested [cite: 434]
    lora_dropout=0.05,
    bias="none"
)
pipeline.unet = get_peft_model(pipeline.unet, lora_config)

# 4. Training Step with Noise Prediction [cite: 126]
# (Inside your loop)
def train_step(batch):
    # Convert image to latents using VAE [cite: 125]
    latents = pipeline.vae.encode(batch["pixel_values"]).latent_dist.sample()
    latents = latents * 0.18215

    # Add noise according to scheduler [cite: 126]
    noise = torch.randn_like(latents)
    timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (1,)).long()
    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

    # Text conditioning using CLIP (limited to 77 tokens) 
    encoder_hidden_states = pipeline.text_encoder(batch["input_ids"])[0]

    # Predict noise and calculate MSE loss
    model_pred = pipeline.unet(noisy_latents, timesteps, encoder_hidden_states).sample
    loss = torch.nn.functional.mse_loss(model_pred.float(), noise.float(), reduction="mean")
    return loss