## SDXL training 

In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from diffusers import StableDiffusionPipeline, UNet2DConditionModel
from compel import Compel
from peft import LoraConfig, get_peft_model
from torch import nn, optim

# --- Dataset Loader ---
class ImageCaptionDataset(Dataset):
    def __init__(self, images_dir, captions_dir, transform=None):
        self.images_dir = images_dir
        self.captions_dir = captions_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(images_dir) if f.lower().endswith((".jpg", ".png"))]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_file = self.image_files[idx]
        base = os.path.splitext(img_file)[0]
        caption_file = os.path.join(self.captions_dir, base + ".txt")

        # Load image
        image = Image.open(os.path.join(self.images_dir, img_file)).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Load caption
        caption = ""
        if os.path.exists(caption_file):
            with open(caption_file, "r") as f:
                caption = f.read().strip()

        return {"image": image, "caption": caption}

# --- Parameters ---
images_dir = "/home/jovyan/lora_dataset/instance_images"
captions_dir = "/home/jovyan/lora_dataset/instance_captions"
output_dir = "/home/jovyan/lora_dreambooth_model"
pretrained_model_name_or_path = "stabilityai/stable-diffusion-xl-base-1.0"

resolution = 512
train_batch_size = 1
gradient_accumulation_steps = 2
learning_rate = 5e-5
num_train_epochs = 5
seed = 42

torch.manual_seed(seed)

# --- Load pipeline ---
pipeline = StableDiffusionPipeline.from_pretrained(
    pretrained_model_name_or_path,
    torch_dtype=torch.float16,
    variant="fp16"
)
pipeline.enable_model_cpu_offload()

# --- Compel for long captions ---
compel = Compel(tokenizer=pipeline.tokenizer, text_encoder=pipeline.text_encoder)

# --- LoRA config ---
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["to_q", "to_v"],  # attention layers
    lora_dropout=0.05,
    bias="none",
    task_type="UNET"
)

# Attach LoRA to UNet
pipeline.unet = get_peft_model(pipeline.unet, lora_config)

# --- Dataset + DataLoader ---
dataset = ImageCaptionDataset(images_dir, captions_dir)
dataloader = DataLoader(dataset, batch_size=train_batch_size, shuffle=True)

# --- Optimizer ---
optimizer = optim.AdamW(pipeline.unet.parameters(), lr=learning_rate)

# --- Training Loop ---
for epoch in range(num_train_epochs):
    for step, batch in enumerate(dataloader):
        images = batch["image"]
        captions = batch["caption"]

        # Convert captions to embeddings (handles >77 tokens)
        conditioning = compel.build_conditioning_tensor(captions)

        # Forward pass (simplified example)
        outputs = pipeline.unet(images, conditioning)
        loss = nn.functional.mse_loss(outputs.sample, images)  # placeholder loss

        loss.backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

    print(f"Epoch {epoch+1}/{num_train_epochs} complete")

# --- Save LoRA weights ---
os.makedirs(output_dir, exist_ok=True)
pipeline.unet.save_pretrained(output_dir)
print(f"âœ… LoRA weights saved to {output_dir}")

In [None]:
import torch
from diffusers import StableDiffusionPipeline, DDPMScheduler
from peft import LoraConfig, get_peft_model
from torchvision import transforms
from torch.utils.data import DataLoader

# 1. Preprocessing as per report: 320x320 resolution [cite: 134]
train_transforms = transforms.Compose([
    transforms.Resize(320, interpolation=transforms.InterpolationMode.BICUBIC), # Bicubic preferred [cite: 138]
    transforms.CenterCrop(320),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5]),
])

# 2. Load SD v1.5 
model_id = "runwayml/stable-diffusion-v1-5"
pipeline = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
noise_scheduler = DDPMScheduler.from_pretrained(model_id, subfolder="scheduler")

# 3. LoRA Configuration [cite: 434]
lora_config = LoraConfig(
    r=16, 
    lora_alpha=32, 
    target_modules=["to_q", "to_v"], # Targeting attention layers as suggested [cite: 434]
    lora_dropout=0.05,
    bias="none"
)
pipeline.unet = get_peft_model(pipeline.unet, lora_config)

# 4. Training Step with Noise Prediction [cite: 126]
# (Inside your loop)
def train_step(batch):
    # Convert image to latents using VAE [cite: 125]
    latents = pipeline.vae.encode(batch["pixel_values"]).latent_dist.sample()
    latents = latents * 0.18215

    # Add noise according to scheduler [cite: 126]
    noise = torch.randn_like(latents)
    timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (1,)).long()
    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

    # Text conditioning using CLIP (limited to 77 tokens) 
    encoder_hidden_states = pipeline.text_encoder(batch["input_ids"])[0]

    # Predict noise and calculate MSE loss
    model_pred = pipeline.unet(noisy_latents, timesteps, encoder_hidden_states).sample
    loss = torch.nn.functional.mse_loss(model_pred.float(), noise.float(), reduction="mean")
    return loss

In [None]:
print("--- Starting Captioned LoRA Training (Report Methodology) ---")

# --- 1. Library Installations ---
print("\n--- Installing/Updating necessary libraries ---")
!pip install -q Pillow transformers accelerate bitsandbytes xformers peft
!pip uninstall -y -q diffusers
!pip install -q git+https://github.com/huggingface/diffusers
print("--- Library installation complete ---")

# --- 2. Download Text-to-Image LoRA Script ---
# The report approach requires the text_to_image script, NOT the dreambooth script.
print("\n--- Downloading Text-to-Image LoRA training script ---")
script_url = "https://raw.githubusercontent.com/huggingface/diffusers/main/examples/text_to_image/train_text_to_image_lora.py"
script_name = "train_text_to_image_lora.py"
import os
!wget -q -O {script_name} {script_url}
print(f"Script '{script_name}' downloaded.")

# --- 3. Loading BLIP for Captioning ---
print("\n--- Loading BLIP model for unique captioning ---")
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import json
import shutil

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")

# --- 4. Data Preparation & Metadata Generation ---
# This part implements the "Automated Annotation Pipeline" from the report
print("\n--- Generating Unique Captions & Metadata ---")
dataset_dir = "/content/lora_dataset"
os.makedirs(dataset_dir, exist_ok=True)

# Find images in /content
image_exts = ('.jpg', '.jpeg', '.png')
image_paths = [f for f in os.listdir('/content') if f.lower().endswith(image_exts)]

metadata = []
mugshot_context = "a detailed mugshot photo of a person, identification portrait, facial features"

for img_name in image_paths:
    img_path = os.path.join('/content', img_name)
    raw_image = Image.open(img_path).convert("RGB")
    
    # Generate caption
    inputs = processor(raw_image, mugshot_context, return_tensors="pt").to("cuda")
    out = model.generate(**inputs, max_new_tokens=70) # Report suggests ~73 token limit
    caption = processor.decode(out[0], skip_special_tokens=True)
    
    # Move image to dataset folder
    dest_path = os.path.join(dataset_dir, img_name)
    shutil.copy(img_path, dest_path)
    
    # Add to metadata list (Requirement for text-to-image script)
    metadata.append({"file_name": img_name, "text": caption})
    print(f" - Captioned {img_name}: {caption}")

# Write metadata.jsonl
with open(os.path.join(dataset_dir, "metadata.jsonl"), "w") as f:
    for entry in metadata:
        json.dump(entry, f)
        f.write("\n")

print(f"--- Dataset ready with {len(metadata)} captioned images ---")

# --- 5. Training Parameters (Aligned with Report) ---
print("\n--- Defining LoRA Training Parameters ---")
pretrained_model = "runwayml/stable-diffusion-v1-5"
output_dir = "lora_face_model"

accelerate_command = (
    f"accelerate launch {script_name} "
    f"--pretrained_model_name_or_path='{pretrained_model}' "
    f"--train_data_dir='{dataset_dir}' "
    f"--caption_column='text' " # Tells script to look at the 'text' key in jsonl
    f"--resolution=512 "       # You can use 320 as per report, but 512 is standard
    f"--random_flip "
    f"--train_batch_size=1 "
    f"--gradient_accumulation_steps=4 "
    f"--max_train_steps=1000 "
    f"--learning_rate=1e-4 "    # Report uses higher LR for LoRA
    f"--lr_scheduler='constant' "
    f"--lr_warmup_steps=0 "
    f"--seed=42 "
    f"--output_dir='{output_dir}' "
    f"--mixed_precision='fp16' "
    f"--enable_xformers_memory_efficient_attention "
    f"--checkpointing_steps=500 "
)

# --- 6. Execute Training ---
print("\n--- Executing LoRA Training ---")
!{accelerate_command}

# --- 7. Inference ---
print("\n--- Starting Inference with Trained LoRA ---")
from diffusers import StableDiffusionPipeline

pipe = StableDiffusionPipeline.from_pretrained(pretrained_model, torch_dtype=torch.float16).to("cuda")
lora_weight_path = os.path.join(output_dir, "pytorch_lora_weights.safetensors")

if os.path.exists(lora_weight_path):
    pipe.load_lora_weights(lora_weight_path)
    print("LoRA weights loaded.")

test_prompt = "a detailed mugshot photo of a person with dark hair and a serious expression"
image = pipe(test_prompt, num_inference_steps=30).images[0]
image.save("test_output.png")
image.show()