In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
from diffusers import StableDiffusionPipeline, DDPMScheduler
from diffusers import UNet2DConditionModel
from transformers import CLIPTextModel, CLIPTokenizer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from PIL import Image
from pathlib import Path
import torch
import random
import numpy as np
from torch.utils.data import Dataset
from torchvision import transforms
from accelerate import Accelerator
from tqdm import tqdm

# ---- Config ---- #
pretrained_model_name = "runwayml/stable-diffusion-v1-5"
dataset_dir = "/content/drive/MyDrive/CDAC Project/dataset"
output_dir = "/content/drive/MyDrive/CDAC Project/lora_output"
image_size = 512
batch_size = 1
learning_rate = 1e-4
train_steps = 100
checkpoint_interval = 20
seed = 42
lora_rank = 64  # You can adjust this

# ---- Seed ---- #
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

# ---- Dataset ---- #
class ImageCaptionDataset(Dataset):
    def __init__(self, folder, tokenizer):
        self.tokenizer = tokenizer
        self.image_paths = []
        self.captions = []
        for fname in os.listdir(folder):
            if fname.endswith(".jpg") or fname.endswith(".png"):
                image_path = os.path.join(folder, fname)
                txt_path = os.path.splitext(image_path)[0] + ".txt"
                if os.path.exists(txt_path):
                    self.image_paths.append(image_path)
                    with open(txt_path, "r", encoding="utf-8") as f:
                        self.captions.append(f.read().strip())
        self.transform = transforms.Compose([
            transforms.Resize((image_size, image_size)),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5]),
        ])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        image = self.transform(image)
        caption = self.tokenizer(self.captions[idx], padding="max_length", truncation=True, return_tensors="pt").input_ids[0]
        return {"pixel_values": image, "input_ids": caption}

# ---- Load Components ---- #
tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(pretrained_model_name, subfolder="text_encoder")
unet = UNet2DConditionModel.from_pretrained(pretrained_model_name, subfolder="unet")
vae = StableDiffusionPipeline.from_pretrained(pretrained_model_name).vae
noise_scheduler = DDPMScheduler.from_pretrained(pretrained_model_name, subfolder="scheduler")

# Apply LoRA to UNet
lora_config = LoraConfig(r=lora_rank, lora_alpha=lora_rank * 2, target_modules=["to_q", "to_k", "to_v"], bias="none")
unet = get_peft_model(unet, lora_config)

# ---- Dataset & Dataloader ---- #
train_dataset = ImageCaptionDataset(dataset_dir, tokenizer)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# ---- Optimizer ---- #
optimizer = torch.optim.AdamW(unet.parameters(), lr=learning_rate)

# ---- Accelerator Setup ---- #
accelerator = Accelerator()
unet, optimizer, train_dataloader, text_encoder, vae = accelerator.prepare(unet, optimizer, train_dataloader, text_encoder, vae)


# ---- Training ---- #
unet.train()
for step in tqdm(range(train_steps)):
    for batch in train_dataloader:
        pixel_values = batch["pixel_values"].to(accelerator.device)
        input_ids = batch["input_ids"].to(accelerator.device)

        # Encode text
        encoder_hidden_states = text_encoder(input_ids)[0]

        # Encode image -> latents
        latents = vae.encode(pixel_values).latent_dist.sample() * 0.18215

        # Sample noise
        noise = torch.randn_like(latents)
        timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (latents.shape[0],), device=latents.device).long()

        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
        model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample

        loss = torch.nn.functional.mse_loss(model_pred, noise)
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()

    if step % checkpoint_interval == 0 or step == train_steps - 1:
        accelerator.save_state(os.path.join(output_dir, f"checkpoint-{step}"))

# ---- Save final LoRA weights ---- #
unet.save_pretrained(output_dir)
print("✅ Training complete. LoRA saved at:", output_dir)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

text_encoder/model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

unet/diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]