# Download all models

In [None]:
from huggingface_hub import snapshot_download
import os

# Output dir
OUTPUT_DIR = "/kaggle/working/models"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Models
models = [
    "ritishshrirao/controlnet-coco-bbox-filled",
    "ritishshrirao/controlnet-coco-multi",
    "ritishshrirao/animatediff-controlnet-coco-segmentation",
    "ritishshrirao/Controlnet_SD1.5_coco_segmentation"
]

# Download
def download_model(repo_id, output_dir):
    print(f"\nDownloading repository: {repo_id}")
    local_dir = snapshot_download(
        repo_id=repo_id,
        cache_dir=output_dir,
        force_download=True
    )
    print(f"Downloaded to: {local_dir}")
    return local_dir

downloaded_dirs = {}
for model in models:
    local_path = download_model(model, OUTPUT_DIR)
    downloaded_dirs[model] = local_path

print("\nAll models downloaded successfully!")

# Module imports and common setup

In [None]:
import os
import torch
import numpy as np
from PIL import Image
from pycocotools.coco import COCO
from torchvision import transforms
from diffusers import (
    StableDiffusionControlNetPipeline,
    ControlNetModel
)
from transformers import AutoTokenizer, CLIPTextModel
from diffusers import AutoencoderKL, UNet2DConditionModel

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16

# COCO paths
COCO_ROOT = "/kaggle/input/coco-2017-dataset/coco2017"
IMG_DIR = os.path.join(COCO_ROOT, "val2017")
ANN_FILE = os.path.join(COCO_ROOT, "annotations/instances_val2017.json")

BASE_MODEL_ID = "runwayml/stable-diffusion-v1-5"

# Image transforms
image_transform = transforms.Compose([
    transforms.Resize(512),
    transforms.CenterCrop(512),
    transforms.ToTensor()
])

cond_transform = transforms.Compose([
    transforms.Resize(512, interpolation=transforms.InterpolationMode.NEAREST),
    transforms.CenterCrop(512),
    transforms.ToTensor()
])

# Segmentation controlnet demo

In [None]:
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from pycocotools.coco import COCO
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UNet2DConditionModel
from safetensors.torch import load_file

# Config
DEVICE = "cuda"
DTYPE = torch.float16

BASE_MODEL = "runwayml/stable-diffusion-v1-5"
CONTROLNET_DIR = "/kaggle/working/models/models--ritishshrirao--Controlnet_SD1.5_coco_segmentation/snapshots/ecc3bbd29715db6a3a972dbb07e2e2429d3e3a1c"

IMG_DIR = "/kaggle/input/coco-2017-dataset/coco2017/val2017"
ANN_FILE = "/kaggle/input/coco-2017-dataset/coco2017/annotations/instances_val2017.json"

NUM_SAMPLES = 8
BATCH_SIZE = 8
RESOLUTION = 512

print("Loading base UNet...")
unet = UNet2DConditionModel.from_pretrained(BASE_MODEL, subfolder="unet")

print("Building ControlNet from UNet...")
controlnet = ControlNetModel.from_unet(unet)

# Load checkpoint manually
ckpt_path = os.path.join(CONTROLNET_DIR, "diffusion_pytorch_model.safetensors")
state_dict = load_file(ckpt_path, device="cpu")
cleaned = {k.replace("module.", "") if k.startswith("module.") else k: v for k, v in state_dict.items()}

missing, unexpected = controlnet.load_state_dict(cleaned, strict=False)
print(f"Missing keys: {len(missing)}, Unexpected keys: {len(unexpected)}")

controlnet = controlnet.to(DEVICE, dtype=DTYPE)

pipe = StableDiffusionControlNetPipeline.from_pretrained(
    BASE_MODEL,
    controlnet=controlnet,
    safety_checker=None,
    torch_dtype=DTYPE
).to(DEVICE)
pipe.set_progress_bar_config(disable=True)

def build_prompt(coco, anns):
    names = sorted({c["name"] for c in coco.loadCats([a["category_id"] for a in anns])})
    return f"A photorealistic image containing {', '.join(names)}" if names else "A photorealistic image"

def build_seg_map(coco, img_info, anns):
    mask = np.zeros((img_info["height"], img_info["width"], 3), dtype=np.uint8)
    for ann in sorted(anns, key=lambda x: x["area"], reverse=True):
        cid = ann["category_id"]
        color = ((cid * 37) % 255, (cid * 17) % 255, (cid * 29) % 255)
        mask[coco.annToMask(ann) == 1] = color
    return Image.fromarray(mask)

coco = COCO(ANN_FILE)
img_ids = [i for i in coco.getImgIds() if len(coco.getAnnIds(imgIds=i)) > 0]
selected_ids = np.random.choice(img_ids, NUM_SAMPLES, replace=False)

results = []

for start in range(0, NUM_SAMPLES, BATCH_SIZE):
    batch_ids = selected_ids[start:start + BATCH_SIZE]

    prompts, cond_imgs, gt_imgs = [], [], []

    for img_id in batch_ids:
        info = coco.loadImgs(int(img_id))[0]
        img = Image.open(os.path.join(IMG_DIR, info["file_name"])).convert("RGB")
        anns = coco.loadAnns(coco.getAnnIds(imgIds=int(img_id)))

        seg_map = build_seg_map(coco, info, anns).resize((RESOLUTION, RESOLUTION), Image.NEAREST)

        prompts.append(build_prompt(coco, anns))
        cond_imgs.append(seg_map)
        gt_imgs.append(img.resize((RESOLUTION, RESOLUTION)))

    with torch.autocast("cuda", dtype=DTYPE):
        preds = pipe(
            prompt=prompts,
            image=cond_imgs,
            num_inference_steps=25,
            controlnet_conditioning_scale=1.0
        ).images

    for i in range(len(preds)):
        results.append((cond_imgs[i], gt_imgs[i], preds[i], prompts[i]))

fig, axes = plt.subplots(len(results), 3, figsize=(15, 5 * len(results)))
if len(results) == 1:
    axes = np.expand_dims(axes, axis=0)

for i, (cond, gt, pred, prompt) in enumerate(results):
    axes[i, 0].imshow(cond)
    axes[i, 0].set_title("Segmentation Map")
    axes[i, 0].axis("off")

    axes[i, 1].imshow(gt)
    axes[i, 1].set_title("Ground Truth")
    axes[i, 1].axis("off")

    axes[i, 2].imshow(pred)
    axes[i, 2].set_title(prompt[:60])
    axes[i, 2].axis("off")

plt.tight_layout()
plt.show()

# Bounding box controlnet demo

In [None]:
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
from pycocotools.coco import COCO
import hashlib
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler

DEVICE = "cuda"
DTYPE = torch.float16

BASE_MODEL = "runwayml/stable-diffusion-v1-5"
CONTROLNET_PATH = "/kaggle/working/models/models--ritishshrirao--controlnet-coco-bbox-filled/snapshots/af6076a87396d1a9437d2b02cc0a6add2298920c"

IMG_DIR = "/kaggle/input/coco-2017-dataset/coco2017/val2017"
ANN_FILE = "/kaggle/input/coco-2017-dataset/coco2017/annotations/instances_val2017.json"

NUM_SAMPLES = 8
BATCH_SIZE = 8
RESOLUTION = 512

controlnet = ControlNetModel.from_pretrained(CONTROLNET_PATH, torch_dtype=DTYPE).to(DEVICE)

pipe = StableDiffusionControlNetPipeline.from_pretrained(
    BASE_MODEL,
    controlnet=controlnet,
    safety_checker=None,
    torch_dtype=DTYPE
).to(DEVICE)

pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=True)

def color_map(coco):
    cmap = {}
    for c in coco.loadCats(coco.getCatIds()):
        h = hashlib.md5(str(c["id"]).encode()).hexdigest()
        cmap[c["id"]] = tuple(int(h[i:i+2], 16) for i in (0, 2, 4))
    return cmap

def draw_bbox(img_size, anns, cmap):
    w, h = img_size
    canvas = Image.new("RGB", (w, h))
    d = ImageDraw.Draw(canvas)
    for a in sorted(anns, key=lambda x: x["area"], reverse=True):
        x, y, bw, bh = a["bbox"]
        d.rectangle([x, y, x+bw, y+bh], fill=cmap[a["category_id"]])
    return canvas

def build_prompt(coco, anns):
    names = sorted({c["name"] for c in coco.loadCats([a["category_id"] for a in anns])})
    return f"A photorealistic image containing {', '.join(names)}" if names else "A photorealistic image"

coco = COCO(ANN_FILE)
cmap = color_map(coco)
img_ids = [i for i in coco.getImgIds() if len(coco.getAnnIds(imgIds=i)) > 0]
selected = np.random.choice(img_ids, NUM_SAMPLES, replace=False)

results = []

for start in range(0, NUM_SAMPLES, BATCH_SIZE):
    batch_ids = selected[start:start+BATCH_SIZE]

    prompts, cond_imgs, gt_imgs = [], [], []

    for img_id in batch_ids:
        info = coco.loadImgs(int(img_id))[0]
        img = Image.open(os.path.join(IMG_DIR, info["file_name"])).convert("RGB")
        anns = coco.loadAnns(coco.getAnnIds(imgIds=int(img_id)))

        bbox_img = draw_bbox(img.size, anns, cmap).resize((RESOLUTION, RESOLUTION), Image.NEAREST)

        prompts.append(build_prompt(coco, anns))
        cond_imgs.append(bbox_img)
        gt_imgs.append(img.resize((RESOLUTION, RESOLUTION)))

    with torch.autocast("cuda"):
        preds = pipe(
            prompt=prompts,
            image=cond_imgs,
            num_inference_steps=20,
            controlnet_conditioning_scale=1.0
        ).images

    for i in range(len(preds)):
        results.append((cond_imgs[i], gt_imgs[i], preds[i], prompts[i]))

fig, axes = plt.subplots(len(results), 3, figsize=(15, 5*len(results)))
if len(results) == 1: axes = axes[None]

for i, (cond, gt, pred, prompt) in enumerate(results):
    axes[i, 0].imshow(cond); axes[i, 0].set_title("Filled BBox"); axes[i, 0].axis("off")
    axes[i, 1].imshow(gt); axes[i, 1].set_title("Ground Truth"); axes[i, 1].axis("off")
    axes[i, 2].imshow(pred); axes[i, 2].set_title(prompt[:60]); axes[i, 2].axis("off")

plt.tight_layout()
plt.show()

# Multi-controlnet demo (Segmentation + bounding box)

In [None]:
# Paths
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
from pycocotools.coco import COCO
import hashlib
from diffusers import (
    StableDiffusionControlNetPipeline,
    ControlNetModel,
    MultiControlNetModel,
    UNet2DConditionModel,
    UniPCMultistepScheduler,
    PNDMScheduler
)
from safetensors.torch import load_file

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32

BBOX_PATH = "/kaggle/working/models/models--ritishshrirao--controlnet-coco-multi/snapshots/8963e8536dc8e7c2c0fe16d1eeab179149871994/bbox"
SEG_PATH = "/kaggle/working/models/models--ritishshrirao--controlnet-coco-multi/snapshots/8963e8536dc8e7c2c0fe16d1eeab179149871994/segmentation"

BASE_MODEL = "runwayml/stable-diffusion-v1-5"
IMG_DIR = "/kaggle/input/coco-2017-dataset/coco2017/val2017"
ANN_FILE = "/kaggle/input/coco-2017-dataset/coco2017/annotations/instances_val2017.json"

NUM_SAMPLES = 1
BATCH_SIZE = 1
RESOLUTION = 512

def load_controlnet_robust(path):
    if os.path.isfile(path):
        path = os.path.dirname(path)
    try:
        cn = ControlNetModel.from_pretrained(path, torch_dtype=DTYPE)
        return cn.to(DEVICE)
    except Exception as e:
        unet = UNet2DConditionModel.from_pretrained(BASE_MODEL, subfolder="unet")
        cn = ControlNetModel.from_unet(unet).to(DEVICE, dtype=DTYPE)
        files = [f for f in os.listdir(path) if f.endswith('.safetensors')]
        if not files:
            raise FileNotFoundError(f"No .safetensors file found in {path}")
        weight_path = os.path.join(path, files[0])
        state_dict = load_file(weight_path)
        state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
        m, u = cn.load_state_dict(state_dict, strict=False)
        return cn

controlnet_bbox = load_controlnet_robust(BBOX_PATH)
controlnet_seg = load_controlnet_robust(SEG_PATH)

multi_controlnet = MultiControlNetModel([controlnet_seg, controlnet_bbox]).to(DEVICE)

pipe = StableDiffusionControlNetPipeline.from_pretrained(
    BASE_MODEL,
    controlnet=multi_controlnet,
    safety_checker=None,
    torch_dtype=DTYPE
).to(DEVICE)

pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=True)

def generate_color_map(coco):
    palette = {}
    for cat in coco.loadCats(coco.getCatIds()):
        h = hashlib.md5(str(cat['id']).encode()).hexdigest()
        palette[cat['id']] = (int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16))
    return palette

def training_transform(img, size=512, resample=Image.BILINEAR):
    w, h = img.size
    scale = size / min(w, h)
    new_w, new_h = int(w * scale), int(h * scale)
    img = img.resize((new_w, new_h), resample=resample)
    left = (new_w - size) // 2
    top = (new_h - size) // 2
    img = img.crop((left, top, left + size, top + size))
    return img

def draw_seg(coco, info, anns, cmap):
    mask = np.zeros((info["height"], info["width"], 3), dtype=np.uint8)
    for ann in sorted(anns, key=lambda x: x["area"], reverse=True):
        cid = ann["category_id"]
        color = cmap.get(cid, (255, 255, 255))
        try: mask[coco.annToMask(ann) == 1] = color
        except: continue
    return Image.fromarray(mask)

def draw_bbox(size, anns, cmap):
    canvas = Image.new("RGB", size)
    d = ImageDraw.Draw(canvas)
    for ann in sorted(anns, key=lambda x: x["area"], reverse=True):
        x, y, w, h = ann["bbox"]
        d.rectangle([x, y, x + w, y + h], fill=cmap[ann["category_id"]])
    return canvas

def build_prompt(coco, anns):
    names = sorted({c["name"] for c in coco.loadCats([a["category_id"] for a in anns])})
    base = f"A high-quality, photorealistic image containing {', '.join(names)}"
    return base if names else "A high-quality photorealistic image"

coco = COCO(ANN_FILE)
cmap = generate_color_map(coco)
img_ids = [i for i in coco.getImgIds() if len(coco.getAnnIds(imgIds=i)) > 0]
np.random.seed(99)
selected = np.random.choice(img_ids, NUM_SAMPLES, replace=False)

results = []

for start in range(0, NUM_SAMPLES, BATCH_SIZE):
    batch_ids = selected[start:start + BATCH_SIZE]
    prompts, segs, bboxes, gts = [], [], [], []
    for img_id in batch_ids:
        info = coco.loadImgs(int(img_id))[0]
        try:
            raw_img = Image.open(os.path.join(IMG_DIR, info["file_name"])).convert("RGB")
        except: continue
        anns = coco.loadAnns(coco.getAnnIds(imgIds=int(img_id)))
        r_seg = draw_seg(coco, info, anns, cmap)
        r_bbox = draw_bbox(raw_img.size, anns, cmap)
        seg_img = training_transform(r_seg, RESOLUTION, Image.NEAREST)
        bbox_img = training_transform(r_bbox, RESOLUTION, Image.NEAREST)
        gt_img = training_transform(raw_img, RESOLUTION, Image.BILINEAR)
        prompts.append(build_prompt(coco, anns))
        segs.append(seg_img)
        bboxes.append(bbox_img)
        gts.append(gt_img)
    if not prompts: continue
    control_inputs = [[s, b] for s, b in zip(segs, bboxes)]
    g_cuda = torch.Generator(device=DEVICE).manual_seed(42)
    with torch.autocast(DEVICE):
        preds_both = pipe(
            prompt=prompts,
            image=control_inputs,
            num_inference_steps=30,
            controlnet_conditioning_scale=[1.0, 1.0],
            generator=g_cuda
            ).images
        preds_seg = pipe(
            prompt=prompts,
            image=control_inputs,
            num_inference_steps=30,
            controlnet_conditioning_scale=[1.0, 0.0],
            generator=g_cuda
            ).images
        preds_bbox = pipe(
            prompt=prompts,
            image=control_inputs,
            num_inference_steps=30,
            controlnet_conditioning_scale=[0.0, 1.0],
            generator=g_cuda
            ).images
    for i in range(len(preds_both)):
        results.append((segs[i], bboxes[i], gts[i], preds_both[i], preds_seg[i], preds_bbox[i]))

if results:
    rows = len(results)
    cols = 6
    fig, axes = plt.subplots(rows, cols, figsize=(24, 4 * rows))
    if rows == 1: axes = axes.reshape(1, -1)
    for i, (seg, bbox, gt, both, p_seg, p_bbox) in enumerate(results):
        axes[i, 0].imshow(seg); axes[i, 0].set_title("Seg Input")
        axes[i, 1].imshow(bbox); axes[i, 1].set_title("BBox Input")
        axes[i, 2].imshow(gt); axes[i, 2].set_title("Ground Truth")
        axes[i, 3].imshow(both); axes[i, 3].set_title("Result: Both")
        axes[i, 4].imshow(p_seg); axes[i, 4].set_title("Result: Seg Only")
        axes[i, 5].imshow(p_bbox); axes[i, 5].set_title("Result: BBox Only")
        for ax in axes[i]: ax.axis("off")
    plt.tight_layout()
    plt.show()

# Animatediff segmentation controlnet demo

In [None]:
# Paths
import os
import random
import torch
import numpy as np
from PIL import Image
from pycocotools.coco import COCO
import torchvision.transforms.functional as TF
from diffusers import (
    AnimateDiffControlNetPipeline,
    ControlNetModel,
    MotionAdapter,
    EulerDiscreteScheduler
)
from diffusers.utils import export_to_gif

SD_MODEL_ID = "runwayml/stable-diffusion-v1-5"

CONTROLNET_PATH = (
    "/kaggle/working/models/"
    "models--ritishshrirao--Controlnet_SD1.5_coco_segmentation/"
    "snapshots/ecc3bbd29715db6a3a972dbb07e2e2429d3e3a1c"
)

MOTION_WEIGHTS_FILE = (
    "/kaggle/working/models/"
    "models--ritishshrirao--animatediff-controlnet-coco-segmentation/"
    "snapshots/fe676e774aed9b78a9268932ab810e2745447fcf/"
    "diffusion_pytorch_model.safetensors"
)
MOTION_WEIGHTS_DIR = os.path.dirname(MOTION_WEIGHTS_FILE)

IMG_DIR = "/kaggle/input/coco-2017-dataset/coco2017/val2017"
ANN_FILE = "/kaggle/input/coco-2017-dataset/coco2017/annotations/instances_val2017.json"

RESOLUTION = 512
NUM_FRAMES = 16
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16

def generate_color_map(coco):
    palette = {}
    for cat in coco.loadCats(coco.getCatIds()):
        cat_id = cat['id']
        r = (cat_id * 37) % 255
        g = (cat_id * 17 + 128) % 255
        b = (cat_id * 123 + 55) % 255
        palette[cat_id] = (r, g, b)
    return palette

def create_segmentation_mask(coco, img_id, img_shape, color_map):
    ann_ids = coco.getAnnIds(imgIds=img_id)
    anns = coco.loadAnns(ann_ids)
    mask = np.zeros((img_shape[1], img_shape[0], 3), dtype=np.uint8)
    anns = sorted(anns, key=lambda x: x['area'], reverse=True)
    cat_names = []
    for ann in anns:
        cat_id = ann['category_id']
        color = color_map.get(cat_id, (255, 255, 255))
        binary_mask = coco.annToMask(ann)
        mask[binary_mask == 1] = color
        cat_info = coco.loadCats(cat_id)[0]
        cat_names.append(cat_info['name'])
    return Image.fromarray(mask), list(set(cat_names))

def apply_synthetic_motion(pil_img, motion_type="zoom_in"):
    frames = []
    target_zoom = 1.15 if "zoom" in motion_type else 1.0
    target_pan_x = 30 if "pan" in motion_type else 0.0
    target_pan_y = 0.0
    for i in range(NUM_FRAMES):
        progress = i / max(1, (NUM_FRAMES - 1))
        curr_zoom = 1.0 + (target_zoom - 1.0) * progress
        curr_tx = target_pan_x * progress
        curr_ty = target_pan_y * progress
        img_t = TF.affine(
            pil_img, 
            angle=0, 
            translate=(curr_tx, curr_ty), 
            scale=curr_zoom, 
            shear=0, 
            interpolation=TF.InterpolationMode.NEAREST, 
            fill=0
        )
        img_t = TF.resize(img_t, (RESOLUTION, RESOLUTION), interpolation=TF.InterpolationMode.NEAREST)
        frames.append(img_t)
    return frames

def main():
    adapter = MotionAdapter.from_pretrained(MOTION_WEIGHTS_DIR, torch_dtype=DTYPE)
    controlnet = ControlNetModel.from_pretrained(CONTROLNET_PATH, torch_dtype=DTYPE)
    pipe = AnimateDiffControlNetPipeline.from_pretrained(
        SD_MODEL_ID,
        motion_adapter=adapter,
        controlnet=controlnet,
        torch_dtype=DTYPE
    ).to(DEVICE)
    pipe.scheduler = EulerDiscreteScheduler.from_config(
        pipe.scheduler.config, 
        timestep_spacing="trailing",
        beta_schedule="linear"
    )
    pipe.enable_vae_slicing()

    coco = COCO(ANN_FILE)
    color_map = generate_color_map(coco)
    img_ids = coco.getImgIds()
    valid_ids = [id for id in img_ids if len(coco.getAnnIds(imgIds=id)) > 0]
    sample_id = random.choice(valid_ids)
    img_info = coco.loadImgs(sample_id)[0]
    orig_image = Image.open(os.path.join(IMG_DIR, img_info['file_name'])).convert("RGB")
    control_mask, categories = create_segmentation_mask(coco, sample_id, orig_image.size, color_map)
    prompt = f"A photorealistic high-quality cinematic video of {', '.join(categories)}, 4k, trending on artstation"
    negative_prompt = "bad quality, distorted, low resolution, watermark, cartoon, sketch"
    conditioning_frames = apply_synthetic_motion(control_mask, motion_type="pan_right_zoom")
    seed = random.randint(0, 100000)
    generator = torch.Generator(device=DEVICE).manual_seed(seed)
    output = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        num_frames=NUM_FRAMES,
        conditioning_frames=conditioning_frames,
        controlnet_conditioning_scale=1.0,
        num_inference_steps=25,
        guidance_scale=7.5,
        generator=generator,
        width=RESOLUTION,
        height=RESOLUTION
    )
    frames = output.frames[0]
    out_filename = f"output_{sample_id}_seed{seed}.gif"
    export_to_gif(frames, out_filename)
    debug_frames = []
    for mask, res in zip(conditioning_frames, frames):
        mask_resized = mask.resize(res.size)
        new_img = Image.new('RGB', (res.width * 2, res.height))
        new_img.paste(mask_resized, (0, 0))
        new_img.paste(res, (res.width, 0))
        debug_frames.append(new_img)
    export_to_gif(debug_frames, f"debug_{sample_id}_seed{seed}.gif")

if __name__ == "__main__":
    main()

In [None]:
rm *.gif

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
import os
os._exit(0)