In [3]:
!pip -q install -U diffusers transformers accelerate safetensors huggingface_hub


from pathlib import Path
import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw, ImageOps
from diffusers import AutoPipelineForInpainting
import random

objects = ["medium-size dog", "cat", "deer", "fox", "wolf"]
angles  = ["front view", "side view", "three-quarter view", "from behind", "low angle", "slightly top-down"]

Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.
Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.


In [4]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [5]:
ROOT = Path("/content/drive/MyDrive/Autonomous_Project")

MASKS_DIR  = ROOT / "data" / "subset_100" / "masks_sam3_road"
IMAGES_DIR = ROOT / "data" / "subset_100" / "images"
OUT_DIR    = ROOT / "data" / "subset_100" / "aug" / "animels"
OUT_DIR.mkdir(parents=True, exist_ok=True)

objects = ["medium-size dog", "cat", "deer", "fox", "wolf"]
angles  = ["front view", "side view", "three-quarter view", "from behind", "low angle", "slightly top-down"]

In [6]:
def find_matching_mask(img_path: Path, masks_dir: Path) -> Path:
    # masks look like: <stem>_mask.png
    stem_mask = list(masks_dir.glob(img_path.stem + "_mask.*"))
    if stem_mask:
        return stem_mask[0]

    any_mask = list(masks_dir.glob(img_path.stem + "*mask*.*"))
    if any_mask:
        return any_mask[0]

    raise FileNotFoundError(f"No matching mask for {img_path.name}")

def spot_mask_from_road(road_mask_L, image_size, spot_size=(320, 260), y_min_frac=0.72, y_max_frac=0.95):
    mask_np = np.array(road_mask_L)
    H, W = mask_np.shape

    road = mask_np > 128  # assumes road=white
    ys, xs = np.where(road)

    y_min, y_max = int(y_min_frac * H), int(y_max_frac * H)
    ok = (ys >= y_min) & (ys <= y_max)
    ys2, xs2 = ys[ok], xs[ok]

    if len(xs2) == 0:
        cx = np.random.randint(W); cy = np.random.randint(H)
    else:
        i = np.random.randint(len(xs2))
        cx, cy = int(xs2[i]), int(ys2[i])

    img_w, img_h = image_size
    w, h = spot_size

    x0, y0 = max(0, cx - w//2), max(0, cy - h//2)
    x1, y1 = min(img_w, cx + w//2), min(img_h, cy + h//2)

    spot_mask = Image.new("L", (img_w, img_h), 0)
    ImageDraw.Draw(spot_mask).ellipse([x0, y0, x1, y1], fill=255)

    bbox = (x0, y0, x1, y1)  # left, top, right, bottom
    return spot_mask, (cx, cy), bbox

def inpaint_on_crop(pipe, image, spot_mask, bbox, prompt, negative,
                    crop_pad=180, crop_res=768, guidance=14.0, steps=40, strength=0.98):
    W, H = image.size
    x0, y0, x1, y1 = bbox

    # padding for context
    x0p = max(0, x0 - crop_pad); y0p = max(0, y0 - crop_pad)
    x1p = min(W, x1 + crop_pad); y1p = min(H, y1 + crop_pad)

    img_crop  = image.crop((x0p, y0p, x1p, y1p))
    mask_crop = spot_mask.crop((x0p, y0p, x1p, y1p))

    # resize crop to square (helps SDXL a lot)
    img_r  = img_crop.resize((crop_res, crop_res))
    mask_r = mask_crop.resize((crop_res, crop_res))

    out_r = pipe(
        prompt=prompt,
        negative_prompt=negative,
        image=img_r,
        mask_image=mask_r,
        guidance_scale=guidance,
        num_inference_steps=steps,
        strength=strength,
        width=crop_res,
        height=crop_res,
        num_images_per_prompt=1,
    ).images[0]

    out_crop = out_r.resize(img_crop.size)
    merged = image.copy()
    merged.paste(out_crop, (x0p, y0p))
    return merged

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"

pipe = AutoPipelineForInpainting.from_pretrained(
    "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    variant="fp16" if device == "cuda" else None,
)

if device == "cuda":
    pipe.enable_model_cpu_offload()

pipe.enable_attention_slicing()
pipe.vae.enable_slicing()
pipe.vae.enable_tiling()

try:
    pipe.enable_xformers_memory_efficient_attention()
except Exception:
    pass

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_index.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

Fetching 18 files:   0%|          | 0/18 [00:00<?, ?it/s]

scheduler_config.json:   0%|          | 0.00/479 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/737 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

text_encoder/model.fp16.safetensors:   0%|          | 0.00/246M [00:00<?, ?B/s]

text_encoder_2/model.fp16.safetensors:   0%|          | 0.00/1.39G [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

unet/diffusion_pytorch_model.fp16.safete(…):   0%|          | 0.00/5.14G [00:00<?, ?B/s]

vae/diffusion_pytorch_model.fp16.safeten(…):   0%|          | 0.00/167M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

The config attributes {'decay': 0.9999, 'inv_gamma': 1.0, 'min_decay': 0.0, 'optimization_step': 37000, 'power': 0.6666666666666666, 'update_after_step': 0, 'use_ema_warmup': False} were passed to UNet2DConditionModel, but are not expected and will be ignored. Please verify your config.json configuration file.
`torch_dtype` is deprecated! Use `dtype` instead!


In [10]:
img_files = [p for p in IMAGES_DIR.iterdir() if p.is_file()]

valid_imgs = []
for p in img_files:
    try:
        _ = find_matching_mask(p, MASKS_DIR)
        valid_imgs.append(p)
    except FileNotFoundError:
        pass

print("total images:", len(img_files))
print("images with masks:", len(valid_imgs))

chosen_imgs = random.sample(valid_imgs, k=min(80, len(valid_imgs)))
print("chosen:", [p.name for p in chosen_imgs])

total images: 100
images with masks: 91
chosen: ['000912.png', '000801.png', '000128.png', '000992.png', '000580.png', '000794.png', '000952.png', '001784.png', '001099.png', '001610.png', '001780.png', '000245.png', '001112.png', '001232.png', '000869.png', '001631.png', '001868.png', '001337.png', '000395.png', '000413.png', '001735.png', '000234.png', '001160.png', '000004.png', '001178.png', '001534.png', '001137.png', '000148.png', '000943.png', '000511.png', '000225.png', '001644.png', '000331.png', '000345.png', '001701.png', '000315.png', '001647.png', '000576.png', '000477.png', '000659.png', '001564.png', '000624.png', '000525.png', '000209.png', '000162.png', '001732.png', '000243.png', '001211.png', '000105.png', '000302.png', '001668.png', '000860.png', '000503.png', '001258.png', '000015.png', '000574.png', '001526.png', '000242.png', '001003.png', '001419.png', '000286.png', '000748.png', '000965.png', '000518.png', '000411.png', '001073.png', '000610.png', '001192.png',

In [11]:
results = []

for idx, img_path in enumerate(chosen_imgs, start=1):
    mask_path = find_matching_mask(img_path, MASKS_DIR)

    init_image = Image.open(img_path).convert("RGB")
    road_mask  = Image.open(mask_path).convert("L").resize(init_image.size)

    # אם כביש אצלך שחור ולא לבן, תבטל הערה:
    # road_mask = ImageOps.invert(road_mask)

    obj = random.choice(objects)
    ang = random.choice(angles)

    prompt = (
        f"a realistic {obj} standing on the asphalt road, {ang}, full body, "
        "clearly visible, occupies a noticeable portion of the frame, "
        "correct scale and perspective, natural daylight, sharp focus, "
        "cast shadow on the road, photo-realistic, high detail"
    )
    negative = "cartoon, painting, blurry, lowres, deformed, extra legs, bad anatomy, floating, no shadow"

    # bigger spot -> easier to see object
    spot_w = random.randint(300, 420)
    spot_h = random.randint(220, 320)
    spot_mask, (cx, cy), bbox = spot_mask_from_road(
        road_mask,
        init_image.size,
        spot_size=(spot_w, spot_h),
        y_min_frac=0.72,
        y_max_frac=0.95
    )

    # debug overlay (optional)
    ov = np.array(init_image).copy()
    m  = np.array(spot_mask) > 10
    ov[m] = (ov[m] * 0.55).astype(np.uint8)
    plt.figure(figsize=(12,4))
    plt.title(f"{idx}/5 overlay | {img_path.name} | bbox={bbox}")
    plt.imshow(ov)
    plt.axis("off")
    plt.show()

    out = inpaint_on_crop(
        pipe=pipe,
        image=init_image,
        spot_mask=spot_mask,
        bbox=bbox,
        prompt=prompt,
        negative=negative,
        crop_pad=180,
        crop_res=768,   # אפשר 1024 אם אתה רוצה יותר חד, אבל יהיה יותר כבד
        guidance=14.0,
        steps=30,
        strength=0.98
    )

    out_path = OUT_DIR / f"{img_path.stem}_{obj.replace(' ','_')}_{ang.replace(' ','_').replace('-','_')}.png"
    out.save(out_path)

    results.append((img_path.name, obj, ang, str(out_path)))

    plt.figure(figsize=(12,4))
    plt.title(f"{idx}/5 | {img_path.name} | {obj} | {ang} | {init_image.size[0]}x{init_image.size[1]}")
    plt.imshow(out)
    plt.axis("off")
    plt.show()

print("Saved outputs:")
for r in results:
    print(r)

Output hidden; open in https://colab.research.google.com to view.

In [12]:
import gc
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("cache cleared")

cache cleared
