## Generate images with Stable Diffusion

In [7]:
import os

# Model
%env CUDA_VISIBLE_DEVICES=3,2
#%env PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32

# HF Cache
os.environ["HF_HOME"] = "../.cache"
!echo $HF_HOME
!huggingface-cli whoami

env: CUDA_VISIBLE_DEVICES=3,2
../.cache
TorgeSchwark
[1morgs: [0m Basket-AEye


## Flux 1 dev 

In [2]:
import torch
from diffusers import FluxPipeline

pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, device_map="balanced") # Auto testen
#pipe.enable_model_cpu_offload() #save some VRAM by offloading the model to CPU. Remove this if you have enough GPU power

prompt = "A high-resolution photo of a single jar of tomato sauce on a plain white background, realistic lighting, label visible, in the style of commercial product photography, supermarket packaging design with label, front-facing, no reflections, no props."
image = pipe(
    prompt,
    height=256,
    width=256,
    num_images_per_prompt=4,
    guidance_scale=3.5,
    num_inference_steps=50,
    max_sequence_length=512,
    generator=torch.Generator("cpu").manual_seed(0)
).images[0]
image.save("generated_images/sauce2.png")


  from .autonotebook import tqdm as notebook_tqdm
2025-04-30 12:35:11.483712: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746009311.502990  629570 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746009311.508819  629570 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-30 12:35:11.529426: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


GatedRepoError: 401 Client Error. (Request ID: Root=1-6811fce3-0c0112a017e8421947e79e21;66857834-4090-4284-b620-4dc97be3b4dd)

Cannot access gated repo for url https://huggingface.co/black-forest-labs/FLUX.1-dev/resolve/main/model_index.json.
Access to model black-forest-labs/FLUX.1-dev is restricted. You must have access to it and be authenticated to access it. Please log in.

## Stable diffusion 3.5 M

In [None]:
from diffusers import StableDiffusion3Pipeline
import torch
import gc
from huggingface_hub import login



pipe = StableDiffusion3Pipeline.from_pretrained(
    "stabilityai/stable-diffusion-3.5-medium",
    torch_dtype=torch.bfloat16,
    device_map="balanced",  # this splits the model
)

  from .autonotebook import tqdm as notebook_tqdm
Loading pipeline components...:   0%|          | 0/9 [00:00<?, ?it/s]You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.12it/s]it/s]
Loading pipeline components...: 100%|██████████| 9/9 [00:06<00:00,  1.42it/s]


In [None]:
image = pipe(
    "A single supermarket tomato sauce on a plain background with a sauce product label at front",
    num_inference_steps=15,
    guidance_scale=6.0,
    height=512,
    width=512,
    num_images_per_prompt=1
).images[0]

image.save("generated_images/stable_diffusion_3/sauce.png")



100%|██████████| 15/15 [00:21<00:00,  1.45s/it]


In [22]:
import os
import gc
import torch
from PIL import Image

def generate_images_from_prompts(
    pipe,
    prompt_dict,
    num_total_images_per_class=10,
    output_dir="generated_images/stable_diffusion_3"
):
    init_image = Image.open("./generated_images/barilla_pasta_sauce_basilico_gro.jpg").convert("RGB")

    for class_name, prompts in prompt_dict.items():
        class_dir = os.path.join(output_dir, class_name)
        os.makedirs(class_dir, exist_ok=True)
        count = 0
        prompt_index = 0

        while count < num_total_images_per_class:
            prompt = prompts[prompt_index % len(prompts)]
            prompt_index += 1

            image = pipe(
                prompt,
                num_inference_steps=15,
                guidance_scale=7.0,
                height=512,
                width=512,
                num_images_per_prompt=1
            ).images[0]

            save_path = os.path.join(class_dir, f"{count}.png")
            image.save(save_path)
            print(f"Saved: {save_path}")
            count += 1

    # Speicher aufräumen
    torch.cuda.empty_cache()
    gc.collect()


In [None]:
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
import torch
from PIL import Image

# Lade das ControlNet Modell
controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11f1p_sd15_depth", 
                                             torch_dtype=torch.float16, 
                                             variant="fp16", 
                                             use_safetensors=True)

# Erstelle die Stable Diffusion Pipeline mit ControlNet
pipeline = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    controlnet=controlnet,
    torch_dtype=torch.float16,
    variant="fp16",
    use_safetensors=True
)
pipeline.enable_model_cpu_offload()

input_image = Image.open("generated_images/barilla_pasta_sauce_basilico_gro.jpg").convert("RGB")

# Dein Prompt
prompt = "A realistic supermarket tomato sauce on a neutral background."

# Optional: Lade ein Bild für ControlNet als Initialbild (falls gewünscht)
# init_image = Image.open("path_to_your_image.jpg").convert("RGB")  # Beispielbild laden, falls du ControlNet einsetzen möchtest

# Führe die Bildgenerierung durch
output_image = pipeline(
    prompt=prompt,
    image=input_image,
    num_inference_steps=20,   # Die Anzahl der Schritte für die Bildgenerierung
    guidance_scale=5.0,       # Höherer Wert sorgt für genauere Bildausgabe entsprechend dem Text
).images[0]

# Zeige das erzeugte Bild an
output_image.save("generated_images/tomato_sauce_controlnet_output.png")



Loading pipeline components...: 100%|██████████| 7/7 [00:00<00:00,  8.65it/s]
100%|██████████| 20/20 [00:57<00:00,  2.88s/it]


In [23]:
prompt_dict = {
    "tomato sauce": [
        "A supermarket tomato sauce bottle on a neutral background",
        "A tomato sauce bottle with a clear label on a plain background",
        "A tomato sauce bottle placed on a simple neutral background"
    ],
    "apples": [
        "A single red apple on a plain neutral background",
        "A green apple on a simple neutral background",
        "A yellow apple on a plain background"
    ],
    "bananas": [
        "A bunch of ripe bananas on a neutral background",
        "A single banana on a plain background",
        "A bunch of bananas on a simple neutral background"
    ],
    "cola bottles": [
        "A cola bottle with a label on a neutral background",
        "A cola bottle on a plain background",
        "A cola bottle with a simple design on a neutral background"
    ],
    "milk": [
        "A milk carton on a neutral background",
        "A plastic milk bottle on a plain background",
        "A single container of milk on a neutral background"
    ],
    "muesli": [
        "A muesli box on a simple neutral background",
        "A muesli box with a label on a plain background",
        "A muesli box on a neutral background"
    ],
    "avocado": [
        "A ripe avocado on a neutral background",
        "A single avocado on a plain background",
        "A whole avocado on a neutral background"
    ],
    "cucumber": [
        "A fresh cucumber on a neutral background",
        "A cucumber on a plain background",
        "A single cucumber on a neutral background"
    ],
    "water": [
        "A bottle of water on a neutral background",
        "A plastic water bottle on a plain background",
        "A clear water bottle on a neutral background"
    ],
    "quark": [
        "A container of quark on a neutral background",
        "A tub of quark on a simple background",
        "A container of quark with a label on a neutral background"
    ]
}


generate_images_from_prompts(pipe, prompt_dict)

100%|██████████| 15/15 [00:21<00:00,  1.45s/it]


Saved: generated_images/stable_diffusion_3/tomato sauce/0.png


100%|██████████| 15/15 [00:21<00:00,  1.45s/it]


Saved: generated_images/stable_diffusion_3/tomato sauce/1.png


100%|██████████| 15/15 [00:22<00:00,  1.47s/it]


Saved: generated_images/stable_diffusion_3/tomato sauce/2.png


100%|██████████| 15/15 [00:22<00:00,  1.50s/it]


Saved: generated_images/stable_diffusion_3/tomato sauce/3.png


100%|██████████| 15/15 [00:22<00:00,  1.52s/it]


KeyboardInterrupt: 

### VRAM reduction using quantization

In [None]:
from diffusers import BitsAndBytesConfig, SD3Transformer2DModel
from diffusers import StableDiffusion3Pipeline
import torch

model_id = "stabilityai/stable-diffusion-3.5-medium"

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model_nf4 = SD3Transformer2DModel.from_pretrained(
    model_id,
    subfolder="transformer",
    quantization_config=nf4_config,
    torch_dtype=torch.bfloat16
)

pipeline = StableDiffusion3Pipeline.from_pretrained(
    model_id, 
    transformer=model_nf4,
    torch_dtype=torch.bfloat16,
    
)
pipeline.enable_model_cpu_offload()
#! Extra
#pipeline.enable_sequential_cpu_offload()

prompt = "A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus, basking in a river of melted butter amidst a breakfast-themed landscape. It features the distinctive, bulky body shape of a hippo. However, instead of the usual grey skin, the creature's body resembles a golden-brown, crispy waffle fresh off the griddle. The skin is textured with the familiar grid pattern of a waffle, each square filled with a glistening sheen of syrup. The environment combines the natural habitat of a hippo with elements of a breakfast table setting, a river of warm, melted butter, with oversized utensils or plates peeking out from the lush, pancake-like foliage in the background, a towering pepper mill standing in for a tree.  As the sun rises in this fantastical world, it casts a warm, buttery glow over the scene. The creature, content in its butter river, lets out a yawn. Nearby, a flock of birds take flight"

image = pipeline(
    prompt=prompt,
    num_inference_steps=40,
    guidance_scale=4.5,
    max_sequence_length=512,
).images[0]
image.save("whimsical.png")



Loading pipeline components...:   0%|          | 0/9 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Token indices sequence length is longer than the specified maximum sequence length for this model (200 > 77). Running this sequence through the model will result in indexing errors
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['. the skin is textured with the familiar grid pattern of a waffle, each square filled with a glistening sheen of syrup. the environment combines the natural habitat of a hippo with elements of a breakfast table setting, a river of warm, melted butter, with oversized utensils or plates peeking out from the lush, pancake - like foliage in the background, a towering pepper mill standing in for a tree. as the sun rises in this fantastical world, it casts a warm, buttery glow over the scene. the creature, content in its butter river, lets out a yawn. nearby, a flock of birds take flight']
Token indices sequence length is l

  0%|          | 0/40 [00:00<?, ?it/s]

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument weight in method wrapper_CUDA___slow_conv2d_forward)

## Stable diffusion 2

In [2]:
from diffusers import StableDiffusionPipeline
import torch
# Lade das Stable Diffusion v2.1 Modell von Stability AI
pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16)
pipe.to("cuda:0")  # Sicherstellen, dass es auf der GPU läuft

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

StableDiffusionPipeline {
  "_class_name": "StableDiffusionPipeline",
  "_diffusers_version": "0.33.1",
  "_name_or_path": "stabilityai/stable-diffusion-2",
  "feature_extractor": [
    "transformers",
    "CLIPImageProcessor"
  ],
  "image_encoder": [
    null,
    null
  ],
  "requires_safety_checker": false,
  "safety_checker": [
    null,
    null
  ],
  "scheduler": [
    "diffusers",
    "DDIMScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet2DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

## Generate images

In [5]:
from diffusers import StableDiffusionPipeline
import torch
import os

# Modell laden (stable diffusion v2.1)

output_dir = "generated_backgrounds"
os.makedirs(output_dir, exist_ok=True)

# Liste von 10 verschiedenen Prompts
prompts = [
    "A neutral simple background with not to much structure"

]

# Batch-Größed: Hier generieren wir 8 Bilder pro Batch
batch_size = 1

# Zielauflösung für die generierten Bilder (z.B. 512x512, 768x768, etc.)
width = 256
height = 256

# 100 Bilder in Batches von 8 erzeugen
for batch_idx in range(0, 100, batch_size):
    # Nächste Batches von Prompts auswählen
    batch_prompts = prompts[(batch_idx // batch_size) % len(prompts)] * batch_size

    # Bilder für den aktuellen Batch generieren
    images = pipe(batch_prompts).images

    # Speichern der Bilder
    for i, image in enumerate(images):
        image.save(f"{output_dir}/background{batch_idx + i + 1}.png")

print("Fertig! 100 Bilder gespeichert in:", output_dir)


100%|██████████| 50/50 [03:57<00:00,  4.75s/it]
 28%|██▊       | 14/50 [01:11<03:03,  5.11s/it]


KeyboardInterrupt: 