## Generate images with Stable Diffusion

In [1]:
import os

# Model
%env CUDA_VISIBLE_DEVICES=2
#%env PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32

# HF Cache
os.environ["HF_HOME"] = "../.cache"
!echo $HF_HOME
!huggingface-cli whoami

env: CUDA_VISIBLE_DEVICES=2
../.cache


Maats
[1morgs: [0m DBD-research-group,Basket-AEye


## Flux 1 dev 

In [None]:
import torch
from diffusers import FluxPipeline

pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, device_map="balanced") # Auto testen
#pipe.enable_model_cpu_offload() #save some VRAM by offloading the model to CPU. Remove this if you have enough GPU power

prompt = "A high-resolution photo of a single jar of tomato sauce on a plain white background, realistic lighting, label visible, in the style of commercial product photography, supermarket packaging design with label, front-facing, no reflections, no props."
image = pipe(
    prompt,
    height=256,
    width=256,
    num_images_per_prompt=4,
    guidance_scale=3.5,
    num_inference_steps=50,
    max_sequence_length=512,
    generator=torch.Generator("cpu").manual_seed(0)
).images[0]
image.save("generated_images/sauce2.png")


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


  0%|          | 0/50 [00:00<?, ?it/s]

## Stable diffusion 3.5 M

In [2]:
from diffusers import StableDiffusion3Pipeline
import torch

pipe = StableDiffusion3Pipeline.from_pretrained(
    "stabilityai/stable-diffusion-3.5-medium",
    torch_dtype=torch.bfloat16,
    device_map="balanced",  # this splits the model
)

image = pipe(
    "A single supermarket tomato sauce on a plain background with a sauce product label at front",
    num_inference_steps=40,
    guidance_scale=4.5,
    height=256,
    width=256,
    num_images_per_prompt=1
).images[0]

image.save("generated_images/stable_diffusion_3/sauce.png")

Loading pipeline components...:   0%|          | 0/9 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


  0%|          | 0/40 [00:00<?, ?it/s]

### VRAM reduction using quantization

In [None]:
from diffusers import BitsAndBytesConfig, SD3Transformer2DModel
from diffusers import StableDiffusion3Pipeline
import torch

model_id = "stabilityai/stable-diffusion-3.5-medium"

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model_nf4 = SD3Transformer2DModel.from_pretrained(
    model_id,
    subfolder="transformer",
    quantization_config=nf4_config,
    torch_dtype=torch.bfloat16
)

pipeline = StableDiffusion3Pipeline.from_pretrained(
    model_id, 
    transformer=model_nf4,
    torch_dtype=torch.bfloat16,
    
)
pipeline.enable_model_cpu_offload()
#! Extra
#pipeline.enable_sequential_cpu_offload()

prompt = "A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus, basking in a river of melted butter amidst a breakfast-themed landscape. It features the distinctive, bulky body shape of a hippo. However, instead of the usual grey skin, the creature's body resembles a golden-brown, crispy waffle fresh off the griddle. The skin is textured with the familiar grid pattern of a waffle, each square filled with a glistening sheen of syrup. The environment combines the natural habitat of a hippo with elements of a breakfast table setting, a river of warm, melted butter, with oversized utensils or plates peeking out from the lush, pancake-like foliage in the background, a towering pepper mill standing in for a tree.  As the sun rises in this fantastical world, it casts a warm, buttery glow over the scene. The creature, content in its butter river, lets out a yawn. Nearby, a flock of birds take flight"

image = pipeline(
    prompt=prompt,
    num_inference_steps=40,
    guidance_scale=4.5,
    max_sequence_length=512,
).images[0]
image.save("whimsical.png")



Loading pipeline components...:   0%|          | 0/9 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Token indices sequence length is longer than the specified maximum sequence length for this model (200 > 77). Running this sequence through the model will result in indexing errors
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['. the skin is textured with the familiar grid pattern of a waffle, each square filled with a glistening sheen of syrup. the environment combines the natural habitat of a hippo with elements of a breakfast table setting, a river of warm, melted butter, with oversized utensils or plates peeking out from the lush, pancake - like foliage in the background, a towering pepper mill standing in for a tree. as the sun rises in this fantastical world, it casts a warm, buttery glow over the scene. the creature, content in its butter river, lets out a yawn. nearby, a flock of birds take flight']
Token indices sequence length is l

  0%|          | 0/40 [00:00<?, ?it/s]

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument weight in method wrapper_CUDA___slow_conv2d_forward)

## Stable diffusion 2

In [2]:
from diffusers import StableDiffusionPipeline
import torch
# Lade das Stable Diffusion v2.1 Modell von Stability AI
pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16)
pipe.to("cuda:0")  # Sicherstellen, dass es auf der GPU läuft

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

StableDiffusionPipeline {
  "_class_name": "StableDiffusionPipeline",
  "_diffusers_version": "0.33.1",
  "_name_or_path": "stabilityai/stable-diffusion-2",
  "feature_extractor": [
    "transformers",
    "CLIPImageProcessor"
  ],
  "image_encoder": [
    null,
    null
  ],
  "requires_safety_checker": false,
  "safety_checker": [
    null,
    null
  ],
  "scheduler": [
    "diffusers",
    "DDIMScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet2DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

## Generate images

In [5]:
from diffusers import StableDiffusionPipeline
import torch
import os

# Modell laden (stable diffusion v2.1)

output_dir = "generated_backgrounds"
os.makedirs(output_dir, exist_ok=True)

# Liste von 10 verschiedenen Prompts
prompts = [
    "A neutral simple background with not to much structure"

]

# Batch-Größed: Hier generieren wir 8 Bilder pro Batch
batch_size = 1

# Zielauflösung für die generierten Bilder (z.B. 512x512, 768x768, etc.)
width = 256
height = 256

# 100 Bilder in Batches von 8 erzeugen
for batch_idx in range(0, 100, batch_size):
    # Nächste Batches von Prompts auswählen
    batch_prompts = prompts[(batch_idx // batch_size) % len(prompts)] * batch_size

    # Bilder für den aktuellen Batch generieren
    images = pipe(batch_prompts).images

    # Speichern der Bilder
    for i, image in enumerate(images):
        image.save(f"{output_dir}/background{batch_idx + i + 1}.png")

print("Fertig! 100 Bilder gespeichert in:", output_dir)


100%|██████████| 50/50 [03:57<00:00,  4.75s/it]
 28%|██▊       | 14/50 [01:11<03:03,  5.11s/it]


KeyboardInterrupt: 

## slighly transform image

In [7]:
import os
from rembg import remove
from PIL import Image
import io

# Erstelle einen neuen Ordner für die Bilder ohne Hintergrund
all_objects_output_folder = './all_objects_no_background'
os.makedirs(all_objects_output_folder, exist_ok=True)
objects_path = "./all_generated_objects"

# Durchlaufe alle Subordner im Ordner all_generated_objects
for subfolder in os.listdir(objects_path):
    subfolder_path = os.path.join(objects_path, subfolder)
    
    if os.path.isdir(subfolder_path):
        # Erstelle einen Output-Ordner für jedes Objekt, benannt nach dem Subfolder
        output_folder = os.path.join(all_objects_output_folder, subfolder)
        os.makedirs(output_folder, exist_ok=True)

        # Durchlaufe alle Objekte (Dateien) im Subfolder
        for input_file in os.listdir(subfolder_path):
            if input_file.endswith('.png'):  # Nur PNG-Dateien verarbeiten
                input_path = os.path.join(subfolder_path, input_file)
                output_path = os.path.join(output_folder, f'{input_file.replace(".png", "_no_background.png")}')

                # Öffne das Bild
                with open(input_path, 'rb') as input_file:
                    input_data = input_file.read()

                # Entferne den Hintergrund
                output_data = remove(input_data)

                # Lade das Bild und hole die Bounding Box des Objekts
                output_image = Image.open(io.BytesIO(output_data))
                bbox = output_image.getbbox()  # Bounding Box des Objekts

                # Schneide das Bild auf die Bounding Box zu
                cropped_image = output_image.crop(bbox)

                # Speichere das Ergebnis im entsprechenden Ordner
                cropped_image.save(output_path)



KeyboardInterrupt: 

In [1]:
import os
import random
from PIL import Image
import io

# Hauptordner, der die Objekte enthält
objects_folder = 'all_objects_no_background'
background_folder = 'generated_backgrounds'

# Dictionary zum Speichern der Objekte unter ihren Ordnernamen
objects_dict = {}
backgrounds = []

# Lade Objekte
for subfolder in os.listdir(objects_folder):
    subfolder_path = os.path.join(objects_folder, subfolder)
    
    if os.path.isdir(subfolder_path):
        object_files = [f for f in os.listdir(subfolder_path) if f.endswith('.png')]
        objects_dict[subfolder] = object_files

# Lade Hintergründe
for f in os.listdir(background_folder):
    if f.endswith('.png'):
        backgrounds.append(os.path.join(background_folder, f))

print("done loading images")

# Funktion, um das Objekt zu skalieren
def scale_object(object_image, max_width, max_height):
    obj_width, obj_height = object_image.size
    scale_factor_width = max_width / obj_width
    scale_factor_height =  max_height / obj_height
    new_width = int(obj_width * scale_factor_width)
    new_height = int(obj_height * scale_factor_height)
    return object_image.resize((new_width, new_height), Image.LANCZOS)

# Funktion, um zu überprüfen, ob das Objekt mit einem anderen kollidiert
def check_collision(objects, new_object, x, y):
    new_bbox = (x, y, x + new_object.size[0], y + new_object.size[1])
    for obj, (ox, oy) in objects:
        obj_bbox = (ox, oy, ox + obj.size[0], oy + obj.size[1])
        # Überprüfen, ob sich die Bounding Boxes überschneiden
        if (new_bbox[2] > obj_bbox[0] and new_bbox[0] < obj_bbox[2] and
            new_bbox[3] > obj_bbox[1] and new_bbox[1] < obj_bbox[3]):
            return True
    return False

# Funktion, um Objekte auf dem Hintergrund zu platzieren
def place_objects_on_background(background_image, object_images, object_counts):
    bg_width, bg_height = background_image.size
    
    # Bestimme die maximale Größe der Objekte, basierend auf der Hintergrundgröße
    max_object_width = bg_width // len(object_images)  # Berechne Platz für jedes Objekt
    max_object_height = bg_height // len(object_images)
    
    placed_objects = []
    
    for object_image, object_category in object_images:
        # Skaliere jedes Objekt auf die maximale Größe
        scaled_object = scale_object(object_image, max_object_width, max_object_height)
        
        placed = False
        attempt_count = 0  # Zähler für die Versuche
        while not placed and attempt_count < 4:
            # Zufällige Position für das Objekt auf dem Hintergrund
            x = random.randint(0, bg_width - scaled_object.size[0])
            y = random.randint(0, bg_height - scaled_object.size[1])
            
            # Überprüfen, ob das Objekt mit anderen kollidiert
            if not check_collision(placed_objects, scaled_object, x, y):
                placed_objects.append((scaled_object, (x, y)))
                placed = True
                object_counts[object_category] += 1  # Erhöhe die Anzahl des aktuellen Objekts
            else:
                attempt_count += 1  # Zähle den fehlgeschlagenen Versuch
        
        # Wenn das Objekt nach 4 Versuchen nicht platziert wurde, überspringe es
        if not placed:
            print(f"Object {object_category} could not be placed after 4 attempts, skipping object.")
    
    # Erstelle ein neues Bild mit allen Objekten
    for obj, (x, y) in placed_objects:
        background_image.paste(obj, (x, y), obj)  # Nur die sichtbaren Pixel werden eingefügt
    
    return background_image

# Ausgabeordner erstellen
output_folder = 'generated_images'
os.makedirs(output_folder, exist_ok=True)

# Generiere 1000 Bilder
for i in range(1000):
    # Lade zufälligen Hintergrund
    background_image = Image.open(random.choice(backgrounds))
    
    # Bestimme eine zufällige Anzahl von Objekten (zwischen 1 und 5)
    num_objects = random.randint(1, 5)
    
    # Wähle zufällige Objekte aus
    selected_objects = []
    object_counts = {key: 0 for key in objects_dict.keys()}  # Zähler für jede Objektkategorie
    
    for _ in range(num_objects):
        # Wähle zufälliges Objekt aus einem zufälligen Ordner
        object_category = random.choice(list(objects_dict.keys()))
        object_file = random.choice(objects_dict[object_category])
        object_image = Image.open(os.path.join(objects_folder, object_category, object_file))
        
        selected_objects.append((object_image, object_category))
        print("Selected an object")
    
    # Platziere die Objekte auf dem Hintergrund
    result_image = place_objects_on_background(background_image.convert("RGBA"), selected_objects, object_counts)
    
    # Speichere das generierte Bild
    result_image.save(os.path.join(output_folder, f'generated_image_{i+1}.png'))
    
    # Speichere die Label-Datei
    label_filename = os.path.join(output_folder, f'label_image_{i+1}.txt')
    with open(label_filename, 'w') as label_file:
        label_file.write(str([object_counts[key] for key in objects_dict.keys()]))
    
    # Optional: Bild anzeigen


done loading images
Selected an object
Selected an object
Object generated_apples could not be placed after 4 attempts, skipping object.
Selected an object
Selected an object
Selected an object
Selected an object
Selected an object
Object generated_tomato_sauce could not be placed after 4 attempts, skipping object.
Selected an object
Selected an object
Selected an object
Selected an object
Object generated_banana could not be placed after 4 attempts, skipping object.
Selected an object
Selected an object
Selected an object
Selected an object
Selected an object
Selected an object
Selected an object
Selected an object
Selected an object
Object generated_banana could not be placed after 4 attempts, skipping object.
Selected an object
Selected an object
Selected an object
Object generated_banana could not be placed after 4 attempts, skipping object.
Selected an object
Selected an object
Selected an object
Object generated_tomato_sauce could not be placed after 4 attempts, skipping object.
