From f842396367d78bf4bbe294c4940db6940c2494d1 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Wed, 26 Apr 2023 18:43:09 +0200
Subject: [PATCH 001/206] Post release for 0.16.0 (#3244)
* Post release
* fix more
---
examples/controlnet/train_controlnet.py | 2 +-
examples/controlnet/train_controlnet_flax.py | 2 +-
examples/custom_diffusion/train_custom_diffusion.py | 2 +-
examples/dreambooth/train_dreambooth.py | 2 +-
examples/dreambooth/train_dreambooth_flax.py | 2 +-
examples/dreambooth/train_dreambooth_lora.py | 2 +-
examples/instruct_pix2pix/train_instruct_pix2pix.py | 2 +-
examples/text_to_image/train_text_to_image.py | 2 +-
examples/text_to_image/train_text_to_image_flax.py | 2 +-
examples/text_to_image/train_text_to_image_lora.py | 2 +-
examples/textual_inversion/textual_inversion.py | 2 +-
examples/textual_inversion/textual_inversion_flax.py | 2 +-
.../unconditional_image_generation/train_unconditional.py | 2 +-
setup.py | 2 +-
src/diffusers/__init__.py | 2 +-
src/diffusers/pipelines/pipeline_utils.py | 6 +++---
src/diffusers/utils/hub_utils.py | 2 +-
17 files changed, 19 insertions(+), 19 deletions(-)
diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py
index 9b9ba5ab737f..d8f7b68a5444 100644
--- a/examples/controlnet/train_controlnet.py
+++ b/examples/controlnet/train_controlnet.py
@@ -55,7 +55,7 @@
import wandb
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0")
+check_min_version("0.17.0.dev0")
logger = get_logger(__name__)
diff --git a/examples/controlnet/train_controlnet_flax.py b/examples/controlnet/train_controlnet_flax.py
index aff361cb6e01..18d97502c7c4 100644
--- a/examples/controlnet/train_controlnet_flax.py
+++ b/examples/controlnet/train_controlnet_flax.py
@@ -59,7 +59,7 @@
import wandb
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0")
+check_min_version("0.17.0.dev0")
logger = logging.getLogger(__name__)
diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py
index 0954f3d6e789..7060f8da4534 100644
--- a/examples/custom_diffusion/train_custom_diffusion.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -56,7 +56,7 @@
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0")
+check_min_version("0.17.0.dev0")
logger = get_logger(__name__)
diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index a9449002ca80..593af005d6f4 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -56,7 +56,7 @@
import wandb
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0")
+check_min_version("0.17.0.dev0")
logger = get_logger(__name__)
diff --git a/examples/dreambooth/train_dreambooth_flax.py b/examples/dreambooth/train_dreambooth_flax.py
index 1a4ca9153c80..2a2200181d8a 100644
--- a/examples/dreambooth/train_dreambooth_flax.py
+++ b/examples/dreambooth/train_dreambooth_flax.py
@@ -36,7 +36,7 @@
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0")
+check_min_version("0.17.0.dev0")
# Cache compiled models across invocations of this script.
cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))
diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
index 805a8d1eea4d..3aa2fb0a8491 100644
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
@@ -55,7 +55,7 @@
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0")
+check_min_version("0.17.0.dev0")
logger = get_logger(__name__)
diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix.py b/examples/instruct_pix2pix/train_instruct_pix2pix.py
index dc5a1c3081c0..de555a50ba50 100644
--- a/examples/instruct_pix2pix/train_instruct_pix2pix.py
+++ b/examples/instruct_pix2pix/train_instruct_pix2pix.py
@@ -51,7 +51,7 @@
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0")
+check_min_version("0.17.0.dev0")
logger = get_logger(__name__, log_level="INFO")
diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py
index 1d6db2a6f1da..1d62cb7f816d 100644
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -50,7 +50,7 @@
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0")
+check_min_version("0.17.0.dev0")
logger = get_logger(__name__, log_level="INFO")
diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py
index c5dc71f0536e..2b2255b46353 100644
--- a/examples/text_to_image/train_text_to_image_flax.py
+++ b/examples/text_to_image/train_text_to_image_flax.py
@@ -33,7 +33,7 @@
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0")
+check_min_version("0.17.0.dev0")
logger = logging.getLogger(__name__)
diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py
index 39bdb4e59a52..c2a4e1aacdb7 100644
--- a/examples/text_to_image/train_text_to_image_lora.py
+++ b/examples/text_to_image/train_text_to_image_lora.py
@@ -47,7 +47,7 @@
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0")
+check_min_version("0.17.0.dev0")
logger = get_logger(__name__, log_level="INFO")
diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py
index 824759cc4ca9..c58f6b600629 100644
--- a/examples/textual_inversion/textual_inversion.py
+++ b/examples/textual_inversion/textual_inversion.py
@@ -77,7 +77,7 @@
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0")
+check_min_version("0.17.0.dev0")
logger = get_logger(__name__)
diff --git a/examples/textual_inversion/textual_inversion_flax.py b/examples/textual_inversion/textual_inversion_flax.py
index 19553ceb92ec..af167c53f275 100644
--- a/examples/textual_inversion/textual_inversion_flax.py
+++ b/examples/textual_inversion/textual_inversion_flax.py
@@ -56,7 +56,7 @@
# ------------------------------------------------------------------------------
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0")
+check_min_version("0.17.0.dev0")
logger = logging.getLogger(__name__)
diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py
index 836a38f96286..282f52101a3c 100644
--- a/examples/unconditional_image_generation/train_unconditional.py
+++ b/examples/unconditional_image_generation/train_unconditional.py
@@ -28,7 +28,7 @@
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0")
+check_min_version("0.17.0.dev0")
logger = get_logger(__name__, log_level="INFO")
diff --git a/setup.py b/setup.py
index ea98b5d10277..c0df285dcffb 100644
--- a/setup.py
+++ b/setup.py
@@ -226,7 +226,7 @@ def run(self):
setup(
name="diffusers",
- version="0.16.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+ version="0.17.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
description="Diffusers",
long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type="text/markdown",
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index d4dbf1145072..f21a550517eb 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.16.0"
+__version__ = "0.17.0.dev0"
from .configuration_utils import ConfigMixin
from .utils import (
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 8c028b64a8c8..5e4290e8db9f 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -1075,8 +1075,8 @@ def load_module(name, value):
return_cached_folder = kwargs.pop("return_cached_folder", False)
if return_cached_folder:
- message = f"Passing `return_cached_folder=True` is deprecated and will be removed in `diffusers=0.17.0`. Please do the following instead: \n 1. Load the cached_folder via `cached_folder={cls}.download({pretrained_model_name_or_path})`. \n 2. Load the pipeline by loading from the cached folder: `pipeline={cls}.from_pretrained(cached_folder)`."
- deprecate("return_cached_folder", "0.17.0", message)
+ message = f"Passing `return_cached_folder=True` is deprecated and will be removed in `diffusers=0.18.0`. Please do the following instead: \n 1. Load the cached_folder via `cached_folder={cls}.download({pretrained_model_name_or_path})`. \n 2. Load the pipeline by loading from the cached folder: `pipeline={cls}.from_pretrained(cached_folder)`."
+ deprecate("return_cached_folder", "0.18.0", message)
return model, cached_folder
return model
@@ -1238,7 +1238,7 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
# if the whole pipeline is cached we don't have to ping the Hub
if revision in DEPRECATED_REVISION_ARGS and version.parse(
version.parse(__version__).base_version
- ) >= version.parse("0.17.0"):
+ ) >= version.parse("0.18.0"):
warn_deprecated_model_variant(
pretrained_model_name, use_auth_token, variant, revision, model_filenames
)
diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py
index 9cfc649c8b86..6e44370a378a 100644
--- a/src/diffusers/utils/hub_utils.py
+++ b/src/diffusers/utils/hub_utils.py
@@ -280,7 +280,7 @@ def _get_model_file(
if (
revision in DEPRECATED_REVISION_ARGS
and (weights_name == WEIGHTS_NAME or weights_name == SAFETENSORS_WEIGHTS_NAME)
- and version.parse(version.parse(__version__).base_version) >= version.parse("0.17.0")
+ and version.parse(version.parse(__version__).base_version) >= version.parse("0.18.0")
):
try:
model_file = hf_hub_download(
From c399de396dbb464be0935f910703eff9f11667ad Mon Sep 17 00:00:00 2001
From: Pedro Cuenca
Date: Wed, 26 Apr 2023 21:06:50 +0200
Subject: [PATCH 002/206] [docs] only mention one stage (#3246)
* [docs] only mention one stage
* add blurb on auto accepting
---------
Co-authored-by: William Berman
---
docs/source/en/api/pipelines/if.mdx | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/docs/source/en/api/pipelines/if.mdx b/docs/source/en/api/pipelines/if.mdx
index 5d3b292587f6..921a68a29f76 100644
--- a/docs/source/en/api/pipelines/if.mdx
+++ b/docs/source/en/api/pipelines/if.mdx
@@ -28,8 +28,8 @@ Our work underscores the potential of larger UNet architectures in the first sta
## Usage
Before you can use IF, you need to accept its usage conditions. To do so:
-1. Make sure to have a [Hugging Face account](https://huggingface.co/join) and be loggin in
-2. Accept the license on the model card of [DeepFloyd/IF-I-IF-v1.0](https://huggingface.co/DeepFloyd/IF-I-IF-v1.0) and [DeepFloyd/IF-II-L-v1.0](https://huggingface.co/DeepFloyd/IF-II-L-v1.0)
+1. Make sure to have a [Hugging Face account](https://huggingface.co/join) and be logged in
+2. Accept the license on the model card of [DeepFloyd/IF-I-IF-v1.0](https://huggingface.co/DeepFloyd/IF-I-IF-v1.0). Accepting the license on the stage I model card will auto accept for the other IF models.
3. Make sure to login locally. Install `huggingface_hub`
```sh
pip install huggingface_hub --upgrade
From e0a2bd15f9a1eb0d48a69973a9c7ddb4eabb1a27 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca
Date: Wed, 26 Apr 2023 21:22:27 +0200
Subject: [PATCH 003/206] Write model card in controlnet training script
(#3229)
Write model card in controlnet training script.
---
examples/controlnet/train_controlnet.py | 59 ++++++++++++++++++++++++-
1 file changed, 58 insertions(+), 1 deletion(-)
diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py
index d8f7b68a5444..fc46c744cd8b 100644
--- a/examples/controlnet/train_controlnet.py
+++ b/examples/controlnet/train_controlnet.py
@@ -60,6 +60,17 @@
logger = get_logger(__name__)
+def image_grid(imgs, rows, cols):
+ assert len(imgs) == rows * cols
+
+ w, h = imgs[0].size
+ grid = Image.new("RGB", size=(cols * w, rows * h))
+
+ for i, img in enumerate(imgs):
+ grid.paste(img, box=(i % cols * w, i // cols * h))
+ return grid
+
+
def log_validation(vae, text_encoder, tokenizer, unet, controlnet, args, accelerator, weight_dtype, step):
logger.info("Running validation... ")
@@ -156,6 +167,8 @@ def log_validation(vae, text_encoder, tokenizer, unet, controlnet, args, acceler
else:
logger.warn(f"image logging not implemented for {tracker.name}")
+ return image_logs
+
def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
text_encoder_config = PretrainedConfig.from_pretrained(
@@ -177,6 +190,43 @@ def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: st
raise ValueError(f"{model_class} is not supported.")
+def save_model_card(repo_id: str, image_logs=None, base_model=str, repo_folder=None):
+ img_str = ""
+ if image_logs is not None:
+ img_str = "You can find some example images below.\n"
+ for i, log in enumerate(image_logs):
+ images = log["images"]
+ validation_prompt = log["validation_prompt"]
+ validation_image = log["validation_image"]
+ validation_image.save(os.path.join(repo_folder, "image_control.png"))
+ img_str += f"prompt: {validation_prompt}\n"
+ images = [validation_image] + images
+ image_grid(images, 1, len(images)).save(os.path.join(repo_folder, f"images_{i}.png"))
+ img_str += f"\n"
+
+ yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- text-to-image
+- diffusers
+- controlnet
+inference: true
+---
+ """
+ model_card = f"""
+# controlnet-{repo_id}
+
+These are controlnet weights trained on {base_model} with new type of conditioning.
+{img_str}
+"""
+ with open(os.path.join(repo_folder, "README.md"), "w") as f:
+ f.write(yaml + model_card)
+
+
def parse_args(input_args=None):
parser = argparse.ArgumentParser(description="Simple example of a ControlNet training script.")
parser.add_argument(
@@ -943,6 +993,7 @@ def load_model_hook(models, input_dir):
disable=not accelerator.is_local_main_process,
)
+ image_logs = None
for epoch in range(first_epoch, args.num_train_epochs):
for step, batch in enumerate(train_dataloader):
with accelerator.accumulate(controlnet):
@@ -1014,7 +1065,7 @@ def load_model_hook(models, input_dir):
logger.info(f"Saved state to {save_path}")
if args.validation_prompt is not None and global_step % args.validation_steps == 0:
- log_validation(
+ image_logs = log_validation(
vae,
text_encoder,
tokenizer,
@@ -1040,6 +1091,12 @@ def load_model_hook(models, input_dir):
controlnet.save_pretrained(args.output_dir)
if args.push_to_hub:
+ save_model_card(
+ repo_id,
+ image_logs=image_logs,
+ base_model=args.pretrained_model_name_or_path,
+ repo_folder=args.output_dir,
+ )
upload_folder(
repo_id=repo_id,
folder_path=args.output_dir,
From fd512d7461cc0bcd686f46a2c573aeb93e5d3cf3 Mon Sep 17 00:00:00 2001
From: Nipun Jindal
Date: Thu, 27 Apr 2023 11:18:38 +0530
Subject: [PATCH 004/206] [2064]: Add stochastic sampler (sample_dpmpp_sde)
(#3020)
* [2064]: Add stochastic sampler
* [2064]: Add stochastic sampler
* [2064]: Add stochastic sampler
* [2064]: Add stochastic sampler
* [2064]: Add stochastic sampler
* [2064]: Add stochastic sampler
* [2064]: Add stochastic sampler
* Review comments
* [Review comment]: Add is_torchsde_available()
* [Review comment]: Test and docs
* [Review comment]
* [Review comment]
* [Review comment]
* [Review comment]
* [Review comment]
---------
Co-authored-by: njindal
---
docs/source/en/_toctree.yml | 2 +
docs/source/en/api/schedulers/dpm_sde.mdx | 23 +
src/diffusers/__init__.py | 8 +
src/diffusers/schedulers/__init__.py | 16 +-
.../schedulers/scheduling_dpmsolver_sde.py | 447 ++++++++++++++++++
.../schedulers/scheduling_heun_discrete.py | 2 -
src/diffusers/schedulers/scheduling_utils.py | 1 +
src/diffusers/utils/__init__.py | 1 +
.../utils/dummy_torch_and_torchsde_objects.py | 17 +
src/diffusers/utils/import_utils.py | 17 +
src/diffusers/utils/testing_utils.py | 8 +
tests/schedulers/test_scheduler_dpm_sde.py | 156 ++++++
12 files changed, 695 insertions(+), 3 deletions(-)
create mode 100644 docs/source/en/api/schedulers/dpm_sde.mdx
create mode 100644 src/diffusers/schedulers/scheduling_dpmsolver_sde.py
create mode 100644 src/diffusers/utils/dummy_torch_and_torchsde_objects.py
create mode 100644 tests/schedulers/test_scheduler_dpm_sde.py
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index ccaaff7ca680..35c5fd78a1f6 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -266,6 +266,8 @@
title: VP-SDE
- local: api/schedulers/vq_diffusion
title: VQDiffusionScheduler
+ - local: api/schedulers/dpm_sde
+ title: DPMSolverSDEScheduler
title: Schedulers
- sections:
- local: api/experimental/rl
diff --git a/docs/source/en/api/schedulers/dpm_sde.mdx b/docs/source/en/api/schedulers/dpm_sde.mdx
new file mode 100644
index 000000000000..33ec514cef64
--- /dev/null
+++ b/docs/source/en/api/schedulers/dpm_sde.mdx
@@ -0,0 +1,23 @@
+
+
+# DPM Stochastic Scheduler inspired by Karras et. al paper
+
+## Overview
+
+Inspired by Stochastic Sampler from [Karras et. al](https://arxiv.org/abs/2206.00364).
+Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
+
+All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)
+
+## DPMSolverSDEScheduler
+[[autodoc]] DPMSolverSDEScheduler
\ No newline at end of file
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index f21a550517eb..078d03eb8995 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -12,6 +12,7 @@
is_onnx_available,
is_scipy_available,
is_torch_available,
+ is_torchsde_available,
is_transformers_available,
is_transformers_version,
is_unidecode_available,
@@ -102,6 +103,13 @@
else:
from .schedulers import LMSDiscreteScheduler
+try:
+ if not (is_torch_available() and is_torchsde_available()):
+ raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+ from .utils.dummy_torch_and_torchsde_objects import * # noqa F403
+else:
+ from .schedulers import DPMSolverSDEScheduler
try:
if not (is_torch_available() and is_transformers_available()):
diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py
index e5d5bb40633f..c4b62c722257 100644
--- a/src/diffusers/schedulers/__init__.py
+++ b/src/diffusers/schedulers/__init__.py
@@ -13,7 +13,13 @@
# limitations under the License.
-from ..utils import OptionalDependencyNotAvailable, is_flax_available, is_scipy_available, is_torch_available
+from ..utils import (
+ OptionalDependencyNotAvailable,
+ is_flax_available,
+ is_scipy_available,
+ is_torch_available,
+ is_torchsde_available,
+)
try:
@@ -72,3 +78,11 @@
from ..utils.dummy_torch_and_scipy_objects import * # noqa F403
else:
from .scheduling_lms_discrete import LMSDiscreteScheduler
+
+try:
+ if not (is_torch_available() and is_torchsde_available()):
+ raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+ from ..utils.dummy_torch_and_torchsde_objects import * # noqa F403
+else:
+ from .scheduling_dpmsolver_sde import DPMSolverSDEScheduler
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
new file mode 100644
index 000000000000..ae9229981152
--- /dev/null
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
@@ -0,0 +1,447 @@
+# Copyright 2023 Katherine Crowson, The HuggingFace Team and hlky. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torchsde
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+class BatchedBrownianTree:
+ """A wrapper around torchsde.BrownianTree that enables batches of entropy."""
+
+ def __init__(self, x, t0, t1, seed=None, **kwargs):
+ t0, t1, self.sign = self.sort(t0, t1)
+ w0 = kwargs.get("w0", torch.zeros_like(x))
+ if seed is None:
+ seed = torch.randint(0, 2**63 - 1, []).item()
+ self.batched = True
+ try:
+ assert len(seed) == x.shape[0]
+ w0 = w0[0]
+ except TypeError:
+ seed = [seed]
+ self.batched = False
+ self.trees = [torchsde.BrownianTree(t0, w0, t1, entropy=s, **kwargs) for s in seed]
+
+ @staticmethod
+ def sort(a, b):
+ return (a, b, 1) if a < b else (b, a, -1)
+
+ def __call__(self, t0, t1):
+ t0, t1, sign = self.sort(t0, t1)
+ w = torch.stack([tree(t0, t1) for tree in self.trees]) * (self.sign * sign)
+ return w if self.batched else w[0]
+
+
+class BrownianTreeNoiseSampler:
+ """A noise sampler backed by a torchsde.BrownianTree.
+
+ Args:
+ x (Tensor): The tensor whose shape, device and dtype to use to generate
+ random samples.
+ sigma_min (float): The low end of the valid interval.
+ sigma_max (float): The high end of the valid interval.
+ seed (int or List[int]): The random seed. If a list of seeds is
+ supplied instead of a single integer, then the noise sampler will use one BrownianTree per batch item, each
+ with its own seed.
+ transform (callable): A function that maps sigma to the sampler's
+ internal timestep.
+ """
+
+ def __init__(self, x, sigma_min, sigma_max, seed=None, transform=lambda x: x):
+ self.transform = transform
+ t0, t1 = self.transform(torch.as_tensor(sigma_min)), self.transform(torch.as_tensor(sigma_max))
+ self.tree = BatchedBrownianTree(x, t0, t1, seed)
+
+ def __call__(self, sigma, sigma_next):
+ t0, t1 = self.transform(torch.as_tensor(sigma)), self.transform(torch.as_tensor(sigma_next))
+ return self.tree(t0, t1) / (t1 - t0).abs().sqrt()
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+ """
+ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+ (1-beta) over time from t = [0,1].
+
+ Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+ to that part of the diffusion process.
+
+
+ Args:
+ num_diffusion_timesteps (`int`): the number of betas to produce.
+ max_beta (`float`): the maximum beta to use; use values lower than 1 to
+ prevent singularities.
+
+ Returns:
+ betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+ """
+
+ def alpha_bar(time_step):
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+ betas = []
+ for i in range(num_diffusion_timesteps):
+ t1 = i / num_diffusion_timesteps
+ t2 = (i + 1) / num_diffusion_timesteps
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+ return torch.tensor(betas, dtype=torch.float32)
+
+
+class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
+ """
+ Implements Stochastic Sampler (Algorithm 2) from Karras et al. (2022). Based on the original k-diffusion
+ implementation by Katherine Crowson:
+ https://github.com/crowsonkb/k-diffusion/blob/41b4cb6df0506694a7776af31349acf082bf6091/k_diffusion/sampling.py#L543
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+ [`~SchedulerMixin.from_pretrained`] functions.
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model. beta_start (`float`): the
+ starting `beta` value of inference. beta_end (`float`): the final `beta` value. beta_schedule (`str`):
+ the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+ `linear` or `scaled_linear`.
+ trained_betas (`np.ndarray`, optional):
+ option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+ prediction_type (`str`, default `epsilon`, optional):
+ prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+ process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
+ https://imagen.research.google/video/paper.pdf)
+ use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+ This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the
+ noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence
+ of noise levels {σi} as defined in Equation (5) of the paper https://arxiv.org/pdf/2206.00364.pdf.
+ noise_sampler_seed (`int`, *optional*, defaults to `None`):
+ The random seed to use for the noise sampler. If `None`, a random seed will be generated.
+ """
+
+ _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+ order = 2
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.00085, # sensible defaults
+ beta_end: float = 0.012,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ prediction_type: str = "epsilon",
+ use_karras_sigmas: Optional[bool] = False,
+ noise_sampler_seed: Optional[int] = None,
+ ):
+ if trained_betas is not None:
+ self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+ elif beta_schedule == "linear":
+ self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+ elif beta_schedule == "scaled_linear":
+ # this schedule is very specific to the latent diffusion model.
+ self.betas = (
+ torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+ )
+ elif beta_schedule == "squaredcos_cap_v2":
+ # Glide cosine schedule
+ self.betas = betas_for_alpha_bar(num_train_timesteps)
+ else:
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+ self.alphas = 1.0 - self.betas
+ self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+ # set all values
+ self.set_timesteps(num_train_timesteps, None, num_train_timesteps)
+ self.use_karras_sigmas = use_karras_sigmas
+ self.noise_sampler = None
+ self.noise_sampler_seed = noise_sampler_seed
+
+ # Copied from diffusers.schedulers.scheduling_heun_discrete.HeunDiscreteScheduler.index_for_timestep
+ def index_for_timestep(self, timestep, schedule_timesteps=None):
+ if schedule_timesteps is None:
+ schedule_timesteps = self.timesteps
+
+ indices = (schedule_timesteps == timestep).nonzero()
+
+ if self.state_in_first_order:
+ pos = -1
+ else:
+ pos = 0
+ return indices[pos].item()
+
+ def scale_model_input(
+ self,
+ sample: torch.FloatTensor,
+ timestep: Union[float, torch.FloatTensor],
+ ) -> torch.FloatTensor:
+ """
+ Args:
+ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+ current timestep.
+ sample (`torch.FloatTensor`): input sample timestep (`int`, optional): current timestep
+ Returns:
+ `torch.FloatTensor`: scaled input sample
+ """
+ step_index = self.index_for_timestep(timestep)
+
+ sigma = self.sigmas[step_index]
+ sigma_input = sigma if self.state_in_first_order else self.mid_point_sigma
+ sample = sample / ((sigma_input**2 + 1) ** 0.5)
+ return sample
+
+ def set_timesteps(
+ self,
+ num_inference_steps: int,
+ device: Union[str, torch.device] = None,
+ num_train_timesteps: Optional[int] = None,
+ ):
+ """
+ Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ device (`str` or `torch.device`, optional):
+ the device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+ """
+ self.num_inference_steps = num_inference_steps
+
+ num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
+
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+ log_sigmas = np.log(sigmas)
+ sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+
+ if self.use_karras_sigmas:
+ sigmas = self._convert_to_karras(in_sigmas=sigmas)
+ timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+
+ second_order_timesteps = self._second_order_timesteps(sigmas, log_sigmas)
+
+ sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+ sigmas = torch.from_numpy(sigmas).to(device=device)
+ self.sigmas = torch.cat([sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]])
+
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = self.sigmas.max()
+
+ timesteps = torch.from_numpy(timesteps)
+ second_order_timesteps = torch.from_numpy(second_order_timesteps)
+ timesteps = torch.cat([timesteps[:1], timesteps[1:].repeat_interleave(2)])
+ timesteps[1::2] = second_order_timesteps
+
+ if str(device).startswith("mps"):
+ # mps does not support float64
+ self.timesteps = timesteps.to(device, dtype=torch.float32)
+ else:
+ self.timesteps = timesteps.to(device=device)
+
+ # empty first order variables
+ self.sample = None
+ self.mid_point_sigma = None
+
+ def _second_order_timesteps(self, sigmas, log_sigmas):
+ def sigma_fn(_t):
+ return np.exp(-_t)
+
+ def t_fn(_sigma):
+ return -np.log(_sigma)
+
+ midpoint_ratio = 0.5
+ t = t_fn(sigmas)
+ delta_time = np.diff(t)
+ t_proposed = t[:-1] + delta_time * midpoint_ratio
+ sig_proposed = sigma_fn(t_proposed)
+ timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sig_proposed])
+ return timesteps
+
+ # copied from diffusers.schedulers.scheduling_euler_discrete._sigma_to_t
+ def _sigma_to_t(self, sigma, log_sigmas):
+ # get log sigma
+ log_sigma = np.log(sigma)
+
+ # get distribution
+ dists = log_sigma - log_sigmas[:, np.newaxis]
+
+ # get sigmas range
+ low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+ high_idx = low_idx + 1
+
+ low = log_sigmas[low_idx]
+ high = log_sigmas[high_idx]
+
+ # interpolate sigmas
+ w = (low - log_sigma) / (low - high)
+ w = np.clip(w, 0, 1)
+
+ # transform interpolation to time range
+ t = (1 - w) * low_idx + w * high_idx
+ t = t.reshape(sigma.shape)
+ return t
+
+ # copied from diffusers.schedulers.scheduling_euler_discrete._convert_to_karras
+ def _convert_to_karras(self, in_sigmas: torch.FloatTensor) -> torch.FloatTensor:
+ """Constructs the noise schedule of Karras et al. (2022)."""
+
+ sigma_min: float = in_sigmas[-1].item()
+ sigma_max: float = in_sigmas[0].item()
+
+ rho = 7.0 # 7.0 is the value used in the paper
+ ramp = np.linspace(0, 1, self.num_inference_steps)
+ min_inv_rho = sigma_min ** (1 / rho)
+ max_inv_rho = sigma_max ** (1 / rho)
+ sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+ return sigmas
+
+ @property
+ def state_in_first_order(self):
+ return self.sample is None
+
+ def step(
+ self,
+ model_output: Union[torch.FloatTensor, np.ndarray],
+ timestep: Union[float, torch.FloatTensor],
+ sample: Union[torch.FloatTensor, np.ndarray],
+ return_dict: bool = True,
+ s_noise: float = 1.0,
+ ) -> Union[SchedulerOutput, Tuple]:
+ """
+ Args:
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+ model_output (Union[torch.FloatTensor, np.ndarray]): Direct output from learned diffusion model.
+ timestep (Union[float, torch.FloatTensor]): Current discrete timestep in the diffusion chain.
+ sample (Union[torch.FloatTensor, np.ndarray]): Current instance of sample being created by diffusion process.
+ return_dict (bool, optional): Option for returning tuple rather than SchedulerOutput class. Defaults to True.
+ s_noise (float, optional): Scaling factor for the noise added to the sample. Defaults to 1.0.
+ Returns:
+ [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+ [`~schedulers.scheduling_utils.SchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
+ returning a tuple, the first element is the sample tensor.
+ """
+ step_index = self.index_for_timestep(timestep)
+
+ # Create a noise sampler if it hasn't been created yet
+ if self.noise_sampler is None:
+ min_sigma, max_sigma = self.sigmas[self.sigmas > 0].min(), self.sigmas.max()
+ self.noise_sampler = BrownianTreeNoiseSampler(sample, min_sigma, max_sigma, self.noise_sampler_seed)
+
+ # Define functions to compute sigma and t from each other
+ def sigma_fn(_t: torch.FloatTensor) -> torch.FloatTensor:
+ return _t.neg().exp()
+
+ def t_fn(_sigma: torch.FloatTensor) -> torch.FloatTensor:
+ return _sigma.log().neg()
+
+ if self.state_in_first_order:
+ sigma = self.sigmas[step_index]
+ sigma_next = self.sigmas[step_index + 1]
+ else:
+ # 2nd order
+ sigma = self.sigmas[step_index - 1]
+ sigma_next = self.sigmas[step_index]
+
+ # Set the midpoint and step size for the current step
+ midpoint_ratio = 0.5
+ t, t_next = t_fn(sigma), t_fn(sigma_next)
+ delta_time = t_next - t
+ t_proposed = t + delta_time * midpoint_ratio
+
+ # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+ if self.config.prediction_type == "epsilon":
+ sigma_input = sigma if self.state_in_first_order else sigma_fn(t_proposed)
+ pred_original_sample = sample - sigma_input * model_output
+ elif self.config.prediction_type == "v_prediction":
+ sigma_input = sigma if self.state_in_first_order else sigma_fn(t_proposed)
+ pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
+ sample / (sigma_input**2 + 1)
+ )
+ elif self.config.prediction_type == "sample":
+ raise NotImplementedError("prediction_type not implemented yet: sample")
+ else:
+ raise ValueError(
+ f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+ )
+
+ if sigma_next == 0:
+ derivative = (sample - pred_original_sample) / sigma
+ dt = sigma_next - sigma
+ prev_sample = sample + derivative * dt
+ else:
+ if self.state_in_first_order:
+ t_next = t_proposed
+ else:
+ sample = self.sample
+
+ sigma_from = sigma_fn(t)
+ sigma_to = sigma_fn(t_next)
+ sigma_up = min(sigma_to, (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5)
+ sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+ ancestral_t = t_fn(sigma_down)
+ prev_sample = (sigma_fn(ancestral_t) / sigma_fn(t)) * sample - (
+ t - ancestral_t
+ ).expm1() * pred_original_sample
+ prev_sample = prev_sample + self.noise_sampler(sigma_fn(t), sigma_fn(t_next)) * s_noise * sigma_up
+
+ if self.state_in_first_order:
+ # store for 2nd order step
+ self.sample = sample
+ self.mid_point_sigma = sigma_fn(t_next)
+ else:
+ # free for "first order mode"
+ self.sample = None
+ self.mid_point_sigma = None
+
+ if not return_dict:
+ return (prev_sample,)
+
+ return SchedulerOutput(prev_sample=prev_sample)
+
+ # Copied from diffusers.schedulers.scheduling_heun_discrete.HeunDiscreteScheduler.add_noise
+ def add_noise(
+ self,
+ original_samples: torch.FloatTensor,
+ noise: torch.FloatTensor,
+ timesteps: torch.FloatTensor,
+ ) -> torch.FloatTensor:
+ # Make sure sigmas and timesteps have the same device and dtype as original_samples
+ sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+ if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+ # mps does not support float64
+ schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+ timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+ else:
+ schedule_timesteps = self.timesteps.to(original_samples.device)
+ timesteps = timesteps.to(original_samples.device)
+
+ step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+
+ sigma = sigmas[step_indices].flatten()
+ while len(sigma.shape) < len(original_samples.shape):
+ sigma = sigma.unsqueeze(-1)
+
+ noisy_samples = original_samples + noise * sigma
+ return noisy_samples
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index 2b32cad39925..100e2012ea20 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -70,8 +70,6 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
`linear` or `scaled_linear`.
trained_betas (`np.ndarray`, optional):
option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
- options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small`,
- `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
prediction_type (`str`, default `epsilon`, optional):
prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
diff --git a/src/diffusers/schedulers/scheduling_utils.py b/src/diffusers/schedulers/scheduling_utils.py
index a4121f75d850..0f95beb022ac 100644
--- a/src/diffusers/schedulers/scheduling_utils.py
+++ b/src/diffusers/schedulers/scheduling_utils.py
@@ -43,6 +43,7 @@ class KarrasDiffusionSchedulers(Enum):
KDPM2AncestralDiscreteScheduler = 11
DEISMultistepScheduler = 12
UniPCMultistepScheduler = 13
+ DPMSolverSDEScheduler = 14
@dataclass
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 1b8eca050c9e..f3e4c9d1d0ec 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -70,6 +70,7 @@
is_tf_available,
is_torch_available,
is_torch_version,
+ is_torchsde_available,
is_transformers_available,
is_transformers_version,
is_unidecode_available,
diff --git a/src/diffusers/utils/dummy_torch_and_torchsde_objects.py b/src/diffusers/utils/dummy_torch_and_torchsde_objects.py
new file mode 100644
index 000000000000..a81bbb316f32
--- /dev/null
+++ b/src/diffusers/utils/dummy_torch_and_torchsde_objects.py
@@ -0,0 +1,17 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class DPMSolverSDEScheduler(metaclass=DummyObject):
+ _backends = ["torch", "torchsde"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "torchsde"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "torchsde"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "torchsde"])
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index 2d90cb9747a7..4ded0f272462 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -287,6 +287,13 @@
except importlib_metadata.PackageNotFoundError:
_bs4_available = False
+_torchsde_available = importlib.util.find_spec("torchsde") is not None
+try:
+ _torchsde_version = importlib_metadata.version("torchsde")
+ logger.debug(f"Successfully imported torchsde version {_torchsde_version}")
+except importlib_metadata.PackageNotFoundError:
+ _torchsde_available = False
+
def is_torch_available():
return _torch_available
@@ -372,6 +379,10 @@ def is_bs4_available():
return _bs4_available
+def is_torchsde_available():
+ return _torchsde_available
+
+
# docstyle-ignore
FLAX_IMPORT_ERROR = """
{0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
@@ -475,6 +486,11 @@ def is_bs4_available():
that match your environment. Please note that you may need to restart your runtime after installation.
"""
+# docstyle-ignore
+TORCHSDE_IMPORT_ERROR = """
+{0} requires the torchsde library but it was not found in your environment. You can install it with pip: `pip install torchsde`
+"""
+
BACKENDS_MAPPING = OrderedDict(
[
@@ -495,6 +511,7 @@ def is_bs4_available():
("tensorboard", (_tensorboard_available, TENSORBOARD_IMPORT_ERROR)),
("compel", (_compel_available, COMPEL_IMPORT_ERROR)),
("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
+ ("torchsde", (_torchsde_available, TORCHSDE_IMPORT_ERROR)),
]
)
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index d8fed5dec1c8..4ad7d97b4462 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -26,6 +26,7 @@
is_opencv_available,
is_torch_available,
is_torch_version,
+ is_torchsde_available,
)
from .logging import get_logger
@@ -216,6 +217,13 @@ def require_note_seq(test_case):
return unittest.skipUnless(is_note_seq_available(), "test requires note_seq")(test_case)
+def require_torchsde(test_case):
+ """
+ Decorator marking a test that requires torchsde. These tests are skipped when torchsde isn't installed.
+ """
+ return unittest.skipUnless(is_torchsde_available(), "test requires torchsde")(test_case)
+
+
def load_numpy(arry: Union[str, np.ndarray], local_path: Optional[str] = None) -> np.ndarray:
if isinstance(arry, str):
# local_path = "/home/patrick_huggingface_co/"
diff --git a/tests/schedulers/test_scheduler_dpm_sde.py b/tests/schedulers/test_scheduler_dpm_sde.py
new file mode 100644
index 000000000000..010c4bdb1196
--- /dev/null
+++ b/tests/schedulers/test_scheduler_dpm_sde.py
@@ -0,0 +1,156 @@
+import torch
+
+from diffusers import DPMSolverSDEScheduler
+from diffusers.utils import torch_device
+from diffusers.utils.testing_utils import require_torchsde
+
+from .test_schedulers import SchedulerCommonTest
+
+
+@require_torchsde
+class DPMSolverSDESchedulerTest(SchedulerCommonTest):
+ scheduler_classes = (DPMSolverSDEScheduler,)
+ num_inference_steps = 10
+
+ def get_scheduler_config(self, **kwargs):
+ config = {
+ "num_train_timesteps": 1100,
+ "beta_start": 0.0001,
+ "beta_end": 0.02,
+ "beta_schedule": "linear",
+ "noise_sampler_seed": 0,
+ }
+
+ config.update(**kwargs)
+ return config
+
+ def test_timesteps(self):
+ for timesteps in [10, 50, 100, 1000]:
+ self.check_over_configs(num_train_timesteps=timesteps)
+
+ def test_betas(self):
+ for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
+ self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+ def test_schedules(self):
+ for schedule in ["linear", "scaled_linear"]:
+ self.check_over_configs(beta_schedule=schedule)
+
+ def test_prediction_type(self):
+ for prediction_type in ["epsilon", "v_prediction"]:
+ self.check_over_configs(prediction_type=prediction_type)
+
+ def test_full_loop_no_noise(self):
+ scheduler_class = self.scheduler_classes[0]
+ scheduler_config = self.get_scheduler_config()
+ scheduler = scheduler_class(**scheduler_config)
+
+ scheduler.set_timesteps(self.num_inference_steps)
+
+ model = self.dummy_model()
+ sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+ sample = sample.to(torch_device)
+
+ for i, t in enumerate(scheduler.timesteps):
+ sample = scheduler.scale_model_input(sample, t)
+
+ model_output = model(sample, t)
+
+ output = scheduler.step(model_output, t, sample)
+ sample = output.prev_sample
+
+ result_sum = torch.sum(torch.abs(sample))
+ result_mean = torch.mean(torch.abs(sample))
+
+ if torch_device in ["mps"]:
+ assert abs(result_sum.item() - 167.47821044921875) < 1e-2
+ assert abs(result_mean.item() - 0.2178705964565277) < 1e-3
+ else:
+ assert abs(result_sum.item() - 162.52383422851562) < 1e-2
+ assert abs(result_mean.item() - 0.211619570851326) < 1e-3
+
+ def test_full_loop_with_v_prediction(self):
+ scheduler_class = self.scheduler_classes[0]
+ scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
+ scheduler = scheduler_class(**scheduler_config)
+
+ scheduler.set_timesteps(self.num_inference_steps)
+
+ model = self.dummy_model()
+ sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+ sample = sample.to(torch_device)
+
+ for i, t in enumerate(scheduler.timesteps):
+ sample = scheduler.scale_model_input(sample, t)
+
+ model_output = model(sample, t)
+
+ output = scheduler.step(model_output, t, sample)
+ sample = output.prev_sample
+
+ result_sum = torch.sum(torch.abs(sample))
+ result_mean = torch.mean(torch.abs(sample))
+
+ if torch_device in ["mps"]:
+ assert abs(result_sum.item() - 124.77149200439453) < 1e-2
+ assert abs(result_mean.item() - 0.16226289014816284) < 1e-3
+ else:
+ assert abs(result_sum.item() - 119.8487548828125) < 1e-2
+ assert abs(result_mean.item() - 0.1560530662536621) < 1e-3
+
+ def test_full_loop_device(self):
+ scheduler_class = self.scheduler_classes[0]
+ scheduler_config = self.get_scheduler_config()
+ scheduler = scheduler_class(**scheduler_config)
+
+ scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+ model = self.dummy_model()
+ sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
+
+ for t in scheduler.timesteps:
+ sample = scheduler.scale_model_input(sample, t)
+
+ model_output = model(sample, t)
+
+ output = scheduler.step(model_output, t, sample)
+ sample = output.prev_sample
+
+ result_sum = torch.sum(torch.abs(sample))
+ result_mean = torch.mean(torch.abs(sample))
+
+ if torch_device in ["mps"]:
+ assert abs(result_sum.item() - 167.46957397460938) < 1e-2
+ assert abs(result_mean.item() - 0.21805934607982635) < 1e-3
+ else:
+ assert abs(result_sum.item() - 162.52383422851562) < 1e-2
+ assert abs(result_mean.item() - 0.211619570851326) < 1e-3
+
+ def test_full_loop_device_karras_sigmas(self):
+ scheduler_class = self.scheduler_classes[0]
+ scheduler_config = self.get_scheduler_config()
+ scheduler = scheduler_class(**scheduler_config, use_karras_sigmas=True)
+
+ scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+ model = self.dummy_model()
+ sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
+ sample = sample.to(torch_device)
+
+ for t in scheduler.timesteps:
+ sample = scheduler.scale_model_input(sample, t)
+
+ model_output = model(sample, t)
+
+ output = scheduler.step(model_output, t, sample)
+ sample = output.prev_sample
+
+ result_sum = torch.sum(torch.abs(sample))
+ result_mean = torch.mean(torch.abs(sample))
+
+ if torch_device in ["mps"]:
+ assert abs(result_sum.item() - 176.66974135742188) < 1e-2
+ assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
+ else:
+ assert abs(result_sum.item() - 170.3135223388672) < 1e-2
+ assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
From 0b64c2c6c318d52c08592b456694d8ae15f820fa Mon Sep 17 00:00:00 2001
From: Nipun Jindal
Date: Thu, 27 Apr 2023 14:52:38 +0530
Subject: [PATCH 005/206] [Stochastic Sampler][Slow Test]: Cuda test fixes
(#3257)
[Slow Test]: Cuda test fixes
Co-authored-by: njindal
---
tests/schedulers/test_scheduler_dpm_sde.py | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/tests/schedulers/test_scheduler_dpm_sde.py b/tests/schedulers/test_scheduler_dpm_sde.py
index 010c4bdb1196..7906c8d5d4e9 100644
--- a/tests/schedulers/test_scheduler_dpm_sde.py
+++ b/tests/schedulers/test_scheduler_dpm_sde.py
@@ -65,6 +65,9 @@ def test_full_loop_no_noise(self):
if torch_device in ["mps"]:
assert abs(result_sum.item() - 167.47821044921875) < 1e-2
assert abs(result_mean.item() - 0.2178705964565277) < 1e-3
+ elif torch_device in ["cuda"]:
+ assert abs(result_sum.item() - 171.59352111816406) < 1e-2
+ assert abs(result_mean.item() - 0.22342906892299652) < 1e-3
else:
assert abs(result_sum.item() - 162.52383422851562) < 1e-2
assert abs(result_mean.item() - 0.211619570851326) < 1e-3
@@ -94,6 +97,9 @@ def test_full_loop_with_v_prediction(self):
if torch_device in ["mps"]:
assert abs(result_sum.item() - 124.77149200439453) < 1e-2
assert abs(result_mean.item() - 0.16226289014816284) < 1e-3
+ elif torch_device in ["cuda"]:
+ assert abs(result_sum.item() - 128.1663360595703) < 1e-2
+ assert abs(result_mean.item() - 0.16688326001167297) < 1e-3
else:
assert abs(result_sum.item() - 119.8487548828125) < 1e-2
assert abs(result_mean.item() - 0.1560530662536621) < 1e-3
@@ -122,6 +128,9 @@ def test_full_loop_device(self):
if torch_device in ["mps"]:
assert abs(result_sum.item() - 167.46957397460938) < 1e-2
assert abs(result_mean.item() - 0.21805934607982635) < 1e-3
+ elif torch_device in ["cuda"]:
+ assert abs(result_sum.item() - 171.59353637695312) < 1e-2
+ assert abs(result_mean.item() - 0.22342908382415771) < 1e-3
else:
assert abs(result_sum.item() - 162.52383422851562) < 1e-2
assert abs(result_mean.item() - 0.211619570851326) < 1e-3
@@ -151,6 +160,9 @@ def test_full_loop_device_karras_sigmas(self):
if torch_device in ["mps"]:
assert abs(result_sum.item() - 176.66974135742188) < 1e-2
assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
+ elif torch_device in ["cuda"]:
+ assert abs(result_sum.item() - 177.63653564453125) < 1e-2
+ assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
else:
assert abs(result_sum.item() - 170.3135223388672) < 1e-2
assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
From 70ef774fa0f7016dce962d6f36788ce58847a1b3 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca
Date: Thu, 27 Apr 2023 13:29:18 +0200
Subject: [PATCH 006/206] Remove required from tracker_project_name (#3260)
Remove required from tracker_project_name.
As observed by https://github.com/off99555 in https://github.com/huggingface/diffusers/issues/2695#issuecomment-1470755050, it already has a default value.
---
examples/controlnet/train_controlnet.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py
index fc46c744cd8b..9754c25b81e9 100644
--- a/examples/controlnet/train_controlnet.py
+++ b/examples/controlnet/train_controlnet.py
@@ -536,7 +536,6 @@ def parse_args(input_args=None):
"--tracker_project_name",
type=str,
default="train_controlnet",
- required=True,
help=(
"The `project_name` argument passed to Accelerator.init_trackers for"
" more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
From 77bfb562414e93f35b5d6cea8431d5163fe46191 Mon Sep 17 00:00:00 2001
From: Isaac <34376531+init-22@users.noreply.github.com>
Date: Thu, 27 Apr 2023 17:01:43 +0530
Subject: [PATCH 007/206] adding required parameters while calling the
get_up_block and get_down_block (#3210)
* removed unnecessary parameters from get_up_block and get_down_block functions
* adding resnet_skip_time_act, resnet_out_scale_factor and cross_attention_norm to get_up_block and get_down_block functions
---------
Co-authored-by: Sayak Paul
---
.../pipelines/versatile_diffusion/modeling_text_unet.py | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 57e1abc7315b..0959e2bb3a8b 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -42,6 +42,9 @@ def get_down_block(
only_cross_attention=False,
upcast_attention=False,
resnet_time_scale_shift="default",
+ resnet_skip_time_act=False,
+ resnet_out_scale_factor=1.0,
+ cross_attention_norm=None,
):
down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
if down_block_type == "DownBlockFlat":
@@ -98,6 +101,9 @@ def get_up_block(
only_cross_attention=False,
upcast_attention=False,
resnet_time_scale_shift="default",
+ resnet_skip_time_act=False,
+ resnet_out_scale_factor=1.0,
+ cross_attention_norm=None,
):
up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
if up_block_type == "UpBlockFlat":
From fa31da29e591ed2e64a7c6ba9153c0b2e5a0ddc2 Mon Sep 17 00:00:00 2001
From: Ernie Chu <51432514+ernestchu@users.noreply.github.com>
Date: Thu, 27 Apr 2023 20:24:51 +0800
Subject: [PATCH 008/206] [docs] Update interface in repaint.mdx (#3119)
Update repaint.mdx
accomodate to #1701
---
docs/source/en/api/pipelines/repaint.mdx | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/source/en/api/pipelines/repaint.mdx b/docs/source/en/api/pipelines/repaint.mdx
index 927398d0bf54..895d3011883c 100644
--- a/docs/source/en/api/pipelines/repaint.mdx
+++ b/docs/source/en/api/pipelines/repaint.mdx
@@ -60,7 +60,7 @@ pipe = pipe.to("cuda")
generator = torch.Generator(device="cuda").manual_seed(0)
output = pipe(
- original_image=original_image,
+ image=original_image,
mask_image=mask_image,
num_inference_steps=250,
eta=0.0,
From eade4308dabc7f7ba75eab508d386b66b3764513 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?apolin=C3=A1rio?=
Date: Thu, 27 Apr 2023 15:26:58 +0200
Subject: [PATCH 009/206] Update IF name to XL (#3262)
Co-authored-by: multimodalart
---
docs/source/en/api/pipelines/if.mdx | 32 +++++++++----------
.../pipelines/deepfloyd_if/pipeline_if.py | 2 +-
.../deepfloyd_if/pipeline_if_img2img.py | 2 +-
.../pipeline_if_img2img_superresolution.py | 2 +-
.../deepfloyd_if/pipeline_if_inpainting.py | 2 +-
.../pipeline_if_inpainting_superresolution.py | 2 +-
.../pipeline_if_superresolution.py | 2 +-
tests/pipelines/deepfloyd_if/test_if.py | 2 +-
8 files changed, 23 insertions(+), 23 deletions(-)
diff --git a/docs/source/en/api/pipelines/if.mdx b/docs/source/en/api/pipelines/if.mdx
index 921a68a29f76..d79c7035fb75 100644
--- a/docs/source/en/api/pipelines/if.mdx
+++ b/docs/source/en/api/pipelines/if.mdx
@@ -29,7 +29,7 @@ Our work underscores the potential of larger UNet architectures in the first sta
Before you can use IF, you need to accept its usage conditions. To do so:
1. Make sure to have a [Hugging Face account](https://huggingface.co/join) and be logged in
-2. Accept the license on the model card of [DeepFloyd/IF-I-IF-v1.0](https://huggingface.co/DeepFloyd/IF-I-IF-v1.0). Accepting the license on the stage I model card will auto accept for the other IF models.
+2. Accept the license on the model card of [DeepFloyd/IF-I-XL-v1.0](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0). Accepting the license on the stage I model card will auto accept for the other IF models.
3. Make sure to login locally. Install `huggingface_hub`
```sh
pip install huggingface_hub --upgrade
@@ -62,7 +62,7 @@ The following sections give more in-detail examples of how to use IF. Specifical
**Available checkpoints**
- *Stage-1*
- - [DeepFloyd/IF-I-IF-v1.0](https://huggingface.co/DeepFloyd/IF-I-IF-v1.0)
+ - [DeepFloyd/IF-I-XL-v1.0](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0)
- [DeepFloyd/IF-I-L-v1.0](https://huggingface.co/DeepFloyd/IF-I-L-v1.0)
- [DeepFloyd/IF-I-M-v1.0](https://huggingface.co/DeepFloyd/IF-I-M-v1.0)
@@ -90,7 +90,7 @@ from diffusers.utils import pt_to_pil
import torch
# stage 1
-stage_1 = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+stage_1 = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
stage_1.enable_model_cpu_offload()
# stage 2
@@ -162,7 +162,7 @@ original_image = Image.open(BytesIO(response.content)).convert("RGB")
original_image = original_image.resize((768, 512))
# stage 1
-stage_1 = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+stage_1 = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
stage_1.enable_model_cpu_offload()
# stage 2
@@ -244,7 +244,7 @@ mask_image = Image.open(BytesIO(response.content))
mask_image = mask_image
# stage 1
-stage_1 = IFInpaintingPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+stage_1 = IFInpaintingPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
stage_1.enable_model_cpu_offload()
# stage 2
@@ -305,7 +305,7 @@ In addition to being loaded with `from_pretrained`, Pipelines can also be loaded
```python
from diffusers import IFPipeline, IFSuperResolutionPipeline
-pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0")
+pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0")
pipe_2 = IFSuperResolutionPipeline.from_pretrained("DeepFloyd/IF-II-L-v1.0")
@@ -326,7 +326,7 @@ pipe_2 = IFInpaintingSuperResolutionPipeline(**pipe_2.components)
The simplest optimization to run IF faster is to move all model components to the GPU.
```py
-pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
pipe.to("cuda")
```
@@ -352,7 +352,7 @@ the input image which also determines how many steps to run in the denoising pro
A smaller number will vary the image less but run faster.
```py
-pipe = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
pipe.to("cuda")
image = pipe(image=image, prompt="", strength=0.3).images
@@ -364,7 +364,7 @@ with IF and it might not give expected results.
```py
import torch
-pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
pipe.to("cuda")
pipe.text_encoder = torch.compile(pipe.text_encoder)
@@ -378,14 +378,14 @@ When optimizing for GPU memory, we can use the standard diffusers cpu offloading
Either the model based CPU offloading,
```py
-pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
pipe.enable_model_cpu_offload()
```
or the more aggressive layer based CPU offloading.
```py
-pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
pipe.enable_sequential_cpu_offload()
```
@@ -395,13 +395,13 @@ Additionally, T5 can be loaded in 8bit precision
from transformers import T5EncoderModel
text_encoder = T5EncoderModel.from_pretrained(
- "DeepFloyd/IF-I-IF-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit"
+ "DeepFloyd/IF-I-XL-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit"
)
from diffusers import DiffusionPipeline
pipe = DiffusionPipeline.from_pretrained(
- "DeepFloyd/IF-I-IF-v1.0",
+ "DeepFloyd/IF-I-XL-v1.0",
text_encoder=text_encoder, # pass the previously instantiated 8bit text encoder
unet=None,
device_map="auto",
@@ -422,13 +422,13 @@ from transformers import T5EncoderModel
from diffusers.utils import pt_to_pil
text_encoder = T5EncoderModel.from_pretrained(
- "DeepFloyd/IF-I-IF-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit"
+ "DeepFloyd/IF-I-XL-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit"
)
# text to image
pipe = DiffusionPipeline.from_pretrained(
- "DeepFloyd/IF-I-IF-v1.0",
+ "DeepFloyd/IF-I-XL-v1.0",
text_encoder=text_encoder, # pass the previously instantiated 8bit text encoder
unet=None,
device_map="auto",
@@ -444,7 +444,7 @@ gc.collect()
torch.cuda.empty_cache()
pipe = IFPipeline.from_pretrained(
- "DeepFloyd/IF-I-IF-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16, device_map="auto"
+ "DeepFloyd/IF-I-XL-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16, device_map="auto"
)
generator = torch.Generator().manual_seed(0)
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
index a76e51a3ffe9..479ffa9e6635 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
@@ -41,7 +41,7 @@
>>> from diffusers.utils import pt_to_pil
>>> import torch
- >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+ >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
>>> pipe.enable_model_cpu_offload()
>>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
index a31748450d4b..fac4adeea463 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
@@ -70,7 +70,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
>>> original_image = original_image.resize((768, 512))
>>> pipe = IFImg2ImgPipeline.from_pretrained(
- ... "DeepFloyd/IF-I-IF-v1.0",
+ ... "DeepFloyd/IF-I-XL-v1.0",
... variant="fp16",
... torch_dtype=torch.float16,
... )
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
index 21e280654cf5..eed1bb43e5d8 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
@@ -73,7 +73,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
>>> original_image = original_image.resize((768, 512))
>>> pipe = IFImg2ImgPipeline.from_pretrained(
- ... "DeepFloyd/IF-I-IF-v1.0",
+ ... "DeepFloyd/IF-I-XL-v1.0",
... variant="fp16",
... torch_dtype=torch.float16,
... )
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
index 95eba1cc7d24..d3651f5169c1 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
@@ -76,7 +76,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
>>> mask_image = mask_image
>>> pipe = IFInpaintingPipeline.from_pretrained(
- ... "DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16
+ ... "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16
... )
>>> pipe.enable_model_cpu_offload()
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
index 4eb0bf300fa5..5ea6a47082ae 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
@@ -78,7 +78,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
>>> mask_image = mask_image
>>> pipe = IFInpaintingPipeline.from_pretrained(
- ... "DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16
+ ... "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16
... )
>>> pipe.enable_model_cpu_offload()
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
index bb1d4ee4ba66..a62a51b0972f 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
@@ -45,7 +45,7 @@
>>> from diffusers.utils import pt_to_pil
>>> import torch
- >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+ >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
>>> pipe.enable_model_cpu_offload()
>>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
diff --git a/tests/pipelines/deepfloyd_if/test_if.py b/tests/pipelines/deepfloyd_if/test_if.py
index e2204cb601a6..bf01c2350d22 100644
--- a/tests/pipelines/deepfloyd_if/test_if.py
+++ b/tests/pipelines/deepfloyd_if/test_if.py
@@ -94,7 +94,7 @@ def tearDown(self):
def test_all(self):
# if
- pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+ pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
pipe_2 = IFSuperResolutionPipeline.from_pretrained(
"DeepFloyd/IF-II-L-v1.0", variant="fp16", torch_dtype=torch.float16, text_encoder=None, tokenizer=None
From d92c4d5ab703746cba2e7fff4fa4441066eee9c8 Mon Sep 17 00:00:00 2001
From: Xie Zejian
Date: Thu, 27 Apr 2023 22:39:14 +0800
Subject: [PATCH 010/206] fix typo in score sde pipeline (#3132)
---
src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py b/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
index 60a6f1e70f4a..3ff7b8ee460b 100644
--- a/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
+++ b/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
@@ -33,7 +33,7 @@ class ScoreSdeVePipeline(DiffusionPipeline):
unet: UNet2DModel
scheduler: ScoreSdeVeScheduler
- def __init__(self, unet: UNet2DModel, scheduler: DiffusionPipeline):
+ def __init__(self, unet: UNet2DModel, scheduler: ScoreSdeVeScheduler):
super().__init__()
self.register_modules(unet=unet, scheduler=scheduler)
From eb29dbad1753f1e76a8b171e8dcc677dc05398e5 Mon Sep 17 00:00:00 2001
From: Jair Trejo
Date: Thu, 27 Apr 2023 08:24:12 -0700
Subject: [PATCH 011/206] Fix typo in textual inversion JAX training script
(#3123)
The pipeline is built as `pipe` but then used as `pipeline`.
---
docs/source/en/training/text_inversion.mdx | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/source/en/training/text_inversion.mdx b/docs/source/en/training/text_inversion.mdx
index 4cbab9886045..76e7f0dcc8f2 100644
--- a/docs/source/en/training/text_inversion.mdx
+++ b/docs/source/en/training/text_inversion.mdx
@@ -245,7 +245,7 @@ from flax.training.common_utils import shard
from diffusers import FlaxStableDiffusionPipeline
model_path = "path-to-your-trained-model"
-pipe, params = FlaxStableDiffusionPipeline.from_pretrained(model_path, dtype=jax.numpy.bfloat16)
+pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(model_path, dtype=jax.numpy.bfloat16)
prompt = "A backpack"
prng_seed = jax.random.PRNGKey(0)
From b63419a28a93ae4f4e6ced0ffff07c042d3af474 Mon Sep 17 00:00:00 2001
From: Robert Dargavel Smith
Date: Thu, 27 Apr 2023 16:27:41 +0100
Subject: [PATCH 012/206] AudioDiffusionPipeline - fix encode method after
config changes (#3114)
* config fixes
* deprecate get_input_dims
---
.../pipeline_audio_diffusion.py | 19 +------------------
1 file changed, 1 insertion(+), 18 deletions(-)
diff --git a/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py b/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
index 1df76ed6c52c..629a2e7d32ca 100644
--- a/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
+++ b/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
@@ -51,21 +51,6 @@ def __init__(
super().__init__()
self.register_modules(unet=unet, scheduler=scheduler, mel=mel, vqvae=vqvae)
- def get_input_dims(self) -> Tuple:
- """Returns dimension of input image
-
- Returns:
- `Tuple`: (height, width)
- """
- input_module = self.vqvae if self.vqvae is not None else self.unet
- # For backwards compatibility
- sample_size = (
- (input_module.config.sample_size, input_module.config.sample_size)
- if type(input_module.config.sample_size) == int
- else input_module.config.sample_size
- )
- return sample_size
-
def get_default_steps(self) -> int:
"""Returns default number of steps recommended for inference
@@ -123,8 +108,6 @@ def __call__(
# For backwards compatibility
if type(self.unet.config.sample_size) == int:
self.unet.config.sample_size = (self.unet.config.sample_size, self.unet.config.sample_size)
- input_dims = self.get_input_dims()
- self.mel.set_resolution(x_res=input_dims[1], y_res=input_dims[0])
if noise is None:
noise = randn_tensor(
(
@@ -234,7 +217,7 @@ def encode(self, images: List[Image.Image], steps: int = 50) -> np.ndarray:
sample = torch.Tensor(sample).to(self.device)
for t in self.progress_bar(torch.flip(self.scheduler.timesteps, (0,))):
- prev_timestep = t - self.scheduler.num_train_timesteps // self.scheduler.num_inference_steps
+ prev_timestep = t - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
alpha_prod_t = self.scheduler.alphas_cumprod[t]
alpha_prod_t_prev = (
self.scheduler.alphas_cumprod[prev_timestep]
From 2ced899cc7cff5c37f2186819c90538ce301908c Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Thu, 27 Apr 2023 17:45:37 +0200
Subject: [PATCH 013/206] Revert "Revert "[Community Pipelines] Update
lpw_stable_diffusion pipeline"" (#3265)
Revert "Revert "[Community Pipelines] Update lpw_stable_diffusion pipeline" (#3201)"
This reverts commit 91a2a80eb2f98a9f64b9e287715add244dc6f2f3.
---
examples/community/lpw_stable_diffusion.py | 749 +++++++++++++++------
1 file changed, 533 insertions(+), 216 deletions(-)
diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py
index e912ad5244be..56fb903c7106 100644
--- a/examples/community/lpw_stable_diffusion.py
+++ b/examples/community/lpw_stable_diffusion.py
@@ -1,6 +1,6 @@
import inspect
import re
-from typing import Callable, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
import numpy as np
import PIL
@@ -8,32 +8,23 @@
from packaging import version
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-import diffusers
-from diffusers import SchedulerMixin, StableDiffusionPipeline
+from diffusers import DiffusionPipeline
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from diffusers.models import AutoencoderKL, UNet2DConditionModel
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
-from diffusers.utils import logging
-
-
-try:
- from diffusers.utils import PIL_INTERPOLATION
-except ImportError:
- if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
- PIL_INTERPOLATION = {
- "linear": PIL.Image.Resampling.BILINEAR,
- "bilinear": PIL.Image.Resampling.BILINEAR,
- "bicubic": PIL.Image.Resampling.BICUBIC,
- "lanczos": PIL.Image.Resampling.LANCZOS,
- "nearest": PIL.Image.Resampling.NEAREST,
- }
- else:
- PIL_INTERPOLATION = {
- "linear": PIL.Image.LINEAR,
- "bilinear": PIL.Image.BILINEAR,
- "bicubic": PIL.Image.BICUBIC,
- "lanczos": PIL.Image.LANCZOS,
- "nearest": PIL.Image.NEAREST,
- }
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+ PIL_INTERPOLATION,
+ deprecate,
+ is_accelerate_available,
+ is_accelerate_version,
+ logging,
+ randn_tensor,
+)
+
+
# ------------------------------------------------------------------------------
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -144,7 +135,7 @@ def multiply_range(start_position, multiplier):
return res
-def get_prompts_with_weights(pipe: StableDiffusionPipeline, prompt: List[str], max_length: int):
+def get_prompts_with_weights(pipe: DiffusionPipeline, prompt: List[str], max_length: int):
r"""
Tokenize a list of prompts and return its tokens with weights of each token.
@@ -205,7 +196,7 @@ def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos
def get_unweighted_text_embeddings(
- pipe: StableDiffusionPipeline,
+ pipe: DiffusionPipeline,
text_input: torch.Tensor,
chunk_length: int,
no_boseos_middle: Optional[bool] = True,
@@ -245,7 +236,7 @@ def get_unweighted_text_embeddings(
def get_weighted_text_embeddings(
- pipe: StableDiffusionPipeline,
+ pipe: DiffusionPipeline,
prompt: Union[str, List[str]],
uncond_prompt: Optional[Union[str, List[str]]] = None,
max_embeddings_multiples: Optional[int] = 3,
@@ -261,7 +252,7 @@ def get_weighted_text_embeddings(
Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
Args:
- pipe (`StableDiffusionPipeline`):
+ pipe (`DiffusionPipeline`):
Pipe to provide access to the tokenizer and the text encoder.
prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation.
@@ -349,7 +340,7 @@ def get_weighted_text_embeddings(
pipe.tokenizer.model_max_length,
no_boseos_middle=no_boseos_middle,
)
- prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=pipe.device)
+ prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=text_embeddings.device)
if uncond_prompt is not None:
uncond_embeddings = get_unweighted_text_embeddings(
pipe,
@@ -357,7 +348,7 @@ def get_weighted_text_embeddings(
pipe.tokenizer.model_max_length,
no_boseos_middle=no_boseos_middle,
)
- uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=pipe.device)
+ uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=uncond_embeddings.device)
# assign weights to the prompts and normalize in the sense of mean
# TODO: should we normalize by chunk or in a whole (current implementation)?
@@ -377,30 +368,50 @@ def get_weighted_text_embeddings(
return text_embeddings, None
-def preprocess_image(image):
+def preprocess_image(image, batch_size):
w, h = image.size
- w, h = (x - x % 32 for x in (w, h)) # resize to integer multiple of 32
+ w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
image = np.array(image).astype(np.float32) / 255.0
- image = image[None].transpose(0, 3, 1, 2)
+ image = np.vstack([image[None].transpose(0, 3, 1, 2)] * batch_size)
image = torch.from_numpy(image)
return 2.0 * image - 1.0
-def preprocess_mask(mask, scale_factor=8):
- mask = mask.convert("L")
- w, h = mask.size
- w, h = (x - x % 32 for x in (w, h)) # resize to integer multiple of 32
- mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
- mask = np.array(mask).astype(np.float32) / 255.0
- mask = np.tile(mask, (4, 1, 1))
- mask = mask[None].transpose(0, 1, 2, 3) # what does this step do?
- mask = 1 - mask # repaint white, keep black
- mask = torch.from_numpy(mask)
- return mask
+def preprocess_mask(mask, batch_size, scale_factor=8):
+ if not isinstance(mask, torch.FloatTensor):
+ mask = mask.convert("L")
+ w, h = mask.size
+ w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
+ mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
+ mask = np.array(mask).astype(np.float32) / 255.0
+ mask = np.tile(mask, (4, 1, 1))
+ mask = np.vstack([mask[None]] * batch_size)
+ mask = 1 - mask # repaint white, keep black
+ mask = torch.from_numpy(mask)
+ return mask
+
+ else:
+ valid_mask_channel_sizes = [1, 3]
+ # if mask channel is fourth tensor dimension, permute dimensions to pytorch standard (B, C, H, W)
+ if mask.shape[3] in valid_mask_channel_sizes:
+ mask = mask.permute(0, 3, 1, 2)
+ elif mask.shape[1] not in valid_mask_channel_sizes:
+ raise ValueError(
+ f"Mask channel dimension of size in {valid_mask_channel_sizes} should be second or fourth dimension,"
+ f" but received mask of shape {tuple(mask.shape)}"
+ )
+ # (potentially) reduce mask channel dimension from 3 to 1 for broadcasting to latent shape
+ mask = mask.mean(dim=1, keepdim=True)
+ h, w = mask.shape[-2:]
+ h, w = (x - x % 8 for x in (h, w)) # resize to integer multiple of 8
+ mask = torch.nn.functional.interpolate(mask, (h // scale_factor, w // scale_factor))
+ return mask
-class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
+class StableDiffusionLongPromptWeightingPipeline(
+ DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin
+):
r"""
Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
weighting in prompt.
@@ -429,66 +440,196 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
"""
- if version.parse(version.parse(diffusers.__version__).base_version) >= version.parse("0.9.0"):
-
- def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: SchedulerMixin,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool = True,
- ):
- super().__init__(
- vae=vae,
- text_encoder=text_encoder,
- tokenizer=tokenizer,
- unet=unet,
- scheduler=scheduler,
- safety_checker=safety_checker,
- feature_extractor=feature_extractor,
- requires_safety_checker=requires_safety_checker,
+ _optional_components = ["safety_checker", "feature_extractor"]
+
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
+ super().__init__()
+
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["steps_offset"] = 1
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+ " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+ " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+ " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+ " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+ )
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["clip_sample"] = False
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if safety_checker is None and requires_safety_checker:
+ logger.warning(
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
)
- self.__init__additional__()
- else:
+ if safety_checker is not None and feature_extractor is None:
+ raise ValueError(
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+ )
- def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- scheduler: SchedulerMixin,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- ):
- super().__init__(
- vae=vae,
- text_encoder=text_encoder,
- tokenizer=tokenizer,
- unet=unet,
- scheduler=scheduler,
- safety_checker=safety_checker,
- feature_extractor=feature_extractor,
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+ version.parse(unet.config._diffusers_version).base_version
+ ) < version.parse("0.9.0.dev0")
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+ if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+ deprecation_message = (
+ "The configuration file of the unet has set the default `sample_size` to smaller than"
+ " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+ " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+ " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+ " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+ " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+ " in the config might lead to incorrect results in future versions. If you have downloaded this"
+ " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+ " the `unet/config.json` file"
)
- self.__init__additional__()
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(unet.config)
+ new_config["sample_size"] = 64
+ unet._internal_dict = FrozenDict(new_config)
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+ self.register_to_config(
+ requires_safety_checker=requires_safety_checker,
+ )
+
+ def enable_vae_slicing(self):
+ r"""
+ Enable sliced VAE decoding.
+
+ When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
+ steps. This is useful to save some memory and allow larger batch sizes.
+ """
+ self.vae.enable_slicing()
+
+ def disable_vae_slicing(self):
+ r"""
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_slicing()
+
+ def enable_vae_tiling(self):
+ r"""
+ Enable tiled VAE decoding.
+
+ When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
+ several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
+ """
+ self.vae.enable_tiling()
+
+ def disable_vae_tiling(self):
+ r"""
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_tiling()
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
+ def enable_sequential_cpu_offload(self, gpu_id=0):
+ r"""
+ Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+ text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+ `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+ Note that offloading happens on a submodule basis. Memory savings are higher than with
+ `enable_model_cpu_offload`, but performance is lower.
+ """
+ if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
+ from accelerate import cpu_offload
+ else:
+ raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
+
+ device = torch.device(f"cuda:{gpu_id}")
+
+ if self.device.type != "cpu":
+ self.to("cpu", silence_dtype_warnings=True)
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
+
+ for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+ cpu_offload(cpu_offloaded_model, device)
+
+ if self.safety_checker is not None:
+ cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
+ def enable_model_cpu_offload(self, gpu_id=0):
+ r"""
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+ """
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+ from accelerate import cpu_offload_with_hook
+ else:
+ raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+ device = torch.device(f"cuda:{gpu_id}")
+
+ if self.device.type != "cpu":
+ self.to("cpu", silence_dtype_warnings=True)
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
+
+ hook = None
+ for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+ _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+ if self.safety_checker is not None:
+ _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
- def __init__additional__(self):
- if not hasattr(self, "vae_scale_factor"):
- setattr(self, "vae_scale_factor", 2 ** (len(self.vae.config.block_out_channels) - 1))
+ # We'll offload the last model manually.
+ self.final_offload_hook = hook
@property
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
- if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+ if not hasattr(self.unet, "_hf_hook"):
return self.device
for module in self.unet.modules():
if (
@@ -505,8 +646,10 @@ def _encode_prompt(
device,
num_images_per_prompt,
do_classifier_free_guidance,
- negative_prompt,
- max_embeddings_multiples,
+ negative_prompt=None,
+ max_embeddings_multiples=3,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
):
r"""
Encodes the prompt into text encoder hidden states.
@@ -526,47 +669,71 @@ def _encode_prompt(
max_embeddings_multiples (`int`, *optional*, defaults to `3`):
The max multiple length of prompt embeddings compared to the max output length of text encoder.
"""
- batch_size = len(prompt) if isinstance(prompt, list) else 1
-
- if negative_prompt is None:
- negative_prompt = [""] * batch_size
- elif isinstance(negative_prompt, str):
- negative_prompt = [negative_prompt] * batch_size
- if batch_size != len(negative_prompt):
- raise ValueError(
- f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
- f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`."
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ if negative_prompt_embeds is None:
+ if negative_prompt is None:
+ negative_prompt = [""] * batch_size
+ elif isinstance(negative_prompt, str):
+ negative_prompt = [negative_prompt] * batch_size
+ if batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ if prompt_embeds is None or negative_prompt_embeds is None:
+ if isinstance(self, TextualInversionLoaderMixin):
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
+ negative_prompt = self.maybe_convert_prompt(negative_prompt, self.tokenizer)
+
+ prompt_embeds1, negative_prompt_embeds1 = get_weighted_text_embeddings(
+ pipe=self,
+ prompt=prompt,
+ uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
+ max_embeddings_multiples=max_embeddings_multiples,
)
+ if prompt_embeds is None:
+ prompt_embeds = prompt_embeds1
+ if negative_prompt_embeds is None:
+ negative_prompt_embeds = negative_prompt_embeds1
- text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
- pipe=self,
- prompt=prompt,
- uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
- max_embeddings_multiples=max_embeddings_multiples,
- )
- bs_embed, seq_len, _ = text_embeddings.shape
- text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
- text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+ bs_embed, seq_len, _ = prompt_embeds.shape
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
if do_classifier_free_guidance:
- bs_embed, seq_len, _ = uncond_embeddings.shape
- uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
- uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
- text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+ bs_embed, seq_len, _ = negative_prompt_embeds.shape
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+ negative_prompt_embeds = negative_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
- return text_embeddings
+ return prompt_embeds
- def check_inputs(self, prompt, height, width, strength, callback_steps):
- if not isinstance(prompt, str) and not isinstance(prompt, list):
- raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+ def check_inputs(
+ self,
+ prompt,
+ height,
+ width,
+ strength,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
+ if height % 8 != 0 or width % 8 != 0:
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if strength < 0 or strength > 1:
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
- if height % 8 != 0 or width % 8 != 0:
- raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
if (callback_steps is None) or (
callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
):
@@ -575,17 +742,42 @@ def check_inputs(self, prompt, height, width, strength, callback_steps):
f" {type(callback_steps)}."
)
+ if prompt is not None and prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+ " only forward one of the two."
+ )
+ elif prompt is None and prompt_embeds is None:
+ raise ValueError(
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+ )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ if negative_prompt is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
+ raise ValueError(
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+ f" {negative_prompt_embeds.shape}."
+ )
+
def get_timesteps(self, num_inference_steps, strength, device, is_text2img):
if is_text2img:
return self.scheduler.timesteps.to(device), num_inference_steps
else:
# get the original timestep using init_timestep
- offset = self.scheduler.config.get("steps_offset", 0)
- init_timestep = int(num_inference_steps * strength) + offset
- init_timestep = min(init_timestep, num_inference_steps)
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+ t_start = max(num_inference_steps - init_timestep, 0)
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
- t_start = max(num_inference_steps - init_timestep + offset, 0)
- timesteps = self.scheduler.timesteps[t_start:].to(device)
return timesteps, num_inference_steps - t_start
def run_safety_checker(self, image, device, dtype):
@@ -599,7 +791,7 @@ def run_safety_checker(self, image, device, dtype):
return image, has_nsfw_concept
def decode_latents(self, latents):
- latents = 1 / 0.18215 * latents
+ latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents).sample
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
@@ -623,43 +815,51 @@ def prepare_extra_step_kwargs(self, generator, eta):
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
- def prepare_latents(self, image, timestep, batch_size, height, width, dtype, device, generator, latents=None):
+ def prepare_latents(
+ self,
+ image,
+ timestep,
+ num_images_per_prompt,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ device,
+ generator,
+ latents=None,
+ ):
if image is None:
- shape = (
- batch_size,
- self.unet.config.in_channels,
- height // self.vae_scale_factor,
- width // self.vae_scale_factor,
- )
+ batch_size = batch_size * num_images_per_prompt
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+ if isinstance(generator, list) and len(generator) != batch_size:
+ raise ValueError(
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+ )
if latents is None:
- if device.type == "mps":
- # randn does not work reproducibly on mps
- latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
- else:
- latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
else:
- if latents.shape != shape:
- raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
latents = latents.to(device)
# scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma
return latents, None, None
else:
+ image = image.to(device=self.device, dtype=dtype)
init_latent_dist = self.vae.encode(image).latent_dist
init_latents = init_latent_dist.sample(generator=generator)
- init_latents = 0.18215 * init_latents
- init_latents = torch.cat([init_latents] * batch_size, dim=0)
+ init_latents = self.vae.config.scaling_factor * init_latents
+
+ # Expand init_latents for batch_size and num_images_per_prompt
+ init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
init_latents_orig = init_latents
- shape = init_latents.shape
# add noise to latents using the timesteps
- if device.type == "mps":
- noise = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
- else:
- noise = torch.randn(shape, generator=generator, device=device, dtype=dtype)
- latents = self.scheduler.add_noise(init_latents, noise, timestep)
+ noise = randn_tensor(init_latents.shape, generator=generator, device=self.device, dtype=dtype)
+ init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+ latents = init_latents
return latents, init_latents_orig, noise
@torch.no_grad()
@@ -675,15 +875,19 @@ def __call__(
guidance_scale: float = 7.5,
strength: float = 0.8,
num_images_per_prompt: Optional[int] = 1,
+ add_predicted_noise: Optional[bool] = False,
eta: float = 0.0,
- generator: Optional[torch.Generator] = None,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.FloatTensor] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
max_embeddings_multiples: Optional[int] = 3,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
is_cancelled_callback: Optional[Callable[[], bool]] = None,
callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
):
r"""
Function invoked when calling the pipeline for generation.
@@ -723,16 +927,26 @@ def __call__(
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
+ add_predicted_noise (`bool`, *optional*, defaults to True):
+ Use predicted noise instead of random noise when constructing noisy versions of the original image in
+ the reverse diffusion process
eta (`float`, *optional*, defaults to 0.0):
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
[`schedulers.DDIMScheduler`], will be ignored for others.
- generator (`torch.Generator`, *optional*):
- A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
- deterministic.
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
latents (`torch.FloatTensor`, *optional*):
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor will ge generated by sampling using the supplied random `generator`.
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
max_embeddings_multiples (`int`, *optional*, defaults to `3`):
The max multiple length of prompt embeddings compared to the max output length of text encoder.
output_type (`str`, *optional*, defaults to `"pil"`):
@@ -750,6 +964,10 @@ def __call__(
callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function will be called. If not specified, the callback will be
called at every step.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
Returns:
`None` if cancelled by `is_cancelled_callback`,
@@ -764,10 +982,18 @@ def __call__(
width = width or self.unet.config.sample_size * self.vae_scale_factor
# 1. Check inputs. Raise error if not correct
- self.check_inputs(prompt, height, width, strength, callback_steps)
+ self.check_inputs(
+ prompt, height, width, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+ )
# 2. Define call parameters
- batch_size = 1 if isinstance(prompt, str) else len(prompt)
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
device = self._execution_device
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -775,26 +1001,28 @@ def __call__(
do_classifier_free_guidance = guidance_scale > 1.0
# 3. Encode input prompt
- text_embeddings = self._encode_prompt(
+ prompt_embeds = self._encode_prompt(
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt,
max_embeddings_multiples,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
)
- dtype = text_embeddings.dtype
+ dtype = prompt_embeds.dtype
# 4. Preprocess image and mask
if isinstance(image, PIL.Image.Image):
- image = preprocess_image(image)
+ image = preprocess_image(image, batch_size)
if image is not None:
image = image.to(device=self.device, dtype=dtype)
if isinstance(mask_image, PIL.Image.Image):
- mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
+ mask_image = preprocess_mask(mask_image, batch_size, self.vae_scale_factor)
if mask_image is not None:
mask = mask_image.to(device=self.device, dtype=dtype)
- mask = torch.cat([mask] * batch_size * num_images_per_prompt)
+ mask = torch.cat([mask] * num_images_per_prompt)
else:
mask = None
@@ -807,7 +1035,9 @@ def __call__(
latents, init_latents_orig, noise = self.prepare_latents(
image,
latent_timestep,
- batch_size * num_images_per_prompt,
+ num_images_per_prompt,
+ batch_size,
+ self.unet.config.in_channels,
height,
width,
dtype,
@@ -820,43 +1050,70 @@ def __call__(
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 8. Denoising loop
- for i, t in enumerate(self.progress_bar(timesteps)):
- # expand the latents if we are doing classifier free guidance
- latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
- latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
- # predict the noise residual
- noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-
- # perform guidance
- if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
- # compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
- if mask is not None:
- # masking
- init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
- latents = (init_latents_proper * mask) + (latents * (1 - mask))
-
- # call the callback, if provided
- if i % callback_steps == 0:
- if callback is not None:
- callback(i, t, latents)
- if is_cancelled_callback is not None and is_cancelled_callback():
- return None
-
- # 9. Post-processing
- image = self.decode_latents(latents)
-
- # 10. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
-
- # 11. Convert to PIL
- if output_type == "pil":
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
+ for i, t in enumerate(timesteps):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+ # predict the noise residual
+ noise_pred = self.unet(
+ latent_model_input,
+ t,
+ encoder_hidden_states=prompt_embeds,
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+ if mask is not None:
+ # masking
+ if add_predicted_noise:
+ init_latents_proper = self.scheduler.add_noise(
+ init_latents_orig, noise_pred_uncond, torch.tensor([t])
+ )
+ else:
+ init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+ latents = (init_latents_proper * mask) + (latents * (1 - mask))
+
+ # call the callback, if provided
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+ progress_bar.update()
+ if i % callback_steps == 0:
+ if callback is not None:
+ callback(i, t, latents)
+ if is_cancelled_callback is not None and is_cancelled_callback():
+ return None
+
+ if output_type == "latent":
+ image = latents
+ has_nsfw_concept = None
+ elif output_type == "pil":
+ # 9. Post-processing
+ image = self.decode_latents(latents)
+
+ # 10. Run safety checker
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+ # 11. Convert to PIL
image = self.numpy_to_pil(image)
+ else:
+ # 9. Post-processing
+ image = self.decode_latents(latents)
+
+ # 10. Run safety checker
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+ # Offload last model to CPU
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+ self.final_offload_hook.offload()
if not return_dict:
return image, has_nsfw_concept
@@ -873,14 +1130,17 @@ def text2img(
guidance_scale: float = 7.5,
num_images_per_prompt: Optional[int] = 1,
eta: float = 0.0,
- generator: Optional[torch.Generator] = None,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.FloatTensor] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
max_embeddings_multiples: Optional[int] = 3,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
is_cancelled_callback: Optional[Callable[[], bool]] = None,
callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
):
r"""
Function for text-to-image generation.
@@ -908,13 +1168,20 @@ def text2img(
eta (`float`, *optional*, defaults to 0.0):
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
[`schedulers.DDIMScheduler`], will be ignored for others.
- generator (`torch.Generator`, *optional*):
- A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
- deterministic.
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
latents (`torch.FloatTensor`, *optional*):
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor will ge generated by sampling using the supplied random `generator`.
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
max_embeddings_multiples (`int`, *optional*, defaults to `3`):
The max multiple length of prompt embeddings compared to the max output length of text encoder.
output_type (`str`, *optional*, defaults to `"pil"`):
@@ -932,7 +1199,13 @@ def text2img(
callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function will be called. If not specified, the callback will be
called at every step.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
Returns:
+ `None` if cancelled by `is_cancelled_callback`,
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
When returning a tuple, the first element is a list with the generated images, and the second element is a
@@ -950,12 +1223,15 @@ def text2img(
eta=eta,
generator=generator,
latents=latents,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
max_embeddings_multiples=max_embeddings_multiples,
output_type=output_type,
return_dict=return_dict,
callback=callback,
is_cancelled_callback=is_cancelled_callback,
callback_steps=callback_steps,
+ cross_attention_kwargs=cross_attention_kwargs,
)
def img2img(
@@ -968,13 +1244,16 @@ def img2img(
guidance_scale: Optional[float] = 7.5,
num_images_per_prompt: Optional[int] = 1,
eta: Optional[float] = 0.0,
- generator: Optional[torch.Generator] = None,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
max_embeddings_multiples: Optional[int] = 3,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
is_cancelled_callback: Optional[Callable[[], bool]] = None,
callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
):
r"""
Function for image-to-image generation.
@@ -1007,9 +1286,16 @@ def img2img(
eta (`float`, *optional*, defaults to 0.0):
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
[`schedulers.DDIMScheduler`], will be ignored for others.
- generator (`torch.Generator`, *optional*):
- A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
- deterministic.
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
max_embeddings_multiples (`int`, *optional*, defaults to `3`):
The max multiple length of prompt embeddings compared to the max output length of text encoder.
output_type (`str`, *optional*, defaults to `"pil"`):
@@ -1027,8 +1313,13 @@ def img2img(
callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function will be called. If not specified, the callback will be
called at every step.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
Returns:
- [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ `None` if cancelled by `is_cancelled_callback`,
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
When returning a tuple, the first element is a list with the generated images, and the second element is a
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
@@ -1044,12 +1335,15 @@ def img2img(
num_images_per_prompt=num_images_per_prompt,
eta=eta,
generator=generator,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
max_embeddings_multiples=max_embeddings_multiples,
output_type=output_type,
return_dict=return_dict,
callback=callback,
is_cancelled_callback=is_cancelled_callback,
callback_steps=callback_steps,
+ cross_attention_kwargs=cross_attention_kwargs,
)
def inpaint(
@@ -1062,14 +1356,18 @@ def inpaint(
num_inference_steps: Optional[int] = 50,
guidance_scale: Optional[float] = 7.5,
num_images_per_prompt: Optional[int] = 1,
+ add_predicted_noise: Optional[bool] = False,
eta: Optional[float] = 0.0,
- generator: Optional[torch.Generator] = None,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
max_embeddings_multiples: Optional[int] = 3,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
is_cancelled_callback: Optional[Callable[[], bool]] = None,
callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
):
r"""
Function for inpaint.
@@ -1103,12 +1401,22 @@ def inpaint(
usually at the expense of lower image quality.
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
+ add_predicted_noise (`bool`, *optional*, defaults to True):
+ Use predicted noise instead of random noise when constructing noisy versions of the original image in
+ the reverse diffusion process
eta (`float`, *optional*, defaults to 0.0):
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
[`schedulers.DDIMScheduler`], will be ignored for others.
- generator (`torch.Generator`, *optional*):
- A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
- deterministic.
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
max_embeddings_multiples (`int`, *optional*, defaults to `3`):
The max multiple length of prompt embeddings compared to the max output length of text encoder.
output_type (`str`, *optional*, defaults to `"pil"`):
@@ -1126,8 +1434,13 @@ def inpaint(
callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function will be called. If not specified, the callback will be
called at every step.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
Returns:
- [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ `None` if cancelled by `is_cancelled_callback`,
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
When returning a tuple, the first element is a list with the generated images, and the second element is a
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
@@ -1142,12 +1455,16 @@ def inpaint(
guidance_scale=guidance_scale,
strength=strength,
num_images_per_prompt=num_images_per_prompt,
+ add_predicted_noise=add_predicted_noise,
eta=eta,
generator=generator,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
max_embeddings_multiples=max_embeddings_multiples,
output_type=output_type,
return_dict=return_dict,
callback=callback,
is_cancelled_callback=is_cancelled_callback,
callback_steps=callback_steps,
+ cross_attention_kwargs=cross_attention_kwargs,
)
From 364d59d13b64762c3a0e6ce9ebbe4226b8008ed3 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Thu, 27 Apr 2023 18:12:08 +0200
Subject: [PATCH 014/206] Fix community pipelines (#3266)
---
src/diffusers/utils/dynamic_modules_utils.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/diffusers/utils/dynamic_modules_utils.py b/src/diffusers/utils/dynamic_modules_utils.py
index 1951c4fa2623..aa6c9c657a87 100644
--- a/src/diffusers/utils/dynamic_modules_utils.py
+++ b/src/diffusers/utils/dynamic_modules_utils.py
@@ -267,7 +267,7 @@ def get_cached_module_file(
# retrieve github version that matches
if revision is None:
- revision = latest_version if latest_version in available_versions else "main"
+ revision = latest_version if latest_version[1:] in available_versions else "main"
logger.info(f"Defaulting to latest_version: {revision}.")
elif revision in available_versions:
revision = f"v{revision}"
From 329d1df8f2266e5e718212a97f4ad60200157272 Mon Sep 17 00:00:00 2001
From: YiYi Xu
Date: Thu, 27 Apr 2023 07:03:56 -1000
Subject: [PATCH 015/206] update notebook (#3259)
Co-authored-by: yiyixuxu
---
docs/source/en/quicktour.mdx | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/docs/source/en/quicktour.mdx b/docs/source/en/quicktour.mdx
index d494b79dccd5..2a2a5a3ad903 100644
--- a/docs/source/en/quicktour.mdx
+++ b/docs/source/en/quicktour.mdx
@@ -33,7 +33,7 @@ The quicktour is a simplified version of the introductory 🧨 Diffusers [notebo
Before you begin, make sure you have all the necessary libraries installed:
```bash
-pip install --upgrade diffusers accelerate transformers
+!pip install --upgrade diffusers accelerate transformers
```
- [🤗 Accelerate](https://huggingface.co/docs/accelerate/index) speeds up model loading for inference and training.
@@ -121,9 +121,9 @@ Save the image by calling `save`:
You can also use the pipeline locally. The only difference is you need to download the weights first:
-```
-git lfs install
-git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
+```bash
+!git lfs install
+!git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
```
Then load the saved weights into the pipeline:
From 256e6960cbe8a6379ee396ca6317503a991b9bbe Mon Sep 17 00:00:00 2001
From: Will Berman
Date: Thu, 27 Apr 2023 11:05:08 -0700
Subject: [PATCH 016/206] [docs] add notes for stateful model changes (#3252)
* [docs] add notes for stateful model changes
* Update docs/source/en/optimization/fp16.mdx
Co-authored-by: Pedro Cuenca
* link to accelerate docs for discarding hooks
---------
Co-authored-by: Pedro Cuenca
---
docs/source/en/optimization/fp16.mdx | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/docs/source/en/optimization/fp16.mdx b/docs/source/en/optimization/fp16.mdx
index d05c5aabea2b..596312a0ffe0 100644
--- a/docs/source/en/optimization/fp16.mdx
+++ b/docs/source/en/optimization/fp16.mdx
@@ -202,6 +202,8 @@ image = pipe(prompt).images[0]
**Note**: When using `enable_sequential_cpu_offload()`, it is important to **not** move the pipeline to CUDA beforehand or else the gain in memory consumption will only be minimal. See [this issue](https://github.com/huggingface/diffusers/issues/1934) for more information.
+**Note**: `enable_sequential_cpu_offload()` is a stateful operation that installs hooks on the models.
+
## Model offloading for fast inference and memory savings
@@ -251,6 +253,11 @@ image = pipe(prompt).images[0]
This feature requires `accelerate` version 0.17.0 or larger.
+**Note**: `enable_model_cpu_offload()` is a stateful operation that installs hooks on the models and state on the pipeline. In order to properly offload
+models after they are called, it is required that the entire pipeline is run and models are called in the order the pipeline expects them to be. Exercise caution
+if models are re-used outside the context of the pipeline after hooks have been installed. See [accelerate](https://huggingface.co/docs/accelerate/v0.18.0/en/package_reference/big_modeling#accelerate.hooks.remove_hook_from_module)
+for further docs on removing hooks.
+
## Using Channels Last memory format
Channels last memory format is an alternative way of ordering NCHW tensors in memory preserving dimensions ordering. Channels last tensors ordered in such a way that channels become the densest dimension (aka storing images pixel-per-pixel). Since not all operators currently support channels last format it may result in a worst performance, so it's better to try it and see if it works for your model.
From 71de5b705184d074925dee9cd9f70154c84f1e1e Mon Sep 17 00:00:00 2001
From: Sayak Paul
Date: Fri, 28 Apr 2023 11:36:49 +0530
Subject: [PATCH 017/206] [LoRA] quality of life improvements in the loading
semantics and docs (#3180)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* 👽 qol improvements for LoRA.
* better function name?
* fix: LoRA weight loading with the new format.
* address Patrick's comments.
* Apply suggestions from code review
Co-authored-by: Patrick von Platen
* change wording around encouraging the use of load_lora_weights().
* fix: function name.
---------
Co-authored-by: Patrick von Platen
---
docs/source/en/_toctree.yml | 6 +--
docs/source/en/training/lora.mdx | 36 +++++++++++--
examples/dreambooth/README.md | 29 ++++++++++-
examples/dreambooth/train_dreambooth_lora.py | 2 +-
examples/test_examples.py | 8 ++-
examples/text_to_image/README.md | 15 ++++++
src/diffusers/loaders.py | 54 ++++++++++++++------
7 files changed, 123 insertions(+), 27 deletions(-)
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 35c5fd78a1f6..26d3dbcf4e83 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -171,7 +171,7 @@
- local: api/pipelines/semantic_stable_diffusion
title: Semantic Guidance
- local: api/pipelines/spectrogram_diffusion
- title: "Spectrogram Diffusion"
+ title: Spectrogram Diffusion
- sections:
- local: api/pipelines/stable_diffusion/overview
title: Overview
@@ -238,6 +238,8 @@
title: DPM Discrete Scheduler
- local: api/schedulers/dpm_discrete_ancestral
title: DPM Discrete Scheduler with ancestral sampling
+ - local: api/schedulers/dpm_sde
+ title: DPMSolverSDEScheduler
- local: api/schedulers/euler_ancestral
title: Euler Ancestral Scheduler
- local: api/schedulers/euler
@@ -266,8 +268,6 @@
title: VP-SDE
- local: api/schedulers/vq_diffusion
title: VQDiffusionScheduler
- - local: api/schedulers/dpm_sde
- title: DPMSolverSDEScheduler
title: Schedulers
- sections:
- local: api/experimental/rl
diff --git a/docs/source/en/training/lora.mdx b/docs/source/en/training/lora.mdx
index 7e3c3c0b2b68..3c7cc7ebfeec 100644
--- a/docs/source/en/training/lora.mdx
+++ b/docs/source/en/training/lora.mdx
@@ -115,7 +115,7 @@ Load the LoRA weights from your finetuned model *on top of the base model weight
```py
->>> pipe.unet.load_attn_procs(model_path)
+>>> pipe.unet.load_attn_procs(lora_model_path)
>>> pipe.to("cuda")
# use half the weights from the LoRA finetuned model and half the weights from the base model
@@ -128,6 +128,25 @@ Load the LoRA weights from your finetuned model *on top of the base model weight
>>> image.save("blue_pokemon.png")
```
+
+
+If you are loading the LoRA parameters from the Hub and if the Hub repository has
+a `base_model` tag (such as [this](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4/blob/main/README.md?code=true#L4)), then
+you can do:
+
+```py
+from huggingface_hub.repocard import RepoCard
+
+lora_model_id = "sayakpaul/sd-model-finetuned-lora-t4"
+card = RepoCard.load(lora_model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = StableDiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16)
+...
+```
+
+
+
## DreamBooth
[DreamBooth](https://arxiv.org/abs/2208.12242) is a finetuning technique for personalizing a text-to-image model like Stable Diffusion to generate photorealistic images of a subject in different contexts, given a few images of the subject. However, DreamBooth is very sensitive to hyperparameters and it is easy to overfit. Some important hyperparameters to consider include those that affect the training time (learning rate, number of training steps), and inference time (number of steps, scheduler type).
@@ -208,7 +227,7 @@ Load the LoRA weights from your finetuned DreamBooth model *on top of the base m
```py
->>> pipe.unet.load_attn_procs(model_path)
+>>> pipe.unet.load_attn_procs(lora_model_path)
>>> pipe.to("cuda")
# use half the weights from the LoRA finetuned model and half the weights from the base model
@@ -222,4 +241,15 @@ Load the LoRA weights from your finetuned DreamBooth model *on top of the base m
>>> image = pipe("A picture of a sks dog in a bucket.", num_inference_steps=25, guidance_scale=7.5).images[0]
>>> image.save("bucket-dog.png")
-```
\ No newline at end of file
+```
+
+Note that the use of [`LoraLoaderMixin.load_lora_weights`] is preferred to [`UNet2DConditionLoadersMixin.load_attn_procs`] for loading LoRA parameters. This is because
+[`LoraLoaderMixin.load_lora_weights`] can handle the following situations:
+
+* LoRA parameters that don't have separate identifiers for the UNet and the text encoder (such as [`"patrickvonplaten/lora_dreambooth_dog_example"`](https://huggingface.co/patrickvonplaten/lora_dreambooth_dog_example)). So, you can just do:
+
+ ```py
+ pipe.load_lora_weights(lora_model_path)
+ ```
+
+* LoRA parameters that have separate identifiers for the UNet and the text encoder such as: [`"sayakpaul/dreambooth"`](https://huggingface.co/sayakpaul/dreambooth).
\ No newline at end of file
diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md
index 8447c7560720..e1eb8a06b0ff 100644
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -355,7 +355,7 @@ The final LoRA embedding weights have been uploaded to [patrickvonplaten/lora_dr
The training results are summarized [here](https://api.wandb.ai/report/patrickvonplaten/xm6cd5q5).
You can use the `Step` slider to see how the model learned the features of our subject while the model trained.
-Optionally, we can also train additional LoRA layers for the text encoder. Specify the `train_text_encoder` argument above for that. If you're interested to know more about how we
+Optionally, we can also train additional LoRA layers for the text encoder. Specify the `--train_text_encoder` argument above for that. If you're interested to know more about how we
enable this support, check out this [PR](https://github.com/huggingface/diffusers/pull/2918).
With the default hyperparameters from the above, the training seems to go in a positive direction. Check out [this panel](https://wandb.ai/sayakpaul/dreambooth-lora/reports/test-23-04-17-17-00-13---Vmlldzo0MDkwNjMy). The trained LoRA layers are available [here](https://huggingface.co/sayakpaul/dreambooth).
@@ -387,6 +387,33 @@ Finally, we can run the model in inference.
image = pipe("A picture of a sks dog in a bucket", num_inference_steps=25).images[0]
```
+If you are loading the LoRA parameters from the Hub and if the Hub repository has
+a `base_model` tag (such as [this](https://huggingface.co/patrickvonplaten/lora_dreambooth_dog_example/blob/main/README.md?code=true#L4)), then
+you can do:
+
+```py
+from huggingface_hub.repocard import RepoCard
+
+lora_model_id = "patrickvonplaten/lora_dreambooth_dog_example"
+card = RepoCard.load(lora_model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = StableDiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16)
+...
+```
+
+**Note** that we will gradually be depcrecating the use of [`UNet2DConditionLoadersMixin.load_attn_procs`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.UNet2DConditionLoadersMixin.load_attn_procs) since we now have a more general
+method to load the LoRA parameters -- [`LoraLoaderMixin.load_lora_weights`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.load_lora_weights). This is because
+[`LoraLoaderMixin.load_lora_weights`] can handle the following situations:
+
+* LoRA parameters that don't have separate identifiers for the UNet and the text encoder (such as [`"patrickvonplaten/lora_dreambooth_dog_example"`](https://huggingface.co/patrickvonplaten/lora_dreambooth_dog_example)). So, you can just do:
+
+ ```py
+ pipe.load_lora_weights(lora_model_path)
+ ```
+
+* LoRA parameters that have separate identifiers for the UNet and the text encoder such as: [`"sayakpaul/dreambooth"`](https://huggingface.co/sayakpaul/dreambooth).
+
## Training with Flax/JAX
For faster training on TPUs and GPUs you can leverage the flax training example. Follow the instructions above to get the model and dataset before running the script.
diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
index 3aa2fb0a8491..5cefc57c614d 100644
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
@@ -1045,7 +1045,7 @@ def main(args):
pipeline = pipeline.to(accelerator.device)
# load attention processors
- pipeline.load_attn_procs(args.output_dir)
+ pipeline.load_lora_weights(args.output_dir)
# run inference
if args.validation_prompt and args.num_validation_images > 0:
diff --git a/examples/test_examples.py b/examples/test_examples.py
index d4a5ef5046f0..648c2cb8a1b7 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -281,10 +281,14 @@ def test_dreambooth_lora_with_text_encoder(self):
# save_pretrained smoke test
self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.bin")))
- # the names of the keys of the state dict should either start with `unet`
- # or `text_encoder`.
+ # check `text_encoder` is present at all.
lora_state_dict = torch.load(os.path.join(tmpdir, "pytorch_lora_weights.bin"))
keys = lora_state_dict.keys()
+ is_text_encoder_present = any(k.startswith("text_encoder") for k in keys)
+ self.assertTrue(is_text_encoder_present)
+
+ # the names of the keys of the state dict should either start with `unet`
+ # or `text_encoder`.
is_correct_naming = all(k.startswith("unet") or k.startswith("text_encoder") for k in keys)
self.assertTrue(is_correct_naming)
diff --git a/examples/text_to_image/README.md b/examples/text_to_image/README.md
index 406a64b3759f..160e73fa02bb 100644
--- a/examples/text_to_image/README.md
+++ b/examples/text_to_image/README.md
@@ -229,6 +229,21 @@ image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
image.save("pokemon.png")
```
+If you are loading the LoRA parameters from the Hub and if the Hub repository has
+a `base_model` tag (such as [this](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4/blob/main/README.md?code=true#L4)), then
+you can do:
+
+```py
+from huggingface_hub.repocard import RepoCard
+
+lora_model_id = "sayakpaul/sd-model-finetuned-lora-t4"
+card = RepoCard.load(lora_model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = StableDiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16)
+...
+```
+
## Training with Flax/JAX
For faster training on TPUs and GPUs you can leverage the flax training example. Follow the instructions above to get the model and dataset before running the script.
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 0db716c012d8..b4b0f4bb3bd6 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
+import warnings
from collections import defaultdict
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union
@@ -45,6 +46,8 @@
logger = logging.get_logger(__name__)
+TEXT_ENCODER_NAME = "text_encoder"
+UNET_NAME = "unet"
LORA_WEIGHT_NAME = "pytorch_lora_weights.bin"
LORA_WEIGHT_NAME_SAFE = "pytorch_lora_weights.safetensors"
@@ -87,6 +90,9 @@ def map_from(module, state_dict, *args, **kwargs):
class UNet2DConditionLoadersMixin:
+ text_encoder_name = TEXT_ENCODER_NAME
+ unet_name = UNET_NAME
+
def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
r"""
Load pretrained attention processor layers into `UNet2DConditionModel`. Attention processor layers have to be
@@ -225,6 +231,18 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
is_custom_diffusion = any("custom_diffusion" in k for k in state_dict.keys())
if is_lora:
+ is_new_lora_format = all(
+ key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys()
+ )
+ if is_new_lora_format:
+ # Strip the `"unet"` prefix.
+ is_text_encoder_present = any(key.startswith(self.text_encoder_name) for key in state_dict.keys())
+ if is_text_encoder_present:
+ warn_message = "The state_dict contains LoRA params corresponding to the text encoder which are not being used here. To use both UNet and text encoder related LoRA params, use [`pipe.load_lora_weights()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.load_lora_weights)."
+ warnings.warn(warn_message)
+ unet_keys = [k for k in state_dict.keys() if k.startswith(self.unet_name)]
+ state_dict = {k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys}
+
lora_grouped_dict = defaultdict(dict)
for key, value in state_dict.items():
attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
@@ -672,8 +690,8 @@ class LoraLoaderMixin:
"""
- text_encoder_name = "text_encoder"
- unet_name = "unet"
+ text_encoder_name = TEXT_ENCODER_NAME
+ unet_name = UNET_NAME
def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
r"""
@@ -810,21 +828,24 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
# then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as
# their prefixes.
keys = list(state_dict.keys())
-
- # Load the layers corresponding to UNet.
- if all(key.startswith(self.unet_name) for key in keys):
+ if all(key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in keys):
+ # Load the layers corresponding to UNet.
+ unet_keys = [k for k in keys if k.startswith(self.unet_name)]
logger.info(f"Loading {self.unet_name}.")
- unet_lora_state_dict = {k: v for k, v in state_dict.items() if k.startswith(self.unet_name)}
+ unet_lora_state_dict = {
+ k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys
+ }
self.unet.load_attn_procs(unet_lora_state_dict)
- # Load the layers corresponding to text encoder and make necessary adjustments.
- elif all(key.startswith(self.text_encoder_name) for key in keys):
+ # Load the layers corresponding to text encoder and make necessary adjustments.
+ text_encoder_keys = [k for k in keys if k.startswith(self.text_encoder_name)]
logger.info(f"Loading {self.text_encoder_name}.")
text_encoder_lora_state_dict = {
- k: v for k, v in state_dict.items() if k.startswith(self.text_encoder_name)
+ k.replace(f"{self.text_encoder_name}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys
}
- attn_procs_text_encoder = self.load_attn_procs(text_encoder_lora_state_dict)
- self._modify_text_encoder(attn_procs_text_encoder)
+ if len(text_encoder_lora_state_dict) > 0:
+ attn_procs_text_encoder = self._load_text_encoder_attn_procs(text_encoder_lora_state_dict)
+ self._modify_text_encoder(attn_procs_text_encoder)
# Otherwise, we're dealing with the old format. This means the `state_dict` should only
# contain the module names of the `unet` as its keys WITHOUT any prefix.
@@ -832,11 +853,8 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys()
):
self.unet.load_attn_procs(state_dict)
- deprecation_message = "You have saved the LoRA weights using the old format. This will be"
- " deprecated soon. To convert the old LoRA weights to the new format, you can first load them"
- " in a dictionary and then create a new dictionary like the following:"
- " `new_state_dict = {f'unet'.{module_name}: params for module_name, params in old_state_dict.items()}`."
- deprecate("legacy LoRA weights", "1.0.0", deprecation_message, standard_warn=False)
+ warn_message = "You have saved the LoRA weights using the old format. To convert the old LoRA weights to the new format, you can first load them in a dictionary and then create a new dictionary like the following: `new_state_dict = {f'unet'.{module_name}: params for module_name, params in old_state_dict.items()}`."
+ warnings.warn(warn_message)
def _modify_text_encoder(self, attn_processors: Dict[str, LoRAAttnProcessor]):
r"""
@@ -872,7 +890,9 @@ def _get_lora_layer_attribute(self, name: str) -> str:
else:
return "to_out_lora"
- def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
+ def _load_text_encoder_attn_procs(
+ self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs
+ ):
r"""
Load pretrained attention processor layers for
[`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel).
From 462b4edd31c8bf1f5be7f9f7c9b88b25fb83e0b0 Mon Sep 17 00:00:00 2001
From: Joqsan <6027118+Joqsan@users.noreply.github.com>
Date: Fri, 28 Apr 2023 12:11:29 +0300
Subject: [PATCH 018/206] [Community Pipelines] EDICT pipeline implementation
(#3153)
* EDICT pipeline initial commit
- Starting point taking from https://github.com/Joqsan/edict-diffusion
* refactor __init__() method
* minor refactoring
* refactor scheduler code
- remove scheduler and move its methods to the EDICTPipeline class
* make CFG optional
- refactor encode_prompt().
- include optional generator for sampling with vae.
- minor variable renaming
* add EDICT pipeline description to README.md
* replace preprocess() with VaeImageProcessor
* run make style and make quality commands
---------
Co-authored-by: Patrick von Platen
---
examples/community/README.md | 86 +++++++++
examples/community/edict_pipeline.py | 264 +++++++++++++++++++++++++++
2 files changed, 350 insertions(+)
create mode 100644 examples/community/edict_pipeline.py
diff --git a/examples/community/README.md b/examples/community/README.md
index 8b5b1743203d..91528eac1e85 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -32,6 +32,8 @@ MagicMix | Diffusion Pipeline for semantic mixing of an image and a text prompt
| DDIM Noise Comparative Analysis Pipeline | Investigating how the diffusion models learn visual concepts from each noise level (which is a contribution of [P2 weighting (CVPR 2022)](https://arxiv.org/abs/2204.00227)) | [DDIM Noise Comparative Analysis Pipeline](#ddim-noise-comparative-analysis-pipeline) | - |[Aengus (Duc-Anh)](https://github.com/aengusng8) |
| CLIP Guided Img2Img Stable Diffusion Pipeline | Doing CLIP guidance for image to image generation with Stable Diffusion | [CLIP Guided Img2Img Stable Diffusion](#clip-guided-img2img-stable-diffusion) | - | [Nipun Jindal](https://github.com/nipunjindal/) |
| TensorRT Stable Diffusion Pipeline | Accelerates the Stable Diffusion Text2Image Pipeline using TensorRT | [TensorRT Stable Diffusion Pipeline](#tensorrt-text2image-stable-diffusion-pipeline) | - |[Asfiya Baig](https://github.com/asfiyab-nvidia) |
+| EDICT Image Editing Pipeline | Diffusion pipeline for text-guided image editing | [EDICT Image Editing Pipeline](#edict-image-editing-pipeline) | - | [Joqsan Azocar](https://github.com/Joqsan) |
+
To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
@@ -1161,3 +1163,87 @@ prompt = "a beautiful photograph of Mt. Fuji during cherry blossom"
image = pipe(prompt).images[0]
image.save('tensorrt_mt_fuji.png')
```
+
+### EDICT Image Editing Pipeline
+
+This pipeline implements the text-guided image editing approach from the paper [EDICT: Exact Diffusion Inversion via Coupled Transformations](https://arxiv.org/abs/2211.12446). You have to pass:
+- (`PIL`) `image` you want to edit.
+- `base_prompt`: the text prompt describing the current image (before editing).
+- `target_prompt`: the text prompt describing with the edits.
+
+```python
+from diffusers import DiffusionPipeline, DDIMScheduler
+from transformers import CLIPTextModel
+import torch, PIL, requests
+from io import BytesIO
+from IPython.display import display
+
+def center_crop_and_resize(im):
+
+ width, height = im.size
+ d = min(width, height)
+ left = (width - d) / 2
+ upper = (height - d) / 2
+ right = (width + d) / 2
+ lower = (height + d) / 2
+
+ return im.crop((left, upper, right, lower)).resize((512, 512))
+
+torch_dtype = torch.float16
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+# scheduler and text_encoder param values as in the paper
+scheduler = DDIMScheduler(
+ num_train_timesteps=1000,
+ beta_start=0.00085,
+ beta_end=0.012,
+ beta_schedule="scaled_linear",
+ set_alpha_to_one=False,
+ clip_sample=False,
+)
+
+text_encoder = CLIPTextModel.from_pretrained(
+ pretrained_model_name_or_path="openai/clip-vit-large-patch14",
+ torch_dtype=torch_dtype,
+)
+
+# initialize pipeline
+pipeline = DiffusionPipeline.from_pretrained(
+ pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4",
+ custom_pipeline="edict_pipeline",
+ revision="fp16",
+ scheduler=scheduler,
+ text_encoder=text_encoder,
+ leapfrog_steps=True,
+ torch_dtype=torch_dtype,
+).to(device)
+
+# download image
+image_url = "https://huggingface.co/datasets/Joqsan/images/resolve/main/imagenet_dog_1.jpeg"
+response = requests.get(image_url)
+image = PIL.Image.open(BytesIO(response.content))
+
+# preprocess it
+cropped_image = center_crop_and_resize(image)
+
+# define the prompts
+base_prompt = "A dog"
+target_prompt = "A golden retriever"
+
+# run the pipeline
+result_image = pipeline(
+ base_prompt=base_prompt,
+ target_prompt=target_prompt,
+ image=cropped_image,
+)
+
+display(result_image)
+```
+
+Init Image
+
+
+
+Output Image
+
+
diff --git a/examples/community/edict_pipeline.py b/examples/community/edict_pipeline.py
new file mode 100644
index 000000000000..ac977f79abec
--- /dev/null
+++ b/examples/community/edict_pipeline.py
@@ -0,0 +1,264 @@
+from typing import Optional
+
+import torch
+from PIL import Image
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DDIMScheduler, DiffusionPipeline, UNet2DConditionModel
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.utils import (
+ deprecate,
+)
+
+
+class EDICTPipeline(DiffusionPipeline):
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: DDIMScheduler,
+ mixing_coeff: float = 0.93,
+ leapfrog_steps: bool = True,
+ ):
+ self.mixing_coeff = mixing_coeff
+ self.leapfrog_steps = leapfrog_steps
+
+ super().__init__()
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ scheduler=scheduler,
+ )
+
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+ def _encode_prompt(
+ self, prompt: str, negative_prompt: Optional[str] = None, do_classifier_free_guidance: bool = False
+ ):
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+
+ prompt_embeds = self.text_encoder(text_inputs.input_ids.to(self.device)).last_hidden_state
+
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=self.device)
+
+ if do_classifier_free_guidance:
+ uncond_tokens = "" if negative_prompt is None else negative_prompt
+
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+
+ negative_prompt_embeds = self.text_encoder(uncond_input.input_ids.to(self.device)).last_hidden_state
+
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+ return prompt_embeds
+
+ def denoise_mixing_layer(self, x: torch.Tensor, y: torch.Tensor):
+ x = self.mixing_coeff * x + (1 - self.mixing_coeff) * y
+ y = self.mixing_coeff * y + (1 - self.mixing_coeff) * x
+
+ return [x, y]
+
+ def noise_mixing_layer(self, x: torch.Tensor, y: torch.Tensor):
+ y = (y - (1 - self.mixing_coeff) * x) / self.mixing_coeff
+ x = (x - (1 - self.mixing_coeff) * y) / self.mixing_coeff
+
+ return [x, y]
+
+ def _get_alpha_and_beta(self, t: torch.Tensor):
+ # as self.alphas_cumprod is always in cpu
+ t = int(t)
+
+ alpha_prod = self.scheduler.alphas_cumprod[t] if t >= 0 else self.scheduler.final_alpha_cumprod
+
+ return alpha_prod, 1 - alpha_prod
+
+ def noise_step(
+ self,
+ base: torch.Tensor,
+ model_input: torch.Tensor,
+ model_output: torch.Tensor,
+ timestep: torch.Tensor,
+ ):
+ prev_timestep = timestep - self.scheduler.config.num_train_timesteps / self.scheduler.num_inference_steps
+
+ alpha_prod_t, beta_prod_t = self._get_alpha_and_beta(timestep)
+ alpha_prod_t_prev, beta_prod_t_prev = self._get_alpha_and_beta(prev_timestep)
+
+ a_t = (alpha_prod_t_prev / alpha_prod_t) ** 0.5
+ b_t = -a_t * (beta_prod_t**0.5) + beta_prod_t_prev**0.5
+
+ next_model_input = (base - b_t * model_output) / a_t
+
+ return model_input, next_model_input.to(base.dtype)
+
+ def denoise_step(
+ self,
+ base: torch.Tensor,
+ model_input: torch.Tensor,
+ model_output: torch.Tensor,
+ timestep: torch.Tensor,
+ ):
+ prev_timestep = timestep - self.scheduler.config.num_train_timesteps / self.scheduler.num_inference_steps
+
+ alpha_prod_t, beta_prod_t = self._get_alpha_and_beta(timestep)
+ alpha_prod_t_prev, beta_prod_t_prev = self._get_alpha_and_beta(prev_timestep)
+
+ a_t = (alpha_prod_t_prev / alpha_prod_t) ** 0.5
+ b_t = -a_t * (beta_prod_t**0.5) + beta_prod_t_prev**0.5
+ next_model_input = a_t * base + b_t * model_output
+
+ return model_input, next_model_input.to(base.dtype)
+
+ @torch.no_grad()
+ def decode_latents(self, latents: torch.Tensor):
+ latents = 1 / self.vae.config.scaling_factor * latents
+ image = self.vae.decode(latents).sample
+ image = (image / 2 + 0.5).clamp(0, 1)
+ return image
+
+ @torch.no_grad()
+ def prepare_latents(
+ self,
+ image: Image.Image,
+ text_embeds: torch.Tensor,
+ timesteps: torch.Tensor,
+ guidance_scale: float,
+ generator: Optional[torch.Generator] = None,
+ ):
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ image = image.to(device=self.device, dtype=text_embeds.dtype)
+ latent = self.vae.encode(image).latent_dist.sample(generator)
+
+ latent = self.vae.config.scaling_factor * latent
+
+ coupled_latents = [latent.clone(), latent.clone()]
+
+ for i, t in tqdm(enumerate(timesteps), total=len(timesteps)):
+ coupled_latents = self.noise_mixing_layer(x=coupled_latents[0], y=coupled_latents[1])
+
+ # j - model_input index, k - base index
+ for j in range(2):
+ k = j ^ 1
+
+ if self.leapfrog_steps:
+ if i % 2 == 0:
+ k, j = j, k
+
+ model_input = coupled_latents[j]
+ base = coupled_latents[k]
+
+ latent_model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input
+
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeds).sample
+
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ base, model_input = self.noise_step(
+ base=base,
+ model_input=model_input,
+ model_output=noise_pred,
+ timestep=t,
+ )
+
+ coupled_latents[k] = model_input
+
+ return coupled_latents
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ base_prompt: str,
+ target_prompt: str,
+ image: Image.Image,
+ guidance_scale: float = 3.0,
+ num_inference_steps: int = 50,
+ strength: float = 0.8,
+ negative_prompt: Optional[str] = None,
+ generator: Optional[torch.Generator] = None,
+ output_type: Optional[str] = "pil",
+ ):
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ image = self.image_processor.preprocess(image)
+
+ base_embeds = self._encode_prompt(base_prompt, negative_prompt, do_classifier_free_guidance)
+ target_embeds = self._encode_prompt(target_prompt, negative_prompt, do_classifier_free_guidance)
+
+ self.scheduler.set_timesteps(num_inference_steps, self.device)
+
+ t_limit = num_inference_steps - int(num_inference_steps * strength)
+ fwd_timesteps = self.scheduler.timesteps[t_limit:]
+ bwd_timesteps = fwd_timesteps.flip(0)
+
+ coupled_latents = self.prepare_latents(image, base_embeds, bwd_timesteps, guidance_scale, generator)
+
+ for i, t in tqdm(enumerate(fwd_timesteps), total=len(fwd_timesteps)):
+ # j - model_input index, k - base index
+ for k in range(2):
+ j = k ^ 1
+
+ if self.leapfrog_steps:
+ if i % 2 == 1:
+ k, j = j, k
+
+ model_input = coupled_latents[j]
+ base = coupled_latents[k]
+
+ latent_model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input
+
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=target_embeds).sample
+
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ base, model_input = self.denoise_step(
+ base=base,
+ model_input=model_input,
+ model_output=noise_pred,
+ timestep=t,
+ )
+
+ coupled_latents[k] = model_input
+
+ coupled_latents = self.denoise_mixing_layer(x=coupled_latents[0], y=coupled_latents[1])
+
+ # either one is fine
+ final_latent = coupled_latents[0]
+
+ if output_type not in ["latent", "pt", "np", "pil"]:
+ deprecation_message = (
+ f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: "
+ "`pil`, `np`, `pt`, `latent`"
+ )
+ deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
+ output_type = "np"
+
+ if output_type == "latent":
+ image = final_latent
+ else:
+ image = self.decode_latents(final_latent)
+ image = self.image_processor.postprocess(image, output_type=output_type)
+
+ return image
From 0614fd2038e07ad284dc2d3815b1c65729ae7760 Mon Sep 17 00:00:00 2001
From: NimenDavid <312648004@qq.com>
Date: Fri, 28 Apr 2023 17:23:02 +0800
Subject: [PATCH 019/206] [Docs]zh translated docs update (#3245)
* zh translated docs update
* update _toctree
---
docs/source/zh/_toctree.yml | 102 +++++++++++++++++---------
docs/source/zh/index.mdx | 125 +++++++++++++++++++-------------
docs/source/zh/installation.mdx | 43 ++++++-----
3 files changed, 164 insertions(+), 106 deletions(-)
diff --git a/docs/source/zh/_toctree.yml b/docs/source/zh/_toctree.yml
index 2d67d9c4a025..58f6ac09faef 100644
--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@@ -4,51 +4,79 @@
- local: quicktour
title: 快速入门
- local: stable_diffusion
- title: Stable Diffusion
+ title: Effective and efficient diffusion
- local: installation
title: 安装
title: 开始
- sections:
+ - local: tutorials/tutorial_overview
+ title: Overview
+ - local: using-diffusers/write_own_pipeline
+ title: Understanding models and schedulers
- local: tutorials/basic_training
title: Train a diffusion model
title: Tutorials
- sections:
- sections:
+ - local: using-diffusers/loading_overview
+ title: Overview
- local: using-diffusers/loading
- title: Loading Pipelines, Models, and Schedulers
+ title: Load pipelines, models, and schedulers
- local: using-diffusers/schedulers
- title: Using different Schedulers
- - local: using-diffusers/configuration
- title: Configuring Pipelines, Models, and Schedulers
+ title: Load and compare different schedulers
- local: using-diffusers/custom_pipeline_overview
- title: Loading and Adding Custom Pipelines
+ title: Load community pipelines
- local: using-diffusers/kerascv
- title: Using KerasCV Stable Diffusion Checkpoints in Diffusers
+ title: Load KerasCV Stable Diffusion checkpoints
title: Loading & Hub
- sections:
+ - local: using-diffusers/pipeline_overview
+ title: Overview
- local: using-diffusers/unconditional_image_generation
- title: Unconditional Image Generation
+ title: Unconditional image generation
- local: using-diffusers/conditional_image_generation
- title: Text-to-Image Generation
+ title: Text-to-image generation
- local: using-diffusers/img2img
- title: Text-Guided Image-to-Image
+ title: Text-guided image-to-image
- local: using-diffusers/inpaint
- title: Text-Guided Image-Inpainting
+ title: Text-guided image-inpainting
- local: using-diffusers/depth2img
- title: Text-Guided Depth-to-Image
- - local: using-diffusers/controlling_generation
- title: Controlling generation
+ title: Text-guided depth-to-image
- local: using-diffusers/reusing_seeds
- title: Reusing seeds for deterministic generation
+ title: Improve image quality with deterministic generation
- local: using-diffusers/reproducibility
- title: Reproducibility
+ title: Create reproducible pipelines
- local: using-diffusers/custom_pipeline_examples
- title: Community Pipelines
+ title: Community pipelines
- local: using-diffusers/contribute_pipeline
- title: How to contribute a Pipeline
+ title: How to contribute a community pipeline
- local: using-diffusers/using_safetensors
title: Using safetensors
+ - local: using-diffusers/stable_diffusion_jax_how_to
+ title: Stable Diffusion in JAX/Flax
+ - local: using-diffusers/weighted_prompts
+ title: Weighting Prompts
title: Pipelines for Inference
+ - sections:
+ - local: training/overview
+ title: Overview
+ - local: training/unconditional_training
+ title: Unconditional image generation
+ - local: training/text_inversion
+ title: Textual Inversion
+ - local: training/dreambooth
+ title: DreamBooth
+ - local: training/text2image
+ title: Text-to-image
+ - local: training/lora
+ title: Low-Rank Adaptation of Large Language Models (LoRA)
+ - local: training/controlnet
+ title: ControlNet
+ - local: training/instructpix2pix
+ title: InstructPix2Pix Training
+ - local: training/custom_diffusion
+ title: Custom Diffusion
+ title: Training
- sections:
- local: using-diffusers/rl
title: Reinforcement Learning
@@ -59,6 +87,8 @@
title: Taking Diffusers Beyond Images
title: Using Diffusers
- sections:
+ - local: optimization/opt_overview
+ title: Overview
- local: optimization/fp16
title: Memory and Speed
- local: optimization/torch2.0
@@ -69,32 +99,26 @@
title: ONNX
- local: optimization/open_vino
title: OpenVINO
+ - local: optimization/coreml
+ title: Core ML
- local: optimization/mps
title: MPS
- local: optimization/habana
title: Habana Gaudi
+ - local: optimization/tome
+ title: Token Merging
title: Optimization/Special Hardware
-- sections:
- - local: training/overview
- title: Overview
- - local: training/unconditional_training
- title: Unconditional Image Generation
- - local: training/text_inversion
- title: Textual Inversion
- - local: training/dreambooth
- title: DreamBooth
- - local: training/text2image
- title: Text-to-image
- - local: training/lora
- title: Low-Rank Adaptation of Large Language Models (LoRA)
- title: Training
- sections:
- local: conceptual/philosophy
title: Philosophy
+ - local: using-diffusers/controlling_generation
+ title: Controlled generation
- local: conceptual/contribution
title: How to contribute?
- local: conceptual/ethical_guidelines
title: Diffusers' Ethical Guidelines
+ - local: conceptual/evaluation
+ title: Evaluating Diffusion Models
title: Conceptual Guides
- sections:
- sections:
@@ -118,6 +142,8 @@
title: AltDiffusion
- local: api/pipelines/audio_diffusion
title: Audio Diffusion
+ - local: api/pipelines/audioldm
+ title: AudioLDM
- local: api/pipelines/cycle_diffusion
title: Cycle Diffusion
- local: api/pipelines/dance_diffusion
@@ -128,6 +154,8 @@
title: DDPM
- local: api/pipelines/dit
title: DiT
+ - local: api/pipelines/if
+ title: IF
- local: api/pipelines/latent_diffusion
title: Latent Diffusion
- local: api/pipelines/paint_by_example
@@ -142,6 +170,8 @@
title: Score SDE VE
- local: api/pipelines/semantic_stable_diffusion
title: Semantic Guidance
+ - local: api/pipelines/spectrogram_diffusion
+ title: "Spectrogram Diffusion"
- sections:
- local: api/pipelines/stable_diffusion/overview
title: Overview
@@ -171,6 +201,8 @@
title: MultiDiffusion Panorama
- local: api/pipelines/stable_diffusion/controlnet
title: Text-to-Image Generation with ControlNet Conditioning
+ - local: api/pipelines/stable_diffusion/model_editing
+ title: Text-to-Image Model Editing
title: Stable Diffusion
- local: api/pipelines/stable_diffusion_2
title: Stable Diffusion 2
@@ -178,6 +210,10 @@
title: Stable unCLIP
- local: api/pipelines/stochastic_karras_ve
title: Stochastic Karras VE
+ - local: api/pipelines/text_to_video
+ title: Text-to-Video
+ - local: api/pipelines/text_to_video_zero
+ title: Text-to-Video Zero
- local: api/pipelines/unclip
title: UnCLIP
- local: api/pipelines/latent_diffusion_uncond
@@ -235,4 +271,4 @@
- local: api/experimental/rl
title: RL Planning
title: Experimental Features
- title: API
+ title: API
\ No newline at end of file
diff --git a/docs/source/zh/index.mdx b/docs/source/zh/index.mdx
index 4f952c5db79c..e1a2a3971d87 100644
--- a/docs/source/zh/index.mdx
+++ b/docs/source/zh/index.mdx
@@ -18,61 +18,84 @@ specific language governing permissions and limitations under the License.
# 🧨 Diffusers
-🤗Diffusers提供了预训练好的视觉和音频扩散模型,并可以作为推理和训练的模块化工具箱。
+🤗 Diffusers 是一个值得首选用于生成图像、音频甚至 3D 分子结构的,最先进的预训练扩散模型库。
+无论您是在寻找简单的推理解决方案,还是想训练自己的扩散模型,🤗 Diffusers 这一模块化工具箱都能对其提供支持。
+本库的设计更偏重于[可用而非高性能](conceptual/philosophy#usability-over-performance)、[简明而非简单](conceptual/philosophy#simple-over-easy)以及[易用而非抽象](conceptual/philosophy#tweakable-contributorfriendly-over-abstraction)。
-更准确地说,🤗Diffusers提供了:
-- 最先进的扩散管道,可以在推理中仅用几行代码运行(详情看[**Using Diffusers**](./using-diffusers/conditional_image_generation))或看[**管道**](#pipelines) 以获取所有支持的管道及其对应的论文的概述。
-- 可以在推理中交替使用的各种噪声调度程序,以便在推理过程中权衡如何选择速度和质量。有关更多信息,可以看[**Schedulers**](./api/schedulers/overview)。
-- 多种类型的模型,如U-Net,可用作端到端扩散系统中的构建模块。有关更多详细信息,可以看 [**Models**](./api/models) 。
-- 训练示例,展示如何训练最流行的扩散模型任务。更多相关信息,可以看[**Training**](./training/overview)。
+本库包含三个主要组件:
+- 最先进的扩散管道 [diffusion pipelines](api/pipelines/overview),只需几行代码即可进行推理。
+- 可交替使用的各种噪声调度器 [noise schedulers](api/schedulers/overview),用于平衡生成速度和质量。
+- 预训练模型 [models](api/models),可作为构建模块,并与调度程序结合使用,来创建您自己的端到端扩散系统。
-## 🧨 Diffusers pipelines
-
-下表总结了所有官方支持的pipelines及其对应的论文,部分提供了colab,可以直接尝试一下。
+
+## 🧨 Diffusers pipelines
-| 管道 | 论文 | 任务 | Colab
-|---|---|:---:|:---:|
-| [alt_diffusion](./api/pipelines/alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation |
-| [audio_diffusion](./api/pipelines/audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio-diffusion.git) | Unconditional Audio Generation | [](https://colab.research.google.com/github/teticio/audio-diffusion/blob/master/notebooks/audio_diffusion_pipeline.ipynb)
-| [controlnet](./api/pipelines/stable_diffusion/controlnet) | [**ControlNet with Stable Diffusion**](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/controlnet.ipynb)
-| [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
-| [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
-| [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
-| [ddim](./api/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
-| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation |
-| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image |
-| [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation |
-| [paint_by_example](./api/pipelines/paint_by_example) | [**Paint by Example: Exemplar-based Image Editing with Diffusion Models**](https://arxiv.org/abs/2211.13227) | Image-Guided Image Inpainting |
-| [pndm](./api/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation |
-| [score_sde_ve](./api/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
-| [score_sde_vp](./api/pipelines/score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
-| [semantic_stable_diffusion](./api/pipelines/semantic_stable_diffusion) | [**Semantic Guidance**](https://arxiv.org/abs/2301.12247) | Text-Guided Generation | [](https://colab.research.google.com/github/ml-research/semantic-image-editing/blob/main/examples/SemanticGuidance.ipynb)
-| [stable_diffusion_text2img](./api/pipelines/stable_diffusion/text2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
-| [stable_diffusion_img2img](./api/pipelines/stable_diffusion/img2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
-| [stable_diffusion_inpaint](./api/pipelines/stable_diffusion/inpaint) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
-| [stable_diffusion_panorama](./api/pipelines/stable_diffusion/panorama) | [**MultiDiffusion**](https://multidiffusion.github.io/) | Text-to-Panorama Generation |
-| [stable_diffusion_pix2pix](./api/pipelines/stable_diffusion/pix2pix) | [**InstructPix2Pix**](https://github.com/timothybrooks/instruct-pix2pix) | Text-Guided Image Editing|
-| [stable_diffusion_pix2pix_zero](./api/pipelines/stable_diffusion/pix2pix_zero) | [**Zero-shot Image-to-Image Translation**](https://pix2pixzero.github.io/) | Text-Guided Image Editing |
-| [stable_diffusion_attend_and_excite](./api/pipelines/stable_diffusion/attend_and_excite) | [**Attend and Excite for Stable Diffusion**](https://attendandexcite.github.io/Attend-and-Excite/) | Text-to-Image Generation |
-| [stable_diffusion_self_attention_guidance](./api/pipelines/stable_diffusion/self_attention_guidance) | [**Self-Attention Guidance**](https://ku-cvlab.github.io/Self-Attention-Guidance) | Text-to-Image Generation |
-| [stable_diffusion_image_variation](./stable_diffusion/image_variation) | [**Stable Diffusion Image Variations**](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations) | Image-to-Image Generation |
-| [stable_diffusion_latent_upscale](./stable_diffusion/latent_upscale) | [**Stable Diffusion Latent Upscaler**](https://twitter.com/StabilityAI/status/1590531958815064065) | Text-Guided Super Resolution Image-to-Image |
-| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation |
-| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting |
-| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Depth-Conditional Stable Diffusion**](https://github.com/Stability-AI/stablediffusion#depth-conditional-stable-diffusion) | Depth-to-Image Generation |
-| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
-| [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
-| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Text-to-Image Generation |
-| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Image-to-Image Text-Guided Generation |
-| [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
-| [unclip](./api/pipelines/unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125) | Text-to-Image Generation |
-| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation |
-| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation |
-| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation |
-| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation |
-
+下表汇总了当前所有官方支持的pipelines及其对应的论文.
-**注意**: 管道是如何使用相应论文中提出的扩散模型的简单示例。
\ No newline at end of file
+| 管道 | 论文/仓库 | 任务 |
+|---|---|:---:|
+| [alt_diffusion](./api/pipelines/alt_diffusion) | [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation |
+| [audio_diffusion](./api/pipelines/audio_diffusion) | [Audio Diffusion](https://github.com/teticio/audio-diffusion.git) | Unconditional Audio Generation |
+| [controlnet](./api/pipelines/stable_diffusion/controlnet) | [Adding Conditional Control to Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation |
+| [cycle_diffusion](./api/pipelines/cycle_diffusion) | [Unifying Diffusion Models' Latent Space, with Applications to CycleDiffusion and Guidance](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
+| [dance_diffusion](./api/pipelines/dance_diffusion) | [Dance Diffusion](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
+| [ddpm](./api/pipelines/ddpm) | [Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
+| [ddim](./api/pipelines/ddim) | [Denoising Diffusion Implicit Models](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
+| [if](./if) | [**IF**](./api/pipelines/if) | Image Generation |
+| [if_img2img](./if) | [**IF**](./api/pipelines/if) | Image-to-Image Generation |
+| [if_inpainting](./if) | [**IF**](./api/pipelines/if) | Image-to-Image Generation |
+| [latent_diffusion](./api/pipelines/latent_diffusion) | [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation |
+| [latent_diffusion](./api/pipelines/latent_diffusion) | [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image |
+| [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation |
+| [paint_by_example](./api/pipelines/paint_by_example) | [Paint by Example: Exemplar-based Image Editing with Diffusion Models](https://arxiv.org/abs/2211.13227) | Image-Guided Image Inpainting |
+| [pndm](./api/pipelines/pndm) | [Pseudo Numerical Methods for Diffusion Models on Manifolds](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation |
+| [score_sde_ve](./api/pipelines/score_sde_ve) | [Score-Based Generative Modeling through Stochastic Differential Equations](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
+| [score_sde_vp](./api/pipelines/score_sde_vp) | [Score-Based Generative Modeling through Stochastic Differential Equations](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
+| [semantic_stable_diffusion](./api/pipelines/semantic_stable_diffusion) | [Semantic Guidance](https://arxiv.org/abs/2301.12247) | Text-Guided Generation |
+| [stable_diffusion_text2img](./api/pipelines/stable_diffusion/text2img) | [Stable Diffusion](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation |
+| [stable_diffusion_img2img](./api/pipelines/stable_diffusion/img2img) | [Stable Diffusion](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation |
+| [stable_diffusion_inpaint](./api/pipelines/stable_diffusion/inpaint) | [Stable Diffusion](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting |
+| [stable_diffusion_panorama](./api/pipelines/stable_diffusion/panorama) | [MultiDiffusion](https://multidiffusion.github.io/) | Text-to-Panorama Generation |
+| [stable_diffusion_pix2pix](./api/pipelines/stable_diffusion/pix2pix) | [InstructPix2Pix: Learning to Follow Image Editing Instructions](https://arxiv.org/abs/2211.09800) | Text-Guided Image Editing|
+| [stable_diffusion_pix2pix_zero](./api/pipelines/stable_diffusion/pix2pix_zero) | [Zero-shot Image-to-Image Translation](https://pix2pixzero.github.io/) | Text-Guided Image Editing |
+| [stable_diffusion_attend_and_excite](./api/pipelines/stable_diffusion/attend_and_excite) | [Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models](https://arxiv.org/abs/2301.13826) | Text-to-Image Generation |
+| [stable_diffusion_self_attention_guidance](./api/pipelines/stable_diffusion/self_attention_guidance) | [Improving Sample Quality of Diffusion Models Using Self-Attention Guidance](https://arxiv.org/abs/2210.00939) | Text-to-Image Generation Unconditional Image Generation |
+| [stable_diffusion_image_variation](./stable_diffusion/image_variation) | [Stable Diffusion Image Variations](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations) | Image-to-Image Generation |
+| [stable_diffusion_latent_upscale](./stable_diffusion/latent_upscale) | [Stable Diffusion Latent Upscaler](https://twitter.com/StabilityAI/status/1590531958815064065) | Text-Guided Super Resolution Image-to-Image |
+| [stable_diffusion_model_editing](./api/pipelines/stable_diffusion/model_editing) | [Editing Implicit Assumptions in Text-to-Image Diffusion Models](https://time-diffusion.github.io/) | Text-to-Image Model Editing |
+| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [Stable Diffusion 2](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation |
+| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [Stable Diffusion 2](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting |
+| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [Depth-Conditional Stable Diffusion](https://github.com/Stability-AI/stablediffusion#depth-conditional-stable-diffusion) | Depth-to-Image Generation |
+| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [Stable Diffusion 2](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
+| [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [Safe Stable Diffusion](https://arxiv.org/abs/2211.05105) | Text-Guided Generation |
+| [stable_unclip](./stable_unclip) | Stable unCLIP | Text-to-Image Generation |
+| [stable_unclip](./stable_unclip) | Stable unCLIP | Image-to-Image Text-Guided Generation |
+| [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
+| [text_to_video_sd](./api/pipelines/text_to_video) | [Modelscope's Text-to-video-synthesis Model in Open Domain](https://modelscope.cn/models/damo/text-to-video-synthesis/summary) | Text-to-Video Generation |
+| [unclip](./api/pipelines/unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125)(implementation by [kakaobrain](https://github.com/kakaobrain/karlo)) | Text-to-Image Generation |
+| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation |
+| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation |
+| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation |
+| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation |
diff --git a/docs/source/zh/installation.mdx b/docs/source/zh/installation.mdx
index cda91df8a6cd..8cd3ad97cc21 100644
--- a/docs/source/zh/installation.mdx
+++ b/docs/source/zh/installation.mdx
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
# 安装
-安装🤗 Diffusers 到你正在使用的任何深度学习框架中。
+在你正在使用的任意深度学习框架中安装 🤗 Diffusers 。
🤗 Diffusers已在Python 3.7+、PyTorch 1.7.0+和Flax上进行了测试。按照下面的安装说明,针对你正在使用的深度学习框架进行安装:
@@ -21,11 +21,11 @@ specific language governing permissions and limitations under the License.
## 使用pip安装
-你需要在[虚拟环境](https://docs.python.org/3/library/venv.html)中安装🤗 Diffusers 。
+你需要在[虚拟环境](https://docs.python.org/3/library/venv.html)中安装 🤗 Diffusers 。
如果你对 Python 虚拟环境不熟悉,可以看看这个[教程](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
-使用虚拟环境你可以轻松管理不同的项目,避免了依赖项之间的兼容性问题。
+在虚拟环境中,你可以轻松管理不同的项目,避免依赖项之间的兼容性问题。
首先,在你的项目目录下创建一个虚拟环境:
@@ -39,7 +39,7 @@ python -m venv .env
source .env/bin/activate
```
-现在你就可以安装 🤗 Diffusers了!使用下边这个命令:
+现在,你就可以安装 🤗 Diffusers了!使用下边这个命令:
**PyTorch**
@@ -55,7 +55,7 @@ pip install diffusers["flax"]
## 从源代码安装
-在从源代码安装 `diffusers` 之前,你先确定你已经安装了 `torch` 和 `accelerate`。
+在从源代码安装 `diffusers` 之前,确保你已经安装了 `torch` 和 `accelerate`。
`torch`的安装教程可以看 `torch` [文档](https://pytorch.org/get-started/locally/#start-locally).
@@ -65,17 +65,17 @@ pip install diffusers["flax"]
pip install accelerate
```
-从源码安装 🤗 Diffusers 使用以下命令:
+从源码安装 🤗 Diffusers 需要使用以下命令:
```bash
pip install git+https://github.com/huggingface/diffusers
```
这个命令安装的是最新的 `main`版本,而不是最近的`stable`版。
-`main`是一直和最新进展保持一致的。比如,上次正式版发布了,有bug,新的正式版还没推出,但是`main`中可以看到这个bug被修复了。
-但是这也意味着 `main`版本并不总是稳定的。
+`main`是一直和最新进展保持一致的。比如,上次发布的正式版中有bug,在`main`中可以看到这个bug被修复了,但是新的正式版此时尚未推出。
+但是这也意味着 `main`版本不保证是稳定的。
-我们努力保持`main`版本正常运行,大多数问题都能在几个小时或一天之内解决
+我们努力保持`main`版本正常运行,大多数问题都能在几个小时或一天之内解决
如果你遇到了问题,可以提 [Issue](https://github.com/huggingface/transformers/issues),这样我们就能更快修复问题了。
@@ -105,8 +105,8 @@ pip install -e ".[torch]"
pip install -e ".[flax]"
```
-这些命令将连接你克隆的版本库和你的 Python 库路径。
-现在,除了正常的库路径外,Python 还会在你克隆的文件夹内寻找。
+这些命令将连接到你克隆的版本库和你的 Python 库路径。
+现在,不只是在通常的库路径,Python 还会在你克隆的文件夹内寻找包。
例如,如果你的 Python 包通常安装在 `~/anaconda3/envs/main/lib/python3.7/Site-packages/`,Python 也会搜索你克隆到的文件夹。`~/diffusers/`。
@@ -116,32 +116,31 @@ pip install -e ".[flax]"
-现在你可以用下面的命令轻松地将你克隆的🤗Diffusers仓库更新到最新版本。
+现在你可以用下面的命令轻松地将你克隆的 🤗 Diffusers 库更新到最新版本。
```bash
cd ~/diffusers/
git pull
```
-你的Python环境将在下次运行时找到`main`版本的🤗 Diffusers。
+你的Python环境将在下次运行时找到`main`版本的 🤗 Diffusers。
-## 注意遥测日志
+## 注意 Telemetry 日志
-我们的库会在使用`from_pretrained()`请求期间收集信息。这些数据包括Diffusers和PyTorch/Flax的版本,请求的模型或管道,以及预训练检查点的路径(如果它被托管在Hub上)。
+我们的库会在使用`from_pretrained()`请求期间收集 telemetry 信息。这些数据包括Diffusers和PyTorch/Flax的版本,请求的模型或管道类,以及预训练检查点的路径(如果它被托管在Hub上的话)。
+这些使用数据有助于我们调试问题并确定新功能的开发优先级。
+Telemetry 数据仅在从 HuggingFace Hub 中加载模型和管道时发送,而不会在本地使用期间收集。
-这些使用数据有助于我们调试问题并优先考虑新功能。
-当从HuggingFace Hub加载模型和管道时才会发送遥测数据,并且在本地使用时不会收集数据。
+我们知道,并不是每个人都想分享这些的信息,我们尊重您的隐私,
+因此您可以通过在终端中设置 `DISABLE_TELEMETRY` 环境变量从而禁用 Telemetry 数据收集:
-我们知道并不是每个人都想分享这些的信息,我们尊重您的隐私,
-因此您可以通过在终端中设置“DISABLE_TELEMETRY”环境变量来禁用遥测数据的收集:
-
-在Linux/MacOS中:
+Linux/MacOS :
```bash
export DISABLE_TELEMETRY=YES
```
-在Windows中:
+Windows :
```bash
set DISABLE_TELEMETRY=YES
```
\ No newline at end of file
From 73cc43109b62a744f49eb803fef4c6d4e5211b7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Tolga=20Cang=C3=B6z?=
<46008593+standardAI@users.noreply.github.com>
Date: Fri, 28 Apr 2023 12:57:27 +0300
Subject: [PATCH 020/206] Update logging.mdx (#2863)
Fix typos
---
docs/source/en/api/logging.mdx | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/source/en/api/logging.mdx b/docs/source/en/api/logging.mdx
index b52c0434f42d..bb973db781ea 100644
--- a/docs/source/en/api/logging.mdx
+++ b/docs/source/en/api/logging.mdx
@@ -61,7 +61,7 @@ verbose to the most verbose), those levels (with their corresponding int values
critical errors.
- `diffusers.logging.ERROR` (int value, 40): only report errors.
- `diffusers.logging.WARNING` or `diffusers.logging.WARN` (int value, 30): only reports error and
- warnings. This the default level used by the library.
+ warnings. This is the default level used by the library.
- `diffusers.logging.INFO` (int value, 20): reports error, warnings and basic information.
- `diffusers.logging.DEBUG` (int value, 10): report all information.
From 6290668254f421496c968e39d7de4e07e6bc394d Mon Sep 17 00:00:00 2001
From: timegate
Date: Fri, 28 Apr 2023 18:58:10 +0900
Subject: [PATCH 021/206] Add multiple conditions to
StableDiffusionControlNetInpaintPipeline (#3125)
* try multi controlnet inpaint
* multi controlnet inpaint
* multi controlnet inpaint
---
.../stable_diffusion_controlnet_inpaint.py | 184 ++++++++++++------
1 file changed, 123 insertions(+), 61 deletions(-)
diff --git a/examples/community/stable_diffusion_controlnet_inpaint.py b/examples/community/stable_diffusion_controlnet_inpaint.py
index c47f4c3194e8..aae199f91b9e 100644
--- a/examples/community/stable_diffusion_controlnet_inpaint.py
+++ b/examples/community/stable_diffusion_controlnet_inpaint.py
@@ -1,7 +1,7 @@
# Inspired by: https://github.com/haofanwang/ControlNet-for-Diffusers/
import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import numpy as np
import PIL.Image
@@ -11,6 +11,7 @@
from diffusers import AutoencoderKL, ControlNetModel, DiffusionPipeline, UNet2DConditionModel, logging
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel
from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import (
PIL_INTERPOLATION,
@@ -184,7 +185,14 @@ def prepare_mask_image(mask_image):
def prepare_controlnet_conditioning_image(
- controlnet_conditioning_image, width, height, batch_size, num_images_per_prompt, device, dtype
+ controlnet_conditioning_image,
+ width,
+ height,
+ batch_size,
+ num_images_per_prompt,
+ device,
+ dtype,
+ do_classifier_free_guidance,
):
if not isinstance(controlnet_conditioning_image, torch.Tensor):
if isinstance(controlnet_conditioning_image, PIL.Image.Image):
@@ -214,6 +222,9 @@ def prepare_controlnet_conditioning_image(
controlnet_conditioning_image = controlnet_conditioning_image.to(device=device, dtype=dtype)
+ if do_classifier_free_guidance:
+ controlnet_conditioning_image = torch.cat([controlnet_conditioning_image] * 2)
+
return controlnet_conditioning_image
@@ -230,7 +241,7 @@ def __init__(
text_encoder: CLIPTextModel,
tokenizer: CLIPTokenizer,
unet: UNet2DConditionModel,
- controlnet: ControlNetModel,
+ controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
scheduler: KarrasDiffusionSchedulers,
safety_checker: StableDiffusionSafetyChecker,
feature_extractor: CLIPImageProcessor,
@@ -254,6 +265,9 @@ def __init__(
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
+ if isinstance(controlnet, (list, tuple)):
+ controlnet = MultiControlNetModel(controlnet)
+
self.register_modules(
vae=vae,
text_encoder=text_encoder,
@@ -264,6 +278,7 @@ def __init__(
safety_checker=safety_checker,
feature_extractor=feature_extractor,
)
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
@@ -522,6 +537,42 @@ def prepare_extra_step_kwargs(self, generator, eta):
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
+ def check_controlnet_conditioning_image(self, image, prompt, prompt_embeds):
+ image_is_pil = isinstance(image, PIL.Image.Image)
+ image_is_tensor = isinstance(image, torch.Tensor)
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+
+ if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list:
+ raise TypeError(
+ "image must be passed and be one of PIL image, torch tensor, list of PIL images, or list of torch tensors"
+ )
+
+ if image_is_pil:
+ image_batch_size = 1
+ elif image_is_tensor:
+ image_batch_size = image.shape[0]
+ elif image_is_pil_list:
+ image_batch_size = len(image)
+ elif image_is_tensor_list:
+ image_batch_size = len(image)
+ else:
+ raise ValueError("controlnet condition image is not valid")
+
+ if prompt is not None and isinstance(prompt, str):
+ prompt_batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ prompt_batch_size = len(prompt)
+ elif prompt_embeds is not None:
+ prompt_batch_size = prompt_embeds.shape[0]
+ else:
+ raise ValueError("prompt or prompt_embeds are not valid")
+
+ if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+ raise ValueError(
+ f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+ )
+
def check_inputs(
self,
prompt,
@@ -534,6 +585,7 @@ def check_inputs(
negative_prompt=None,
prompt_embeds=None,
negative_prompt_embeds=None,
+ controlnet_conditioning_scale=None,
):
if height % 8 != 0 or width % 8 != 0:
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
@@ -572,45 +624,35 @@ def check_inputs(
f" {negative_prompt_embeds.shape}."
)
- controlnet_cond_image_is_pil = isinstance(controlnet_conditioning_image, PIL.Image.Image)
- controlnet_cond_image_is_tensor = isinstance(controlnet_conditioning_image, torch.Tensor)
- controlnet_cond_image_is_pil_list = isinstance(controlnet_conditioning_image, list) and isinstance(
- controlnet_conditioning_image[0], PIL.Image.Image
- )
- controlnet_cond_image_is_tensor_list = isinstance(controlnet_conditioning_image, list) and isinstance(
- controlnet_conditioning_image[0], torch.Tensor
- )
-
- if (
- not controlnet_cond_image_is_pil
- and not controlnet_cond_image_is_tensor
- and not controlnet_cond_image_is_pil_list
- and not controlnet_cond_image_is_tensor_list
- ):
- raise TypeError(
- "image must be passed and be one of PIL image, torch tensor, list of PIL images, or list of torch tensors"
- )
-
- if controlnet_cond_image_is_pil:
- controlnet_cond_image_batch_size = 1
- elif controlnet_cond_image_is_tensor:
- controlnet_cond_image_batch_size = controlnet_conditioning_image.shape[0]
- elif controlnet_cond_image_is_pil_list:
- controlnet_cond_image_batch_size = len(controlnet_conditioning_image)
- elif controlnet_cond_image_is_tensor_list:
- controlnet_cond_image_batch_size = len(controlnet_conditioning_image)
-
- if prompt is not None and isinstance(prompt, str):
- prompt_batch_size = 1
- elif prompt is not None and isinstance(prompt, list):
- prompt_batch_size = len(prompt)
- elif prompt_embeds is not None:
- prompt_batch_size = prompt_embeds.shape[0]
-
- if controlnet_cond_image_batch_size != 1 and controlnet_cond_image_batch_size != prompt_batch_size:
- raise ValueError(
- f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {controlnet_cond_image_batch_size}, prompt batch size: {prompt_batch_size}"
- )
+ # check controlnet condition image
+ if isinstance(self.controlnet, ControlNetModel):
+ self.check_controlnet_conditioning_image(controlnet_conditioning_image, prompt, prompt_embeds)
+ elif isinstance(self.controlnet, MultiControlNetModel):
+ if not isinstance(controlnet_conditioning_image, list):
+ raise TypeError("For multiple controlnets: `image` must be type `list`")
+ if len(controlnet_conditioning_image) != len(self.controlnet.nets):
+ raise ValueError(
+ "For multiple controlnets: `image` must have the same length as the number of controlnets."
+ )
+ for image_ in controlnet_conditioning_image:
+ self.check_controlnet_conditioning_image(image_, prompt, prompt_embeds)
+ else:
+ assert False
+
+ # Check `controlnet_conditioning_scale`
+ if isinstance(self.controlnet, ControlNetModel):
+ if not isinstance(controlnet_conditioning_scale, float):
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+ elif isinstance(self.controlnet, MultiControlNetModel):
+ if isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+ self.controlnet.nets
+ ):
+ raise ValueError(
+ "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+ " the same length as the number of controlnets"
+ )
+ else:
+ assert False
if isinstance(image, torch.Tensor) and not isinstance(mask_image, torch.Tensor):
raise TypeError("if `image` is a tensor, `mask_image` must also be a tensor")
@@ -630,6 +672,8 @@ def check_inputs(
image_channels, image_height, image_width = image.shape
elif image.ndim == 4:
image_batch_size, image_channels, image_height, image_width = image.shape
+ else:
+ assert False
if mask_image.ndim == 2:
mask_image_batch_size = 1
@@ -797,7 +841,7 @@ def __call__(
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
- controlnet_conditioning_scale: float = 1.0,
+ controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
):
r"""
Function invoked when calling the pipeline for generation.
@@ -897,6 +941,7 @@ def __call__(
negative_prompt,
prompt_embeds,
negative_prompt_embeds,
+ controlnet_conditioning_scale,
)
# 2. Define call parameters
@@ -913,6 +958,9 @@ def __call__(
# corresponds to doing no classifier free guidance.
do_classifier_free_guidance = guidance_scale > 1.0
+ if isinstance(self.controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(self.controlnet.nets)
+
# 3. Encode input prompt
prompt_embeds = self._encode_prompt(
prompt,
@@ -929,15 +977,37 @@ def __call__(
mask_image = prepare_mask_image(mask_image)
- controlnet_conditioning_image = prepare_controlnet_conditioning_image(
- controlnet_conditioning_image,
- width,
- height,
- batch_size * num_images_per_prompt,
- num_images_per_prompt,
- device,
- self.controlnet.dtype,
- )
+ # condition image(s)
+ if isinstance(self.controlnet, ControlNetModel):
+ controlnet_conditioning_image = prepare_controlnet_conditioning_image(
+ controlnet_conditioning_image=controlnet_conditioning_image,
+ width=width,
+ height=height,
+ batch_size=batch_size * num_images_per_prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ device=device,
+ dtype=self.controlnet.dtype,
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ )
+ elif isinstance(self.controlnet, MultiControlNetModel):
+ controlnet_conditioning_images = []
+
+ for image_ in controlnet_conditioning_image:
+ image_ = prepare_controlnet_conditioning_image(
+ controlnet_conditioning_image=image_,
+ width=width,
+ height=height,
+ batch_size=batch_size * num_images_per_prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ device=device,
+ dtype=self.controlnet.dtype,
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ )
+ controlnet_conditioning_images.append(image_)
+
+ controlnet_conditioning_image = controlnet_conditioning_images
+ else:
+ assert False
masked_image = image * (mask_image < 0.5)
@@ -979,9 +1049,6 @@ def __call__(
do_classifier_free_guidance,
)
- if do_classifier_free_guidance:
- controlnet_conditioning_image = torch.cat([controlnet_conditioning_image] * 2)
-
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -1007,15 +1074,10 @@ def __call__(
t,
encoder_hidden_states=prompt_embeds,
controlnet_cond=controlnet_conditioning_image,
+ conditioning_scale=controlnet_conditioning_scale,
return_dict=False,
)
- down_block_res_samples = [
- down_block_res_sample * controlnet_conditioning_scale
- for down_block_res_sample in down_block_res_samples
- ]
- mid_block_res_sample *= controlnet_conditioning_scale
-
# predict the noise residual
noise_pred = self.unet(
inpainting_latent_model_input,
From d46421446437511c931afd38ba3aa4908a00bdd9 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Fri, 28 Apr 2023 12:39:50 +0200
Subject: [PATCH 022/206] Let's make sure that dreambooth always uploads to the
Hub (#3272)
* Update Dreambooth README
* Adapt all docs as well
* automatically write model card
* fix
* make style
---
docs/source/en/training/dreambooth.mdx | 18 ++++++----
examples/dreambooth/README.md | 18 ++++++----
examples/dreambooth/train_dreambooth.py | 48 ++++++++++++++++++++++++-
3 files changed, 71 insertions(+), 13 deletions(-)
diff --git a/docs/source/en/training/dreambooth.mdx b/docs/source/en/training/dreambooth.mdx
index c5a5a047d114..09b877c7d0cc 100644
--- a/docs/source/en/training/dreambooth.mdx
+++ b/docs/source/en/training/dreambooth.mdx
@@ -98,7 +98,8 @@ accelerate launch train_dreambooth.py \
--learning_rate=5e-6 \
--lr_scheduler="constant" \
--lr_warmup_steps=0 \
- --max_train_steps=400
+ --max_train_steps=400 \
+ --push_to_hub
```
@@ -161,7 +162,8 @@ accelerate launch train_dreambooth.py \
--lr_scheduler="constant" \
--lr_warmup_steps=0 \
--num_class_images=200 \
- --max_train_steps=800
+ --max_train_steps=800 \
+ --push_to_hub
```
@@ -225,7 +227,8 @@ accelerate launch train_dreambooth.py \
--lr_scheduler="constant" \
--lr_warmup_steps=0 \
--num_class_images=200 \
- --max_train_steps=800
+ --max_train_steps=800 \
+ --push_to_hub
```
@@ -387,7 +390,8 @@ accelerate launch train_dreambooth.py \
--lr_scheduler="constant" \
--lr_warmup_steps=0 \
--num_class_images=200 \
- --max_train_steps=800
+ --max_train_steps=800 \
+ --push_to_hub
```
### 12GB GPU
@@ -418,7 +422,8 @@ accelerate launch train_dreambooth.py \
--lr_scheduler="constant" \
--lr_warmup_steps=0 \
--num_class_images=200 \
- --max_train_steps=800
+ --max_train_steps=800 \
+ --push_to_hub
```
### 8 GB GPU
@@ -464,7 +469,8 @@ accelerate launch train_dreambooth.py \
--lr_warmup_steps=0 \
--num_class_images=200 \
--max_train_steps=800 \
- --mixed_precision=fp16
+ --mixed_precision=fp16 \
+ --push_to_hub
```
## Inference
diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md
index e1eb8a06b0ff..490e31458988 100644
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -80,7 +80,8 @@ accelerate launch train_dreambooth.py \
--learning_rate=5e-6 \
--lr_scheduler="constant" \
--lr_warmup_steps=0 \
- --max_train_steps=400
+ --max_train_steps=400 \
+ --push_to_hub
```
### Training with prior-preservation loss
@@ -109,7 +110,8 @@ accelerate launch train_dreambooth.py \
--lr_scheduler="constant" \
--lr_warmup_steps=0 \
--num_class_images=200 \
- --max_train_steps=800
+ --max_train_steps=800 \
+ --push_to_hub
```
@@ -141,7 +143,8 @@ accelerate launch train_dreambooth.py \
--lr_scheduler="constant" \
--lr_warmup_steps=0 \
--num_class_images=200 \
- --max_train_steps=800
+ --max_train_steps=800 \
+ --push_to_hub
```
@@ -176,7 +179,8 @@ accelerate launch train_dreambooth.py \
--lr_scheduler="constant" \
--lr_warmup_steps=0 \
--num_class_images=200 \
- --max_train_steps=800
+ --max_train_steps=800 \
+ --push_to_hub
```
@@ -218,7 +222,8 @@ accelerate launch --mixed_precision="fp16" train_dreambooth.py \
--lr_scheduler="constant" \
--lr_warmup_steps=0 \
--num_class_images=200 \
- --max_train_steps=800
+ --max_train_steps=800 \
+ --push_to_hub
```
### Fine-tune text encoder with the UNet.
@@ -251,7 +256,8 @@ accelerate launch train_dreambooth.py \
--lr_scheduler="constant" \
--lr_warmup_steps=0 \
--num_class_images=200 \
- --max_train_steps=800
+ --max_train_steps=800 \
+ --push_to_hub
```
### Using DreamBooth for pipelines other than Stable Diffusion
diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index 593af005d6f4..190f4625a16c 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -61,6 +61,39 @@
logger = get_logger(__name__)
+def save_model_card(repo_id: str, images=None, base_model=str, train_text_encoder=False, prompt=str, repo_folder=None):
+ img_str = ""
+ for i, image in enumerate(images):
+ image.save(os.path.join(repo_folder, f"image_{i}.png"))
+ img_str += f"\n"
+
+ yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+instance_prompt: {prompt}
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- text-to-image
+- diffusers
+- dreambooth
+inference: true
+---
+ """
+ model_card = f"""
+# DreamBooth - {repo_id}
+
+This is a dreambooth model derived from {base_model}. The weights were trained on {prompt} using [DreamBooth](https://dreambooth.github.io/).
+You can find some example images in the following. \n
+{img_str}
+
+DreamBooth for the text encoder was enabled: {train_text_encoder}.
+"""
+ with open(os.path.join(repo_folder, "README.md"), "w") as f:
+ f.write(yaml + model_card)
+
+
def log_validation(text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch):
logger.info(
f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
@@ -104,6 +137,8 @@ def log_validation(text_encoder, tokenizer, unet, vae, args, accelerator, weight
del pipeline
torch.cuda.empty_cache()
+ return images
+
def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
text_encoder_config = PretrainedConfig.from_pretrained(
@@ -997,13 +1032,16 @@ def load_model_hook(models, input_dir):
global_step += 1
if accelerator.is_main_process:
+ images = []
if global_step % args.checkpointing_steps == 0:
save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
accelerator.save_state(save_path)
logger.info(f"Saved state to {save_path}")
if args.validation_prompt is not None and global_step % args.validation_steps == 0:
- log_validation(text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch)
+ images = log_validation(
+ text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch
+ )
logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
progress_bar.set_postfix(**logs)
@@ -1024,6 +1062,14 @@ def load_model_hook(models, input_dir):
pipeline.save_pretrained(args.output_dir)
if args.push_to_hub:
+ save_model_card(
+ repo_id,
+ images=images,
+ base_model=args.pretrained_model_name_or_path,
+ train_text_encoder=args.train_text_encoder,
+ prompt=args.instance_prompt,
+ repo_folder=args.output_dir,
+ )
upload_folder(
repo_id=repo_id,
folder_path=args.output_dir,
From be0bfcec4dacf5f98de9b12967472a3a5b5fcde2 Mon Sep 17 00:00:00 2001
From: clarencechen
Date: Fri, 28 Apr 2023 03:58:26 -0700
Subject: [PATCH 023/206] Diffedit Zero-Shot Inpainting Pipeline (#2837)
* Update Pix2PixZero Auto-correlation Loss
* Add Stable Diffusion DiffEdit pipeline
* Add draft documentation and import code
* Bugfixes and refactoring
* Add option to not decode latents in the inversion process
* Harmonize preprocessing
* Revert "Update Pix2PixZero Auto-correlation Loss"
This reverts commit b218062fed08d6cc164206d6cb852b2b7b00847a.
* Update annotations
* rename `compute_mask` to `generate_mask`
* Update documentation
* Update docs
* Update Docs
* Fix copy
* Change shape of output latents to batch first
* Update docs
* Add first draft for tests
* Bugfix and update tests
* Add `cross_attention_kwargs` support for all pipeline methods
* Fix Copies
* Add support for PIL image latents
Add support for mask broadcasting
Update docs and tests
Align `mask` argument to `mask_image`
Remove height and width arguments
* Enable MPS Tests
* Move example docstrings
* Fix test
* Fix test
* fix pipeline inheritance
* Harmonize `prepare_image_latents` with StableDiffusionPix2PixZeroPipeline
* Register modules set to `None` in config for `test_save_load_optional_components`
* Move fixed logic to specific test class
* Clean changes to other pipelines
* Update new tests to coordinate with #2953
* Update slow tests for better results
* Safety to avoid potential problems with torch.inference_mode
* Add reference in SD Pipeline Overview
* Fix tests again
* Enforce determinism in noise for generate_mask
* Fix copies
* Widen test tolerance for fp16 based on `test_stable_diffusion_upscale_pipeline_fp16`
* Add LoraLoaderMixin and update `prepare_image_latents`
* clean up repeat and reg
* bugfix
* Remove invalid args from docs
Suppress spurious warning by repeating image before latent to mask gen
---
docs/source/en/_toctree.yml | 2 +
.../pipelines/stable_diffusion/diffedit.mdx | 360 ++++
.../pipelines/stable_diffusion/overview.mdx | 1 +
src/diffusers/__init__.py | 1 +
src/diffusers/pipelines/__init__.py | 1 +
.../pipelines/stable_diffusion/__init__.py | 2 +
.../pipeline_stable_diffusion_diffedit.py | 1530 +++++++++++++++++
.../dummy_torch_and_transformers_objects.py | 15 +
.../test_stable_diffusion_diffedit.py | 315 ++++
9 files changed, 2227 insertions(+)
create mode 100644 docs/source/en/api/pipelines/stable_diffusion/diffedit.mdx
create mode 100644 src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
create mode 100644 tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 26d3dbcf4e83..fc101347a6e9 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -203,6 +203,8 @@
title: Text-to-Image Generation with ControlNet Conditioning
- local: api/pipelines/stable_diffusion/model_editing
title: Text-to-Image Model Editing
+ - local: api/pipelines/stable_diffusion/diffedit
+ title: DiffEdit
title: Stable Diffusion
- local: api/pipelines/stable_diffusion_2
title: Stable Diffusion 2
diff --git a/docs/source/en/api/pipelines/stable_diffusion/diffedit.mdx b/docs/source/en/api/pipelines/stable_diffusion/diffedit.mdx
new file mode 100644
index 000000000000..a7cd906e0e77
--- /dev/null
+++ b/docs/source/en/api/pipelines/stable_diffusion/diffedit.mdx
@@ -0,0 +1,360 @@
+
+
+# Zero-shot Diffusion-based Semantic Image Editing with Mask Guidance
+
+## Overview
+
+[DiffEdit: Diffusion-based semantic image editing with mask guidance](https://arxiv.org/abs/2210.11427) by Guillaume Couairon, Jakob Verbeek, Holger Schwenk, and Matthieu Cord.
+
+The abstract of the paper is the following:
+
+*Image generation has recently seen tremendous advances, with diffusion models allowing to synthesize convincing images for a large variety of text prompts. In this article, we propose DiffEdit, a method to take advantage of text-conditioned diffusion models for the task of semantic image editing, where the goal is to edit an image based on a text query. Semantic image editing is an extension of image generation, with the additional constraint that the generated image should be as similar as possible to a given input image. Current editing methods based on diffusion models usually require to provide a mask, making the task much easier by treating it as a conditional inpainting task. In contrast, our main contribution is able to automatically generate a mask highlighting regions of the input image that need to be edited, by contrasting predictions of a diffusion model conditioned on different text prompts. Moreover, we rely on latent inference to preserve content in those regions of interest and show excellent synergies with mask-based diffusion. DiffEdit achieves state-of-the-art editing performance on ImageNet. In addition, we evaluate semantic image editing in more challenging settings, using images from the COCO dataset as well as text-based generated images.*
+
+Resources:
+
+* [Paper](https://arxiv.org/abs/2210.11427).
+* [Blog Post with Demo](https://blog.problemsolversguild.com/technical/research/2022/11/02/DiffEdit-Implementation.html).
+* [Implementation on Github](https://github.com/Xiang-cd/DiffEdit-stable-diffusion/).
+
+## Tips
+
+* The pipeline can generate masks that can be fed into other inpainting pipelines. Check out the code examples below to know more.
+* In order to generate an image using this pipeline, both an image mask (manually specified or generated using `generate_mask`)
+and a set of partially inverted latents (generated using `invert`) _must_ be provided as arguments when calling the pipeline to generate the final edited image.
+Refer to the code examples below for more details.
+* The function `generate_mask` exposes two prompt arguments, `source_prompt` and `target_prompt`,
+that let you control the locations of the semantic edits in the final image to be generated. Let's say,
+you wanted to translate from "cat" to "dog". In this case, the edit direction will be "cat -> dog". To reflect
+this in the generated mask, you simply have to set the embeddings related to the phrases including "cat" to
+`source_prompt_embeds` and "dog" to `target_prompt_embeds`. Refer to the code example below for more details.
+* When generating partially inverted latents using `invert`, assign a caption or text embedding describing the
+overall image to the `prompt` argument to help guide the inverse latent sampling process. In most cases, the
+source concept is sufficently descriptive to yield good results, but feel free to explore alternatives.
+Please refer to [this code example](#generating-image-captions-for-inversion) for more details.
+* When calling the pipeline to generate the final edited image, assign the source concept to `negative_prompt`
+and the target concept to `prompt`. Taking the above example, you simply have to set the embeddings related to
+the phrases including "cat" to `negative_prompt_embeds` and "dog" to `prompt_embeds`. Refer to the code example
+below for more details.
+* If you wanted to reverse the direction in the example above, i.e., "dog -> cat", then it's recommended to:
+ * Swap the `source_prompt` and `target_prompt` in the arguments to `generate_mask`.
+ * Change the input prompt for `invert` to include "dog".
+ * Swap the `prompt` and `negative_prompt` in the arguments to call the pipeline to generate the final edited image.
+* Note that the source and target prompts, or their corresponding embeddings, can also be automatically generated. Please, refer to [this discussion](#generating-source-and-target-embeddings) for more details.
+
+## Available Pipelines:
+
+| Pipeline | Tasks
+|---|---|
+| [StableDiffusionDiffEditPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py) | *Text-Based Image Editing*
+
+
+
+## Usage example
+
+### Based on an input image with a caption
+
+When the pipeline is conditioned on an input image, we first obtain partially inverted latents from the input image using a
+`DDIMInverseScheduler` with the help of a caption. Then we generate an editing mask to identify relevant regions in the image using the source and target prompts. Finally,
+the inverted noise and generated mask is used to start the generation process.
+
+First, let's load our pipeline:
+
+```py
+import torch
+from diffusers import DDIMScheduler, DDIMInverseScheduler, StableDiffusionPix2PixZeroPipeline
+
+sd_model_ckpt = "stabilityai/stable-diffusion-2-1"
+pipeline = StableDiffusionDiffEditPipeline.from_pretrained(
+ sd_model_ckpt,
+ torch_dtype=torch.float16,
+ safety_checker=None,
+)
+pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
+pipeline.enable_model_cpu_offload()
+pipeline.enable_vae_slicing()
+generator = torch.manual_seed(0)
+```
+
+Then, we load an input image to edit using our method:
+
+```py
+from diffusers.utils import load_image
+
+img_url = "https://github.com/Xiang-cd/DiffEdit-stable-diffusion/raw/main/assets/origin.png"
+raw_image = load_image(img_url).convert("RGB").resize((768, 768))
+```
+
+Then, we employ the source and target prompts to generate the editing mask:
+
+```py
+# See the "Generating source and target embeddings" section below to
+# automate the generation of these captions with a pre-trained model like Flan-T5 as explained below.
+
+source_prompt = "a bowl of fruits"
+target_prompt = "a basket of fruits"
+mask_image = pipeline.generate_mask(
+ image=raw_image,
+ source_prompt=source_prompt,
+ target_prompt=target_prompt,
+ generator=generator,
+)
+```
+
+Then, we employ the caption and the input image to get the inverted latents:
+
+```py
+inv_latents = pipeline.invert(prompt=source_prompt, image=raw_image, generator=generator).latents
+```
+
+Now, generate the image with the inverted latents and semantically generated mask:
+
+```py
+image = pipeline(
+ prompt=target_prompt,
+ mask_image=mask_image,
+ image_latents=inv_latents,
+ generator=generator,
+ negative_prompt=source_prompt,
+).images[0]
+image.save("edited_image.png")
+```
+
+## Generating image captions for inversion
+
+The authors originally used the source concept prompt as the caption for generating the partially inverted latents. However, we can also leverage open source and public image captioning models for the same purpose.
+Below, we provide an end-to-end example with the [BLIP](https://huggingface.co/docs/transformers/model_doc/blip) model
+for generating captions.
+
+First, let's load our automatic image captioning model:
+
+```py
+import torch
+from transformers import BlipForConditionalGeneration, BlipProcessor
+
+captioner_id = "Salesforce/blip-image-captioning-base"
+processor = BlipProcessor.from_pretrained(captioner_id)
+model = BlipForConditionalGeneration.from_pretrained(captioner_id, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+```
+
+Then, we define a utility to generate captions from an input image using the model:
+
+```py
+@torch.no_grad()
+def generate_caption(images, caption_generator, caption_processor):
+ text = "a photograph of"
+
+ inputs = caption_processor(images, text, return_tensors="pt").to(device="cuda", dtype=caption_generator.dtype)
+ caption_generator.to("cuda")
+ outputs = caption_generator.generate(**inputs, max_new_tokens=128)
+
+ # offload caption generator
+ caption_generator.to("cpu")
+
+ caption = caption_processor.batch_decode(outputs, skip_special_tokens=True)[0]
+ return caption
+```
+
+Then, we load an input image for conditioning and obtain a suitable caption for it:
+
+```py
+from diffusers.utils import load_image
+
+img_url = "https://github.com/Xiang-cd/DiffEdit-stable-diffusion/raw/main/assets/origin.png"
+raw_image = load_image(img_url).convert("RGB").resize((768, 768))
+caption = generate_caption(raw_image, model, processor)
+```
+
+Then, we employ the generated caption and the input image to get the inverted latents:
+
+```py
+from diffusers import DDIMInverseScheduler, DDIMScheduler
+
+pipeline = StableDiffusionDiffEditPipeline.from_pretrained(
+ "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
+)
+pipeline = pipeline.to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_vae_slicing()
+
+pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
+
+generator = torch.manual_seed(0)
+inv_latents = pipeline.invert(prompt=caption, image=raw_image, generator=generator).latents
+```
+
+Now, generate the image with the inverted latents and semantically generated mask from our source and target prompts:
+
+```py
+source_prompt = "a bowl of fruits"
+target_prompt = "a basket of fruits"
+
+mask_image = pipeline.generate_mask(
+ image=raw_image,
+ source_prompt=source_prompt,
+ target_prompt=target_prompt,
+ generator=generator,
+)
+
+image = pipeline(
+ prompt=target_prompt,
+ mask_image=mask_image,
+ image_latents=inv_latents,
+ generator=generator,
+ negative_prompt=source_prompt,
+).images[0]
+image.save("edited_image.png")
+```
+
+## Generating source and target embeddings
+
+The authors originally required the user to manually provide the source and target prompts for discovering
+edit directions. However, we can also leverage open source and public models for the same purpose.
+Below, we provide an end-to-end example with the [Flan-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5) model
+for generating source an target embeddings.
+
+**1. Load the generation model**:
+
+```py
+import torch
+from transformers import AutoTokenizer, T5ForConditionalGeneration
+
+tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")
+model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto", torch_dtype=torch.float16)
+```
+
+**2. Construct a starting prompt**:
+
+```py
+source_concept = "bowl"
+target_concept = "basket"
+
+source_text = f"Provide a caption for images containing a {source_concept}. "
+"The captions should be in English and should be no longer than 150 characters."
+
+target_text = f"Provide a caption for images containing a {target_concept}. "
+"The captions should be in English and should be no longer than 150 characters."
+```
+
+Here, we're interested in the "bowl -> basket" direction.
+
+**3. Generate prompts**:
+
+We can use a utility like so for this purpose.
+
+```py
+@torch.no_grad
+def generate_prompts(input_prompt):
+ input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.to("cuda")
+
+ outputs = model.generate(
+ input_ids, temperature=0.8, num_return_sequences=16, do_sample=True, max_new_tokens=128, top_k=10
+ )
+ return tokenizer.batch_decode(outputs, skip_special_tokens=True)
+```
+
+And then we just call it to generate our prompts:
+
+```py
+source_prompts = generate_prompts(source_text)
+target_prompts = generate_prompts(target_text)
+```
+
+We encourage you to play around with the different parameters supported by the
+`generate()` method ([documentation](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.generation_tf_utils.TFGenerationMixin.generate)) for the generation quality you are looking for.
+
+**4. Load the embedding model**:
+
+Here, we need to use the same text encoder model used by the subsequent Stable Diffusion model.
+
+```py
+from diffusers import StableDiffusionDiffEditPipeline
+
+pipeline = StableDiffusionDiffEditPipeline.from_pretrained(
+ "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
+)
+pipeline = pipeline.to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_vae_slicing()
+
+generator = torch.manual_seed(0)
+```
+
+**5. Compute embeddings**:
+
+```py
+import torch
+
+@torch.no_grad()
+def embed_prompts(sentences, tokenizer, text_encoder, device="cuda"):
+ embeddings = []
+ for sent in sentences:
+ text_inputs = tokenizer(
+ sent,
+ padding="max_length",
+ max_length=tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+ text_input_ids = text_inputs.input_ids
+ prompt_embeds = text_encoder(text_input_ids.to(device), attention_mask=None)[0]
+ embeddings.append(prompt_embeds)
+ return torch.concatenate(embeddings, dim=0).mean(dim=0).unsqueeze(0)
+
+source_embeddings = embed_prompts(source_prompts, pipeline.tokenizer, pipeline.text_encoder)
+target_embeddings = embed_prompts(target_captions, pipeline.tokenizer, pipeline.text_encoder)
+```
+
+And you're done! Now, you can use these embeddings directly while calling the pipeline:
+
+```py
+from diffusers import DDIMInverseScheduler, DDIMScheduler
+from diffusers.utils import load_image
+
+pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
+
+img_url = "https://github.com/Xiang-cd/DiffEdit-stable-diffusion/raw/main/assets/origin.png"
+raw_image = load_image(img_url).convert("RGB").resize((768, 768))
+
+
+mask_image = pipeline.generate_mask(
+ image=raw_image,
+ source_prompt_embeds=source_embeds,
+ target_prompt_embeds=target_embeds,
+ generator=generator,
+)
+
+inv_latents = pipeline.invert(
+ prompt_embeds=source_embeds,
+ image=raw_image,
+ generator=generator,
+).latents
+
+images = pipeline(
+ mask_image=mask_image,
+ image_latents=inv_latents,
+ prompt_embeds=target_embeddings,
+ negative_prompt_embeds=source_embeddings,
+ generator=generator,
+).images
+images[0].save("edited_image.png")
+```
+
+## StableDiffusionDiffEditPipeline
+[[autodoc]] StableDiffusionDiffEditPipeline
+ - all
+ - generate_mask
+ - invert
+ - __call__
\ No newline at end of file
diff --git a/docs/source/en/api/pipelines/stable_diffusion/overview.mdx b/docs/source/en/api/pipelines/stable_diffusion/overview.mdx
index 70731fd294b9..a163b57f2a84 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/overview.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/overview.mdx
@@ -36,6 +36,7 @@ For more details about how Stable Diffusion works and how it differs from the ba
| [StableDiffusionAttendAndExcitePipeline](./attend_and_excite) | **Experimental** – *Text-to-Image Generation * | | [Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models](https://huggingface.co/spaces/AttendAndExcite/Attend-and-Excite)
| [StableDiffusionPix2PixZeroPipeline](./pix2pix_zero) | **Experimental** – *Text-Based Image Editing * | | [Zero-shot Image-to-Image Translation](https://arxiv.org/abs/2302.03027)
| [StableDiffusionModelEditingPipeline](./model_editing) | **Experimental** – *Text-to-Image Model Editing * | | [Editing Implicit Assumptions in Text-to-Image Diffusion Models](https://arxiv.org/abs/2303.08084)
+| [StableDiffusionDiffEditPipeline](./diffedit) | **Experimental** – *Text-Based Image Editing * | | [DiffEdit: Diffusion-based semantic image editing with mask guidance](https://arxiv.org/abs/2210.11427)
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 078d03eb8995..a8293ea77fef 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -134,6 +134,7 @@
StableDiffusionAttendAndExcitePipeline,
StableDiffusionControlNetPipeline,
StableDiffusionDepth2ImgPipeline,
+ StableDiffusionDiffEditPipeline,
StableDiffusionImageVariationPipeline,
StableDiffusionImg2ImgPipeline,
StableDiffusionInpaintPipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 10da653a1377..3cddad4a6b26 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -60,6 +60,7 @@
StableDiffusionAttendAndExcitePipeline,
StableDiffusionControlNetPipeline,
StableDiffusionDepth2ImgPipeline,
+ StableDiffusionDiffEditPipeline,
StableDiffusionImageVariationPipeline,
StableDiffusionImg2ImgPipeline,
StableDiffusionInpaintPipeline,
diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index 6bc2b58b5fef..b89dde319cb3 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -75,10 +75,12 @@ class StableDiffusionPipelineOutput(BaseOutput):
except OptionalDependencyNotAvailable:
from ...utils.dummy_torch_and_transformers_objects import (
StableDiffusionDepth2ImgPipeline,
+ StableDiffusionDiffEditPipeline,
StableDiffusionPix2PixZeroPipeline,
)
else:
from .pipeline_stable_diffusion_depth2img import StableDiffusionDepth2ImgPipeline
+ from .pipeline_stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
from .pipeline_stable_diffusion_pix2pix_zero import StableDiffusionPix2PixZeroPipeline
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
new file mode 100644
index 000000000000..9bef5269fa07
--- /dev/null
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
@@ -0,0 +1,1530 @@
+# Copyright 2023 DiffEdit Authors and Pix2Pix Zero Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...schedulers import DDIMInverseScheduler, KarrasDiffusionSchedulers
+from ...utils import (
+ PIL_INTERPOLATION,
+ BaseOutput,
+ deprecate,
+ is_accelerate_available,
+ is_accelerate_version,
+ logging,
+ randn_tensor,
+ replace_example_docstring,
+)
+from ..pipeline_utils import DiffusionPipeline
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+@dataclass
+class DiffEditInversionPipelineOutput(BaseOutput):
+ """
+ Output class for Stable Diffusion pipelines.
+
+ Args:
+ latents (`torch.FloatTensor`)
+ inverted latents tensor
+ images (`List[PIL.Image.Image]` or `np.ndarray`)
+ List of denoised PIL images of length `num_timesteps * batch_size` or numpy array of shape `(num_timesteps,
+ batch_size, height, width, num_channels)`. PIL images or numpy array present the denoised images of the
+ diffusion pipeline.
+ """
+
+ latents: torch.FloatTensor
+ images: Union[List[PIL.Image.Image], np.ndarray]
+
+
+EXAMPLE_DOC_STRING = """
+
+ ```py
+ >>> import PIL
+ >>> import requests
+ >>> import torch
+ >>> from io import BytesIO
+
+ >>> from diffusers import StableDiffusionDiffEditPipeline
+
+
+ >>> def download_image(url):
+ ... response = requests.get(url)
+ ... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+ >>> img_url = "https://github.com/Xiang-cd/DiffEdit-stable-diffusion/raw/main/assets/origin.png"
+
+ >>> init_image = download_image(img_url).resize((768, 768))
+
+ >>> pipe = StableDiffusionDiffEditPipeline.from_pretrained(
+ ... "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
+ ... )
+ >>> pipe = pipe.to("cuda")
+
+ >>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+ >>> pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
+ >>> pipeline.enable_model_cpu_offload()
+
+ >>> mask_prompt = "A bowl of fruits"
+ >>> prompt = "A bowl of pears"
+
+ >>> mask_image = pipe.generate_mask(image=init_image, source_prompt=prompt, target_prompt=mask_prompt)
+ >>> image_latents = pipe.invert(image=init_image, prompt=mask_prompt).latents
+ >>> image = pipe(prompt=prompt, mask_image=mask_image, image_latents=image_latents).images[0]
+ ```
+"""
+
+EXAMPLE_INVERT_DOC_STRING = """
+ ```py
+ >>> import PIL
+ >>> import requests
+ >>> import torch
+ >>> from io import BytesIO
+
+ >>> from diffusers import StableDiffusionDiffEditPipeline
+
+
+ >>> def download_image(url):
+ ... response = requests.get(url)
+ ... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+ >>> img_url = "https://github.com/Xiang-cd/DiffEdit-stable-diffusion/raw/main/assets/origin.png"
+
+ >>> init_image = download_image(img_url).resize((768, 768))
+
+ >>> pipe = StableDiffusionDiffEditPipeline.from_pretrained(
+ ... "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
+ ... )
+ >>> pipe = pipe.to("cuda")
+
+ >>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+ >>> pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
+ >>> pipeline.enable_model_cpu_offload()
+
+ >>> prompt = "A bowl of fruits"
+
+ >>> inverted_latents = pipe.invert(image=init_image, prompt=prompt).latents
+ ```
+"""
+
+
+def auto_corr_loss(hidden_states, generator=None):
+ reg_loss = 0.0
+ for i in range(hidden_states.shape[0]):
+ for j in range(hidden_states.shape[1]):
+ noise = hidden_states[i : i + 1, j : j + 1, :, :]
+ while True:
+ roll_amount = torch.randint(noise.shape[2] // 2, (1,), generator=generator).item()
+ reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=2)).mean() ** 2
+ reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=3)).mean() ** 2
+
+ if noise.shape[2] <= 8:
+ break
+ noise = torch.nn.functional.avg_pool2d(noise, kernel_size=2)
+ return reg_loss
+
+
+def kl_divergence(hidden_states):
+ return hidden_states.var() + hidden_states.mean() ** 2 - 1 - torch.log(hidden_states.var() + 1e-7)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
+def preprocess(image):
+ if isinstance(image, torch.Tensor):
+ return image
+ elif isinstance(image, PIL.Image.Image):
+ image = [image]
+
+ if isinstance(image[0], PIL.Image.Image):
+ w, h = image[0].size
+ w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
+
+ image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+ image = np.concatenate(image, axis=0)
+ image = np.array(image).astype(np.float32) / 255.0
+ image = image.transpose(0, 3, 1, 2)
+ image = 2.0 * image - 1.0
+ image = torch.from_numpy(image)
+ elif isinstance(image[0], torch.Tensor):
+ image = torch.cat(image, dim=0)
+ return image
+
+
+def preprocess_mask(mask, batch_size: int = 1):
+ if not isinstance(mask, torch.Tensor):
+ # preprocess mask
+ if isinstance(mask, PIL.Image.Image) or isinstance(mask, np.ndarray):
+ mask = [mask]
+
+ if isinstance(mask, list):
+ if isinstance(mask[0], PIL.Image.Image):
+ mask = [np.array(m.convert("L")).astype(np.float32) / 255.0 for m in mask]
+ if isinstance(mask[0], np.ndarray):
+ mask = np.stack(mask, axis=0) if mask[0].ndim < 3 else np.concatenate(mask, axis=0)
+ mask = torch.from_numpy(mask)
+ elif isinstance(mask[0], torch.Tensor):
+ mask = torch.stack(mask, dim=0) if mask[0].ndim < 3 else torch.cat(mask, dim=0)
+
+ # Batch and add channel dim for single mask
+ if mask.ndim == 2:
+ mask = mask.unsqueeze(0).unsqueeze(0)
+
+ # Batch single mask or add channel dim
+ if mask.ndim == 3:
+ # Single batched mask, no channel dim or single mask not batched but channel dim
+ if mask.shape[0] == 1:
+ mask = mask.unsqueeze(0)
+
+ # Batched masks no channel dim
+ else:
+ mask = mask.unsqueeze(1)
+
+ # Check mask shape
+ if batch_size > 1:
+ if mask.shape[0] == 1:
+ mask = torch.cat([mask] * batch_size)
+ elif mask.shape[0] > 1 and mask.shape[0] != batch_size:
+ raise ValueError(
+ f"`mask_image` with batch size {mask.shape[0]} cannot be broadcasted to batch size {batch_size} "
+ f"inferred by prompt inputs"
+ )
+
+ if mask.shape[1] != 1:
+ raise ValueError(f"`mask_image` must have 1 channel, but has {mask.shape[1]} channels")
+
+ # Check mask is in [0, 1]
+ if mask.min() < 0 or mask.max() > 1:
+ raise ValueError("`mask_image` should be in [0, 1] range")
+
+ # Binarize mask
+ mask[mask < 0.5] = 0
+ mask[mask >= 0.5] = 1
+
+ return mask
+
+
+class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+ r"""
+ Pipeline for text-guided image inpainting using Stable Diffusion using DiffEdit. *This is an experimental feature*.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ In addition the pipeline inherits the following loading methods:
+ - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+ - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
+
+ as well as the following saving methods:
+ - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`CLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+ inverse_scheduler (`[DDIMInverseScheduler]`):
+ A scheduler to be used in combination with `unet` to fill in the unmasked part of the input latents
+ safety_checker ([`StableDiffusionSafetyChecker`]):
+ Classification module that estimates whether generated images could be considered offensive or harmful.
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+ feature_extractor ([`CLIPImageProcessor`]):
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+ """
+ _optional_components = ["safety_checker", "feature_extractor", "inverse_scheduler"]
+
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ inverse_scheduler: DDIMInverseScheduler,
+ requires_safety_checker: bool = True,
+ ):
+ super().__init__()
+
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["steps_offset"] = 1
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if hasattr(scheduler.config, "skip_prk_steps") and scheduler.config.skip_prk_steps is False:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration"
+ " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
+ " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
+ " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
+ " Hub, it would be very nice if you could open a Pull request for the"
+ " `scheduler/scheduler_config.json` file"
+ )
+ deprecate("skip_prk_steps not set", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["skip_prk_steps"] = True
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if safety_checker is None and requires_safety_checker:
+ logger.warning(
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+ )
+
+ if safety_checker is not None and feature_extractor is None:
+ raise ValueError(
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+ )
+
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+ version.parse(unet.config._diffusers_version).base_version
+ ) < version.parse("0.9.0.dev0")
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+ if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+ deprecation_message = (
+ "The configuration file of the unet has set the default `sample_size` to smaller than"
+ " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+ " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+ " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+ " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+ " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+ " in the config might lead to incorrect results in future versions. If you have downloaded this"
+ " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+ " the `unet/config.json` file"
+ )
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(unet.config)
+ new_config["sample_size"] = 64
+ unet._internal_dict = FrozenDict(new_config)
+
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ inverse_scheduler=inverse_scheduler,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
+ def enable_vae_slicing(self):
+ r"""
+ Enable sliced VAE decoding.
+
+ When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
+ steps. This is useful to save some memory and allow larger batch sizes.
+ """
+ self.vae.enable_slicing()
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+ def disable_vae_slicing(self):
+ r"""
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_slicing()
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
+ def enable_vae_tiling(self):
+ r"""
+ Enable tiled VAE decoding.
+
+ When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
+ several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
+ """
+ self.vae.enable_tiling()
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+ def disable_vae_tiling(self):
+ r"""
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_tiling()
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
+ def enable_sequential_cpu_offload(self, gpu_id=0):
+ r"""
+ Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+ text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+ `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+ Note that offloading happens on a submodule basis. Memory savings are higher than with
+ `enable_model_cpu_offload`, but performance is lower.
+ """
+ if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
+ from accelerate import cpu_offload
+ else:
+ raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
+
+ device = torch.device(f"cuda:{gpu_id}")
+
+ if self.device.type != "cpu":
+ self.to("cpu", silence_dtype_warnings=True)
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
+
+ for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+ cpu_offload(cpu_offloaded_model, device)
+
+ if self.safety_checker is not None:
+ cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
+ def enable_model_cpu_offload(self, gpu_id=0):
+ r"""
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+ """
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+ from accelerate import cpu_offload_with_hook
+ else:
+ raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+ device = torch.device(f"cuda:{gpu_id}")
+
+ if self.device.type != "cpu":
+ self.to("cpu", silence_dtype_warnings=True)
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
+
+ hook = None
+ for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+ _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+ if self.safety_checker is not None:
+ _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+ # We'll offload the last model manually.
+ self.final_offload_hook = hook
+
+ @property
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+ def _execution_device(self):
+ r"""
+ Returns the device on which the pipeline's models will be executed. After calling
+ `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+ hooks.
+ """
+ if not hasattr(self.unet, "_hf_hook"):
+ return self.device
+ for module in self.unet.modules():
+ if (
+ hasattr(module, "_hf_hook")
+ and hasattr(module._hf_hook, "execution_device")
+ and module._hf_hook.execution_device is not None
+ ):
+ return torch.device(module._hf_hook.execution_device)
+ return self.device
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+ def _encode_prompt(
+ self,
+ prompt,
+ device,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ ):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ prompt to be encoded
+ device: (`torch.device`):
+ torch device
+ num_images_per_prompt (`int`):
+ number of images that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ """
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ if prompt_embeds is None:
+ # textual inversion: procecss multi-vector tokens if necessary
+ if isinstance(self, TextualInversionLoaderMixin):
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+ text_input_ids = text_inputs.input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+ attention_mask = text_inputs.attention_mask.to(device)
+ else:
+ attention_mask = None
+
+ prompt_embeds = self.text_encoder(
+ text_input_ids.to(device),
+ attention_mask=attention_mask,
+ )
+ prompt_embeds = prompt_embeds[0]
+
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+ bs_embed, seq_len, _ = prompt_embeds.shape
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
+ uncond_tokens: List[str]
+ if negative_prompt is None:
+ uncond_tokens = [""] * batch_size
+ elif type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif isinstance(negative_prompt, str):
+ uncond_tokens = [negative_prompt]
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = negative_prompt
+
+ # textual inversion: procecss multi-vector tokens if necessary
+ if isinstance(self, TextualInversionLoaderMixin):
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+ max_length = prompt_embeds.shape[1]
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+ attention_mask = uncond_input.attention_mask.to(device)
+ else:
+ attention_mask = None
+
+ negative_prompt_embeds = self.text_encoder(
+ uncond_input.input_ids.to(device),
+ attention_mask=attention_mask,
+ )
+ negative_prompt_embeds = negative_prompt_embeds[0]
+
+ if do_classifier_free_guidance:
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+ seq_len = negative_prompt_embeds.shape[1]
+
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+ return prompt_embeds
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+ def run_safety_checker(self, image, device, dtype):
+ if self.safety_checker is not None:
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ image, has_nsfw_concept = self.safety_checker(
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+ )
+ else:
+ has_nsfw_concept = None
+ return image, has_nsfw_concept
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+ def prepare_extra_step_kwargs(self, generator, eta):
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ # check if the scheduler accepts generator
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ if accepts_generator:
+ extra_step_kwargs["generator"] = generator
+ return extra_step_kwargs
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+ def decode_latents(self, latents):
+ latents = 1 / self.vae.config.scaling_factor * latents
+ image = self.vae.decode(latents).sample
+ image = (image / 2 + 0.5).clamp(0, 1)
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+ return image
+
+ def check_inputs(
+ self,
+ prompt,
+ strength,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
+ if (strength is None) or (strength is not None and (strength < 0 or strength > 1)):
+ raise ValueError(
+ f"The value of `strength` should in [0.0, 1.0] but is, but is {strength} of type {type(strength)}."
+ )
+
+ if (callback_steps is None) or (
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ if prompt is not None and prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+ " only forward one of the two."
+ )
+ elif prompt is None and prompt_embeds is None:
+ raise ValueError(
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+ )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ if negative_prompt is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
+ raise ValueError(
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+ f" {negative_prompt_embeds.shape}."
+ )
+
+ def check_source_inputs(
+ self,
+ source_prompt=None,
+ source_negative_prompt=None,
+ source_prompt_embeds=None,
+ source_negative_prompt_embeds=None,
+ ):
+ if source_prompt is not None and source_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `source_prompt`: {source_prompt} and `source_prompt_embeds`: {source_prompt_embeds}."
+ " Please make sure to only forward one of the two."
+ )
+ elif source_prompt is None and source_prompt_embeds is None:
+ raise ValueError(
+ "Provide either `source_image` or `source_prompt_embeds`. Cannot leave all both of the arguments undefined."
+ )
+ elif source_prompt is not None and (
+ not isinstance(source_prompt, str) and not isinstance(source_prompt, list)
+ ):
+ raise ValueError(f"`source_prompt` has to be of type `str` or `list` but is {type(source_prompt)}")
+
+ if source_negative_prompt is not None and source_negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `source_negative_prompt`: {source_negative_prompt} and `source_negative_prompt_embeds`:"
+ f" {source_negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+
+ if source_prompt_embeds is not None and source_negative_prompt_embeds is not None:
+ if source_prompt_embeds.shape != source_negative_prompt_embeds.shape:
+ raise ValueError(
+ "`source_prompt_embeds` and `source_negative_prompt_embeds` must have the same shape when passed"
+ f" directly, but got: `source_prompt_embeds` {source_prompt_embeds.shape} !="
+ f" `source_negative_prompt_embeds` {source_negative_prompt_embeds.shape}."
+ )
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+ def get_timesteps(self, num_inference_steps, strength, device):
+ # get the original timestep using init_timestep
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+ t_start = max(num_inference_steps - init_timestep, 0)
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+ return timesteps, num_inference_steps - t_start
+
+ def get_inverse_timesteps(self, num_inference_steps, strength, device):
+ # get the original timestep using init_timestep
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+ t_start = max(num_inference_steps - init_timestep, 0)
+
+ # safety for t_start overflow to prevent empty timsteps slice
+ if t_start == 0:
+ return self.inverse_scheduler.timesteps, num_inference_steps
+ timesteps = self.inverse_scheduler.timesteps[:-t_start]
+
+ return timesteps, num_inference_steps - t_start
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+ if isinstance(generator, list) and len(generator) != batch_size:
+ raise ValueError(
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+ )
+
+ if latents is None:
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+ else:
+ latents = latents.to(device)
+
+ # scale the initial noise by the standard deviation required by the scheduler
+ latents = latents * self.scheduler.init_noise_sigma
+ return latents
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_pix2pix_zero.StableDiffusionPix2PixZeroPipeline.prepare_image_latents
+ def prepare_image_latents(self, image, batch_size, dtype, device, generator=None):
+ if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+ raise ValueError(
+ f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+ )
+
+ image = image.to(device=device, dtype=dtype)
+
+ if isinstance(generator, list) and len(generator) != batch_size:
+ raise ValueError(
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+ )
+
+ if isinstance(generator, list):
+ latents = [self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)]
+ latents = torch.cat(latents, dim=0)
+ else:
+ latents = self.vae.encode(image).latent_dist.sample(generator)
+
+ latents = self.vae.config.scaling_factor * latents
+
+ if batch_size != latents.shape[0]:
+ if batch_size % latents.shape[0] == 0:
+ # expand image_latents for batch_size
+ deprecation_message = (
+ f"You have passed {batch_size} text prompts (`prompt`), but only {latents.shape[0]} initial"
+ " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+ " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+ " your script to pass as many initial images as text prompts to suppress this warning."
+ )
+ deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+ additional_latents_per_image = batch_size // latents.shape[0]
+ latents = torch.cat([latents] * additional_latents_per_image, dim=0)
+ else:
+ raise ValueError(
+ f"Cannot duplicate `image` of batch size {latents.shape[0]} to {batch_size} text prompts."
+ )
+ else:
+ latents = torch.cat([latents], dim=0)
+
+ return latents
+
+ def get_epsilon(self, model_output: torch.Tensor, sample: torch.Tensor, timestep: int):
+ pred_type = self.inverse_scheduler.config.prediction_type
+ alpha_prod_t = self.inverse_scheduler.alphas_cumprod[timestep]
+
+ beta_prod_t = 1 - alpha_prod_t
+
+ if pred_type == "epsilon":
+ return model_output
+ elif pred_type == "sample":
+ return (sample - alpha_prod_t ** (0.5) * model_output) / beta_prod_t ** (0.5)
+ elif pred_type == "v_prediction":
+ return (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+ else:
+ raise ValueError(
+ f"prediction_type given as {pred_type} must be one of `epsilon`, `sample`, or `v_prediction`"
+ )
+
+ @torch.no_grad()
+ def generate_mask(
+ self,
+ image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+ target_prompt: Optional[Union[str, List[str]]] = None,
+ target_negative_prompt: Optional[Union[str, List[str]]] = None,
+ target_prompt_embeds: Optional[torch.FloatTensor] = None,
+ target_negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ source_prompt: Optional[Union[str, List[str]]] = None,
+ source_negative_prompt: Optional[Union[str, List[str]]] = None,
+ source_prompt_embeds: Optional[torch.FloatTensor] = None,
+ source_negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ num_maps_per_mask: Optional[int] = 10,
+ mask_encode_strength: Optional[float] = 0.5,
+ mask_thresholding_ratio: Optional[float] = 3.0,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ output_type: Optional[str] = "np",
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ ):
+ r"""
+ Function used to generate a latent mask given a mask prompt, a target prompt, and an image.
+
+ Args:
+ image (`PIL.Image.Image`):
+ `Image`, or tensor representing an image batch which will be used for computing the mask.
+ target_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the semantic mask generation. If not defined, one has to pass
+ `prompt_embeds`. instead.
+ target_negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+ is less than `1`).
+ target_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ target_negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ source_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the semantic mask generation using the method in [DiffEdit:
+ Diffusion-Based Semantic Image Editing with Mask Guidance](https://arxiv.org/pdf/2210.11427.pdf). If
+ not defined, one has to pass `source_prompt_embeds` or `source_image` instead.
+ source_negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the semantic mask generation away from using the method in [DiffEdit:
+ Diffusion-Based Semantic Image Editing with Mask Guidance](https://arxiv.org/pdf/2210.11427.pdf). If
+ not defined, one has to pass `source_negative_prompt_embeds` or `source_image` instead.
+ source_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings to guide the semantic mask generation. Can be used to easily tweak text
+ inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from
+ `source_prompt` input argument.
+ source_negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings to negatively guide the semantic mask generation. Can be used to easily
+ tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from
+ `source_negative_prompt` input argument.
+ num_maps_per_mask (`int`, *optional*, defaults to 10):
+ The number of noise maps sampled to generate the semantic mask using the method in [DiffEdit:
+ Diffusion-Based Semantic Image Editing with Mask Guidance](https://arxiv.org/pdf/2210.11427.pdf).
+ mask_encode_strength (`float`, *optional*, defaults to 0.5):
+ Conceptually, the strength of the noise maps sampled to generate the semantic mask using the method in
+ [DiffEdit: Diffusion-Based Semantic Image Editing with Mask Guidance](
+ https://arxiv.org/pdf/2210.11427.pdf). Must be between 0 and 1.
+ mask_thresholding_ratio (`float`, *optional*, defaults to 3.0):
+ The maximum multiple of the mean absolute difference used to clamp the semantic guidance map before
+ mask binarization.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
+ Examples:
+
+ Returns:
+ `List[PIL.Image.Image]` or `np.array`: `List[PIL.Image.Image]` if `output_type` is `"pil"`, otherwise a
+ `np.array`. When returning a `List[PIL.Image.Image]`, the list will consist of a batch of single-channel
+ binary image with dimensions `(height // self.vae_scale_factor, width // self.vae_scale_factor)`, otherwise
+ the `np.array` will have shape `(batch_size, height // self.vae_scale_factor, width //
+ self.vae_scale_factor)`.
+ """
+
+ # 1. Check inputs (Provide dummy argument for callback_steps)
+ self.check_inputs(
+ target_prompt,
+ mask_encode_strength,
+ 1,
+ target_negative_prompt,
+ target_prompt_embeds,
+ target_negative_prompt_embeds,
+ )
+
+ self.check_source_inputs(
+ source_prompt,
+ source_negative_prompt,
+ source_prompt_embeds,
+ source_negative_prompt_embeds,
+ )
+
+ if (num_maps_per_mask is None) or (
+ num_maps_per_mask is not None and (not isinstance(num_maps_per_mask, int) or num_maps_per_mask <= 0)
+ ):
+ raise ValueError(
+ f"`num_maps_per_mask` has to be a positive integer but is {num_maps_per_mask} of type"
+ f" {type(num_maps_per_mask)}."
+ )
+
+ if mask_thresholding_ratio is None or mask_thresholding_ratio <= 0:
+ raise ValueError(
+ f"`mask_thresholding_ratio` has to be positive but is {mask_thresholding_ratio} of type"
+ f" {type(mask_thresholding_ratio)}."
+ )
+
+ # 2. Define call parameters
+ if target_prompt is not None and isinstance(target_prompt, str):
+ batch_size = 1
+ elif target_prompt is not None and isinstance(target_prompt, list):
+ batch_size = len(target_prompt)
+ else:
+ batch_size = target_prompt_embeds.shape[0]
+ if cross_attention_kwargs is None:
+ cross_attention_kwargs = {}
+
+ device = self._execution_device
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ # 3. Encode input prompts
+ target_prompt_embeds = self._encode_prompt(
+ target_prompt,
+ device,
+ num_maps_per_mask,
+ do_classifier_free_guidance,
+ target_negative_prompt,
+ prompt_embeds=target_prompt_embeds,
+ negative_prompt_embeds=target_negative_prompt_embeds,
+ )
+
+ source_prompt_embeds = self._encode_prompt(
+ source_prompt,
+ device,
+ num_maps_per_mask,
+ do_classifier_free_guidance,
+ source_negative_prompt,
+ prompt_embeds=source_prompt_embeds,
+ negative_prompt_embeds=source_negative_prompt_embeds,
+ )
+
+ # 4. Preprocess image
+ image = preprocess(image).repeat_interleave(num_maps_per_mask, dim=0)
+
+ # 5. Set timesteps
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
+ timesteps, _ = self.get_timesteps(num_inference_steps, mask_encode_strength, device)
+ encode_timestep = timesteps[0]
+
+ # 6. Prepare image latents and add noise with specified strength
+ image_latents = self.prepare_image_latents(
+ image, batch_size * num_maps_per_mask, self.vae.dtype, device, generator
+ )
+ noise = randn_tensor(image_latents.shape, generator=generator, device=device, dtype=self.vae.dtype)
+ image_latents = self.scheduler.add_noise(image_latents, noise, encode_timestep)
+
+ latent_model_input = torch.cat([image_latents] * (4 if do_classifier_free_guidance else 2))
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, encode_timestep)
+
+ # 7. Predict the noise residual
+ prompt_embeds = torch.cat([source_prompt_embeds, target_prompt_embeds])
+ noise_pred = self.unet(
+ latent_model_input,
+ encode_timestep,
+ encoder_hidden_states=prompt_embeds,
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
+
+ if do_classifier_free_guidance:
+ noise_pred_neg_src, noise_pred_source, noise_pred_uncond, noise_pred_target = noise_pred.chunk(4)
+ noise_pred_source = noise_pred_neg_src + guidance_scale * (noise_pred_source - noise_pred_neg_src)
+ noise_pred_target = noise_pred_uncond + guidance_scale * (noise_pred_target - noise_pred_uncond)
+ else:
+ noise_pred_source, noise_pred_target = noise_pred.chunk(2)
+
+ # 8. Compute the mask from the absolute difference of predicted noise residuals
+ # TODO: Consider smoothing mask guidance map
+ mask_guidance_map = (
+ torch.abs(noise_pred_target - noise_pred_source)
+ .reshape(batch_size, num_maps_per_mask, *noise_pred_target.shape[-3:])
+ .mean([1, 2])
+ )
+ clamp_magnitude = mask_guidance_map.mean() * mask_thresholding_ratio
+ semantic_mask_image = mask_guidance_map.clamp(0, clamp_magnitude) / clamp_magnitude
+ semantic_mask_image = torch.where(semantic_mask_image <= 0.5, 0, 1)
+ mask_image = semantic_mask_image.cpu().numpy()
+
+ # 9. Convert to Numpy array or PIL.
+ if output_type == "pil":
+ mask_image = self.numpy_to_pil(mask_image)
+
+ # Offload last model to CPU
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+ self.final_offload_hook.offload()
+
+ return mask_image
+
+ @torch.no_grad()
+ @replace_example_docstring(EXAMPLE_INVERT_DOC_STRING)
+ def invert(
+ self,
+ prompt: Optional[Union[str, List[str]]] = None,
+ image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+ num_inference_steps: int = 50,
+ inpaint_strength: float = 0.8,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ decode_latents: bool = False,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ lambda_auto_corr: float = 20.0,
+ lambda_kl: float = 20.0,
+ num_reg_steps: int = 0,
+ num_auto_corr_rolls: int = 5,
+ ):
+ r"""
+ Function used to generate inverted latents given a prompt and image.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+ instead.
+ image (`PIL.Image.Image`):
+ `Image`, or tensor representing an image batch to produce the inverted latents, guided by `prompt`.
+ inpaint_strength (`float`, *optional*, defaults to 0.8):
+ Conceptually, indicates how far into the noising process to run latent inversion. Must be between 0 and
+ 1. When `strength` is 1, the inversion process will be run for the full number of iterations specified
+ in `num_inference_steps`. `image` will be used as a reference for the inversion process, adding more
+ noise the larger the `strength`. If `strength` is 0, no inpainting will occur.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+ is less than `1`).
+ generator (`torch.Generator`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ decode_latents (`bool`, *optional*, defaults to `False`):
+ Whether or not to decode the inverted latents into a generated image. Setting this argument to `True`
+ will decode all inverted latents for each timestep into a list of generated images.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.DiffEditInversionPipelineOutput`] instead of a
+ plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+ lambda_auto_corr (`float`, *optional*, defaults to 20.0):
+ Lambda parameter to control auto correction
+ lambda_kl (`float`, *optional*, defaults to 20.0):
+ Lambda parameter to control Kullback–Leibler divergence output
+ num_reg_steps (`int`, *optional*, defaults to 0):
+ Number of regularization loss steps
+ num_auto_corr_rolls (`int`, *optional*, defaults to 5):
+ Number of auto correction roll steps
+
+ Examples:
+
+ Returns:
+ [`~pipelines.stable_diffusion.pipeline_stable_diffusion_diffedit.DiffEditInversionPipelineOutput`] or
+ `tuple`: [`~pipelines.stable_diffusion.pipeline_stable_diffusion_diffedit.DiffEditInversionPipelineOutput`]
+ if `return_dict` is `True`, otherwise a `tuple`. When returning a tuple, the first element is the inverted
+ latents tensors ordered by increasing noise, and then second is the corresponding decoded images if
+ `decode_latents` is `True`, otherwise `None`.
+ """
+
+ # 1. Check inputs
+ self.check_inputs(
+ prompt,
+ inpaint_strength,
+ callback_steps,
+ negative_prompt,
+ prompt_embeds,
+ negative_prompt_embeds,
+ )
+
+ if image is None:
+ raise ValueError("`image` input cannot be undefined.")
+
+ # 2. Define call parameters
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+ if cross_attention_kwargs is None:
+ cross_attention_kwargs = {}
+
+ device = self._execution_device
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ # 3. Preprocess image
+ image = preprocess(image)
+
+ # 4. Prepare latent variables
+ num_images_per_prompt = 1
+ latents = self.prepare_image_latents(
+ image, batch_size * num_images_per_prompt, self.vae.dtype, device, generator
+ )
+
+ # 5. Encode input prompt
+ prompt_embeds = self._encode_prompt(
+ prompt,
+ device,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
+
+ # 6. Prepare timesteps
+ self.inverse_scheduler.set_timesteps(num_inference_steps, device=device)
+ timesteps, num_inference_steps = self.get_inverse_timesteps(num_inference_steps, inpaint_strength, device)
+
+ # 7. Noising loop where we obtain the intermediate noised latent image for each timestep.
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.inverse_scheduler.order
+ inverted_latents = [latents.detach().clone()]
+ with self.progress_bar(total=num_inference_steps - 1) as progress_bar:
+ for i, t in enumerate(timesteps[:-1]):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.inverse_scheduler.scale_model_input(latent_model_input, t)
+
+ # predict the noise residual
+ noise_pred = self.unet(
+ latent_model_input,
+ t,
+ encoder_hidden_states=prompt_embeds,
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # regularization of the noise prediction (not in original code or paper but borrowed from Pix2PixZero)
+ if num_reg_steps > 0:
+ with torch.enable_grad():
+ for _ in range(num_reg_steps):
+ if lambda_auto_corr > 0:
+ for _ in range(num_auto_corr_rolls):
+ var = torch.autograd.Variable(noise_pred.detach().clone(), requires_grad=True)
+
+ # Derive epsilon from model output before regularizing to IID standard normal
+ var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
+
+ l_ac = auto_corr_loss(var_epsilon, generator=generator)
+ l_ac.backward()
+
+ grad = var.grad.detach() / num_auto_corr_rolls
+ noise_pred = noise_pred - lambda_auto_corr * grad
+
+ if lambda_kl > 0:
+ var = torch.autograd.Variable(noise_pred.detach().clone(), requires_grad=True)
+
+ # Derive epsilon from model output before regularizing to IID standard normal
+ var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
+
+ l_kld = kl_divergence(var_epsilon)
+ l_kld.backward()
+
+ grad = var.grad.detach()
+ noise_pred = noise_pred - lambda_kl * grad
+
+ noise_pred = noise_pred.detach()
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.inverse_scheduler.step(noise_pred, t, latents).prev_sample
+ inverted_latents.append(latents.detach().clone())
+
+ # call the callback, if provided
+ if i == len(timesteps) - 1 or (
+ (i + 1) > num_warmup_steps and (i + 1) % self.inverse_scheduler.order == 0
+ ):
+ progress_bar.update()
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, latents)
+
+ assert len(inverted_latents) == len(timesteps)
+ latents = torch.stack(list(reversed(inverted_latents)), 1)
+
+ # 8. Post-processing
+ image = None
+ if decode_latents:
+ image = self.decode_latents(latents.flatten(0, 1).detach())
+
+ # 9. Convert to PIL.
+ if decode_latents and output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ # Offload last model to CPU
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+ self.final_offload_hook.offload()
+
+ if not return_dict:
+ return (latents, image)
+
+ return DiffEditInversionPipelineOutput(latents=latents, images=image)
+
+ @torch.no_grad()
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
+ def __call__(
+ self,
+ prompt: Optional[Union[str, List[str]]] = None,
+ mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+ image_latents: torch.FloatTensor = None,
+ inpaint_strength: Optional[float] = 0.8,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ latents: Optional[torch.FloatTensor] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+ callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+ instead.
+ mask_image (`PIL.Image.Image`):
+ `Image`, or tensor representing an image batch, to mask the generated image. White pixels in the mask
+ will be repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be
+ converted to a single channel (luminance) before use. If it's a tensor, it should contain one color
+ channel (L) instead of 3, so the expected shape would be `(B, 1, H, W)`.
+ image_latents (`PIL.Image.Image` or `torch.FloatTensor`):
+ Partially noised image latents from the inversion process to be used as inputs for image generation.
+ inpaint_strength (`float`, *optional*, defaults to 0.8):
+ Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
+ is 1, the denoising process will be run on the masked area for the full number of iterations specified
+ in `num_inference_steps`. `image_latents` will be used as a reference for the masked area, adding more
+ noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+ is less than `1`).
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+ latents (`torch.FloatTensor`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+ plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
+ Examples:
+
+ Returns:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+ (nsfw) content, according to the `safety_checker`.
+ """
+
+ # 1. Check inputs
+ self.check_inputs(
+ prompt,
+ inpaint_strength,
+ callback_steps,
+ negative_prompt,
+ prompt_embeds,
+ negative_prompt_embeds,
+ )
+
+ if mask_image is None:
+ raise ValueError(
+ "`mask_image` input cannot be undefined. Use `generate_mask()` to compute `mask_image` from text prompts."
+ )
+ if image_latents is None:
+ raise ValueError(
+ "`image_latents` input cannot be undefined. Use `invert()` to compute `image_latents` from input images."
+ )
+
+ # 2. Define call parameters
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+ if cross_attention_kwargs is None:
+ cross_attention_kwargs = {}
+
+ device = self._execution_device
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ # 3. Encode input prompt
+ prompt_embeds = self._encode_prompt(
+ prompt,
+ device,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
+
+ # 4. Preprocess mask
+ mask_image = preprocess_mask(mask_image, batch_size)
+ latent_height, latent_width = mask_image.shape[-2:]
+ mask_image = torch.cat([mask_image] * num_images_per_prompt)
+ mask_image = mask_image.to(device=device, dtype=prompt_embeds.dtype)
+
+ # 5. Set timesteps
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, inpaint_strength, device)
+
+ # 6. Preprocess image latents
+ image_latents = preprocess(image_latents)
+ latent_shape = (self.vae.config.latent_channels, latent_height, latent_width)
+ if image_latents.shape[-3:] != latent_shape:
+ raise ValueError(
+ f"Each latent image in `image_latents` must have shape {latent_shape}, "
+ f"but has shape {image_latents.shape[-3:]}"
+ )
+ if image_latents.ndim == 4:
+ image_latents = image_latents.reshape(batch_size, len(timesteps), *latent_shape)
+ if image_latents.shape[:2] != (batch_size, len(timesteps)):
+ raise ValueError(
+ f"`image_latents` must have batch size {batch_size} with latent images from {len(timesteps)} timesteps, "
+ f"but has batch size {image_latents.shape[0]} with latent images from {image_latents.shape[1]} timesteps."
+ )
+ image_latents = image_latents.transpose(0, 1).repeat_interleave(num_images_per_prompt, dim=1)
+ image_latents = image_latents.to(device=device, dtype=prompt_embeds.dtype)
+
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+ # 8. Denoising loop
+ latents = image_latents[0].detach().clone()
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
+ for i, t in enumerate(timesteps):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+ # predict the noise residual
+ noise_pred = self.unet(
+ latent_model_input,
+ t,
+ encoder_hidden_states=prompt_embeds,
+ cross_attention_kwargs=cross_attention_kwargs,
+ ).sample
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+ # mask with inverted latents from appropriate timestep - use original image latent for last step
+ latents = latents * mask_image + image_latents[i] * (1 - mask_image)
+
+ # call the callback, if provided
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+ progress_bar.update()
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, latents)
+
+ # 9. Post-processing
+ image = self.decode_latents(latents)
+
+ # 10. Run safety checker
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+ # 11. Convert to PIL
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ # Offload last model to CPU
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+ self.final_offload_hook.offload()
+
+ if not return_dict:
+ return (image, has_nsfw_concept)
+
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index bf4fe8d87ff9..f3708107e82a 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -242,6 +242,21 @@ def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
+class StableDiffusionDiffEditPipeline(metaclass=DummyObject):
+ _backends = ["torch", "transformers"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "transformers"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+
class StableDiffusionImageVariationPipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
new file mode 100644
index 000000000000..c20bc3b47d7b
--- /dev/null
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
@@ -0,0 +1,315 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+ AutoencoderKL,
+ DDIMInverseScheduler,
+ DDIMScheduler,
+ StableDiffusionDiffEditPipeline,
+ UNet2DConditionModel,
+)
+from diffusers.utils import load_image, slow
+from diffusers.utils.testing_utils import floats_tensor, require_torch_gpu, torch_device
+
+from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class StableDiffusionDiffEditPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+ pipeline_class = StableDiffusionDiffEditPipeline
+ params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"height", "width", "image"} | {"image_latents"}
+ batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - {"image"} | {"image_latents"}
+
+ def get_dummy_components(self):
+ torch.manual_seed(0)
+ unet = UNet2DConditionModel(
+ block_out_channels=(32, 64),
+ layers_per_block=2,
+ sample_size=32,
+ in_channels=4,
+ out_channels=4,
+ down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+ up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+ cross_attention_dim=32,
+ # SD2-specific config below
+ attention_head_dim=(2, 4),
+ use_linear_projection=True,
+ )
+ scheduler = DDIMScheduler(
+ beta_start=0.00085,
+ beta_end=0.012,
+ beta_schedule="scaled_linear",
+ clip_sample=False,
+ set_alpha_to_one=False,
+ )
+ inverse_scheduler = DDIMInverseScheduler(
+ beta_start=0.00085,
+ beta_end=0.012,
+ beta_schedule="scaled_linear",
+ clip_sample=False,
+ set_alpha_to_zero=False,
+ )
+ torch.manual_seed(0)
+ vae = AutoencoderKL(
+ block_out_channels=[32, 64],
+ in_channels=3,
+ out_channels=3,
+ down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+ up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+ latent_channels=4,
+ sample_size=128,
+ )
+ torch.manual_seed(0)
+ text_encoder_config = CLIPTextConfig(
+ bos_token_id=0,
+ eos_token_id=2,
+ hidden_size=32,
+ intermediate_size=37,
+ layer_norm_eps=1e-05,
+ num_attention_heads=4,
+ num_hidden_layers=5,
+ pad_token_id=1,
+ vocab_size=1000,
+ # SD2-specific config below
+ hidden_act="gelu",
+ projection_dim=512,
+ )
+ text_encoder = CLIPTextModel(text_encoder_config)
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+ components = {
+ "unet": unet,
+ "scheduler": scheduler,
+ "inverse_scheduler": inverse_scheduler,
+ "vae": vae,
+ "text_encoder": text_encoder,
+ "tokenizer": tokenizer,
+ "safety_checker": None,
+ "feature_extractor": None,
+ }
+
+ return components
+
+ def get_dummy_inputs(self, device, seed=0):
+ mask = floats_tensor((1, 16, 16), rng=random.Random(seed)).to(device)
+ latents = floats_tensor((1, 2, 4, 16, 16), rng=random.Random(seed)).to(device)
+ if str(device).startswith("mps"):
+ generator = torch.manual_seed(seed)
+ else:
+ generator = torch.Generator(device=device).manual_seed(seed)
+ inputs = {
+ "prompt": "a dog and a newt",
+ "mask_image": mask,
+ "image_latents": latents,
+ "generator": generator,
+ "num_inference_steps": 2,
+ "inpaint_strength": 1.0,
+ "guidance_scale": 6.0,
+ "output_type": "numpy",
+ }
+
+ return inputs
+
+ def get_dummy_mask_inputs(self, device, seed=0):
+ image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+ image = image.cpu().permute(0, 2, 3, 1)[0]
+ image = Image.fromarray(np.uint8(image)).convert("RGB")
+ if str(device).startswith("mps"):
+ generator = torch.manual_seed(seed)
+ else:
+ generator = torch.Generator(device=device).manual_seed(seed)
+ inputs = {
+ "image": image,
+ "source_prompt": "a cat and a frog",
+ "target_prompt": "a dog and a newt",
+ "generator": generator,
+ "num_inference_steps": 2,
+ "num_maps_per_mask": 2,
+ "mask_encode_strength": 1.0,
+ "guidance_scale": 6.0,
+ "output_type": "numpy",
+ }
+
+ return inputs
+
+ def get_dummy_inversion_inputs(self, device, seed=0):
+ image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+ image = image.cpu().permute(0, 2, 3, 1)[0]
+ image = Image.fromarray(np.uint8(image)).convert("RGB")
+ if str(device).startswith("mps"):
+ generator = torch.manual_seed(seed)
+ else:
+ generator = torch.Generator(device=device).manual_seed(seed)
+ inputs = {
+ "image": image,
+ "prompt": "a cat and a frog",
+ "generator": generator,
+ "num_inference_steps": 2,
+ "inpaint_strength": 1.0,
+ "guidance_scale": 6.0,
+ "decode_latents": True,
+ "output_type": "numpy",
+ }
+ return inputs
+
+ def test_save_load_optional_components(self):
+ if not hasattr(self.pipeline_class, "_optional_components"):
+ return
+
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ pipe.to(torch_device)
+ pipe.set_progress_bar_config(disable=None)
+
+ # set all optional components to None and update pipeline config accordingly
+ for optional_component in pipe._optional_components:
+ setattr(pipe, optional_component, None)
+ pipe.register_modules(**{optional_component: None for optional_component in pipe._optional_components})
+
+ inputs = self.get_dummy_inputs(torch_device)
+ output = pipe(**inputs)[0]
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ pipe.save_pretrained(tmpdir)
+ pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+ pipe_loaded.to(torch_device)
+ pipe_loaded.set_progress_bar_config(disable=None)
+
+ for optional_component in pipe._optional_components:
+ self.assertTrue(
+ getattr(pipe_loaded, optional_component) is None,
+ f"`{optional_component}` did not stay set to None after loading.",
+ )
+
+ inputs = self.get_dummy_inputs(torch_device)
+ output_loaded = pipe_loaded(**inputs)[0]
+
+ max_diff = np.abs(output - output_loaded).max()
+ self.assertLess(max_diff, 1e-4)
+
+ def test_mask(self):
+ device = "cpu"
+
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ pipe.to(device)
+ pipe.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_mask_inputs(device)
+ mask = pipe.generate_mask(**inputs)
+ mask_slice = mask[0, -3:, -3:]
+
+ self.assertEqual(mask.shape, (1, 16, 16))
+ expected_slice = np.array([0] * 9)
+ max_diff = np.abs(mask_slice.flatten() - expected_slice).max()
+ self.assertLessEqual(max_diff, 1e-3)
+ self.assertEqual(mask[0, -3, -4], 0)
+
+ def test_inversion(self):
+ device = "cpu"
+
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ pipe.to(device)
+ pipe.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_inversion_inputs(device)
+ image = pipe.invert(**inputs).images
+ image_slice = image[0, -1, -3:, -3:]
+
+ self.assertEqual(image.shape, (2, 32, 32, 3))
+ expected_slice = np.array(
+ [0.5150, 0.5134, 0.5043, 0.5376, 0.4694, 0.51050, 0.5015, 0.4407, 0.4799],
+ )
+ max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+ self.assertLessEqual(max_diff, 1e-3)
+
+
+@require_torch_gpu
+@slow
+class StableDiffusionDiffEditPipelineIntegrationTests(unittest.TestCase):
+ def tearDown(self):
+ super().tearDown()
+ gc.collect()
+ torch.cuda.empty_cache()
+
+ @classmethod
+ def setUpClass(cls):
+ raw_image = load_image(
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/diffedit/fruit.png"
+ )
+
+ raw_image = raw_image.convert("RGB").resize((768, 768))
+
+ cls.raw_image = raw_image
+
+ def test_stable_diffusion_diffedit_full(self):
+ generator = torch.manual_seed(0)
+
+ pipe = StableDiffusionDiffEditPipeline.from_pretrained(
+ "stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16
+ )
+ pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+ pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
+ pipe.enable_model_cpu_offload()
+ pipe.set_progress_bar_config(disable=None)
+
+ source_prompt = "a bowl of fruit"
+ target_prompt = "a bowl of pears"
+
+ mask_image = pipe.generate_mask(
+ image=self.raw_image,
+ source_prompt=source_prompt,
+ target_prompt=target_prompt,
+ generator=generator,
+ )
+
+ inv_latents = pipe.invert(
+ prompt=source_prompt, image=self.raw_image, inpaint_strength=0.7, generator=generator
+ ).latents
+
+ image = pipe(
+ prompt=target_prompt,
+ mask_image=mask_image,
+ image_latents=inv_latents,
+ generator=generator,
+ negative_prompt=source_prompt,
+ inpaint_strength=0.7,
+ output_type="numpy",
+ ).images[0]
+
+ expected_image = (
+ np.array(
+ load_image(
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+ "/diffedit/pears.png"
+ ).resize((768, 768))
+ )
+ / 255
+ )
+ assert np.abs((expected_image - image).max()) < 5e-1
From a7b0671c07dd82229dc49ae7c6ab8a111e7ba8ff Mon Sep 17 00:00:00 2001
From: Jason Kuan
Date: Fri, 28 Apr 2023 18:59:56 +0800
Subject: [PATCH 024/206] add constant learning rate with custom rule (#3133)
* add constant lr with rules
* add constant with rules in TYPE_TO_SCHEDULER_FUNCTION
* add constant lr rate with rule
* hotfix code quality
* fix doc style
* change name constant_with_rules to piecewise constant
---
src/diffusers/optimization.py | 50 +++++++++++++++++++++++++++++++++++
1 file changed, 50 insertions(+)
diff --git a/src/diffusers/optimization.py b/src/diffusers/optimization.py
index 657e085062e0..78d68b7978a9 100644
--- a/src/diffusers/optimization.py
+++ b/src/diffusers/optimization.py
@@ -34,6 +34,7 @@ class SchedulerType(Enum):
POLYNOMIAL = "polynomial"
CONSTANT = "constant"
CONSTANT_WITH_WARMUP = "constant_with_warmup"
+ PIECEWISE_CONSTANT = "piecewise_constant"
def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1):
@@ -77,6 +78,48 @@ def lr_lambda(current_step: int):
return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
+def get_piecewise_constant_schedule(optimizer: Optimizer, step_rules: str, last_epoch: int = -1):
+ """
+ Create a schedule with a constant learning rate, using the learning rate set in optimizer.
+
+ Args:
+ optimizer ([`~torch.optim.Optimizer`]):
+ The optimizer for which to schedule the learning rate.
+ step_rules (`string`):
+ The rules for the learning rate. ex: rule_steps="1:10,0.1:20,0.01:30,0.005" it means that the learning rate
+ if multiple 1 for the first 10 steps, mutiple 0.1 for the next 20 steps, multiple 0.01 for the next 30
+ steps and multiple 0.005 for the other steps.
+ last_epoch (`int`, *optional*, defaults to -1):
+ The index of the last epoch when resuming training.
+
+ Return:
+ `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+ """
+
+ rules_dict = {}
+ rule_list = step_rules.split(",")
+ for rule_str in rule_list[:-1]:
+ value_str, steps_str = rule_str.split(":")
+ steps = int(steps_str)
+ value = float(value_str)
+ rules_dict[steps] = value
+ last_lr_multiple = float(rule_list[-1])
+
+ def create_rules_function(rules_dict, last_lr_multiple):
+ def rule_func(steps: int) -> float:
+ sorted_steps = sorted(rules_dict.keys())
+ for i, sorted_step in enumerate(sorted_steps):
+ if steps < sorted_step:
+ return rules_dict[sorted_steps[i]]
+ return last_lr_multiple
+
+ return rule_func
+
+ rules_func = create_rules_function(rules_dict, last_lr_multiple)
+
+ return LambdaLR(optimizer, rules_func, last_epoch=last_epoch)
+
+
def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
"""
Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
@@ -232,12 +275,14 @@ def lr_lambda(current_step: int):
SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
SchedulerType.CONSTANT: get_constant_schedule,
SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
+ SchedulerType.PIECEWISE_CONSTANT: get_piecewise_constant_schedule,
}
def get_scheduler(
name: Union[str, SchedulerType],
optimizer: Optimizer,
+ step_rules: Optional[str] = None,
num_warmup_steps: Optional[int] = None,
num_training_steps: Optional[int] = None,
num_cycles: int = 1,
@@ -252,6 +297,8 @@ def get_scheduler(
The name of the scheduler to use.
optimizer (`torch.optim.Optimizer`):
The optimizer that will be used during training.
+ step_rules (`str`, *optional*):
+ A string representing the step rules to use. This is only used by the `PIECEWISE_CONSTANT` scheduler.
num_warmup_steps (`int`, *optional*):
The number of warmup steps to do. This is not required by all schedulers (hence the argument being
optional), the function will raise an error if it's unset and the scheduler type requires it.
@@ -270,6 +317,9 @@ def get_scheduler(
if name == SchedulerType.CONSTANT:
return schedule_func(optimizer, last_epoch=last_epoch)
+ if name == SchedulerType.PIECEWISE_CONSTANT:
+ return schedule_func(optimizer, rules=step_rules, last_epoch=last_epoch)
+
# All other schedulers require `num_warmup_steps`
if num_warmup_steps is None:
raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
From 4d35d7fea3208ddf1599e90b23ee95095b280646 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Fri, 28 Apr 2023 13:31:11 +0200
Subject: [PATCH 025/206] Allow disabling torch 2_0 attention (#3273)
* Allow disabling torch 2_0 attention
* make style
* Update src/diffusers/models/attention.py
---
src/diffusers/models/attention.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 8e537c6f3680..fb5f6f48b324 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -71,6 +71,7 @@ def __init__(
self.proj_attn = nn.Linear(channels, channels, bias=True)
self._use_memory_efficient_attention_xformers = False
+ self._use_2_0_attn = True
self._attention_op = None
def reshape_heads_to_batch_dim(self, tensor, merge_head_and_batch=True):
@@ -142,9 +143,8 @@ def forward(self, hidden_states):
scale = 1 / math.sqrt(self.channels / self.num_heads)
- use_torch_2_0_attn = (
- hasattr(F, "scaled_dot_product_attention") and not self._use_memory_efficient_attention_xformers
- )
+ _use_2_0_attn = self._use_2_0_attn and not self._use_memory_efficient_attention_xformers
+ use_torch_2_0_attn = hasattr(F, "scaled_dot_product_attention") and _use_2_0_attn
query_proj = self.reshape_heads_to_batch_dim(query_proj, merge_head_and_batch=not use_torch_2_0_attn)
key_proj = self.reshape_heads_to_batch_dim(key_proj, merge_head_and_batch=not use_torch_2_0_attn)
From 14b460614b101ea6c9c37f89c4be68ba3ece9754 Mon Sep 17 00:00:00 2001
From: YiYi Xu
Date: Fri, 28 Apr 2023 07:14:30 -1000
Subject: [PATCH 026/206] [doc] add link to training script (#3271)
add link to training script
Co-authored-by: yiyixuxu
---
docs/source/en/training/controlnet.mdx | 7 ++++++-
docs/source/en/training/custom_diffusion.mdx | 8 +++++++-
docs/source/en/training/instructpix2pix.mdx | 9 +++++++--
3 files changed, 20 insertions(+), 4 deletions(-)
diff --git a/docs/source/en/training/controlnet.mdx b/docs/source/en/training/controlnet.mdx
index 94e3d969b80a..1c91298477c7 100644
--- a/docs/source/en/training/controlnet.mdx
+++ b/docs/source/en/training/controlnet.mdx
@@ -33,7 +33,12 @@ cd diffusers
pip install -e .
```
-Then navigate into the example folder and run:
+Then navigate into the [example folder](https://github.com/huggingface/diffusers/tree/main/examples/controlnet)
+```bash
+cd examples/controlnet
+```
+
+Now run:
```bash
pip install -r requirements.txt
```
diff --git a/docs/source/en/training/custom_diffusion.mdx b/docs/source/en/training/custom_diffusion.mdx
index 08604f101ea2..ee8fb19bd18c 100644
--- a/docs/source/en/training/custom_diffusion.mdx
+++ b/docs/source/en/training/custom_diffusion.mdx
@@ -33,7 +33,13 @@ cd diffusers
pip install -e .
```
-Then cd in the example folder and run
+Then cd into the [example folder](https://github.com/huggingface/diffusers/tree/main/examples/custom_diffusion)
+
+```
+cd examples/custom_diffusion
+```
+
+Now run
```bash
pip install -r requirements.txt
diff --git a/docs/source/en/training/instructpix2pix.mdx b/docs/source/en/training/instructpix2pix.mdx
index ff34ec335656..6b6d4d908673 100644
--- a/docs/source/en/training/instructpix2pix.mdx
+++ b/docs/source/en/training/instructpix2pix.mdx
@@ -24,7 +24,7 @@ The output is an "edited" image that reflects the edit instruction applied on th
-The `train_instruct_pix2pix.py` script shows how to implement the training procedure and adapt it for Stable Diffusion.
+The `train_instruct_pix2pix.py` script (you can find the it [here](https://github.com/huggingface/diffusers/blob/main/examples/instruct_pix2pix/train_instruct_pix2pix.py)) shows how to implement the training procedure and adapt it for Stable Diffusion.
***Disclaimer: Even though `train_instruct_pix2pix.py` implements the InstructPix2Pix
training procedure while being faithful to the [original implementation](https://github.com/timothybrooks/instruct-pix2pix) we have only tested it on a [small-scale dataset](https://huggingface.co/datasets/fusing/instructpix2pix-1000-samples). This can impact the end results. For better results, we recommend longer training runs with a larger dataset. [Here](https://huggingface.co/datasets/timbrooks/instructpix2pix-clip-filtered) you can find a large dataset for InstructPix2Pix training.***
@@ -44,7 +44,12 @@ cd diffusers
pip install -e .
```
-Then cd in the example folder and run
+Then cd in the example folder
+```bash
+cd examples/instruct_pix2pix
+```
+
+Now run
```bash
pip install -r requirements.txt
```
From 384c83aa9a1f268e5587d5ea1ea9f4c040845167 Mon Sep 17 00:00:00 2001
From: Will Berman
Date: Fri, 28 Apr 2023 12:05:53 -0700
Subject: [PATCH 027/206] temp disable spectogram diffusion tests (#3278)
The note-seq package throws an error on import because the default installed version of Ipython
is not compatible with python 3.8 which we run in the CI.
https://github.com/huggingface/diffusers/actions/runs/4830121056/jobs/8605954838#step:7:9
---
setup.py | 2 --
src/diffusers/dependency_versions_table.py | 1 -
.../spectrogram_diffusion/test_spectrogram_diffusion.py | 4 ++++
3 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/setup.py b/setup.py
index c0df285dcffb..13c93dcae3c0 100644
--- a/setup.py
+++ b/setup.py
@@ -95,7 +95,6 @@
"Jinja2",
"k-diffusion>=0.0.12",
"librosa",
- "note-seq",
"numpy",
"parameterized",
"protobuf>=3.20.3,<4",
@@ -191,7 +190,6 @@ def run(self):
"Jinja2",
"k-diffusion",
"librosa",
- "note-seq",
"parameterized",
"pytest",
"pytest-timeout",
diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py
index 1269cf1578a6..0e714accacd6 100644
--- a/src/diffusers/dependency_versions_table.py
+++ b/src/diffusers/dependency_versions_table.py
@@ -19,7 +19,6 @@
"Jinja2": "Jinja2",
"k-diffusion": "k-diffusion>=0.0.12",
"librosa": "librosa",
- "note-seq": "note-seq",
"numpy": "numpy",
"parameterized": "parameterized",
"protobuf": "protobuf>=3.20.3,<4",
diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index 3b64ea2d2fc1..3ec6f681be79 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -34,6 +34,10 @@
MIDI_FILE = "./tests/fixtures/elise_format0.mid"
+# The note-seq package throws an error on import because the default installed version of Ipython
+# is not compatible with python 3.8 which we run in the CI.
+# https://github.com/huggingface/diffusers/actions/runs/4830121056/jobs/8605954838#step:7:98
+@unittest.skip("The note-seq package currently throws an error on import")
class SpectrogramDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = SpectrogramDiffusionPipeline
required_optional_params = PipelineTesterMixin.required_optional_params - {
From 536684eb2fe29605ea95e53ca6d581858db17c42 Mon Sep 17 00:00:00 2001
From: Ilia Larchenko <41329713+IliaLarchenko@users.noreply.github.com>
Date: Mon, 1 May 2023 20:33:51 +0700
Subject: [PATCH 028/206] Changed sample[0] to images[0] (#3304)
A pipeline object stores the results in `images` not in `sample`.
Current code blocks don't work.
---
src/diffusers/pipelines/stable_diffusion/README.md | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/diffusers/pipelines/stable_diffusion/README.md b/src/diffusers/pipelines/stable_diffusion/README.md
index be4c5d942b2e..66df9a811afb 100644
--- a/src/diffusers/pipelines/stable_diffusion/README.md
+++ b/src/diffusers/pipelines/stable_diffusion/README.md
@@ -61,7 +61,7 @@ pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
pipe = pipe.to("cuda")
prompt = "a photo of an astronaut riding a horse on mars"
-image = pipe(prompt).sample[0]
+image = pipe(prompt).images[0]
image.save("astronaut_rides_horse.png")
```
@@ -80,7 +80,7 @@ pipe = StableDiffusionPipeline.from_pretrained(
).to("cuda")
prompt = "a photo of an astronaut riding a horse on mars"
-image = pipe(prompt).sample[0]
+image = pipe(prompt).images[0]
image.save("astronaut_rides_horse.png")
```
@@ -99,7 +99,7 @@ pipe = StableDiffusionPipeline.from_pretrained(
).to("cuda")
prompt = "a photo of an astronaut riding a horse on mars"
-image = pipe(prompt).sample[0]
+image = pipe(prompt).images[0]
image.save("astronaut_rides_horse.png")
```
From 709cf554f69cd40c310a9bdb52a8d85dfc64c274 Mon Sep 17 00:00:00 2001
From: Ilia Larchenko <41329713+IliaLarchenko@users.noreply.github.com>
Date: Mon, 1 May 2023 20:44:30 +0700
Subject: [PATCH 029/206] Typo in tutorial (#3295)
---
docs/source/en/using-diffusers/write_own_pipeline.mdx | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/source/en/using-diffusers/write_own_pipeline.mdx b/docs/source/en/using-diffusers/write_own_pipeline.mdx
index 3c993ed53a2a..fa47878e1b9b 100644
--- a/docs/source/en/using-diffusers/write_own_pipeline.mdx
+++ b/docs/source/en/using-diffusers/write_own_pipeline.mdx
@@ -96,7 +96,7 @@ To recreate the pipeline with the model and scheduler separately, let's write ou
>>> image = (input / 2 + 0.5).clamp(0, 1)
>>> image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
- >>> image = Image.fromarray((image * 255)).round().astype("uint8")
+ >>> image = Image.fromarray((image * 255).round().astype("uint8"))
>>> image
```
From 0e82fb19e16bd2d45ade31c9a4b871de56e7e80a Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Mon, 1 May 2023 16:45:43 +0200
Subject: [PATCH 030/206] Torch compile graph fix (#3286)
* fix more
* Fix more
* fix more
* Apply suggestions from code review
* fix
* make style
* make fix-copies
* fix
* make sure torch compile
* Clean
* fix test
---
src/diffusers/models/attention.py | 2 ++
src/diffusers/models/attention_processor.py | 3 ++-
src/diffusers/models/modeling_utils.py | 10 ++++++--
src/diffusers/models/unet_2d_blocks.py | 25 +++++++++++--------
src/diffusers/models/unet_2d_condition.py | 4 +--
.../alt_diffusion/pipeline_alt_diffusion.py | 7 +++---
.../pipelines/deepfloyd_if/pipeline_if.py | 9 ++++---
.../pipeline_paint_by_example.py | 2 +-
.../pipeline_semantic_stable_diffusion.py | 2 +-
.../pipeline_cycle_diffusion.py | 2 +-
.../pipeline_stable_diffusion.py | 7 +++---
...line_stable_diffusion_attend_and_excite.py | 2 +-
.../pipeline_stable_diffusion_controlnet.py | 2 +-
.../pipeline_stable_diffusion_depth2img.py | 2 +-
.../pipeline_stable_diffusion_diffedit.py | 2 +-
...peline_stable_diffusion_image_variation.py | 2 +-
.../pipeline_stable_diffusion_inpaint.py | 2 +-
...ipeline_stable_diffusion_inpaint_legacy.py | 2 +-
...eline_stable_diffusion_instruct_pix2pix.py | 2 +-
.../pipeline_stable_diffusion_k_diffusion.py | 2 +-
...ipeline_stable_diffusion_latent_upscale.py | 2 +-
...pipeline_stable_diffusion_model_editing.py | 2 +-
.../pipeline_stable_diffusion_panorama.py | 2 +-
.../pipeline_stable_diffusion_pix2pix_zero.py | 2 +-
.../pipeline_stable_diffusion_sag.py | 4 +--
.../pipeline_stable_diffusion_upscale.py | 2 +-
.../pipeline_stable_unclip.py | 2 +-
.../pipeline_stable_unclip_img2img.py | 2 +-
.../pipeline_stable_diffusion_safe.py | 2 +-
.../versatile_diffusion/modeling_text_unet.py | 21 +++++++++-------
...ipeline_versatile_diffusion_dual_guided.py | 2 +-
...ine_versatile_diffusion_image_variation.py | 2 +-
...eline_versatile_diffusion_text_to_image.py | 2 +-
src/diffusers/utils/__init__.py | 1 +
src/diffusers/utils/torch_utils.py | 7 ++++++
.../stable_diffusion/test_stable_diffusion.py | 23 +++++++++++++++++
36 files changed, 109 insertions(+), 60 deletions(-)
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index fb5f6f48b324..134f84fc9d50 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -18,6 +18,7 @@
import torch.nn.functional as F
from torch import nn
+from ..utils import maybe_allow_in_graph
from ..utils.import_utils import is_xformers_available
from .attention_processor import Attention
from .embeddings import CombinedTimestepLabelEmbeddings
@@ -193,6 +194,7 @@ def forward(self, hidden_states):
return hidden_states
+@maybe_allow_in_graph
class BasicTransformerBlock(nn.Module):
r"""
A basic Transformer block.
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index b8787aed91f2..7ac88b17999a 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -17,7 +17,7 @@
import torch.nn.functional as F
from torch import nn
-from ..utils import deprecate, logging
+from ..utils import deprecate, logging, maybe_allow_in_graph
from ..utils.import_utils import is_xformers_available
@@ -31,6 +31,7 @@
xformers = None
+@maybe_allow_in_graph
class Attention(nn.Module):
r"""
A cross attention layer.
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index 521e99fdd69c..6644042077d2 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -77,8 +77,14 @@ def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
def get_parameter_dtype(parameter: torch.nn.Module):
try:
- parameters_and_buffers = itertools.chain(parameter.parameters(), parameter.buffers())
- return next(parameters_and_buffers).dtype
+ params = tuple(parameter.parameters())
+ if len(params) > 0:
+ return params[0].dtype
+
+ buffers = tuple(parameter.buffers())
+ if len(buffers) > 0:
+ return buffers[0].dtype
+
except StopIteration:
# For torch.nn.DataParallel compatibility in PyTorch 1.5
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index 439c5c34b601..57153fa39807 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -560,7 +560,8 @@ def forward(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
hidden_states = resnet(hidden_states, temb)
return hidden_states
@@ -868,15 +869,16 @@ def custom_forward(*inputs):
hidden_states,
encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
- output_states += (hidden_states,)
+ output_states = output_states + (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
hidden_states = downsampler(hidden_states)
- output_states += (hidden_states,)
+ output_states = output_states + (hidden_states,)
return hidden_states, output_states
@@ -949,13 +951,13 @@ def custom_forward(*inputs):
else:
hidden_states = resnet(hidden_states, temb)
- output_states += (hidden_states,)
+ output_states = output_states + (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
hidden_states = downsampler(hidden_states)
- output_states += (hidden_states,)
+ output_states = output_states + (hidden_states,)
return hidden_states, output_states
@@ -1342,13 +1344,13 @@ def custom_forward(*inputs):
else:
hidden_states = resnet(hidden_states, temb)
- output_states += (hidden_states,)
+ output_states = output_states + (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
hidden_states = downsampler(hidden_states, temb)
- output_states += (hidden_states,)
+ output_states = output_states + (hidden_states,)
return hidden_states, output_states
@@ -1466,13 +1468,13 @@ def forward(
**cross_attention_kwargs,
)
- output_states += (hidden_states,)
+ output_states = output_states + (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
hidden_states = downsampler(hidden_states, temb)
- output_states += (hidden_states,)
+ output_states = output_states + (hidden_states,)
return hidden_states, output_states
@@ -1859,7 +1861,8 @@ def custom_forward(*inputs):
hidden_states,
encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
if self.upsamplers is not None:
for upsampler in self.upsamplers:
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 38e0fa3b5b2e..83169455fa3e 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -682,7 +682,7 @@ def forward(
# `Timesteps` does not contain any weights and will always return f32 tensors
# but time_embedding might actually be running in fp16. so we need to cast here.
# there might be better ways to encapsulate this.
- t_emb = t_emb.to(dtype=self.dtype)
+ t_emb = t_emb.to(dtype=sample.dtype)
emb = self.time_embedding(t_emb, timestep_cond)
@@ -697,7 +697,7 @@ def forward(
# there might be better ways to encapsulate this.
class_labels = class_labels.to(dtype=sample.dtype)
- class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+ class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
if self.config.class_embeddings_concat:
emb = torch.cat([emb, class_emb], dim=-1)
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index ff9474ffd43a..b61703a2146d 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -437,7 +437,7 @@ def run_safety_checker(self, image, device, dtype):
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
@@ -683,7 +683,8 @@ def __call__(
t,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
@@ -691,7 +692,7 @@ def __call__(
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
index 479ffa9e6635..448389b9f1f6 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
@@ -793,7 +793,8 @@ def __call__(
t,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
@@ -805,8 +806,8 @@ def __call__(
# compute the previous noisy sample x_t -> x_t-1
intermediate_images = self.scheduler.step(
- noise_pred, t, intermediate_images, **extra_step_kwargs
- ).prev_sample
+ noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+ )[0]
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -829,7 +830,7 @@ def __call__(
# 11. Apply watermark
if self.watermarker is not None:
- self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+ image = self.watermarker.apply_watermark(image, self.unet.config.sample_size)
elif output_type == "pt":
nsfw_detected = None
watermark_detected = None
diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
index ca0a90a5b5ca..d6c069bbb7d0 100644
--- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
+++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
@@ -256,7 +256,7 @@ def prepare_extra_step_kwargs(self, generator, eta):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
index 3d5374875d12..fbe436ec9666 100644
--- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
+++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
@@ -134,7 +134,7 @@ def __init__(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index e2accb6d2d2a..a40ba75d04bd 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -516,7 +516,7 @@ def run_safety_checker(self, image, device, dtype):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 7347d70c4023..4168dc7e9788 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -440,7 +440,7 @@ def run_safety_checker(self, image, device, dtype):
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
@@ -686,7 +686,8 @@ def __call__(
t,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
@@ -694,7 +695,7 @@ def __call__(
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
index fba2a4e32f88..eec7debc38b7 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
@@ -454,7 +454,7 @@ def run_safety_checker(self, image, device, dtype):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 3bd7f82d7eb6..e36b0bcdf759 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -496,7 +496,7 @@ def run_safety_checker(self, image, device, dtype):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
index c4f9ae59a4e9..378eb927ca52 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -326,7 +326,7 @@ def run_safety_checker(self, image, device, dtype):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
index 9bef5269fa07..adada63b83f7 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
@@ -648,7 +648,7 @@ def prepare_extra_step_kwargs(self, generator, eta):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index d543593fdbf5..2dc762d62529 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -195,7 +195,7 @@ def run_safety_checker(self, image, device, dtype):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index fb2e5dc424e3..cac7465298cc 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -525,7 +525,7 @@ def prepare_extra_step_kwargs(self, generator, eta):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 3ad1d5e92273..6d93fba2425e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -446,7 +446,7 @@ def run_safety_checker(self, image, device, dtype):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index 49944cdcd636..225e3719b98f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -656,7 +656,7 @@ def prepare_extra_step_kwargs(self, generator, eta):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
index 99aca66db809..5a21bcafccbc 100755
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
@@ -358,7 +358,7 @@ def run_safety_checker(self, image, device, dtype):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
index 822bd49ce31c..fcda8d526c99 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -221,7 +221,7 @@ def _encode_prompt(self, prompt, device, do_classifier_free_guidance, negative_p
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
index b7ded03d529b..3926a4e70ad0 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
@@ -385,7 +385,7 @@ def run_safety_checker(self, image, device, dtype):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
index 392b2a72a76f..facffd7a852a 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
@@ -349,7 +349,7 @@ def run_safety_checker(self, image, device, dtype):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
index 6444ec7c8506..b60987edfaca 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -590,7 +590,7 @@ def run_safety_checker(self, image, device, dtype):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
index ebac58e18f62..27ba46c8b3e7 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
@@ -366,7 +366,7 @@ def run_safety_checker(self, image, device, dtype):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
@@ -619,7 +619,7 @@ def __call__(
def get_map_size(module, input, output):
nonlocal map_size
- map_size = output.sample.shape[-2:]
+ map_size = output[0].shape[-2:]
with self.unet.mid_block.attentions[0].register_forward_hook(get_map_size):
with self.progress_bar(total=num_inference_steps) as progress_bar:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index 87014f52dfc2..a8c29f32e9e5 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -373,7 +373,7 @@ def prepare_extra_step_kwargs(self, generator, eta):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
index fafb8d1d2800..3e34dcb98132 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
@@ -475,7 +475,7 @@ def _encode_prompt(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
index 22b7280f3679..9d6a6c8332fb 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
@@ -430,7 +430,7 @@ def _encode_image(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
index 87e7b3e6c9eb..f4f7eefcd07a 100644
--- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -364,7 +364,7 @@ def run_safety_checker(self, image, device, dtype, enable_safety_guidance):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 0959e2bb3a8b..e9e31d67905b 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -785,7 +785,7 @@ def forward(
# `Timesteps` does not contain any weights and will always return f32 tensors
# but time_embedding might actually be running in fp16. so we need to cast here.
# there might be better ways to encapsulate this.
- t_emb = t_emb.to(dtype=self.dtype)
+ t_emb = t_emb.to(dtype=sample.dtype)
emb = self.time_embedding(t_emb, timestep_cond)
@@ -800,7 +800,7 @@ def forward(
# there might be better ways to encapsulate this.
class_labels = class_labels.to(dtype=sample.dtype)
- class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+ class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
if self.config.class_embeddings_concat:
emb = torch.cat([emb, class_emb], dim=-1)
@@ -1081,13 +1081,13 @@ def custom_forward(*inputs):
else:
hidden_states = resnet(hidden_states, temb)
- output_states += (hidden_states,)
+ output_states = output_states + (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
hidden_states = downsampler(hidden_states)
- output_states += (hidden_states,)
+ output_states = output_states + (hidden_states,)
return hidden_states, output_states
@@ -1211,15 +1211,16 @@ def custom_forward(*inputs):
hidden_states,
encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
- output_states += (hidden_states,)
+ output_states = output_states + (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
hidden_states = downsampler(hidden_states)
- output_states += (hidden_states,)
+ output_states = output_states + (hidden_states,)
return hidden_states, output_states
@@ -1424,7 +1425,8 @@ def custom_forward(*inputs):
hidden_states,
encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
if self.upsamplers is not None:
for upsampler in self.upsamplers:
@@ -1528,7 +1530,8 @@ def forward(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
hidden_states = resnet(hidden_states, temb)
return hidden_states
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
index 661a1bd3cf73..2827ed4a7378 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
@@ -330,7 +330,7 @@ def normalize_embeddings(encoder_output):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
index e3a2ee370362..46eee27bcbfc 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
@@ -190,7 +190,7 @@ def normalize_embeddings(encoder_output):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
index 26b9be2bfa76..cd5dd70a2cdc 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
@@ -247,7 +247,7 @@ def normalize_embeddings(encoder_output):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index f3e4c9d1d0ec..cd3a1b8f3dd4 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -101,6 +101,7 @@
torch_all_close,
torch_device,
)
+ from .torch_utils import maybe_allow_in_graph
from .testing_utils import export_to_video
diff --git a/src/diffusers/utils/torch_utils.py b/src/diffusers/utils/torch_utils.py
index b9815cbceede..2b626a3b425a 100644
--- a/src/diffusers/utils/torch_utils.py
+++ b/src/diffusers/utils/torch_utils.py
@@ -25,6 +25,13 @@
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+try:
+ from torch._dynamo import allow_in_graph as maybe_allow_in_graph
+except (ImportError, ModuleNotFoundError):
+
+ def maybe_allow_in_graph(cls):
+ return cls
+
def randn_tensor(
shape: Union[Tuple, List],
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index fcfcd84c5d48..e1334e1ddd3b 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -22,6 +22,7 @@
import numpy as np
import torch
from huggingface_hub import hf_hub_download
+from packaging import version
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
@@ -921,6 +922,28 @@ def test_download_ckpt_diff_format_is_same(self):
assert np.max(np.abs(image - image_ckpt)) < 1e-4
+ def test_stable_diffusion_compile(self):
+ if version.parse(torch.__version__) >= version.parse("2.0"):
+ print(f"Test `test_stable_diffusion_ddim` is skipped because {torch.__version__} is < 2.0")
+ return
+
+ sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+ sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
+ sd_pipe = sd_pipe.to(torch_device)
+
+ sd_pipe.unet.to(memory_format=torch.channels_last)
+ sd_pipe.unet = torch.compile(sd_pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+ sd_pipe.set_progress_bar_config(disable=None)
+
+ inputs = self.get_inputs(torch_device)
+ image = sd_pipe(**inputs).images
+ image_slice = image[0, -3:, -3:, -1].flatten()
+
+ assert image.shape == (1, 512, 512, 3)
+ expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
+ assert np.abs(image_slice - expected_slice).max() < 1e-4
+
@nightly
@require_torch_gpu
From a7f25b4a881c0835980e2a3db5fe09bfc4ab2dc0 Mon Sep 17 00:00:00 2001
From: YiYi Xu
Date: Mon, 1 May 2023 07:54:09 -1000
Subject: [PATCH 031/206] Postprocessing refactor img2img (#3268)
* refactor img2img VaeImageProcessor.postprocess
* remove copy from for init, run_safety_checker, decode_latents
Co-authored-by: Sayak Paul
---------
Co-authored-by: yiyixuxu
Co-authored-by: Sayak Paul
---
src/diffusers/image_processor.py | 45 ++++++++--
.../pipeline_alt_diffusion_img2img.py | 56 ++++++------
.../pipeline_stable_diffusion_img2img.py | 54 ++++++------
tests/others/test_image_processor.py | 6 +-
tests/pipelines/pipeline_params.py | 4 +
.../test_stable_diffusion_img2img.py | 73 +++++-----------
tests/pipelines/test_pipelines_common.py | 85 ++++++++++++++++---
7 files changed, 198 insertions(+), 125 deletions(-)
diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 4598e1b4288c..68782d1f5f79 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -13,7 +13,7 @@
# limitations under the License.
import warnings
-from typing import Union
+from typing import List, Optional, Union
import numpy as np
import PIL
@@ -21,7 +21,7 @@
from PIL import Image
from .configuration_utils import ConfigMixin, register_to_config
-from .utils import CONFIG_NAME, PIL_INTERPOLATION
+from .utils import CONFIG_NAME, PIL_INTERPOLATION, deprecate
class VaeImageProcessor(ConfigMixin):
@@ -82,7 +82,7 @@ def numpy_to_pt(images):
@staticmethod
def pt_to_numpy(images):
"""
- Convert a numpy image to a pytorch tensor
+ Convert a pytorch tensor to a numpy image
"""
images = images.cpu().permute(0, 2, 3, 1).float().numpy()
return images
@@ -94,6 +94,13 @@ def normalize(images):
"""
return 2.0 * images - 1.0
+ @staticmethod
+ def denormalize(images):
+ """
+ Denormalize an image array to [0,1]
+ """
+ return (images / 2 + 0.5).clamp(0, 1)
+
def resize(self, images: PIL.Image.Image) -> PIL.Image.Image:
"""
Resize a PIL image. Both height and width will be downscaled to the next integer multiple of `vae_scale_factor`
@@ -165,17 +172,39 @@ def preprocess(
def postprocess(
self,
- image,
+ image: torch.FloatTensor,
output_type: str = "pil",
+ do_denormalize: Optional[List[bool]] = None,
):
- if isinstance(image, torch.Tensor) and output_type == "pt":
+ if not isinstance(image, torch.Tensor):
+ raise ValueError(
+ f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
+ )
+ if output_type not in ["latent", "pt", "np", "pil"]:
+ deprecation_message = (
+ f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+ "`pil`, `np`, `pt`, `latent`"
+ )
+ deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
+ output_type = "np"
+
+ if output_type == "latent":
+ return image
+
+ if do_denormalize is None:
+ do_denormalize = [self.config.do_normalize] * image.shape[0]
+
+ image = torch.stack(
+ [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])]
+ )
+
+ if output_type == "pt":
return image
image = self.pt_to_numpy(image)
if output_type == "np":
return image
- elif output_type == "pil":
+
+ if output_type == "pil":
return self.numpy_to_pil(image)
- else:
- raise ValueError(f"Unsupported output_type {output_type}.")
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index dee4a91924f7..5df9bab3ae41 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -13,6 +13,7 @@
# limitations under the License.
import inspect
+import warnings
from typing import Any, Callable, Dict, List, Optional, Union
import numpy as np
@@ -202,6 +203,7 @@ def __init__(
new_config = dict(unet.config)
new_config["sample_size"] = 64
unet._internal_dict = FrozenDict(new_config)
+
self.register_modules(
vae=vae,
text_encoder=text_encoder,
@@ -212,11 +214,8 @@ def __init__(
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
- self.register_to_config(
- requires_safety_checker=requires_safety_checker,
- )
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
@@ -436,17 +435,32 @@ def _encode_prompt(
return prompt_embeds
def run_safety_checker(self, image, device, dtype):
- feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
- safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
- image, has_nsfw_concept = self.safety_checker(
- images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
- )
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+ image, has_nsfw_concept = self.safety_checker(
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+ )
return image, has_nsfw_concept
def decode_latents(self, latents):
+ warnings.warn(
+ (
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead"
+ ),
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents).sample
image = (image / 2 + 0.5).clamp(0, 1)
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
return image
def prepare_extra_step_kwargs(self, generator, eta):
@@ -730,27 +744,19 @@ def __call__(
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- if output_type not in ["latent", "pt", "np", "pil"]:
- deprecation_message = (
- f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: "
- "`pil`, `np`, `pt`, `latent`"
- )
- deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
- output_type = "np"
-
- if output_type == "latent":
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor).sample
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
image = latents
has_nsfw_concept = None
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
else:
- image = self.decode_latents(latents)
-
- if self.safety_checker is not None:
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
- else:
- has_nsfw_concept = False
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(image, output_type=output_type)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index c26ddf06cadc..5e9a0f9e350b 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -13,6 +13,7 @@
# limitations under the License.
import inspect
+import warnings
from typing import Any, Callable, Dict, List, Optional, Union
import numpy as np
@@ -205,6 +206,7 @@ def __init__(
new_config = dict(unet.config)
new_config["sample_size"] = 64
unet._internal_dict = FrozenDict(new_config)
+
self.register_modules(
vae=vae,
text_encoder=text_encoder,
@@ -215,11 +217,8 @@ def __init__(
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
- self.register_to_config(
- requires_safety_checker=requires_safety_checker,
- )
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
def enable_sequential_cpu_offload(self, gpu_id=0):
@@ -443,17 +442,30 @@ def _encode_prompt(
return prompt_embeds
def run_safety_checker(self, image, device, dtype):
- feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
- safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
- image, has_nsfw_concept = self.safety_checker(
- images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
- )
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+ image, has_nsfw_concept = self.safety_checker(
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+ )
return image, has_nsfw_concept
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents).sample
image = (image / 2 + 0.5).clamp(0, 1)
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
return image
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
@@ -738,27 +750,19 @@ def __call__(
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- if output_type not in ["latent", "pt", "np", "pil"]:
- deprecation_message = (
- f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: "
- "`pil`, `np`, `pt`, `latent`"
- )
- deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
- output_type = "np"
-
- if output_type == "latent":
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor).sample
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
image = latents
has_nsfw_concept = None
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
else:
- image = self.decode_latents(latents)
-
- if self.safety_checker is not None:
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
- else:
- has_nsfw_concept = False
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- image = self.image_processor.postprocess(image, output_type=output_type)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
diff --git a/tests/others/test_image_processor.py b/tests/others/test_image_processor.py
index 4f0e2c5aecfd..c2cd6f4a04f4 100644
--- a/tests/others/test_image_processor.py
+++ b/tests/others/test_image_processor.py
@@ -42,7 +42,7 @@ def to_np(self, image):
return image
def test_vae_image_processor_pt(self):
- image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
+ image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
input_pt = self.dummy_sample
input_np = self.to_np(input_pt)
@@ -59,7 +59,7 @@ def test_vae_image_processor_pt(self):
), f"decoded output does not match input for output_type {output_type}"
def test_vae_image_processor_np(self):
- image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
+ image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1)
for output_type in ["pt", "np", "pil"]:
@@ -72,7 +72,7 @@ def test_vae_image_processor_np(self):
), f"decoded output does not match input for output_type {output_type}"
def test_vae_image_processor_pil(self):
- image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
+ image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1)
input_pil = image_processor.numpy_to_pil(input_np)
diff --git a/tests/pipelines/pipeline_params.py b/tests/pipelines/pipeline_params.py
index a0ac6c641c0b..7c5ffa2ca24b 100644
--- a/tests/pipelines/pipeline_params.py
+++ b/tests/pipelines/pipeline_params.py
@@ -22,6 +22,10 @@
TEXT_TO_IMAGE_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"])
+TEXT_TO_IMAGE_IMAGE_PARAMS = frozenset([])
+
+IMAGE_TO_IMAGE_IMAGE_PARAMS = frozenset(["image"])
+
IMAGE_VARIATION_PARAMS = frozenset(
[
"image",
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 4262114c78eb..123f5464dfaa 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -35,18 +35,23 @@
from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
-from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import (
+ IMAGE_TO_IMAGE_IMAGE_PARAMS,
+ TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
torch.backends.cuda.matmul.allow_tf32 = False
-class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableDiffusionImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionImg2ImgPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+ image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
def get_dummy_components(self):
torch.manual_seed(0)
@@ -96,33 +101,19 @@ def get_dummy_components(self):
}
return components
- def get_dummy_inputs(self, device, seed=0, input_image_type="pt", output_type="np"):
+ def get_dummy_inputs(self, device, seed=0):
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
-
- if input_image_type == "pt":
- input_image = image
- elif input_image_type == "np":
- input_image = image.cpu().numpy().transpose(0, 2, 3, 1)
- elif input_image_type == "pil":
- input_image = image.cpu().numpy().transpose(0, 2, 3, 1)
- input_image = VaeImageProcessor.numpy_to_pil(input_image)
- else:
- raise ValueError(f"unsupported input_image_type {input_image_type}.")
-
- if output_type not in ["pt", "np", "pil"]:
- raise ValueError(f"unsupported output_type {output_type}")
-
inputs = {
"prompt": "A painting of a squirrel eating a burger",
- "image": input_image,
+ "image": image,
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 6.0,
- "output_type": output_type,
+ "output_type": "numpy",
}
return inputs
@@ -130,11 +121,12 @@ def test_stable_diffusion_img2img_default_case(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
sd_pipe = StableDiffusionImg2ImgPipeline(**components)
- sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False)
+ sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=True)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
+ inputs["image"] = inputs["image"] / 2 + 0.5
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
@@ -147,11 +139,12 @@ def test_stable_diffusion_img2img_negative_prompt(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
sd_pipe = StableDiffusionImg2ImgPipeline(**components)
- sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False)
+ sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=True)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
+ inputs["image"] = inputs["image"] / 2 + 0.5
negative_prompt = "french fries"
output = sd_pipe(**inputs, negative_prompt=negative_prompt)
image = output.images
@@ -166,13 +159,14 @@ def test_stable_diffusion_img2img_multiple_init_images(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
sd_pipe = StableDiffusionImg2ImgPipeline(**components)
- sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False)
+ sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=True)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
inputs["prompt"] = [inputs["prompt"]] * 2
inputs["image"] = inputs["image"].repeat(2, 1, 1, 1)
+ inputs["image"] = inputs["image"] / 2 + 0.5
image = sd_pipe(**inputs).images
image_slice = image[-1, -3:, -3:, -1]
@@ -188,11 +182,12 @@ def test_stable_diffusion_img2img_k_lms(self):
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
)
sd_pipe = StableDiffusionImg2ImgPipeline(**components)
- sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False)
+ sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=True)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
+ inputs["image"] = inputs["image"] / 2 + 0.5
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
@@ -217,36 +212,6 @@ def test_save_load_optional_components(self):
def test_attention_slicing_forward_pass(self):
return super().test_attention_slicing_forward_pass()
- @skip_mps
- def test_pt_np_pil_outputs_equivalent(self):
- device = "cpu"
- components = self.get_dummy_components()
- sd_pipe = StableDiffusionImg2ImgPipeline(**components)
- sd_pipe = sd_pipe.to(device)
- sd_pipe.set_progress_bar_config(disable=None)
-
- output_pt = sd_pipe(**self.get_dummy_inputs(device, output_type="pt"))[0]
- output_np = sd_pipe(**self.get_dummy_inputs(device, output_type="np"))[0]
- output_pil = sd_pipe(**self.get_dummy_inputs(device, output_type="pil"))[0]
-
- assert np.abs(output_pt.cpu().numpy().transpose(0, 2, 3, 1) - output_np).max() <= 1e-4
- assert np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max() <= 1e-4
-
- @skip_mps
- def test_image_types_consistent(self):
- device = "cpu"
- components = self.get_dummy_components()
- sd_pipe = StableDiffusionImg2ImgPipeline(**components)
- sd_pipe = sd_pipe.to(device)
- sd_pipe.set_progress_bar_config(disable=None)
-
- output_pt = sd_pipe(**self.get_dummy_inputs(device, input_image_type="pt"))[0]
- output_np = sd_pipe(**self.get_dummy_inputs(device, input_image_type="np"))[0]
- output_pil = sd_pipe(**self.get_dummy_inputs(device, input_image_type="pil"))[0]
-
- assert np.abs(output_pt - output_np).max() <= 1e-4
- assert np.abs(output_pil - output_np).max() <= 1e-2
-
@slow
@require_torch_gpu
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 0278092282ba..aedda7bae026 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -12,6 +12,7 @@
import diffusers
from diffusers import DiffusionPipeline
+from diffusers.image_processor import VaeImageProcessor
from diffusers.utils import logging
from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available
from diffusers.utils.testing_utils import require_torch, torch_device
@@ -27,6 +28,78 @@ def to_np(tensor):
return tensor
+class PipelineLatentTesterMixin:
+ """
+ This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes.
+ It provides a set of common tests for PyTorch pipeline that has vae, e.g.
+ equivalence of different input and output types, etc.
+ """
+
+ @property
+ def image_params(self) -> frozenset:
+ raise NotImplementedError(
+ "You need to set the attribute `image_params` in the child test class. "
+ "`image_params` are tested for if all accepted input image types (i.e. `pt`,`pil`,`np`) are producing same results"
+ )
+
+ def get_dummy_inputs_by_type(self, device, seed=0, input_image_type="pt", output_type="np"):
+ inputs = self.get_dummy_inputs(device, seed)
+
+ def convert_pt_to_type(image, input_image_type):
+ if input_image_type == "pt":
+ input_image = image
+ elif input_image_type == "np":
+ input_image = VaeImageProcessor.pt_to_numpy(image)
+ elif input_image_type == "pil":
+ input_image = VaeImageProcessor.pt_to_numpy(image)
+ input_image = VaeImageProcessor.numpy_to_pil(input_image)
+ else:
+ raise ValueError(f"unsupported input_image_type {input_image_type}.")
+ return input_image
+
+ for image_param in self.image_params:
+ if image_param in inputs.keys():
+ inputs[image_param] = convert_pt_to_type(inputs[image_param], input_image_type)
+
+ inputs["output_type"] = output_type
+
+ return inputs
+
+ def test_pt_np_pil_outputs_equivalent(self):
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ pipe = pipe.to(torch_device)
+ pipe.set_progress_bar_config(disable=None)
+
+ output_pt = pipe(**self.get_dummy_inputs_by_type(torch_device, output_type="pt"))[0]
+ output_np = pipe(**self.get_dummy_inputs_by_type(torch_device, output_type="np"))[0]
+ output_pil = pipe(**self.get_dummy_inputs_by_type(torch_device, output_type="pil"))[0]
+
+ max_diff = np.abs(output_pt.cpu().numpy().transpose(0, 2, 3, 1) - output_np).max()
+ self.assertLess(max_diff, 1e-4, "`output_type=='pt'` generate different results from `output_type=='np'`")
+
+ max_diff = np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max()
+ self.assertLess(max_diff, 1e-4, "`output_type=='pil'` generate different results from `output_type=='np'`")
+
+ def test_pt_np_pil_inputs_equivalent(self):
+ if len(self.image_params) == 0:
+ return
+
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ pipe = pipe.to(torch_device)
+ pipe.set_progress_bar_config(disable=None)
+
+ out_input_pt = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="pt"))[0]
+ out_input_np = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
+ out_input_pil = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="pil"))[0]
+
+ max_diff = np.abs(out_input_pt - out_input_np).max()
+ self.assertLess(max_diff, 1e-4, "`input_type=='pt'` generate different result from `input_type=='np'`")
+ max_diff = np.abs(out_input_pil - out_input_np).max()
+ self.assertLess(max_diff, 1e-2, "`input_type=='pt'` generate different result from `input_type=='np'`")
+
+
@require_torch
class PipelineTesterMixin:
"""
@@ -339,9 +412,6 @@ def test_components_function(self):
@unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
def test_float16_inference(self):
- self._test_float16_inference()
-
- def _test_float16_inference(self, expected_max_diff=1e-2):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(torch_device)
@@ -355,13 +425,10 @@ def _test_float16_inference(self, expected_max_diff=1e-2):
output_fp16 = pipe_fp16(**self.get_dummy_inputs(torch_device))[0]
max_diff = np.abs(to_np(output) - to_np(output_fp16)).max()
- self.assertLess(max_diff, expected_max_diff, "The outputs of the fp16 and fp32 pipelines are too different.")
+ self.assertLess(max_diff, 1e-2, "The outputs of the fp16 and fp32 pipelines are too different.")
@unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
def test_save_load_float16(self):
- self._test_save_load_float16()
-
- def _test_save_load_float16(self, expected_max_diff=1e-2):
components = self.get_dummy_components()
for name, module in components.items():
if hasattr(module, "half"):
@@ -390,9 +457,7 @@ def _test_save_load_float16(self, expected_max_diff=1e-2):
output_loaded = pipe_loaded(**inputs)[0]
max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
- self.assertLess(
- max_diff, expected_max_diff, "The output of the fp16 pipeline changed after saving and loading."
- )
+ self.assertLess(max_diff, 1e-2, "The output of the fp16 pipeline changed after saving and loading.")
def test_save_load_optional_components(self):
if not hasattr(self.pipeline_class, "_optional_components"):
From 5c7a35a25915f29aa79e5b69d831fd0f7d7d8d41 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Tue, 2 May 2023 19:51:00 +0200
Subject: [PATCH 032/206] [Torch 2.0 compile] Fix more torch compile breaks
(#3313)
* Fix more torch compile breaks
* add tests
* Fix all
* fix controlnet
* fix more
* Add Horace He as co-author.
>
>
Co-authored-by: Horace He
* Add Horace He as co-author.
Co-authored-by: Horace He
---------
Co-authored-by: Horace He
---
src/diffusers/models/controlnet.py | 15 +++---
src/diffusers/models/unet_2d_condition.py | 2 +-
.../pipeline_alt_diffusion_img2img.py | 9 ++--
.../deepfloyd_if/pipeline_if_img2img.py | 7 +--
.../pipeline_if_img2img_superresolution.py | 7 +--
.../deepfloyd_if/pipeline_if_inpainting.py | 7 +--
.../pipeline_if_inpainting_superresolution.py | 7 +--
.../pipeline_if_superresolution.py | 7 +--
.../pipeline_stable_diffusion_controlnet.py | 48 +++++++++++++++----
.../pipeline_stable_diffusion_depth2img.py | 6 ++-
.../pipeline_stable_diffusion_img2img.py | 9 ++--
.../pipeline_stable_diffusion_inpaint.py | 6 ++-
...ipeline_stable_diffusion_inpaint_legacy.py | 6 ++-
...eline_stable_diffusion_instruct_pix2pix.py | 6 ++-
.../pipeline_stable_diffusion_upscale.py | 10 ++--
.../pipeline_stable_unclip.py | 8 ++--
.../pipeline_stable_unclip_img2img.py | 5 +-
.../versatile_diffusion/modeling_text_unet.py | 2 +-
.../stable_diffusion/test_stable_diffusion.py | 44 ++++++++---------
.../test_stable_diffusion_controlnet.py | 37 ++++++++++++++
.../test_stable_diffusion_img2img.py | 23 +++++++++
.../test_stable_diffusion_inpaint.py | 26 ++++++++++
22 files changed, 219 insertions(+), 78 deletions(-)
diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py
index 3ffbb04eb222..7b36d2eed96a 100644
--- a/src/diffusers/models/controlnet.py
+++ b/src/diffusers/models/controlnet.py
@@ -498,7 +498,7 @@ def forward(
# timesteps does not contain any weights and will always return f32 tensors
# but time_embedding might actually be running in fp16. so we need to cast here.
# there might be better ways to encapsulate this.
- t_emb = t_emb.to(dtype=self.dtype)
+ t_emb = t_emb.to(dtype=sample.dtype)
emb = self.time_embedding(t_emb, timestep_cond)
@@ -517,7 +517,7 @@ def forward(
controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
- sample += controlnet_cond
+ sample = sample + controlnet_cond
# 3. down
down_block_res_samples = (sample,)
@@ -551,7 +551,7 @@ def forward(
for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
down_block_res_sample = controlnet_block(down_block_res_sample)
- controlnet_down_block_res_samples += (down_block_res_sample,)
+ controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)
down_block_res_samples = controlnet_down_block_res_samples
@@ -559,13 +559,14 @@ def forward(
# 6. scaling
if guess_mode:
- scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1) # 0.1 to 1.0
- scales *= conditioning_scale
+ scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device) # 0.1 to 1.0
+
+ scales = scales * conditioning_scale
down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
- mid_block_res_sample *= scales[-1] # last one
+ mid_block_res_sample = mid_block_res_sample * scales[-1] # last one
else:
down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
- mid_block_res_sample *= conditioning_scale
+ mid_block_res_sample = mid_block_res_sample * conditioning_scale
if self.config.global_pool_conditions:
down_block_res_samples = [
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 83169455fa3e..2a4c9fd72c1b 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -740,7 +740,7 @@ def forward(
down_block_res_samples, down_block_additional_residuals
):
down_block_res_sample = down_block_res_sample + down_block_additional_residual
- new_down_block_res_samples += (down_block_res_sample,)
+ new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
down_block_res_samples = new_down_block_res_samples
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 5df9bab3ae41..cabed8f017ce 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -457,7 +457,7 @@ def decode_latents(self, latents):
FutureWarning,
)
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
@@ -728,7 +728,8 @@ def __call__(
t,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
@@ -736,7 +737,7 @@ def __call__(
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -745,7 +746,7 @@ def __call__(
callback(i, t, latents)
if not output_type == "latent":
- image = self.vae.decode(latents / self.vae.config.scaling_factor).sample
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
else:
image = latents
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
index fac4adeea463..231ee02b1bb8 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
@@ -918,7 +918,8 @@ def __call__(
t,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
@@ -930,8 +931,8 @@ def __call__(
# compute the previous noisy sample x_t -> x_t-1
intermediate_images = self.scheduler.step(
- noise_pred, t, intermediate_images, **extra_step_kwargs
- ).prev_sample
+ noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+ )[0]
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
index eed1bb43e5d8..770676c15984 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
@@ -1036,7 +1036,8 @@ def __call__(
encoder_hidden_states=prompt_embeds,
class_labels=noise_level,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
@@ -1048,8 +1049,8 @@ def __call__(
# compute the previous noisy sample x_t -> x_t-1
intermediate_images = self.scheduler.step(
- noise_pred, t, intermediate_images, **extra_step_kwargs
- ).prev_sample
+ noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+ )[0]
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
index d3651f5169c1..6986387ca995 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
@@ -1033,7 +1033,8 @@ def __call__(
t,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
@@ -1047,8 +1048,8 @@ def __call__(
prev_intermediate_images = intermediate_images
intermediate_images = self.scheduler.step(
- noise_pred, t, intermediate_images, **extra_step_kwargs
- ).prev_sample
+ noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+ )[0]
intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
index 5ea6a47082ae..2b42d3992ed8 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
@@ -1143,7 +1143,8 @@ def __call__(
encoder_hidden_states=prompt_embeds,
class_labels=noise_level,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
@@ -1157,8 +1158,8 @@ def __call__(
prev_intermediate_images = intermediate_images
intermediate_images = self.scheduler.step(
- noise_pred, t, intermediate_images, **extra_step_kwargs
- ).prev_sample
+ noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+ )[0]
intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
index a62a51b0972f..4729cec3e4d7 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
@@ -886,7 +886,8 @@ def __call__(
encoder_hidden_states=prompt_embeds,
class_labels=noise_level,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
@@ -898,8 +899,8 @@ def __call__(
# compute the previous noisy sample x_t -> x_t-1
intermediate_images = self.scheduler.step(
- noise_pred, t, intermediate_images, **extra_step_kwargs
- ).prev_sample
+ noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+ )[0]
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index e36b0bcdf759..5e8e68823b34 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -20,6 +20,7 @@
import numpy as np
import PIL.Image
import torch
+import torch.nn.functional as F
from torch import nn
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
@@ -579,9 +580,20 @@ def check_inputs(
)
# Check `image`
- if isinstance(self.controlnet, ControlNetModel):
+ is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+ self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+ )
+ if (
+ isinstance(self.controlnet, ControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, ControlNetModel)
+ ):
self.check_image(image, prompt, prompt_embeds)
- elif isinstance(self.controlnet, MultiControlNetModel):
+ elif (
+ isinstance(self.controlnet, MultiControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+ ):
if not isinstance(image, list):
raise TypeError("For multiple controlnets: `image` must be type `list`")
@@ -600,10 +612,18 @@ def check_inputs(
assert False
# Check `controlnet_conditioning_scale`
- if isinstance(self.controlnet, ControlNetModel):
+ if (
+ isinstance(self.controlnet, ControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, ControlNetModel)
+ ):
if not isinstance(controlnet_conditioning_scale, float):
raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
- elif isinstance(self.controlnet, MultiControlNetModel):
+ elif (
+ isinstance(self.controlnet, MultiControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+ ):
if isinstance(controlnet_conditioning_scale, list):
if any(isinstance(i, list) for i in controlnet_conditioning_scale):
raise ValueError("A single batch of multiple conditionings are supported at the moment.")
@@ -910,7 +930,14 @@ def __call__(
)
# 4. Prepare image
- if isinstance(self.controlnet, ControlNetModel):
+ is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+ self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+ )
+ if (
+ isinstance(self.controlnet, ControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, ControlNetModel)
+ ):
image = self.prepare_image(
image=image,
width=width,
@@ -922,7 +949,11 @@ def __call__(
do_classifier_free_guidance=do_classifier_free_guidance,
guess_mode=guess_mode,
)
- elif isinstance(self.controlnet, MultiControlNetModel):
+ elif (
+ isinstance(self.controlnet, MultiControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+ ):
images = []
for image_ in image:
@@ -1006,7 +1037,8 @@ def __call__(
cross_attention_kwargs=cross_attention_kwargs,
down_block_additional_residuals=down_block_res_samples,
mid_block_additional_residual=mid_block_res_sample,
- ).sample
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
@@ -1014,7 +1046,7 @@ def __call__(
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
index 378eb927ca52..16f96bbc2fd5 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -677,7 +677,9 @@ def __call__(
latent_model_input = torch.cat([latent_model_input, depth_mask], dim=1)
# predict the noise residual
- noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds, return_dict=False)[
+ 0
+ ]
# perform guidance
if do_classifier_free_guidance:
@@ -685,7 +687,7 @@ def __call__(
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 5e9a0f9e350b..2dfa730549ab 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -462,7 +462,7 @@ def decode_latents(self, latents):
FutureWarning,
)
latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents).sample
+ image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
@@ -734,7 +734,8 @@ def __call__(
t,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
@@ -742,7 +743,7 @@ def __call__(
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -751,7 +752,7 @@ def __call__(
callback(i, t, latents)
if not output_type == "latent":
- image = self.vae.decode(latents / self.vae.config.scaling_factor).sample
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
else:
image = latents
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index cac7465298cc..859a34677317 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -878,7 +878,9 @@ def __call__(
latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
# predict the noise residual
- noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds, return_dict=False)[
+ 0
+ ]
# perform guidance
if do_classifier_free_guidance:
@@ -886,7 +888,7 @@ def __call__(
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 6d93fba2425e..990c0e838f35 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -690,7 +690,9 @@ def __call__(
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
- noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds, return_dict=False)[
+ 0
+ ]
# perform guidance
if do_classifier_free_guidance:
@@ -698,7 +700,7 @@ def __call__(
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
# masking
if add_predicted_noise:
init_latents_proper = self.scheduler.add_noise(
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index 225e3719b98f..b9dd3aa24b11 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -346,7 +346,9 @@ def __call__(
scaled_latent_model_input = torch.cat([scaled_latent_model_input, image_latents], dim=1)
# predict the noise residual
- noise_pred = self.unet(scaled_latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+ noise_pred = self.unet(
+ scaled_latent_model_input, t, encoder_hidden_states=prompt_embeds, return_dict=False
+ )[0]
# Hack:
# For karras style schedulers the model does classifer free guidance using the
@@ -376,7 +378,7 @@ def __call__(
noise_pred = (noise_pred - latents) / (-sigma)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index a8c29f32e9e5..da1575289c8e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -678,8 +678,12 @@ def __call__(
# predict the noise residual
noise_pred = self.unet(
- latent_model_input, t, encoder_hidden_states=prompt_embeds, class_labels=noise_level
- ).sample
+ latent_model_input,
+ t,
+ encoder_hidden_states=prompt_embeds,
+ class_labels=noise_level,
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
@@ -687,7 +691,7 @@ def __call__(
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
index 3e34dcb98132..51ba24c65873 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
@@ -830,7 +830,8 @@ def __call__(
timestep=t,
sample=prior_latents,
**prior_extra_step_kwargs,
- ).prev_sample
+ return_dict=False,
+ )[0]
if callback is not None and i % callback_steps == 0:
callback(i, t, prior_latents)
@@ -903,7 +904,8 @@ def __call__(
encoder_hidden_states=prompt_embeds,
class_labels=image_embeds,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
@@ -911,7 +913,7 @@ def __call__(
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
index 9d6a6c8332fb..fce82a5bb61f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
@@ -799,7 +799,8 @@ def __call__(
encoder_hidden_states=prompt_embeds,
class_labels=image_embeds,
cross_attention_kwargs=cross_attention_kwargs,
- ).sample
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
@@ -807,7 +808,7 @@ def __call__(
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index e9e31d67905b..f0a210339c46 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -843,7 +843,7 @@ def forward(
down_block_res_samples, down_block_additional_residuals
):
down_block_res_sample = down_block_res_sample + down_block_additional_residual
- new_down_block_res_samples += (down_block_res_sample,)
+ new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
down_block_res_samples = new_down_block_res_samples
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index e1334e1ddd3b..4583cc42e6f1 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -866,6 +866,28 @@ def test_stable_diffusion_textual_inversion(self):
max_diff = np.abs(expected_image - image).max()
assert max_diff < 5e-2
+ def test_stable_diffusion_compile(self):
+ if version.parse(torch.__version__) < version.parse("2.0"):
+ print(f"Test `test_stable_diffusion_ddim` is skipped because {torch.__version__} is < 2.0")
+ return
+
+ sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+ sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
+ sd_pipe = sd_pipe.to(torch_device)
+
+ sd_pipe.unet.to(memory_format=torch.channels_last)
+ sd_pipe.unet = torch.compile(sd_pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+ sd_pipe.set_progress_bar_config(disable=None)
+
+ inputs = self.get_inputs(torch_device)
+ image = sd_pipe(**inputs).images
+ image_slice = image[0, -3:, -3:, -1].flatten()
+
+ assert image.shape == (1, 512, 512, 3)
+ expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
+ assert np.abs(image_slice - expected_slice).max() < 5e-3
+
@slow
@require_torch_gpu
@@ -922,28 +944,6 @@ def test_download_ckpt_diff_format_is_same(self):
assert np.max(np.abs(image - image_ckpt)) < 1e-4
- def test_stable_diffusion_compile(self):
- if version.parse(torch.__version__) >= version.parse("2.0"):
- print(f"Test `test_stable_diffusion_ddim` is skipped because {torch.__version__} is < 2.0")
- return
-
- sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
- sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
- sd_pipe = sd_pipe.to(torch_device)
-
- sd_pipe.unet.to(memory_format=torch.channels_last)
- sd_pipe.unet = torch.compile(sd_pipe.unet, mode="reduce-overhead", fullgraph=True)
-
- sd_pipe.set_progress_bar_config(disable=None)
-
- inputs = self.get_inputs(torch_device)
- image = sd_pipe(**inputs).images
- image_slice = image[0, -3:, -3:, -1].flatten()
-
- assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
- assert np.abs(image_slice - expected_slice).max() < 1e-4
-
@nightly
@require_torch_gpu
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 70b3652fce77..279df4a32b29 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -19,6 +19,7 @@
import numpy as np
import torch
+from packaging import version
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
@@ -585,6 +586,42 @@ def test_canny_guess_mode(self):
expected_slice = np.array([0.2724, 0.2846, 0.2724, 0.3843, 0.3682, 0.2736, 0.4675, 0.3862, 0.2887])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+ def test_stable_diffusion_compile(self):
+ if version.parse(torch.__version__) < version.parse("2.0"):
+ print(f"Test `test_stable_diffusion_ddim` is skipped because {torch.__version__} is < 2.0")
+ return
+
+ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+
+ pipe = StableDiffusionControlNetPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+ )
+ pipe.to("cuda")
+ pipe.set_progress_bar_config(disable=None)
+
+ pipe.unet.to(memory_format=torch.channels_last)
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+ pipe.controlnet.to(memory_format=torch.channels_last)
+ pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
+
+ generator = torch.Generator(device="cpu").manual_seed(0)
+ prompt = "bird"
+ image = load_image(
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+ )
+
+ output = pipe(prompt, image, generator=generator, output_type="np")
+ image = output.images[0]
+
+ assert image.shape == (768, 512, 3)
+
+ expected_image = load_numpy(
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy"
+ )
+
+ assert np.abs(expected_image - image).max() < 1e-1
+
@slow
@require_torch_gpu
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 123f5464dfaa..2f63371c1a0d 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -19,6 +19,7 @@
import numpy as np
import torch
+from packaging import version
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
@@ -460,6 +461,28 @@ def test_img2img_safety_checker_works(self):
assert out.nsfw_content_detected[0], f"Safety checker should work for prompt: {inputs['prompt']}"
assert np.abs(out.images[0]).sum() < 1e-5 # should be all zeros
+ def test_img2img_compile(self):
+ if version.parse(torch.__version__) < version.parse("2.0"):
+ print(f"Test `test_stable_diffusion_ddim` is skipped because {torch.__version__} is < 2.0")
+ return
+
+ pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+ pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+ pipe.to(torch_device)
+ pipe.set_progress_bar_config(disable=None)
+
+ pipe.unet.to(memory_format=torch.channels_last)
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+ inputs = self.get_inputs(torch_device)
+ image = pipe(**inputs).images
+ image_slice = image[0, -3:, -3:, -1].flatten()
+
+ assert image.shape == (1, 512, 768, 3)
+ expected_slice = np.array([0.0593, 0.0607, 0.0851, 0.0582, 0.0636, 0.0721, 0.0751, 0.0981, 0.0781])
+
+ assert np.abs(expected_slice - image_slice).max() < 1e-3
+
@nightly
@require_torch_gpu
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 290d9b0a9134..20977c346ecc 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -19,6 +19,7 @@
import numpy as np
import torch
+from packaging import version
from PIL import Image
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
@@ -274,6 +275,31 @@ def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self):
# make sure that less than 2.2 GB is allocated
assert mem_bytes < 2.2 * 10**9
+ def test_inpaint_compile(self):
+ if version.parse(torch.__version__) < version.parse("2.0"):
+ print(f"Test `test_stable_diffusion_ddim` is skipped because {torch.__version__} is < 2.0")
+ return
+
+ pipe = StableDiffusionInpaintPipeline.from_pretrained(
+ "runwayml/stable-diffusion-inpainting", safety_checker=None
+ )
+ pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
+ pipe.to(torch_device)
+ pipe.set_progress_bar_config(disable=None)
+
+ pipe.unet.to(memory_format=torch.channels_last)
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+ inputs = self.get_inputs(torch_device)
+ image = pipe(**inputs).images
+ image_slice = image[0, 253:256, 253:256, -1].flatten()
+
+ assert image.shape == (1, 512, 512, 3)
+ expected_slice = np.array([0.0425, 0.0273, 0.0344, 0.1694, 0.1727, 0.1812, 0.3256, 0.3311, 0.3272])
+
+ assert np.abs(expected_slice - image_slice).max() < 1e-4
+ assert np.abs(expected_slice - image_slice).max() < 1e-3
+
@nightly
@require_torch_gpu
From efc48da23bd342ca79a79def02649b1975393ea0 Mon Sep 17 00:00:00 2001
From: Sayak Paul
Date: Wed, 3 May 2023 10:13:05 +0530
Subject: [PATCH 033/206] fix: scale_lr and sync example readme and docs.
(#3299)
* fix: scale_lr and sync example readme and docs.
* fix doc link.
---
docs/source/en/training/lora.mdx | 22 +++++++++++++++++--
examples/dreambooth/README.md | 23 +++++++++++++++++---
examples/dreambooth/train_dreambooth_lora.py | 5 -----
3 files changed, 40 insertions(+), 10 deletions(-)
diff --git a/docs/source/en/training/lora.mdx b/docs/source/en/training/lora.mdx
index 3c7cc7ebfeec..8e41aab5e2d8 100644
--- a/docs/source/en/training/lora.mdx
+++ b/docs/source/en/training/lora.mdx
@@ -243,8 +243,26 @@ Load the LoRA weights from your finetuned DreamBooth model *on top of the base m
>>> image.save("bucket-dog.png")
```
-Note that the use of [`LoraLoaderMixin.load_lora_weights`] is preferred to [`UNet2DConditionLoadersMixin.load_attn_procs`] for loading LoRA parameters. This is because
-[`LoraLoaderMixin.load_lora_weights`] can handle the following situations:
+If you used `--train_text_encoder` during training, then use `pipe.load_lora_weights()` to load the LoRA
+weights. For example:
+
+```python
+from huggingface_hub.repocard import RepoCard
+from diffusers import StableDiffusionPipeline
+import torch
+
+lora_model_id = "sayakpaul/dreambooth-text-encoder-test"
+card = RepoCard.load(lora_model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = StableDiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+pipe.load_lora_weights(lora_model_id)
+image = pipe("A picture of a sks dog in a bucket", num_inference_steps=25).images[0]
+```
+
+Note that the use of [`~diffusers.loaders.LoraLoaderMixin.load_lora_weights`] is preferred to [`~diffusers.loaders.UNet2DConditionLoadersMixin.load_attn_procs`] for loading LoRA parameters. This is because
+[`~diffusers.loaders.LoraLoaderMixin.load_lora_weights`] can handle the following situations:
* LoRA parameters that don't have separate identifiers for the UNet and the text encoder (such as [`"patrickvonplaten/lora_dreambooth_dog_example"`](https://huggingface.co/patrickvonplaten/lora_dreambooth_dog_example)). So, you can just do:
diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md
index 490e31458988..75d705f89e02 100644
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -408,9 +408,26 @@ pipe = StableDiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.
...
```
-**Note** that we will gradually be depcrecating the use of [`UNet2DConditionLoadersMixin.load_attn_procs`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.UNet2DConditionLoadersMixin.load_attn_procs) since we now have a more general
-method to load the LoRA parameters -- [`LoraLoaderMixin.load_lora_weights`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.load_lora_weights). This is because
-[`LoraLoaderMixin.load_lora_weights`] can handle the following situations:
+If you used `--train_text_encoder` during training, then use `pipe.load_lora_weights()` to load the LoRA
+weights. For example:
+
+```python
+from huggingface_hub.repocard import RepoCard
+from diffusers import StableDiffusionPipeline
+import torch
+
+lora_model_id = "sayakpaul/dreambooth-text-encoder-test"
+card = RepoCard.load(lora_model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = StableDiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+pipe.load_lora_weights(lora_model_id)
+image = pipe("A picture of a sks dog in a bucket", num_inference_steps=25).images[0]
+```
+
+Note that the use of [`LoraLoaderMixin.load_lora_weights`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.load_lora_weights) is preferred to [`UNet2DConditionLoadersMixin.load_attn_procs`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.UNet2DConditionLoadersMixin.load_attn_procs) for loading LoRA parameters. This is because
+`LoraLoaderMixin.load_lora_weights` can handle the following situations:
* LoRA parameters that don't have separate identifiers for the UNet and the text encoder (such as [`"patrickvonplaten/lora_dreambooth_dog_example"`](https://huggingface.co/patrickvonplaten/lora_dreambooth_dog_example)). So, you can just do:
diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
index 5cefc57c614d..9af81aa5a95d 100644
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
@@ -746,11 +746,6 @@ def main(args):
accelerator.register_for_checkpointing(text_encoder_lora_layers)
del temp_pipeline
- if args.scale_lr:
- args.learning_rate = (
- args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
- )
-
# Enable TF32 for faster training on Ampere GPUs,
# cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
if args.allow_tf32:
From 0ccad2ad2d80f3d58780c4503e3ea4dfd1ff3c0e Mon Sep 17 00:00:00 2001
From: Umar <55330742+mu94-csl@users.noreply.github.com>
Date: Wed, 3 May 2023 10:53:14 -0400
Subject: [PATCH 034/206] Update stable_diffusion.mdx (#3310)
fixed import statement
---
docs/source/en/stable_diffusion.mdx | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/docs/source/en/stable_diffusion.mdx b/docs/source/en/stable_diffusion.mdx
index eebe0ec660f2..0cec07834507 100644
--- a/docs/source/en/stable_diffusion.mdx
+++ b/docs/source/en/stable_diffusion.mdx
@@ -153,7 +153,7 @@ def get_inputs(batch_size=1):
You'll also need a function that'll display each batch of images:
```python
-from PIL import image
+from PIL import Image
def image_grid(imgs, rows=2, cols=2):
@@ -268,4 +268,4 @@ In this tutorial, you learned how to optimize a [`DiffusionPipeline`] for comput
- Enable [xFormers](./optimization/xformers) memory efficient attention mechanism for faster speed and reduced memory consumption.
- Learn how in [PyTorch 2.0](./optimization/torch2.0), [`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html) can yield 2-9% faster inference speed.
-- Many optimization techniques for inference are also included in this memory and speed [guide](./optimization/fp16), such as memory offloading.
\ No newline at end of file
+- Many optimization techniques for inference are also included in this memory and speed [guide](./optimization/fp16), such as memory offloading.
From 63a8ef7b7334589ba4a092e4b805e1956c8b5093 Mon Sep 17 00:00:00 2001
From: Mylo <36931363+gitmylo@users.noreply.github.com>
Date: Wed, 3 May 2023 18:31:04 +0200
Subject: [PATCH 035/206] Fix missing variable assign in DeepFloyd-IF-II
(#3315)
Fix missing variable assign
lol
---
.../pipelines/deepfloyd_if/pipeline_if_superresolution.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
index 4729cec3e4d7..1ba8f888a8e3 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
@@ -667,7 +667,7 @@ def preprocess_image(self, image, num_images_per_prompt, device):
image = [np.array(i).astype(np.float32) / 255.0 for i in image]
image = np.stack(image, axis=0) # to np
- torch.from_numpy(image.transpose(0, 3, 1, 2))
+ image = torch.from_numpy(image.transpose(0, 3, 1, 2))
elif isinstance(image[0], np.ndarray):
image = np.stack(image, axis=0) # to np
if image.ndim == 5:
From 79bd909dbddfa710bce38b9aa4e4644d16b5bb6c Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Wed, 3 May 2023 18:33:41 +0200
Subject: [PATCH 036/206] Correct doc build for patch releases (#3316)
Update build_documentation.yml
---
.github/workflows/build_documentation.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index c202cc628542..c833bc0319e1 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -5,7 +5,7 @@ on:
branches:
- main
- doc-builder*
- - v*-release
+ - v*-patch
jobs:
build:
From 2dd408504a252feed7de603d5c1293ab2af3df72 Mon Sep 17 00:00:00 2001
From: Markus Pobitzer
Date: Wed, 3 May 2023 18:59:49 +0200
Subject: [PATCH 037/206] Add Stable Diffusion RePaint to community pipelines
(#3320)
* Add Stable Diffsuion RePaint to community pipelines
- Adds Stable Diffsuion RePaint to community pipelines
- Add Readme enty for pipeline
* Fix: Remove wrong import
- Remove wrong import
- Minor change in comments
* Fix: Code formatting of stable_diffusion_repaint
* Fix: ruff errors in stable_diffusion_repaint
---
examples/community/README.md | 89 +-
.../community/stable_diffusion_repaint.py | 956 ++++++++++++++++++
2 files changed, 1018 insertions(+), 27 deletions(-)
create mode 100644 examples/community/stable_diffusion_repaint.py
diff --git a/examples/community/README.md b/examples/community/README.md
index 91528eac1e85..14f15fd2215e 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -6,33 +6,34 @@
Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out.
If a community doesn't work as expected, please open an issue and ping the author on it.
-| Example | Description | Code Example | Colab | Author |
-|:---------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------:|
-| CLIP Guided Stable Diffusion | Doing CLIP guidance for text to image generation with Stable Diffusion | [CLIP Guided Stable Diffusion](#clip-guided-stable-diffusion) | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb) | [Suraj Patil](https://github.com/patil-suraj/) |
-| One Step U-Net (Dummy) | Example showcasing of how to use Community Pipelines (see https://github.com/huggingface/diffusers/issues/841) | [One Step U-Net](#one-step-unet) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) |
-| Stable Diffusion Interpolation | Interpolate the latent space of Stable Diffusion between different prompts/seeds | [Stable Diffusion Interpolation](#stable-diffusion-interpolation) | - | [Nate Raw](https://github.com/nateraw/) |
-| Stable Diffusion Mega | **One** Stable Diffusion Pipeline with all functionalities of [Text2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py), [Image2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) and [Inpainting](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | [Stable Diffusion Mega](#stable-diffusion-mega) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) |
-| Long Prompt Weighting Stable Diffusion | **One** Stable Diffusion Pipeline without tokens length limit, and support parsing weighting in prompt. | [Long Prompt Weighting Stable Diffusion](#long-prompt-weighting-stable-diffusion) | - | [SkyTNT](https://github.com/SkyTNT) |
-| Speech to Image | Using automatic-speech-recognition to transcribe text and Stable Diffusion to generate images | [Speech to Image](#speech-to-image) | - | [Mikail Duzenli](https://github.com/MikailINTech)
-| Wild Card Stable Diffusion | Stable Diffusion Pipeline that supports prompts that contain wildcard terms (indicated by surrounding double underscores), with values instantiated randomly from a corresponding txt file or a dictionary of possible values | [Wildcard Stable Diffusion](#wildcard-stable-diffusion) | - | [Shyam Sudhakaran](https://github.com/shyamsn97) |
-| [Composable Stable Diffusion](https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/) | Stable Diffusion Pipeline that supports prompts that contain "|" in prompts (as an AND condition) and weights (separated by "|" as well) to positively / negatively weight prompts. | [Composable Stable Diffusion](#composable-stable-diffusion) | - | [Mark Rich](https://github.com/MarkRich) |
-| Seed Resizing Stable Diffusion| Stable Diffusion Pipeline that supports resizing an image and retaining the concepts of the 512 by 512 generation. | [Seed Resizing](#seed-resizing) | - | [Mark Rich](https://github.com/MarkRich) |
-| Imagic Stable Diffusion | Stable Diffusion Pipeline that enables writing a text prompt to edit an existing image| [Imagic Stable Diffusion](#imagic-stable-diffusion) | - | [Mark Rich](https://github.com/MarkRich) |
-| Multilingual Stable Diffusion| Stable Diffusion Pipeline that supports prompts in 50 different languages. | [Multilingual Stable Diffusion](#multilingual-stable-diffusion-pipeline) | - | [Juan Carlos Piñeros](https://github.com/juancopi81) |
-| Image to Image Inpainting Stable Diffusion | Stable Diffusion Pipeline that enables the overlaying of two images and subsequent inpainting| [Image to Image Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion) | - | [Alex McKinney](https://github.com/vvvm23) |
-| Text Based Inpainting Stable Diffusion | Stable Diffusion Inpainting Pipeline that enables passing a text prompt to generate the mask for inpainting| [Text Based Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion) | - | [Dhruv Karan](https://github.com/unography) |
-| Bit Diffusion | Diffusion on discrete data | [Bit Diffusion](#bit-diffusion) | - |[Stuti R.](https://github.com/kingstut) |
-| K-Diffusion Stable Diffusion | Run Stable Diffusion with any of [K-Diffusion's samplers](https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/sampling.py) | [Stable Diffusion with K Diffusion](#stable-diffusion-with-k-diffusion) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) |
-| Checkpoint Merger Pipeline | Diffusion Pipeline that enables merging of saved model checkpoints | [Checkpoint Merger Pipeline](#checkpoint-merger-pipeline) | - | [Naga Sai Abhinay Devarinti](https://github.com/Abhinay1997/) |
-Stable Diffusion v1.1-1.4 Comparison | Run all 4 model checkpoints for Stable Diffusion and compare their results together | [Stable Diffusion Comparison](#stable-diffusion-comparisons) | - | [Suvaditya Mukherjee](https://github.com/suvadityamuk) |
-MagicMix | Diffusion Pipeline for semantic mixing of an image and a text prompt | [MagicMix](#magic-mix) | - | [Partho Das](https://github.com/daspartho) |
-| Stable UnCLIP | Diffusion Pipeline for combining prior model (generate clip image embedding from text, UnCLIPPipeline `"kakaobrain/karlo-v1-alpha"`) and decoder pipeline (decode clip image embedding to image, StableDiffusionImageVariationPipeline `"lambdalabs/sd-image-variations-diffusers"` ). | [Stable UnCLIP](#stable-unclip) | - |[Ray Wang](https://wrong.wang) |
-| UnCLIP Text Interpolation Pipeline | Diffusion Pipeline that allows passing two prompts and produces images while interpolating between the text-embeddings of the two prompts | [UnCLIP Text Interpolation Pipeline](#unclip-text-interpolation-pipeline) | - | [Naga Sai Abhinay Devarinti](https://github.com/Abhinay1997/) |
-| UnCLIP Image Interpolation Pipeline | Diffusion Pipeline that allows passing two images/image_embeddings and produces images while interpolating between their image-embeddings | [UnCLIP Image Interpolation Pipeline](#unclip-image-interpolation-pipeline) | - | [Naga Sai Abhinay Devarinti](https://github.com/Abhinay1997/) |
-| DDIM Noise Comparative Analysis Pipeline | Investigating how the diffusion models learn visual concepts from each noise level (which is a contribution of [P2 weighting (CVPR 2022)](https://arxiv.org/abs/2204.00227)) | [DDIM Noise Comparative Analysis Pipeline](#ddim-noise-comparative-analysis-pipeline) | - |[Aengus (Duc-Anh)](https://github.com/aengusng8) |
-| CLIP Guided Img2Img Stable Diffusion Pipeline | Doing CLIP guidance for image to image generation with Stable Diffusion | [CLIP Guided Img2Img Stable Diffusion](#clip-guided-img2img-stable-diffusion) | - | [Nipun Jindal](https://github.com/nipunjindal/) |
-| TensorRT Stable Diffusion Pipeline | Accelerates the Stable Diffusion Text2Image Pipeline using TensorRT | [TensorRT Stable Diffusion Pipeline](#tensorrt-text2image-stable-diffusion-pipeline) | - |[Asfiya Baig](https://github.com/asfiyab-nvidia) |
-| EDICT Image Editing Pipeline | Diffusion pipeline for text-guided image editing | [EDICT Image Editing Pipeline](#edict-image-editing-pipeline) | - | [Joqsan Azocar](https://github.com/Joqsan) |
+| Example | Description | Code Example | Colab | Author |
+|:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
+| CLIP Guided Stable Diffusion | Doing CLIP guidance for text to image generation with Stable Diffusion | [CLIP Guided Stable Diffusion](#clip-guided-stable-diffusion) | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb) | [Suraj Patil](https://github.com/patil-suraj/) |
+| One Step U-Net (Dummy) | Example showcasing of how to use Community Pipelines (see https://github.com/huggingface/diffusers/issues/841) | [One Step U-Net](#one-step-unet) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) |
+| Stable Diffusion Interpolation | Interpolate the latent space of Stable Diffusion between different prompts/seeds | [Stable Diffusion Interpolation](#stable-diffusion-interpolation) | - | [Nate Raw](https://github.com/nateraw/) |
+| Stable Diffusion Mega | **One** Stable Diffusion Pipeline with all functionalities of [Text2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py), [Image2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) and [Inpainting](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | [Stable Diffusion Mega](#stable-diffusion-mega) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) |
+| Long Prompt Weighting Stable Diffusion | **One** Stable Diffusion Pipeline without tokens length limit, and support parsing weighting in prompt. | [Long Prompt Weighting Stable Diffusion](#long-prompt-weighting-stable-diffusion) | - | [SkyTNT](https://github.com/SkyTNT) |
+| Speech to Image | Using automatic-speech-recognition to transcribe text and Stable Diffusion to generate images | [Speech to Image](#speech-to-image) | - | [Mikail Duzenli](https://github.com/MikailINTech)
+| Wild Card Stable Diffusion | Stable Diffusion Pipeline that supports prompts that contain wildcard terms (indicated by surrounding double underscores), with values instantiated randomly from a corresponding txt file or a dictionary of possible values | [Wildcard Stable Diffusion](#wildcard-stable-diffusion) | - | [Shyam Sudhakaran](https://github.com/shyamsn97) |
+| [Composable Stable Diffusion](https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/) | Stable Diffusion Pipeline that supports prompts that contain "|" in prompts (as an AND condition) and weights (separated by "|" as well) to positively / negatively weight prompts. | [Composable Stable Diffusion](#composable-stable-diffusion) | - | [Mark Rich](https://github.com/MarkRich) |
+| Seed Resizing Stable Diffusion | Stable Diffusion Pipeline that supports resizing an image and retaining the concepts of the 512 by 512 generation. | [Seed Resizing](#seed-resizing) | - | [Mark Rich](https://github.com/MarkRich) |
+| Imagic Stable Diffusion | Stable Diffusion Pipeline that enables writing a text prompt to edit an existing image | [Imagic Stable Diffusion](#imagic-stable-diffusion) | - | [Mark Rich](https://github.com/MarkRich) |
+| Multilingual Stable Diffusion | Stable Diffusion Pipeline that supports prompts in 50 different languages. | [Multilingual Stable Diffusion](#multilingual-stable-diffusion-pipeline) | - | [Juan Carlos Piñeros](https://github.com/juancopi81) |
+| Image to Image Inpainting Stable Diffusion | Stable Diffusion Pipeline that enables the overlaying of two images and subsequent inpainting | [Image to Image Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion) | - | [Alex McKinney](https://github.com/vvvm23) |
+| Text Based Inpainting Stable Diffusion | Stable Diffusion Inpainting Pipeline that enables passing a text prompt to generate the mask for inpainting | [Text Based Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion) | - | [Dhruv Karan](https://github.com/unography) |
+| Bit Diffusion | Diffusion on discrete data | [Bit Diffusion](#bit-diffusion) | - | [Stuti R.](https://github.com/kingstut) |
+| K-Diffusion Stable Diffusion | Run Stable Diffusion with any of [K-Diffusion's samplers](https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/sampling.py) | [Stable Diffusion with K Diffusion](#stable-diffusion-with-k-diffusion) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) |
+| Checkpoint Merger Pipeline | Diffusion Pipeline that enables merging of saved model checkpoints | [Checkpoint Merger Pipeline](#checkpoint-merger-pipeline) | - | [Naga Sai Abhinay Devarinti](https://github.com/Abhinay1997/) |
+ Stable Diffusion v1.1-1.4 Comparison | Run all 4 model checkpoints for Stable Diffusion and compare their results together | [Stable Diffusion Comparison](#stable-diffusion-comparisons) | - | [Suvaditya Mukherjee](https://github.com/suvadityamuk) |
+ MagicMix | Diffusion Pipeline for semantic mixing of an image and a text prompt | [MagicMix](#magic-mix) | - | [Partho Das](https://github.com/daspartho) |
+| Stable UnCLIP | Diffusion Pipeline for combining prior model (generate clip image embedding from text, UnCLIPPipeline `"kakaobrain/karlo-v1-alpha"`) and decoder pipeline (decode clip image embedding to image, StableDiffusionImageVariationPipeline `"lambdalabs/sd-image-variations-diffusers"` ). | [Stable UnCLIP](#stable-unclip) | - | [Ray Wang](https://wrong.wang) |
+| UnCLIP Text Interpolation Pipeline | Diffusion Pipeline that allows passing two prompts and produces images while interpolating between the text-embeddings of the two prompts | [UnCLIP Text Interpolation Pipeline](#unclip-text-interpolation-pipeline) | - | [Naga Sai Abhinay Devarinti](https://github.com/Abhinay1997/) |
+| UnCLIP Image Interpolation Pipeline | Diffusion Pipeline that allows passing two images/image_embeddings and produces images while interpolating between their image-embeddings | [UnCLIP Image Interpolation Pipeline](#unclip-image-interpolation-pipeline) | - | [Naga Sai Abhinay Devarinti](https://github.com/Abhinay1997/) |
+| DDIM Noise Comparative Analysis Pipeline | Investigating how the diffusion models learn visual concepts from each noise level (which is a contribution of [P2 weighting (CVPR 2022)](https://arxiv.org/abs/2204.00227)) | [DDIM Noise Comparative Analysis Pipeline](#ddim-noise-comparative-analysis-pipeline) | - | [Aengus (Duc-Anh)](https://github.com/aengusng8) |
+| CLIP Guided Img2Img Stable Diffusion Pipeline | Doing CLIP guidance for image to image generation with Stable Diffusion | [CLIP Guided Img2Img Stable Diffusion](#clip-guided-img2img-stable-diffusion) | - | [Nipun Jindal](https://github.com/nipunjindal/) |
+| TensorRT Stable Diffusion Pipeline | Accelerates the Stable Diffusion Text2Image Pipeline using TensorRT | [TensorRT Stable Diffusion Pipeline](#tensorrt-text2image-stable-diffusion-pipeline) | - | [Asfiya Baig](https://github.com/asfiyab-nvidia) |
+| EDICT Image Editing Pipeline | Diffusion pipeline for text-guided image editing | [EDICT Image Editing Pipeline](#edict-image-editing-pipeline) | - | [Joqsan Azocar](https://github.com/Joqsan) |
+| Stable Diffusion RePaint | Stable Diffusion pipeline using [RePaint](https://arxiv.org/abs/2201.0986) for inpainting. | [Stable Diffusion RePaint](#stable-diffusion-repaint ) | - | [Markus Pobitzer](https://github.com/Markus-Pobitzer) |
@@ -1247,3 +1248,37 @@ Init Image
Output Image

+
+### Stable Diffusion RePaint
+
+This pipeline uses the [RePaint](https://arxiv.org/abs/2201.09865) logic on the latent space of stable diffusion. It can
+be used similarly to other image inpainting pipelines but does not rely on a specific inpainting model. This means you can use
+models that are not specifically created for inpainting.
+
+Make sure to use the ```RePaintScheduler``` as shown in the example below.
+
+Disclaimer: The mask gets transferred into latent space, this may lead to unexpected changes on the edge of the masked part.
+The inference time is a lot slower.
+
+```py
+import PIL
+import requests
+import torch
+from io import BytesIO
+from diffusers import StableDiffusionPipeline, RePaintScheduler
+def download_image(url):
+ response = requests.get(url)
+ return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+init_image = download_image(img_url).resize((512, 512))
+mask_image = download_image(mask_url).resize((512, 512))
+mask_image = PIL.ImageOps.invert(mask_image)
+pipe = DiffusionPipeline.from_pretrained(
+ "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16, custom_pipeline="stable_diffusion_repaint",
+)
+pipe.scheduler = RePaintScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+```
\ No newline at end of file
diff --git a/examples/community/stable_diffusion_repaint.py b/examples/community/stable_diffusion_repaint.py
new file mode 100644
index 000000000000..3fd63d4b213a
--- /dev/null
+++ b/examples/community/stable_diffusion_repaint.py
@@ -0,0 +1,956 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel
+from diffusers.configuration_utils import FrozenDict, deprecate
+from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import (
+ StableDiffusionSafetyChecker,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+ is_accelerate_available,
+ is_accelerate_version,
+ logging,
+ randn_tensor,
+)
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+def prepare_mask_and_masked_image(image, mask):
+ """
+ Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
+ converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
+ ``image`` and ``1`` for the ``mask``.
+ The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+ binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+ Args:
+ image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+ It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+ ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+ mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+ It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+ ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+ Raises:
+ ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+ should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+ TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+ (ot the other way around).
+ Returns:
+ tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
+ dimensions: ``batch x channels x height x width``.
+ """
+ if isinstance(image, torch.Tensor):
+ if not isinstance(mask, torch.Tensor):
+ raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
+
+ # Batch single image
+ if image.ndim == 3:
+ assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+ image = image.unsqueeze(0)
+
+ # Batch and add channel dim for single mask
+ if mask.ndim == 2:
+ mask = mask.unsqueeze(0).unsqueeze(0)
+
+ # Batch single mask or add channel dim
+ if mask.ndim == 3:
+ # Single batched mask, no channel dim or single mask not batched but channel dim
+ if mask.shape[0] == 1:
+ mask = mask.unsqueeze(0)
+
+ # Batched masks no channel dim
+ else:
+ mask = mask.unsqueeze(1)
+
+ assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+ assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+ assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+
+ # Check image is in [-1, 1]
+ if image.min() < -1 or image.max() > 1:
+ raise ValueError("Image should be in [-1, 1] range")
+
+ # Check mask is in [0, 1]
+ if mask.min() < 0 or mask.max() > 1:
+ raise ValueError("Mask should be in [0, 1] range")
+
+ # Binarize mask
+ mask[mask < 0.5] = 0
+ mask[mask >= 0.5] = 1
+
+ # Image as float32
+ image = image.to(dtype=torch.float32)
+ elif isinstance(mask, torch.Tensor):
+ raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+ else:
+ # preprocess image
+ if isinstance(image, (PIL.Image.Image, np.ndarray)):
+ image = [image]
+
+ if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+ image = [np.array(i.convert("RGB"))[None, :] for i in image]
+ image = np.concatenate(image, axis=0)
+ elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+ image = np.concatenate([i[None, :] for i in image], axis=0)
+
+ image = image.transpose(0, 3, 1, 2)
+ image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+ # preprocess mask
+ if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+ mask = [mask]
+
+ if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+ mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+ mask = mask.astype(np.float32) / 255.0
+ elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+ mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+ mask[mask < 0.5] = 0
+ mask[mask >= 0.5] = 1
+ mask = torch.from_numpy(mask)
+
+ # masked_image = image * (mask >= 0.5)
+ masked_image = image
+
+ return mask, masked_image
+
+
+class StableDiffusionRepaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+ r"""
+ Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+ In addition the pipeline inherits the following loading methods:
+ - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+ - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
+ as well as the following saving methods:
+ - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`CLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+ safety_checker ([`StableDiffusionSafetyChecker`]):
+ Classification module that estimates whether generated images could be considered offensive or harmful.
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+ feature_extractor ([`CLIPImageProcessor`]):
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+ """
+ _optional_components = ["safety_checker", "feature_extractor"]
+
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
+ super().__init__()
+
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["steps_offset"] = 1
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if hasattr(scheduler.config, "skip_prk_steps") and scheduler.config.skip_prk_steps is False:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration"
+ " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
+ " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
+ " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
+ " Hub, it would be very nice if you could open a Pull request for the"
+ " `scheduler/scheduler_config.json` file"
+ )
+ deprecate(
+ "skip_prk_steps not set",
+ "1.0.0",
+ deprecation_message,
+ standard_warn=False,
+ )
+ new_config = dict(scheduler.config)
+ new_config["skip_prk_steps"] = True
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if safety_checker is None and requires_safety_checker:
+ logger.warning(
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+ )
+
+ if safety_checker is not None and feature_extractor is None:
+ raise ValueError(
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+ )
+
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+ version.parse(unet.config._diffusers_version).base_version
+ ) < version.parse("0.9.0.dev0")
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+ if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+ deprecation_message = (
+ "The configuration file of the unet has set the default `sample_size` to smaller than"
+ " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+ " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+ " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+ " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+ " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+ " in the config might lead to incorrect results in future versions. If you have downloaded this"
+ " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+ " the `unet/config.json` file"
+ )
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(unet.config)
+ new_config["sample_size"] = 64
+ unet._internal_dict = FrozenDict(new_config)
+ # Check shapes, assume num_channels_latents == 4, num_channels_mask == 1, num_channels_masked == 4
+ if unet.config.in_channels != 4:
+ logger.warning(
+ f"You have loaded a UNet with {unet.config.in_channels} input channels, whereas by default,"
+ f" {self.__class__} assumes that `pipeline.unet` has 4 input channels: 4 for `num_channels_latents`,"
+ ". If you did not intend to modify"
+ " this behavior, please check whether you have loaded the right checkpoint."
+ )
+
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
+ def enable_sequential_cpu_offload(self, gpu_id=0):
+ r"""
+ Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+ text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+ `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+ Note that offloading happens on a submodule basis. Memory savings are higher than with
+ `enable_model_cpu_offload`, but performance is lower.
+ """
+ if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
+ from accelerate import cpu_offload
+ else:
+ raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
+
+ device = torch.device(f"cuda:{gpu_id}")
+
+ if self.device.type != "cpu":
+ self.to("cpu", silence_dtype_warnings=True)
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
+
+ for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+ cpu_offload(cpu_offloaded_model, device)
+
+ if self.safety_checker is not None:
+ cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
+ def enable_model_cpu_offload(self, gpu_id=0):
+ r"""
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+ """
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+ from accelerate import cpu_offload_with_hook
+ else:
+ raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+ device = torch.device(f"cuda:{gpu_id}")
+
+ if self.device.type != "cpu":
+ self.to("cpu", silence_dtype_warnings=True)
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
+
+ hook = None
+ for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+ _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+ if self.safety_checker is not None:
+ _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+ # We'll offload the last model manually.
+ self.final_offload_hook = hook
+
+ @property
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+ def _execution_device(self):
+ r"""
+ Returns the device on which the pipeline's models will be executed. After calling
+ `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+ hooks.
+ """
+ if not hasattr(self.unet, "_hf_hook"):
+ return self.device
+ for module in self.unet.modules():
+ if (
+ hasattr(module, "_hf_hook")
+ and hasattr(module._hf_hook, "execution_device")
+ and module._hf_hook.execution_device is not None
+ ):
+ return torch.device(module._hf_hook.execution_device)
+ return self.device
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+ def _encode_prompt(
+ self,
+ prompt,
+ device,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ ):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ prompt to be encoded
+ device: (`torch.device`):
+ torch device
+ num_images_per_prompt (`int`):
+ number of images that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ """
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ if prompt_embeds is None:
+ # textual inversion: procecss multi-vector tokens if necessary
+ if isinstance(self, TextualInversionLoaderMixin):
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+ text_input_ids = text_inputs.input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+ attention_mask = text_inputs.attention_mask.to(device)
+ else:
+ attention_mask = None
+
+ prompt_embeds = self.text_encoder(
+ text_input_ids.to(device),
+ attention_mask=attention_mask,
+ )
+ prompt_embeds = prompt_embeds[0]
+
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+ bs_embed, seq_len, _ = prompt_embeds.shape
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
+ uncond_tokens: List[str]
+ if negative_prompt is None:
+ uncond_tokens = [""] * batch_size
+ elif type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif isinstance(negative_prompt, str):
+ uncond_tokens = [negative_prompt]
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = negative_prompt
+
+ # textual inversion: procecss multi-vector tokens if necessary
+ if isinstance(self, TextualInversionLoaderMixin):
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+ max_length = prompt_embeds.shape[1]
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+ attention_mask = uncond_input.attention_mask.to(device)
+ else:
+ attention_mask = None
+
+ negative_prompt_embeds = self.text_encoder(
+ uncond_input.input_ids.to(device),
+ attention_mask=attention_mask,
+ )
+ negative_prompt_embeds = negative_prompt_embeds[0]
+
+ if do_classifier_free_guidance:
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+ seq_len = negative_prompt_embeds.shape[1]
+
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+ return prompt_embeds
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+ def run_safety_checker(self, image, device, dtype):
+ if self.safety_checker is not None:
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ image, has_nsfw_concept = self.safety_checker(
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+ )
+ else:
+ has_nsfw_concept = None
+ return image, has_nsfw_concept
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+ def prepare_extra_step_kwargs(self, generator, eta):
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ # check if the scheduler accepts generator
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ if accepts_generator:
+ extra_step_kwargs["generator"] = generator
+ return extra_step_kwargs
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+ def decode_latents(self, latents):
+ latents = 1 / self.vae.config.scaling_factor * latents
+ image = self.vae.decode(latents).sample
+ image = (image / 2 + 0.5).clamp(0, 1)
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+ return image
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+ def check_inputs(
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
+ if height % 8 != 0 or width % 8 != 0:
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+ if (callback_steps is None) or (
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ if prompt is not None and prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+ " only forward one of the two."
+ )
+ elif prompt is None and prompt_embeds is None:
+ raise ValueError(
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+ )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ if negative_prompt is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
+ raise ValueError(
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+ f" {negative_prompt_embeds.shape}."
+ )
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+ def prepare_latents(
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ device,
+ generator,
+ latents=None,
+ ):
+ shape = (
+ batch_size,
+ num_channels_latents,
+ height // self.vae_scale_factor,
+ width // self.vae_scale_factor,
+ )
+ if isinstance(generator, list) and len(generator) != batch_size:
+ raise ValueError(
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+ )
+
+ if latents is None:
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+ else:
+ latents = latents.to(device)
+
+ # scale the initial noise by the standard deviation required by the scheduler
+ latents = latents * self.scheduler.init_noise_sigma
+ return latents
+
+ def prepare_mask_latents(
+ self,
+ mask,
+ masked_image,
+ batch_size,
+ height,
+ width,
+ dtype,
+ device,
+ generator,
+ do_classifier_free_guidance,
+ ):
+ # resize the mask to latents shape as we concatenate the mask to the latents
+ # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+ # and half precision
+ mask = torch.nn.functional.interpolate(
+ mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+ )
+ mask = mask.to(device=device, dtype=dtype)
+
+ masked_image = masked_image.to(device=device, dtype=dtype)
+
+ # encode the mask image into latents space so we can concatenate it to the latents
+ if isinstance(generator, list):
+ masked_image_latents = [
+ self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i])
+ for i in range(batch_size)
+ ]
+ masked_image_latents = torch.cat(masked_image_latents, dim=0)
+ else:
+ masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
+ masked_image_latents = self.vae.config.scaling_factor * masked_image_latents
+
+ # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+ if mask.shape[0] < batch_size:
+ if not batch_size % mask.shape[0] == 0:
+ raise ValueError(
+ "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+ f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+ " of masks that you pass is divisible by the total requested batch size."
+ )
+ mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+ if masked_image_latents.shape[0] < batch_size:
+ if not batch_size % masked_image_latents.shape[0] == 0:
+ raise ValueError(
+ "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+ f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+ " Make sure the number of images that you pass is divisible by the total requested batch size."
+ )
+ masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+
+ mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+ masked_image_latents = (
+ torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+ )
+
+ # aligning device to prevent device errors when concating it with the latent model input
+ masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+ return mask, masked_image_latents
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+ mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ jump_length: Optional[int] = 10,
+ jump_n_sample: Optional[int] = 10,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ latents: Optional[torch.FloatTensor] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+ callback_steps: int = 1,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+ instead.
+ image (`PIL.Image.Image`):
+ `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+ be masked out with `mask_image` and repainted according to `prompt`.
+ mask_image (`PIL.Image.Image`):
+ `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+ repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+ to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+ instead of 3, so the expected shape would be `(B, H, W, 1)`.
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+ The height in pixels of the generated image.
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+ The width in pixels of the generated image.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ jump_length (`int`, *optional*, defaults to 10):
+ The number of steps taken forward in time before going backward in time for a single jump ("j" in
+ RePaint paper). Take a look at Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf.
+ jump_n_sample (`int`, *optional*, defaults to 10):
+ The number of times we will make forward time jump for a given chosen time sample. Take a look at
+ Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+ is less than `1`).
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+ latents (`torch.FloatTensor`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+ plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+ Examples:
+ ```py
+ >>> import PIL
+ >>> import requests
+ >>> import torch
+ >>> from io import BytesIO
+ >>> from diffusers import StableDiffusionPipeline, RePaintScheduler
+ >>> def download_image(url):
+ ... response = requests.get(url)
+ ... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+ >>> base_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/"
+ >>> img_url = base_url + "overture-creations-5sI6fQgYIuo.png"
+ >>> mask_url = base_url + "overture-creations-5sI6fQgYIuo_mask.png "
+ >>> init_image = download_image(img_url).resize((512, 512))
+ >>> mask_image = download_image(mask_url).resize((512, 512))
+ >>> pipe = DiffusionPipeline.from_pretrained(
+ ... "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16, custom_pipeline="stable_diffusion_repaint",
+ ... )
+ >>> pipe.scheduler = RePaintScheduler.from_config(pipe.scheduler.config)
+ >>> pipe = pipe.to("cuda")
+ >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+ >>> image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+ ```
+ Returns:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+ (nsfw) content, according to the `safety_checker`.
+ """
+ # 0. Default height and width to unet
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+ # 1. Check inputs
+ self.check_inputs(
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt,
+ prompt_embeds,
+ negative_prompt_embeds,
+ )
+
+ if image is None:
+ raise ValueError("`image` input cannot be undefined.")
+
+ if mask_image is None:
+ raise ValueError("`mask_image` input cannot be undefined.")
+
+ # 2. Define call parameters
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ device = self._execution_device
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ # 3. Encode input prompt
+ prompt_embeds = self._encode_prompt(
+ prompt,
+ device,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
+
+ # 4. Preprocess mask and image
+ mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
+
+ # 5. set timesteps
+ self.scheduler.set_timesteps(num_inference_steps, jump_length, jump_n_sample, device)
+ self.scheduler.eta = eta
+
+ timesteps = self.scheduler.timesteps
+ # latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+ # 6. Prepare latent variables
+ num_channels_latents = self.vae.config.latent_channels
+ latents = self.prepare_latents(
+ batch_size * num_images_per_prompt,
+ num_channels_latents,
+ height,
+ width,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ latents,
+ )
+
+ # 7. Prepare mask latent variables
+ mask, masked_image_latents = self.prepare_mask_latents(
+ mask,
+ masked_image,
+ batch_size * num_images_per_prompt,
+ height,
+ width,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ do_classifier_free_guidance=False, # We do not need duplicate mask and image
+ )
+
+ # 8. Check that sizes of mask, masked image and latents match
+ # num_channels_mask = mask.shape[1]
+ # num_channels_masked_image = masked_image_latents.shape[1]
+ if num_channels_latents != self.unet.config.in_channels:
+ raise ValueError(
+ f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+ f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} "
+ f" = Please verify the config of"
+ " `pipeline.unet` or your `mask_image` or `image` input."
+ )
+
+ # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+ t_last = timesteps[0] + 1
+
+ # 10. Denoising loop
+ with self.progress_bar(total=len(timesteps)) as progress_bar:
+ for i, t in enumerate(timesteps):
+ if t >= t_last:
+ # compute the reverse: x_t-1 -> x_t
+ latents = self.scheduler.undo_step(latents, t_last, generator)
+ progress_bar.update()
+ t_last = t
+ continue
+
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+ # concat latents, mask, masked_image_latents in the channel dimension
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+ # latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+ # predict the noise residual
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(
+ noise_pred,
+ t,
+ latents,
+ masked_image_latents,
+ mask,
+ **extra_step_kwargs,
+ ).prev_sample
+
+ # call the callback, if provided
+ progress_bar.update()
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, latents)
+
+ t_last = t
+
+ # 11. Post-processing
+ image = self.decode_latents(latents)
+
+ # 12. Run safety checker
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+ # 13. Convert to PIL
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ # Offload last model to CPU
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+ self.final_offload_hook.offload()
+
+ if not return_dict:
+ return (image, has_nsfw_concept)
+
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
From 022479416f8667c25d71c336fedb9b6a4ed8a89c Mon Sep 17 00:00:00 2001
From: Cheng Lu
Date: Thu, 4 May 2023 01:00:59 +0800
Subject: [PATCH 038/206] Fix multistep dpmsolver for cosine schedule (suitable
for deepfloyd-if) (#3314)
* fix multistep dpmsolver for cosine schedule (deepfloy-if)
* fix a typo
* Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
Co-authored-by: Patrick von Platen
* Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
Co-authored-by: Patrick von Platen
* Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
Co-authored-by: Patrick von Platen
* Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
Co-authored-by: Patrick von Platen
* Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
Co-authored-by: Patrick von Platen
* update all dpmsolver (singlestep, multistep, dpm, dpm++) for cosine noise schedule
* add test, fix style
---------
Co-authored-by: Patrick von Platen
---
.../scheduling_dpmsolver_multistep.py | 27 +++++++++++++++++--
.../scheduling_dpmsolver_singlestep.py | 24 ++++++++++++++++-
tests/schedulers/test_scheduler_dpm_multi.py | 10 +++++++
tests/schedulers/test_scheduler_dpm_single.py | 10 +++++++
4 files changed, 68 insertions(+), 3 deletions(-)
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
index 3399ee2c54cb..337c6603fe75 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -118,6 +118,17 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the
noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence
of noise levels {σi} as defined in Equation (5) of the paper https://arxiv.org/pdf/2206.00364.pdf.
+ lambda_min_clipped (`float`, default `-inf`):
+ the clipping threshold for the minimum value of lambda(t) for numerical stability. This is critical for
+ cosine (squaredcos_cap_v2) noise schedule.
+ variance_type (`str`, *optional*):
+ Set to "learned" or "learned_range" for diffusion models that predict variance. For example, OpenAI's
+ guided-diffusion (https://github.com/openai/guided-diffusion) predicts both mean and variance of the
+ Gaussian distribution in the model's output. DPM-Solver only needs the "mean" output because it is based on
+ diffusion ODEs. whether the model's output contains the predicted Gaussian variance. For example, OpenAI's
+ guided-diffusion (https://github.com/openai/guided-diffusion) predicts both mean and variance of the
+ Gaussian distribution in the model's output. DPM-Solver only needs the "mean" output because it is based on
+ diffusion ODEs.
"""
_compatibles = [e.name for e in KarrasDiffusionSchedulers]
@@ -140,6 +151,8 @@ def __init__(
solver_type: str = "midpoint",
lower_order_final: bool = True,
use_karras_sigmas: Optional[bool] = False,
+ lambda_min_clipped: float = -float("inf"),
+ variance_type: Optional[str] = None,
):
if trained_betas is not None:
self.betas = torch.tensor(trained_betas, dtype=torch.float32)
@@ -187,7 +200,7 @@ def __init__(
self.lower_order_nums = 0
self.use_karras_sigmas = use_karras_sigmas
- def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
"""
Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
@@ -197,8 +210,11 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
device (`str` or `torch.device`, optional):
the device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
"""
+ # Clipping the minimum of all lambda(t) for numerical stability.
+ # This is critical for cosine (squaredcos_cap_v2) noise schedule.
+ clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.lambda_min_clipped)
timesteps = (
- np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
+ np.linspace(0, self.config.num_train_timesteps - 1 - clipped_idx, num_inference_steps + 1)
.round()[::-1][:-1]
.copy()
.astype(np.int64)
@@ -320,9 +336,13 @@ def convert_model_output(
Returns:
`torch.FloatTensor`: the converted model output.
"""
+
# DPM-Solver++ needs to solve an integral of the data prediction model.
if self.config.algorithm_type == "dpmsolver++":
if self.config.prediction_type == "epsilon":
+ # DPM-Solver and DPM-Solver++ only need the "mean" output.
+ if self.config.variance_type in ["learned_range"]:
+ model_output = model_output[:, :3]
alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
x0_pred = (sample - sigma_t * model_output) / alpha_t
elif self.config.prediction_type == "sample":
@@ -343,6 +363,9 @@ def convert_model_output(
# DPM-Solver needs to solve an integral of the noise prediction model.
elif self.config.algorithm_type == "dpmsolver":
if self.config.prediction_type == "epsilon":
+ # DPM-Solver and DPM-Solver++ only need the "mean" output.
+ if self.config.variance_type in ["learned_range"]:
+ model_output = model_output[:, :3]
return model_output
elif self.config.prediction_type == "sample":
alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
index 049e2b1dbd4d..1d34977d4a57 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -113,6 +113,17 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
lower_order_final (`bool`, default `True`):
whether to use lower-order solvers in the final steps. For singlestep schedulers, we recommend to enable
this to use up all the function evaluations.
+ lambda_min_clipped (`float`, default `-inf`):
+ the clipping threshold for the minimum value of lambda(t) for numerical stability. This is critical for
+ cosine (squaredcos_cap_v2) noise schedule.
+ variance_type (`str`, *optional*):
+ Set to "learned" or "learned_range" for diffusion models that predict variance. For example, OpenAI's
+ guided-diffusion (https://github.com/openai/guided-diffusion) predicts both mean and variance of the
+ Gaussian distribution in the model's output. DPM-Solver only needs the "mean" output because it is based on
+ diffusion ODEs. whether the model's output contains the predicted Gaussian variance. For example, OpenAI's
+ guided-diffusion (https://github.com/openai/guided-diffusion) predicts both mean and variance of the
+ Gaussian distribution in the model's output. DPM-Solver only needs the "mean" output because it is based on
+ diffusion ODEs.
"""
@@ -135,6 +146,8 @@ def __init__(
algorithm_type: str = "dpmsolver++",
solver_type: str = "midpoint",
lower_order_final: bool = True,
+ lambda_min_clipped: float = -float("inf"),
+ variance_type: Optional[str] = None,
):
if trained_betas is not None:
self.betas = torch.tensor(trained_betas, dtype=torch.float32)
@@ -226,8 +239,11 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
the device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
"""
self.num_inference_steps = num_inference_steps
+ # Clipping the minimum of all lambda(t) for numerical stability.
+ # This is critical for cosine (squaredcos_cap_v2) noise schedule.
+ clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.lambda_min_clipped)
timesteps = (
- np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
+ np.linspace(0, self.config.num_train_timesteps - 1 - clipped_idx, num_inference_steps + 1)
.round()[::-1][:-1]
.copy()
.astype(np.int64)
@@ -297,6 +313,9 @@ def convert_model_output(
# DPM-Solver++ needs to solve an integral of the data prediction model.
if self.config.algorithm_type == "dpmsolver++":
if self.config.prediction_type == "epsilon":
+ # DPM-Solver and DPM-Solver++ only need the "mean" output.
+ if self.config.variance_type in ["learned_range"]:
+ model_output = model_output[:, :3]
alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
x0_pred = (sample - sigma_t * model_output) / alpha_t
elif self.config.prediction_type == "sample":
@@ -317,6 +336,9 @@ def convert_model_output(
# DPM-Solver needs to solve an integral of the noise prediction model.
elif self.config.algorithm_type == "dpmsolver":
if self.config.prediction_type == "epsilon":
+ # DPM-Solver and DPM-Solver++ only need the "mean" output.
+ if self.config.variance_type in ["learned_range"]:
+ model_output = model_output[:, :3]
return model_output
elif self.config.prediction_type == "sample":
alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
diff --git a/tests/schedulers/test_scheduler_dpm_multi.py b/tests/schedulers/test_scheduler_dpm_multi.py
index c1593bae3908..02a2a3882e94 100644
--- a/tests/schedulers/test_scheduler_dpm_multi.py
+++ b/tests/schedulers/test_scheduler_dpm_multi.py
@@ -29,6 +29,8 @@ def get_scheduler_config(self, **kwargs):
"algorithm_type": "dpmsolver++",
"solver_type": "midpoint",
"lower_order_final": False,
+ "lambda_min_clipped": -float("inf"),
+ "variance_type": None,
}
config.update(**kwargs)
@@ -187,6 +189,14 @@ def test_lower_order_final(self):
self.check_over_configs(lower_order_final=True)
self.check_over_configs(lower_order_final=False)
+ def test_lambda_min_clipped(self):
+ self.check_over_configs(lambda_min_clipped=-float("inf"))
+ self.check_over_configs(lambda_min_clipped=-5.1)
+
+ def test_variance_type(self):
+ self.check_over_configs(variance_type=None)
+ self.check_over_configs(variance_type="learned_range")
+
def test_inference_steps(self):
for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
diff --git a/tests/schedulers/test_scheduler_dpm_single.py b/tests/schedulers/test_scheduler_dpm_single.py
index 9dff04e7c998..fd7395e794c7 100644
--- a/tests/schedulers/test_scheduler_dpm_single.py
+++ b/tests/schedulers/test_scheduler_dpm_single.py
@@ -28,6 +28,8 @@ def get_scheduler_config(self, **kwargs):
"sample_max_value": 1.0,
"algorithm_type": "dpmsolver++",
"solver_type": "midpoint",
+ "lambda_min_clipped": -float("inf"),
+ "variance_type": None,
}
config.update(**kwargs)
@@ -179,6 +181,14 @@ def test_lower_order_final(self):
self.check_over_configs(lower_order_final=True)
self.check_over_configs(lower_order_final=False)
+ def test_lambda_min_clipped(self):
+ self.check_over_configs(lambda_min_clipped=-float("inf"))
+ self.check_over_configs(lambda_min_clipped=-5.1)
+
+ def test_variance_type(self):
+ self.check_over_configs(variance_type=None)
+ self.check_over_configs(variance_type="learned_range")
+
def test_inference_steps(self):
for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
From 4bae76e4539c30f68fa4e39c4e492a2155cf81d0 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Wed, 3 May 2023 22:58:44 -0700
Subject: [PATCH 039/206] [docs] Improve LoRA docs (#3311)
* update docs
* add to toctree
* apply feedback
---
docs/source/en/_toctree.yml | 2 +
docs/source/en/training/controlnet.mdx | 6 +-
docs/source/en/training/create_dataset.mdx | 90 +++++++++++++++++++
docs/source/en/training/custom_diffusion.mdx | 4 +-
docs/source/en/training/dreambooth.mdx | 6 +-
docs/source/en/training/instructpix2pix.mdx | 6 +-
docs/source/en/training/lora.mdx | 24 +++--
docs/source/en/training/text2image.mdx | 4 +-
docs/source/en/training/text_inversion.mdx | 8 +-
.../en/training/unconditional_training.mdx | 84 +----------------
10 files changed, 128 insertions(+), 106 deletions(-)
create mode 100644 docs/source/en/training/create_dataset.mdx
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index fc101347a6e9..f205046ffc90 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -60,6 +60,8 @@
- sections:
- local: training/overview
title: Overview
+ - local: training/create_dataset
+ title: Create a dataset for training
- local: training/unconditional_training
title: Unconditional image generation
- local: training/text_inversion
diff --git a/docs/source/en/training/controlnet.mdx b/docs/source/en/training/controlnet.mdx
index 1c91298477c7..476081c88704 100644
--- a/docs/source/en/training/controlnet.mdx
+++ b/docs/source/en/training/controlnet.mdx
@@ -69,6 +69,8 @@ The original dataset is hosted in the ControlNet [repo](https://huggingface.co/l
Our training examples use [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) because that is what the original set of ControlNet models was trained on. However, ControlNet can be trained to augment any compatible Stable Diffusion model (such as [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4)) or [`stabilityai/stable-diffusion-2-1`](https://huggingface.co/stabilityai/stable-diffusion-2-1).
+To use your own dataset, take a look at the [Create a dataset for training](create_dataset) guide.
+
## Training
Download the following images to condition our training with:
@@ -79,7 +81,9 @@ wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/ma
wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png
```
-Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument.
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`pretrained_model_name_or_path`](https://huggingface.co/docs/diffusers/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path) argument.
+
+The training script creates and saves a `diffusion_pytorch_model.bin` file in your repository.
```bash
export MODEL_DIR="runwayml/stable-diffusion-v1-5"
diff --git a/docs/source/en/training/create_dataset.mdx b/docs/source/en/training/create_dataset.mdx
new file mode 100644
index 000000000000..9c4f4de53904
--- /dev/null
+++ b/docs/source/en/training/create_dataset.mdx
@@ -0,0 +1,90 @@
+# Create a dataset for training
+
+There are many datasets on the [Hub](https://huggingface.co/datasets?task_categories=task_categories:text-to-image&sort=downloads) to train a model on, but if you can't find one you're interested in or want to use your own, you can create a dataset with the 🤗 [Datasets](hf.co/docs/datasets) library. The dataset structure depends on the task you want to train your model on. The most basic dataset structure is a directory of images for tasks like unconditional image generation. Another dataset structure may be a directory of images and a text file containing their corresponding text captions for tasks like text-to-image generation.
+
+This guide will show you two ways to create a dataset to finetune on:
+
+- provide a folder of images to the `--train_data_dir` argument
+- upload a dataset to the Hub and pass the dataset repository id to the `--dataset_name` argument
+
+
+
+💡 Learn more about how to create an image dataset for training in the [Create an image dataset](https://huggingface.co/docs/datasets/image_dataset) guide.
+
+
+
+## Provide a dataset as a folder
+
+For unconditional generation, you can provide your own dataset as a folder of images. The training script uses the [`ImageFolder`](https://huggingface.co/docs/datasets/en/image_dataset#imagefolder) builder from 🤗 Datasets to automatically build a dataset from the folder. Your directory structure should look like:
+
+```bash
+data_dir/xxx.png
+data_dir/xxy.png
+data_dir/[...]/xxz.png
+```
+
+Pass the path to the dataset directory to the `--train_data_dir` argument, and then you can start training:
+
+```bash
+accelerate launch train_unconditional.py \
+ --train_data_dir \
+
+```
+
+## Upload your data to the Hub
+
+
+
+💡 For more details and context about creating and uploading a dataset to the Hub, take a look at the [Image search with 🤗 Datasets](https://huggingface.co/blog/image-search-datasets) post.
+
+
+
+Start by creating a dataset with the [`ImageFolder`](https://huggingface.co/docs/datasets/image_load#imagefolder) feature, which creates an `image` column containing the PIL-encoded images.
+
+You can use the `data_dir` or `data_files` parameters to specify the location of the dataset. The `data_files` parameter supports mapping specific files to dataset splits like `train` or `test`:
+
+```python
+from datasets import load_dataset
+
+# example 1: local folder
+dataset = load_dataset("imagefolder", data_dir="path_to_your_folder")
+
+# example 2: local files (supported formats are tar, gzip, zip, xz, rar, zstd)
+dataset = load_dataset("imagefolder", data_files="path_to_zip_file")
+
+# example 3: remote files (supported formats are tar, gzip, zip, xz, rar, zstd)
+dataset = load_dataset(
+ "imagefolder",
+ data_files="https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip",
+)
+
+# example 4: providing several splits
+dataset = load_dataset(
+ "imagefolder", data_files={"train": ["path/to/file1", "path/to/file2"], "test": ["path/to/file3", "path/to/file4"]}
+)
+```
+
+Then use the [`~datasets.Dataset.push_to_hub`] method to upload the dataset to the Hub:
+
+```python
+# assuming you have ran the huggingface-cli login command in a terminal
+dataset.push_to_hub("name_of_your_dataset")
+
+# if you want to push to a private repo, simply pass private=True:
+dataset.push_to_hub("name_of_your_dataset", private=True)
+```
+
+Now the dataset is available for training by passing the dataset name to the `--dataset_name` argument:
+
+```bash
+accelerate launch --mixed_precision="fp16" train_text_to_image.py \
+ --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
+ --dataset_name="name_of_your_dataset" \
+
+```
+
+## Next steps
+
+Now that you've created a dataset, you can plug it into the `train_data_dir` (if your dataset is local) or `dataset_name` (if your dataset is on the Hub) arguments of a training script.
+
+For your next steps, feel free to try and use your dataset to train a model for [unconditional generation](uncondtional_training) or [text-to-image generation](text2image)!
\ No newline at end of file
diff --git a/docs/source/en/training/custom_diffusion.mdx b/docs/source/en/training/custom_diffusion.mdx
index ee8fb19bd18c..dda9c17c7ebc 100644
--- a/docs/source/en/training/custom_diffusion.mdx
+++ b/docs/source/en/training/custom_diffusion.mdx
@@ -67,7 +67,7 @@ write_basic_config()
```
### Cat example 😺
-Now let's get our dataset. Download dataset from [here](https://www.cs.cmu.edu/~custom-diffusion/assets/data.zip) and unzip it.
+Now let's get our dataset. Download dataset from [here](https://www.cs.cmu.edu/~custom-diffusion/assets/data.zip) and unzip it. To use your own dataset, take a look at the [Create a dataset for training](create_dataset) guide.
We also collect 200 real images using `clip-retrieval` which are combined with the target images in the training dataset as a regularization. This prevents overfitting to the the given target image. The following flags enable the regularization `with_prior_preservation`, `real_prior` with `prior_loss_weight=1.`.
The `class_prompt` should be the category name same as target image. The collected real images are with text captions similar to the `class_prompt`. The retrieved image are saved in `class_data_dir`. You can disable `real_prior` to use generated images as regularization. To collect the real images use this command first before training.
@@ -79,6 +79,8 @@ python retrieve.py --class_prompt cat --class_data_dir real_reg/samples_cat --nu
**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+The script creates and saves model checkpoints and a `pytorch_custom_diffusion_weights.bin` file in your repository.
+
```bash
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
export OUTPUT_DIR="path-to-save-model"
diff --git a/docs/source/en/training/dreambooth.mdx b/docs/source/en/training/dreambooth.mdx
index 09b877c7d0cc..38a3adf9c4f1 100644
--- a/docs/source/en/training/dreambooth.mdx
+++ b/docs/source/en/training/dreambooth.mdx
@@ -64,6 +64,8 @@ snapshot_download(
)
```
+To use your own dataset, take a look at the [Create a dataset for training](create_dataset) guide.
+
## Finetuning
@@ -76,7 +78,7 @@ DreamBooth finetuning is very sensitive to hyperparameters and easy to overfit.
Set the `INSTANCE_DIR` environment variable to the path of the directory containing the dog images.
-Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument.
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`pretrained_model_name_or_path`] argument. The `instance_prompt` argument is a text prompt that contains a unique identifier, such as `sks`, and the class the image belongs to, which in this example is `a photo of a sks dog`.
```bash
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
@@ -111,7 +113,7 @@ Before running the script, make sure you have the requirements installed:
pip install -U -r requirements.txt
```
-Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument.
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`pretrained_model_name_or_path`] argument. The `instance_prompt` argument is a text prompt that contains a unique identifier, such as `sks`, and the class the image belongs to, which in this example is `a photo of a sks dog`.
Now you can launch the training script with the following command:
diff --git a/docs/source/en/training/instructpix2pix.mdx b/docs/source/en/training/instructpix2pix.mdx
index 6b6d4d908673..2a9e99cda1f2 100644
--- a/docs/source/en/training/instructpix2pix.mdx
+++ b/docs/source/en/training/instructpix2pix.mdx
@@ -77,16 +77,16 @@ write_basic_config()
### Toy example
As mentioned before, we'll use a [small toy dataset](https://huggingface.co/datasets/fusing/instructpix2pix-1000-samples) for training. The dataset
-is a smaller version of the [original dataset](https://huggingface.co/datasets/timbrooks/instructpix2pix-clip-filtered) used in the InstructPix2Pix paper.
+is a smaller version of the [original dataset](https://huggingface.co/datasets/timbrooks/instructpix2pix-clip-filtered) used in the InstructPix2Pix paper. To use your own dataset, take a look at the [Create a dataset for training](create_dataset) guide.
-Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument. You'll also need to specify the dataset name in `DATASET_ID`:
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`pretrained_model_name_or_path`](https://huggingface.co/docs/diffusers/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path) argument. You'll also need to specify the dataset name in `DATASET_ID`:
```bash
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
export DATASET_ID="fusing/instructpix2pix-1000-samples"
```
-Now, we can launch training:
+Now, we can launch training. The script saves all the components (`feature_extractor`, `scheduler`, `text_encoder`, `unet`, etc) in a subfolder in your repository.
```bash
accelerate launch --mixed_precision="fp16" train_instruct_pix2pix.py \
diff --git a/docs/source/en/training/lora.mdx b/docs/source/en/training/lora.mdx
index 8e41aab5e2d8..04eff7af11f8 100644
--- a/docs/source/en/training/lora.mdx
+++ b/docs/source/en/training/lora.mdx
@@ -17,8 +17,7 @@ specific language governing permissions and limitations under the License.
Currently, LoRA is only supported for the attention layers of the [`UNet2DConditionalModel`]. We also
-support LoRA fine-tuning of the text encoder for DreamBooth in a limited capacity. For more details on how we support
-LoRA fine-tuning of the text encoder, refer to the discussion on [this PR](https://github.com/huggingface/diffusers/pull/2918).
+support fine-tuning the text encoder for DreamBooth with LoRA in a limited capacity. Fine-tuning the text encoder for DreamBooth generally yields better results, but it can increase compute usage.
@@ -52,7 +51,7 @@ Finetuning a model like Stable Diffusion, which has billions of parameters, can
Let's finetune [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon.
-Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument. You'll also need to set the `DATASET_NAME` environment variable to the name of the dataset you want to train on.
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`pretrained_model_name_or_path`](https://huggingface.co/docs/diffusers/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path) argument. You'll also need to set the `DATASET_NAME` environment variable to the name of the dataset you want to train on. To use your own dataset, take a look at the [Create a dataset for training](create_dataset) guide.
The `OUTPUT_DIR` and `HUB_MODEL_ID` variables are optional and specify where to save the model to on the Hub:
@@ -69,7 +68,7 @@ There are some flags to be aware of before you start training:
* `--report_to=wandb` reports and logs the training results to your Weights & Biases dashboard (as an example, take a look at this [report](https://wandb.ai/pcuenq/text2image-fine-tune/runs/b4k1w0tn?workspace=user-pcuenq)).
* `--learning_rate=1e-04`, you can afford to use a higher learning rate than you normally would with LoRA.
-Now you're ready to launch the training (you can find the full training script [here](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py)):
+Now you're ready to launch the training (you can find the full training script [here](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py)). Training takes about 5 hours on a 2080 Ti GPU with 11GB of RAM, and it'll create and save model checkpoints and the `pytorch_lora_weights` in your repository.
```bash
accelerate launch --mixed_precision="fp16" train_text_to_image_lora.py \
@@ -159,9 +158,9 @@ pipe = StableDiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.
### Training[[dreambooth-training]]
-Let's finetune [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) with DreamBooth and LoRA with some 🐶 [dog images](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ). Download and save these images to a directory.
+Let's finetune [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) with DreamBooth and LoRA with some 🐶 [dog images](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ). Download and save these images to a directory. To use your own dataset, take a look at the [Create a dataset for training](create_dataset) guide.
-To start, specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument. You'll also need to set `INSTANCE_DIR` to the path of the directory containing the images.
+To start, specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`pretrained_model_name_or_path`](https://huggingface.co/docs/diffusers/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path) argument. You'll also need to set `INSTANCE_DIR` to the path of the directory containing the images.
The `OUTPUT_DIR` variables is optional and specifies where to save the model to on the Hub:
@@ -177,7 +176,11 @@ There are some flags to be aware of before you start training:
* `--report_to=wandb` reports and logs the training results to your Weights & Biases dashboard (as an example, take a look at this [report](https://wandb.ai/pcuenq/text2image-fine-tune/runs/b4k1w0tn?workspace=user-pcuenq)).
* `--learning_rate=1e-04`, you can afford to use a higher learning rate than you normally would with LoRA.
-Now you're ready to launch the training (you can find the full training script [here](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_lora.py)):
+Now you're ready to launch the training (you can find the full training script [here](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_lora.py)). The script creates and saves model checkpoints and the `pytorch_lora_weights.bin` file in your repository.
+
+It's also possible to additionally fine-tune the text encoder with LoRA. This, in most cases, leads
+to better results with a slight increase in the compute. To allow fine-tuning the text encoder with LoRA,
+specify the `--train_text_encoder` while launching the `train_dreambooth_lora.py` script.
```bash
accelerate launch train_dreambooth_lora.py \
@@ -198,12 +201,7 @@ accelerate launch train_dreambooth_lora.py \
--validation_epochs=50 \
--seed="0" \
--push_to_hub
-```
-
-It's also possible to additionally fine-tune the text encoder with LoRA. This, in most cases, leads
-to better results with a slight increase in the compute. To allow fine-tuning the text encoder with LoRA,
-specify the `--train_text_encoder` while launching the `train_dreambooth_lora.py` script.
-
+```
### Inference[[dreambooth-inference]]
diff --git a/docs/source/en/training/text2image.mdx b/docs/source/en/training/text2image.mdx
index dabb68397f78..8535e6ffac70 100644
--- a/docs/source/en/training/text2image.mdx
+++ b/docs/source/en/training/text2image.mdx
@@ -74,7 +74,7 @@ To load a checkpoint to resume training, pass the argument `--resume_from_checkp
Launch the [PyTorch training script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) for a fine-tuning run on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset like this.
-Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument.
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`pretrained_model_name_or_path`](https://huggingface.co/docs/diffusers/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path) argument.
{"path": "../../../../examples/text_to_image/README.md",
@@ -143,7 +143,7 @@ Before running the script, make sure you have the requirements installed:
pip install -U -r requirements_flax.txt
```
-Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument.
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`pretrained_model_name_or_path`](https://huggingface.co/docs/diffusers/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path) argument.
Now you can launch the [Flax training script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_flax.py) like this:
diff --git a/docs/source/en/training/text_inversion.mdx b/docs/source/en/training/text_inversion.mdx
index 76e7f0dcc8f2..1afecc7b71bb 100644
--- a/docs/source/en/training/text_inversion.mdx
+++ b/docs/source/en/training/text_inversion.mdx
@@ -81,7 +81,7 @@ To resume training from a saved checkpoint, pass the following argument to the t
## Finetuning
-For your training dataset, download these [images of a cat toy](https://huggingface.co/datasets/diffusers/cat_toy_example) and store them in a directory:
+For your training dataset, download these [images of a cat toy](https://huggingface.co/datasets/diffusers/cat_toy_example) and store them in a directory. To use your own dataset, take a look at the [Create a dataset for training](create_dataset) guide.
```py
from huggingface_hub import snapshot_download
@@ -92,9 +92,9 @@ snapshot_download(
)
```
-Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument, and the `DATA_DIR` environment variable to the path of the directory containing the images.
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`pretrained_model_name_or_path`](https://huggingface.co/docs/diffusers/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path) argument, and the `DATA_DIR` environment variable to the path of the directory containing the images.
-Now you can launch the [training script](https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py):
+Now you can launch the [training script](https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py). The script creates and saves the following files to your repository: `learned_embeds.bin`, `token_identifier.txt`, and `type_of_concept.txt`.
@@ -144,7 +144,7 @@ Before you begin, make sure you install the Flax specific dependencies:
pip install -U -r requirements_flax.txt
```
-Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument.
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`pretrained_model_name_or_path`](https://huggingface.co/docs/diffusers/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path) argument.
Then you can launch the [training script](https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion_flax.py):
diff --git a/docs/source/en/training/unconditional_training.mdx b/docs/source/en/training/unconditional_training.mdx
index 514932d4b22d..164b4f599f1e 100644
--- a/docs/source/en/training/unconditional_training.mdx
+++ b/docs/source/en/training/unconditional_training.mdx
@@ -74,7 +74,9 @@ The full training state is saved in a subfolder in the `output_dir` every 500 st
## Finetuning
-You're ready to launch the [training script](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/train_unconditional.py) now! Specify the dataset name to finetune on with the `--dataset_name` argument and then save it to the path in `--output_dir`.
+You're ready to launch the [training script](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/train_unconditional.py) now! Specify the dataset name to finetune on with the `--dataset_name` argument and then save it to the path in `--output_dir`. To use your own dataset, take a look at the [Create a dataset for training](create_dataset) guide.
+
+The training script creates and saves a `diffusion_pytorch_model.bin` file in your repository.
@@ -140,82 +142,4 @@ accelerate launch --mixed_precision="fp16" --multi_gpu train_unconditional.py \
--lr_warmup_steps=500 \
--mixed_precision="fp16" \
--logger="wandb"
-```
-
-## Finetuning with your own data
-
-There are two ways to finetune a model on your own dataset:
-
-- provide your own folder of images to the `--train_data_dir` argument
-- upload your dataset to the Hub and pass the dataset repository id to the `--dataset_name` argument.
-
-
-
-💡 Learn more about how to create an image dataset for training in the [Create an image dataset](https://huggingface.co/docs/datasets/image_dataset) guide.
-
-
-
-Below, we explain both in more detail.
-
-### Provide the dataset as a folder
-
-If you provide your own dataset as a folder, the script expects the following directory structure:
-
-```bash
-data_dir/xxx.png
-data_dir/xxy.png
-data_dir/[...]/xxz.png
-```
-
-Pass the path to the folder containing the images to the `--train_data_dir` argument and launch the training:
-
-```bash
-accelerate launch train_unconditional.py \
- --train_data_dir \
-
-```
-
-Internally, the script uses the [`ImageFolder`](https://huggingface.co/docs/datasets/image_load#imagefolder) to automatically build a dataset from the folder.
-
-### Upload your data to the Hub
-
-
-
-💡 For more details and context about creating and uploading a dataset to the Hub, take a look at the [Image search with 🤗 Datasets](https://huggingface.co/blog/image-search-datasets) post.
-
-
-
-To upload your dataset to the Hub, you can start by creating one with the [`ImageFolder`](https://huggingface.co/docs/datasets/image_load#imagefolder) feature, which creates an `image` column containing the PIL-encoded images, from 🤗 Datasets:
-
-```python
-from datasets import load_dataset
-
-# example 1: local folder
-dataset = load_dataset("imagefolder", data_dir="path_to_your_folder")
-
-# example 2: local files (supported formats are tar, gzip, zip, xz, rar, zstd)
-dataset = load_dataset("imagefolder", data_files="path_to_zip_file")
-
-# example 3: remote files (supported formats are tar, gzip, zip, xz, rar, zstd)
-dataset = load_dataset(
- "imagefolder",
- data_files="https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip",
-)
-
-# example 4: providing several splits
-dataset = load_dataset(
- "imagefolder", data_files={"train": ["path/to/file1", "path/to/file2"], "test": ["path/to/file3", "path/to/file4"]}
-)
-```
-
-Then you can use the [`~datasets.Dataset.push_to_hub`] method to upload it to the Hub:
-
-```python
-# assuming you have ran the huggingface-cli login command in a terminal
-dataset.push_to_hub("name_of_your_dataset")
-
-# if you want to push to a private repo, simply pass private=True:
-dataset.push_to_hub("name_of_your_dataset", private=True)
-```
-
-Now train your model by simply setting the `--dataset_name` argument to the name of your dataset on the Hub.
\ No newline at end of file
+```
\ No newline at end of file
From fa9e35fca4f32436f4c6bb890a1b3dfcefa465f7 Mon Sep 17 00:00:00 2001
From: Isamu Isozaki
Date: Thu, 4 May 2023 21:42:32 +0900
Subject: [PATCH 040/206] Added input pretubation (#3292)
* Added input pretubation
* Fixed spelling
---
examples/text_to_image/train_text_to_image.py | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py
index 1d62cb7f816d..f9592e5adca3 100644
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -112,6 +112,9 @@ def log_validation(vae, text_encoder, tokenizer, unet, args, accelerator, weight
def parse_args():
parser = argparse.ArgumentParser(description="Simple example of a training script.")
+ parser.add_argument(
+ "--input_pertubation", type=float, default=0, help="The scale of input pretubation. Recommended 0.1."
+ )
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
@@ -801,7 +804,8 @@ def collate_fn(examples):
noise += args.noise_offset * torch.randn(
(latents.shape[0], latents.shape[1], 1, 1), device=latents.device
)
-
+ if args.input_pertubation:
+ new_noise = noise + args.input_pertubation * torch.randn_like(noise)
bsz = latents.shape[0]
# Sample a random timestep for each image
timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
@@ -809,7 +813,10 @@ def collate_fn(examples):
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
- noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+ if args.input_pertubation:
+ noisy_latents = noise_scheduler.add_noise(latents, new_noise, timesteps)
+ else:
+ noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
# Get the text embedding for conditioning
encoder_hidden_states = text_encoder(batch["input_ids"])[0]
From 79c0e24a1442741c59c9b1d2764ba4dbfe56ac71 Mon Sep 17 00:00:00 2001
From: Cesar Aybar
Date: Thu, 4 May 2023 19:58:27 +0200
Subject: [PATCH 041/206] Update write_own_pipeline.mdx (#3323)
---
docs/source/en/using-diffusers/write_own_pipeline.mdx | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/docs/source/en/using-diffusers/write_own_pipeline.mdx b/docs/source/en/using-diffusers/write_own_pipeline.mdx
index fa47878e1b9b..3dca40dff714 100644
--- a/docs/source/en/using-diffusers/write_own_pipeline.mdx
+++ b/docs/source/en/using-diffusers/write_own_pipeline.mdx
@@ -82,8 +82,8 @@ To recreate the pipeline with the model and scheduler separately, let's write ou
>>> for t in scheduler.timesteps:
... with torch.no_grad():
... noisy_residual = model(input, t).sample
- >>> previous_noisy_sample = scheduler.step(noisy_residual, t, input).prev_sample
- >>> input = previous_noisy_sample
+ ... previous_noisy_sample = scheduler.step(noisy_residual, t, input).prev_sample
+ ... input = previous_noisy_sample
```
This is the entire denoising process, and you can use this same pattern to write any diffusion system.
@@ -287,4 +287,4 @@ This is really what 🧨 Diffusers is designed for: to make it intuitive and eas
For your next steps, feel free to:
* Learn how to [build and contribute a pipeline](using-diffusers/#contribute_pipeline) to 🧨 Diffusers. We can't wait and see what you'll come up with!
-* Explore [existing pipelines](./api/pipelines/overview) in the library, and see if you can deconstruct and build a pipeline from scratch using the models and schedulers separately.
\ No newline at end of file
+* Explore [existing pipelines](./api/pipelines/overview) in the library, and see if you can deconstruct and build a pipeline from scratch using the models and schedulers separately.
From 379197a2f059bc8e2fd1296c018d89693dddfad5 Mon Sep 17 00:00:00 2001
From: Sayak Paul
Date: Fri, 5 May 2023 11:22:29 +0530
Subject: [PATCH 042/206] update controlling generation doc with latest
goodies. (#3321)
---
.../controlling_generation.mdx | 55 ++++++++++++++++++-
1 file changed, 53 insertions(+), 2 deletions(-)
diff --git a/docs/source/en/using-diffusers/controlling_generation.mdx b/docs/source/en/using-diffusers/controlling_generation.mdx
index b1ba17cd2c67..57b5640ffcd5 100644
--- a/docs/source/en/using-diffusers/controlling_generation.mdx
+++ b/docs/source/en/using-diffusers/controlling_generation.mdx
@@ -37,6 +37,28 @@ Unless otherwise mentioned, these are techniques that work with existing models
9. [Textual Inversion](#textual-inversion)
10. [ControlNet](#controlnet)
11. [Prompt Weighting](#prompt-weighting)
+12. [Custom Diffusion](#custom-diffusion)
+13. [Model Editing](#model-editing)
+14. [DiffEdit](#diffedit)
+
+For convenience, we provide a table to denote which methods are inference-only and which require fine-tuning/training.
+
+| **Method** | **Inference only** | **Requires training /
fine-tuning** | **Comments** |
+|:---:|:---:|:---:|:---:|
+| [Instruct Pix2Pix](#instruct-pix2pix) | ✅ | ❌ | Can additionally be
fine-tuned for better
performance on specific
edit instructions. |
+| [Pix2Pix Zero](#pix2pixzero) | ✅ | ❌ | |
+| [Attend and Excite](#attend-and-excite) | ✅ | ❌ | |
+| [Semantic Guidance](#semantic-guidance) | ✅ | ❌ | |
+| [Self-attention Guidance](#self-attention-guidance) | ✅ | ❌ | |
+| [Depth2Image](#depth2image) | ✅ | ❌ | |
+| [MultiDiffusion Panorama](#multidiffusion-panorama) | ✅ | ❌ | |
+| [DreamBooth](#dreambooth) | ❌ | ✅ | |
+| [Textual Inversion](#textual-inversion) | ❌ | ✅ | |
+| [ControlNet](#controlnet) | ✅ | ❌ | A ControlNet can be
trained/fine-tuned on
a custom conditioning. |
+| [Prompt Weighting](#prompt-weighting) | ✅ | ❌ | |
+| [Custom Diffusion](#custom-diffusion) | ❌ | ✅ | |
+| [Model Editing](#model-editing) | ✅ | ❌ | |
+| [DiffEdit](#diffedit) | ✅ | ❌ | |
## Instruct Pix2Pix
@@ -137,13 +159,13 @@ See [here](../api/pipelines/stable_diffusion/panorama) for more information on h
In addition to pre-trained models, Diffusers has training scripts for fine-tuning models on user-provided data.
-### DreamBooth
+## DreamBooth
[DreamBooth](../training/dreambooth) fine-tunes a model to teach it about a new subject. I.e. a few pictures of a person can be used to generate images of that person in different styles.
See [here](../training/dreambooth) for more information on how to use it.
-### Textual Inversion
+## Textual Inversion
[Textual Inversion](../training/text_inversion) fine-tunes a model to teach it about a new concept. I.e. a few pictures of a style of artwork can be used to generate images in that style.
@@ -165,3 +187,32 @@ Prompt weighting is a simple technique that puts more attention weight on certai
input.
For a more in-detail explanation and examples, see [here](../using-diffusers/weighted_prompts).
+
+## Custom Diffusion
+
+[Custom Diffusion](../training/custom_diffusion) only fine-tunes the cross-attention maps of a pre-trained
+text-to-image diffusion model. It also allows for additionally performing textual inversion. It supports
+multi-concept training by design. Like DreamBooth and Textual Inversion, Custom Diffusion is also used to
+teach a pre-trained text-to-image diffusion model about new concepts to generate outputs involving the
+concept(s) of interest.
+
+For more details, check out our [official doc](../training/custom_diffusion).
+
+## Model Editing
+
+[Paper](https://arxiv.org/abs/2303.08084)
+
+The [text-to-image model editing pipeline](../api/pipelines/stable_diffusion/model_editing) helps you mitigate some of the incorrect implicit assumptions a pre-trained text-to-image
+diffusion model might make about the subjects present in the input prompt. For example, if you prompt Stable Diffusion to generate images for "A pack of roses", the roses in the generated images
+are more likely to be red. This pipeline helps you change that assumption.
+
+To know more details, check out the [official doc](../api/pipelines/stable_diffusion/model_editing).
+
+## DiffEdit
+
+[Paper](https://arxiv.org/abs/2210.11427)
+
+[DiffEdit](../api/pipelines/stable_diffusion/diffedit) allows for semantic editing of input images along with
+input prompts while preserving the original input images as much as possible.
+
+To know more details, check out the [official doc](../api/pipelines/stable_diffusion/model_editing).
\ No newline at end of file
From 29ad75dc3b415d21bbe9e5edc0639dc1b7869554 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Fri, 5 May 2023 11:06:09 +0200
Subject: [PATCH 043/206] [Quality] Make style (#3341)
---
setup.py | 3 ++-
src/diffusers/dependency_versions_table.py | 1 +
.../deepfloyd_if/pipeline_if_img2img_superresolution.py | 2 +-
.../deepfloyd_if/pipeline_if_inpainting_superresolution.py | 2 +-
4 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/setup.py b/setup.py
index 13c93dcae3c0..e8c80c492f88 100644
--- a/setup.py
+++ b/setup.py
@@ -111,6 +111,7 @@
"torch>=1.4",
"torchvision",
"transformers>=4.25.1",
+ "urllib3<=2.0.0",
]
# this is a lookup table with items like:
@@ -181,7 +182,7 @@ def run(self):
extras = {}
-extras["quality"] = deps_list("black", "isort", "ruff", "hf-doc-builder")
+extras["quality"] = deps_list("urllib3", "black", "isort", "ruff", "hf-doc-builder")
extras["docs"] = deps_list("hf-doc-builder")
extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2")
extras["test"] = deps_list(
diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py
index 0e714accacd6..0b6f544b9817 100644
--- a/src/diffusers/dependency_versions_table.py
+++ b/src/diffusers/dependency_versions_table.py
@@ -35,4 +35,5 @@
"torch": "torch>=1.4",
"torchvision": "torchvision",
"transformers": "transformers>=4.25.1",
+ "urllib3": "urllib3<=2.0.0",
}
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
index 770676c15984..94ea1015862d 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
@@ -762,7 +762,7 @@ def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt, device
image = [np.array(i).astype(np.float32) / 255.0 for i in image]
image = np.stack(image, axis=0) # to np
- torch.from_numpy(image.transpose(0, 3, 1, 2))
+ image = torch.from_numpy(image.transpose(0, 3, 1, 2))
elif isinstance(image[0], np.ndarray):
image = np.stack(image, axis=0) # to np
if image.ndim == 5:
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
index 2b42d3992ed8..77a3b2594bf1 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
@@ -798,7 +798,7 @@ def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt, device
image = [np.array(i).astype(np.float32) / 255.0 for i in image]
image = np.stack(image, axis=0) # to np
- torch.from_numpy(image.transpose(0, 3, 1, 2))
+ image = torch.from_numpy(image.transpose(0, 3, 1, 2))
elif isinstance(image[0], np.ndarray):
image = np.stack(image, axis=0) # to np
if image.ndim == 5:
From 8d4c7d0ea0a5f732fae2b019ee30b41afd9ed412 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Fri, 5 May 2023 13:02:33 +0200
Subject: [PATCH 044/206] Fix config dpm (#3343)
---
src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
index 1d34977d4a57..9307db89d8d7 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -241,7 +241,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
self.num_inference_steps = num_inference_steps
# Clipping the minimum of all lambda(t) for numerical stability.
# This is critical for cosine (squaredcos_cap_v2) noise schedule.
- clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.lambda_min_clipped)
+ clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped)
timesteps = (
np.linspace(0, self.config.num_train_timesteps - 1 - clipped_idx, num_inference_steps + 1)
.round()[::-1][:-1]
From 27522b585b4273221ee3cd7a09c0fb1f9d13b634 Mon Sep 17 00:00:00 2001
From: Cheng Lu
Date: Fri, 5 May 2023 23:03:47 +0800
Subject: [PATCH 045/206] Add the SDE variant of DPM-Solver and DPM-Solver++
(#3344)
* add SDE variant of DPM-Solver and DPM-Solver++
* add test
* fix typo
* fix typo
---
.../scheduling_dpmsolver_multistep.py | 103 +++++++++++++++---
tests/schedulers/test_scheduler_dpm_multi.py | 18 +--
2 files changed, 98 insertions(+), 23 deletions(-)
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
index 337c6603fe75..e72b1bdc23b5 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -21,6 +21,7 @@
import torch
from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import randn_tensor
from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
@@ -70,6 +71,10 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
thresholding. Note that the thresholding method is unsuitable for latent-space diffusion models (such as
stable-diffusion).
+ We also support the SDE variant of DPM-Solver and DPM-Solver++, which is a fast SDE solver for the reverse
+ diffusion SDE. Currently we only support the first-order and second-order solvers. We recommend using the
+ second-order `sde-dpmsolver++`.
+
[`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
[`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
@@ -103,10 +108,10 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
the threshold value for dynamic thresholding. Valid only when `thresholding=True` and
`algorithm_type="dpmsolver++`.
algorithm_type (`str`, default `dpmsolver++`):
- the algorithm type for the solver. Either `dpmsolver` or `dpmsolver++`. The `dpmsolver` type implements the
- algorithms in https://arxiv.org/abs/2206.00927, and the `dpmsolver++` type implements the algorithms in
- https://arxiv.org/abs/2211.01095. We recommend to use `dpmsolver++` with `solver_order=2` for guided
- sampling (e.g. stable-diffusion).
+ the algorithm type for the solver. Either `dpmsolver` or `dpmsolver++` or `sde-dpmsolver` or
+ `sde-dpmsolver++`. The `dpmsolver` type implements the algorithms in https://arxiv.org/abs/2206.00927, and
+ the `dpmsolver++` type implements the algorithms in https://arxiv.org/abs/2211.01095. We recommend to use
+ `dpmsolver++` or `sde-dpmsolver++` with `solver_order=2` for guided sampling (e.g. stable-diffusion).
solver_type (`str`, default `midpoint`):
the solver type for the second-order solver. Either `midpoint` or `heun`. The solver type slightly affects
the sample quality, especially for small number of steps. We empirically find that `midpoint` solvers are
@@ -180,7 +185,7 @@ def __init__(
self.init_noise_sigma = 1.0
# settings for DPM-Solver
- if algorithm_type not in ["dpmsolver", "dpmsolver++"]:
+ if algorithm_type not in ["dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"]:
if algorithm_type == "deis":
self.register_to_config(algorithm_type="dpmsolver++")
else:
@@ -212,7 +217,7 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
"""
# Clipping the minimum of all lambda(t) for numerical stability.
# This is critical for cosine (squaredcos_cap_v2) noise schedule.
- clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.lambda_min_clipped)
+ clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped)
timesteps = (
np.linspace(0, self.config.num_train_timesteps - 1 - clipped_idx, num_inference_steps + 1)
.round()[::-1][:-1]
@@ -338,10 +343,10 @@ def convert_model_output(
"""
# DPM-Solver++ needs to solve an integral of the data prediction model.
- if self.config.algorithm_type == "dpmsolver++":
+ if self.config.algorithm_type in ["dpmsolver++", "sde-dpmsolver++"]:
if self.config.prediction_type == "epsilon":
# DPM-Solver and DPM-Solver++ only need the "mean" output.
- if self.config.variance_type in ["learned_range"]:
+ if self.config.variance_type in ["learned", "learned_range"]:
model_output = model_output[:, :3]
alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
x0_pred = (sample - sigma_t * model_output) / alpha_t
@@ -360,33 +365,42 @@ def convert_model_output(
x0_pred = self._threshold_sample(x0_pred)
return x0_pred
+
# DPM-Solver needs to solve an integral of the noise prediction model.
- elif self.config.algorithm_type == "dpmsolver":
+ elif self.config.algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
if self.config.prediction_type == "epsilon":
# DPM-Solver and DPM-Solver++ only need the "mean" output.
- if self.config.variance_type in ["learned_range"]:
- model_output = model_output[:, :3]
- return model_output
+ if self.config.variance_type in ["learned", "learned_range"]:
+ epsilon = model_output[:, :3]
+ else:
+ epsilon = model_output
elif self.config.prediction_type == "sample":
alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
epsilon = (sample - alpha_t * model_output) / sigma_t
- return epsilon
elif self.config.prediction_type == "v_prediction":
alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
epsilon = alpha_t * model_output + sigma_t * sample
- return epsilon
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
" `v_prediction` for the DPMSolverMultistepScheduler."
)
+ if self.config.thresholding:
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+ x0_pred = (sample - sigma_t * epsilon) / alpha_t
+ x0_pred = self._threshold_sample(x0_pred)
+ epsilon = (sample - alpha_t * x0_pred) / sigma_t
+
+ return epsilon
+
def dpm_solver_first_order_update(
self,
model_output: torch.FloatTensor,
timestep: int,
prev_timestep: int,
sample: torch.FloatTensor,
+ noise: Optional[torch.FloatTensor] = None,
) -> torch.FloatTensor:
"""
One step for the first-order DPM-Solver (equivalent to DDIM).
@@ -411,6 +425,20 @@ def dpm_solver_first_order_update(
x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output
elif self.config.algorithm_type == "dpmsolver":
x_t = (alpha_t / alpha_s) * sample - (sigma_t * (torch.exp(h) - 1.0)) * model_output
+ elif self.config.algorithm_type == "sde-dpmsolver++":
+ assert noise is not None
+ x_t = (
+ (sigma_t / sigma_s * torch.exp(-h)) * sample
+ + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
+ + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+ )
+ elif self.config.algorithm_type == "sde-dpmsolver":
+ assert noise is not None
+ x_t = (
+ (alpha_t / alpha_s) * sample
+ - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * model_output
+ + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+ )
return x_t
def multistep_dpm_solver_second_order_update(
@@ -419,6 +447,7 @@ def multistep_dpm_solver_second_order_update(
timestep_list: List[int],
prev_timestep: int,
sample: torch.FloatTensor,
+ noise: Optional[torch.FloatTensor] = None,
) -> torch.FloatTensor:
"""
One step for the second-order multistep DPM-Solver.
@@ -470,6 +499,38 @@ def multistep_dpm_solver_second_order_update(
- (sigma_t * (torch.exp(h) - 1.0)) * D0
- (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
)
+ elif self.config.algorithm_type == "sde-dpmsolver++":
+ assert noise is not None
+ if self.config.solver_type == "midpoint":
+ x_t = (
+ (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+ + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+ + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
+ + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+ )
+ elif self.config.solver_type == "heun":
+ x_t = (
+ (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+ + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+ + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1
+ + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+ )
+ elif self.config.algorithm_type == "sde-dpmsolver":
+ assert noise is not None
+ if self.config.solver_type == "midpoint":
+ x_t = (
+ (alpha_t / alpha_s0) * sample
+ - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
+ - (sigma_t * (torch.exp(h) - 1.0)) * D1
+ + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+ )
+ elif self.config.solver_type == "heun":
+ x_t = (
+ (alpha_t / alpha_s0) * sample
+ - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
+ - 2.0 * (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+ + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+ )
return x_t
def multistep_dpm_solver_third_order_update(
@@ -532,6 +593,7 @@ def step(
model_output: torch.FloatTensor,
timestep: int,
sample: torch.FloatTensor,
+ generator=None,
return_dict: bool = True,
) -> Union[SchedulerOutput, Tuple]:
"""
@@ -574,12 +636,21 @@ def step(
self.model_outputs[i] = self.model_outputs[i + 1]
self.model_outputs[-1] = model_output
+ if self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
+ noise = randn_tensor(
+ model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
+ )
+ else:
+ noise = None
+
if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
- prev_sample = self.dpm_solver_first_order_update(model_output, timestep, prev_timestep, sample)
+ prev_sample = self.dpm_solver_first_order_update(
+ model_output, timestep, prev_timestep, sample, noise=noise
+ )
elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
timestep_list = [self.timesteps[step_index - 1], timestep]
prev_sample = self.multistep_dpm_solver_second_order_update(
- self.model_outputs, timestep_list, prev_timestep, sample
+ self.model_outputs, timestep_list, prev_timestep, sample, noise=noise
)
else:
timestep_list = [self.timesteps[step_index - 2], self.timesteps[step_index - 1], timestep]
diff --git a/tests/schedulers/test_scheduler_dpm_multi.py b/tests/schedulers/test_scheduler_dpm_multi.py
index 02a2a3882e94..c9935780b983 100644
--- a/tests/schedulers/test_scheduler_dpm_multi.py
+++ b/tests/schedulers/test_scheduler_dpm_multi.py
@@ -167,16 +167,20 @@ def test_prediction_type(self):
self.check_over_configs(prediction_type=prediction_type)
def test_solver_order_and_type(self):
- for algorithm_type in ["dpmsolver", "dpmsolver++"]:
+ for algorithm_type in ["dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"]:
for solver_type in ["midpoint", "heun"]:
for order in [1, 2, 3]:
for prediction_type in ["epsilon", "sample"]:
- self.check_over_configs(
- solver_order=order,
- solver_type=solver_type,
- prediction_type=prediction_type,
- algorithm_type=algorithm_type,
- )
+ if algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
+ if order == 3:
+ continue
+ else:
+ self.check_over_configs(
+ solver_order=order,
+ solver_type=solver_type,
+ prediction_type=prediction_type,
+ algorithm_type=algorithm_type,
+ )
sample = self.full_loop(
solver_order=order,
solver_type=solver_type,
From 36f43ea75ab7cdf9b04f72bced0b1ab22036c21c Mon Sep 17 00:00:00 2001
From: Will Rice
Date: Fri, 5 May 2023 14:50:41 -0400
Subject: [PATCH 046/206] Add upsample_size to AttnUpBlock2D, AttnDownBlock2D
(#3275)
The argument `upsample_size` needs to be added to these modules to allow compatibility with other blocks that require this argument.
---
src/diffusers/models/unet_2d_blocks.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index 57153fa39807..2f7b19b7328a 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -734,7 +734,7 @@ def __init__(
else:
self.downsamplers = None
- def forward(self, hidden_states, temb=None):
+ def forward(self, hidden_states, temb=None, upsample_size=None):
output_states = ()
for resnet, attn in zip(self.resnets, self.attentions):
@@ -1720,7 +1720,7 @@ def __init__(
else:
self.upsamplers = None
- def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
+ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
for resnet, attn in zip(self.resnets, self.attentions):
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
From e9aa0925a8e5783814cd1e0da6f601fd3eb88571 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A0=20Arrufat?=
<1671644+arrufat@users.noreply.github.com>
Date: Sat, 6 May 2023 20:00:30 +0900
Subject: [PATCH 047/206] Rename --only_save_embeds to --save_as_full_pipeline
(#3206)
* Set --only_save_embeds to False by default
Due to how the option is named, it makes more sense to behave like this.
* Refactor only_save_embeds to save_as_full_pipeline
---
examples/textual_inversion/textual_inversion.py | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py
index c58f6b600629..b66d117e90be 100644
--- a/examples/textual_inversion/textual_inversion.py
+++ b/examples/textual_inversion/textual_inversion.py
@@ -176,10 +176,9 @@ def parse_args():
help="Save learned_embeds.bin every X updates steps.",
)
parser.add_argument(
- "--only_save_embeds",
+ "--save_as_full_pipeline",
action="store_true",
- default=True,
- help="Save only the embeddings for the new concept.",
+ help="Save the complete stable diffusion pipeline.",
)
parser.add_argument(
"--num_vectors",
@@ -900,11 +899,11 @@ def main():
# Create the pipeline using the trained modules and save it.
accelerator.wait_for_everyone()
if accelerator.is_main_process:
- if args.push_to_hub and args.only_save_embeds:
+ if args.push_to_hub and not args.save_as_full_pipeline:
logger.warn("Enabling full model saving because --push_to_hub=True was specified.")
save_full_model = True
else:
- save_full_model = not args.only_save_embeds
+ save_full_model = args.save_as_full_pipeline
if save_full_model:
pipeline = StableDiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
From abd86d1c17c3bba1f59b4a11293a3a8dd5ca34cd Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Sat, 6 May 2023 12:00:42 +0100
Subject: [PATCH 048/206] [AudioLDM] Generalise conversion script (#3328)
Co-authored-by: Patrick von Platen
---
.../convert_original_audioldm_to_diffusers.py | 71 ++++++++++++++-----
1 file changed, 54 insertions(+), 17 deletions(-)
diff --git a/scripts/convert_original_audioldm_to_diffusers.py b/scripts/convert_original_audioldm_to_diffusers.py
index 189b165c0a01..a0d154d7e6ba 100644
--- a/scripts/convert_original_audioldm_to_diffusers.py
+++ b/scripts/convert_original_audioldm_to_diffusers.py
@@ -774,6 +774,8 @@ def load_pipeline_from_original_audioldm_ckpt(
extract_ema: bool = False,
scheduler_type: str = "ddim",
num_in_channels: int = None,
+ model_channels: int = None,
+ num_head_channels: int = None,
device: str = None,
from_safetensors: bool = False,
) -> AudioLDMPipeline:
@@ -784,23 +786,36 @@ def load_pipeline_from_original_audioldm_ckpt(
global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
recommended that you override the default values and/or supply an `original_config_file` wherever possible.
- :param checkpoint_path: Path to `.ckpt` file. :param original_config_file: Path to `.yaml` config file
- corresponding to the original architecture.
- If `None`, will be automatically instantiated based on default values.
- :param image_size: The image size that the model was trained on. Use 512 for original AudioLDM checkpoints. :param
- prediction_type: The prediction type that the model was trained on. Use `'epsilon'` for original
- AudioLDM checkpoints.
- :param num_in_channels: The number of input channels. If `None` number of input channels will be automatically
- inferred.
- :param scheduler_type: Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler",
- "euler-ancestral", "dpm", "ddim"]`.
- :param extract_ema: Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract
- the EMA weights or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights usually
- yield higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning.
- :param device: The device to use. Pass `None` to determine automatically. :param from_safetensors: If
- `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors
- instead of PyTorch.
- :return: An AudioLDMPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
+ Args:
+ checkpoint_path (`str`): Path to `.ckpt` file.
+ original_config_file (`str`):
+ Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically
+ set to the audioldm-s-full-v2 config.
+ image_size (`int`, *optional*, defaults to 512):
+ The image size that the model was trained on.
+ prediction_type (`str`, *optional*):
+ The prediction type that the model was trained on. If `None`, will be automatically
+ inferred by looking for a key in the config. For the default config, the prediction type is `'epsilon'`.
+ num_in_channels (`int`, *optional*, defaults to None):
+ The number of UNet input channels. If `None`, it will be automatically inferred from the config.
+ model_channels (`int`, *optional*, defaults to None):
+ The number of UNet model channels. If `None`, it will be automatically inferred from the config. Override
+ to 128 for the small checkpoints, 192 for the medium checkpoints and 256 for the large.
+ num_head_channels (`int`, *optional*, defaults to None):
+ The number of UNet head channels. If `None`, it will be automatically inferred from the config. Override
+ to 32 for the small and medium checkpoints, and 64 for the large.
+ scheduler_type (`str`, *optional*, defaults to 'pndm'):
+ Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
+ "ddim"]`.
+ extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
+ checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to
+ `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
+ inference. Non-EMA weights are usually better to continue fine-tuning.
+ device (`str`, *optional*, defaults to `None`):
+ The device to use. Pass `None` to determine automatically.
+ from_safetensors (`str`, *optional*, defaults to `False`):
+ If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.
+ return: An AudioLDMPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
"""
if not is_omegaconf_available():
@@ -837,6 +852,12 @@ def load_pipeline_from_original_audioldm_ckpt(
if num_in_channels is not None:
original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
+ if model_channels is not None:
+ original_config["model"]["params"]["unet_config"]["params"]["model_channels"] = model_channels
+
+ if num_head_channels is not None:
+ original_config["model"]["params"]["unet_config"]["params"]["num_head_channels"] = num_head_channels
+
if (
"parameterization" in original_config["model"]["params"]
and original_config["model"]["params"]["parameterization"] == "v"
@@ -960,6 +981,20 @@ def load_pipeline_from_original_audioldm_ckpt(
type=int,
help="The number of input channels. If `None` number of input channels will be automatically inferred.",
)
+ parser.add_argument(
+ "--model_channels",
+ default=None,
+ type=int,
+ help="The number of UNet model channels. If `None`, it will be automatically inferred from the config. Override"
+ " to 128 for the small checkpoints, 192 for the medium checkpoints and 256 for the large.",
+ )
+ parser.add_argument(
+ "--num_head_channels",
+ default=None,
+ type=int,
+ help="The number of UNet head channels. If `None`, it will be automatically inferred from the config. Override"
+ " to 32 for the small and medium checkpoints, and 64 for the large.",
+ )
parser.add_argument(
"--scheduler_type",
default="ddim",
@@ -1009,6 +1044,8 @@ def load_pipeline_from_original_audioldm_ckpt(
extract_ema=args.extract_ema,
scheduler_type=args.scheduler_type,
num_in_channels=args.num_in_channels,
+ model_channels=args.model_channels,
+ num_head_channels=args.num_head_channels,
from_safetensors=args.from_safetensors,
device=args.device,
)
From 7ce3fa010a5019ed35d5a89572d3f68646b2a8d9 Mon Sep 17 00:00:00 2001
From: At-sushi
Date: Sat, 6 May 2023 20:04:07 +0900
Subject: [PATCH 049/206] Fix TypeError when using prompt_embeds and
negative_prompt (#2982)
* test: Added test case
* fix: fixed type checking issue on _encode_prompt
* fix: fixed copies consistency
* fix: one copy was not sufficient
---
.../alt_diffusion/pipeline_alt_diffusion.py | 2 +-
.../pipeline_alt_diffusion_img2img.py | 2 +-
.../pipeline_cycle_diffusion.py | 2 +-
.../pipeline_stable_diffusion.py | 2 +-
...line_stable_diffusion_attend_and_excite.py | 2 +-
.../pipeline_stable_diffusion_controlnet.py | 2 +-
.../pipeline_stable_diffusion_depth2img.py | 2 +-
.../pipeline_stable_diffusion_img2img.py | 2 +-
.../pipeline_stable_diffusion_inpaint.py | 2 +-
...ipeline_stable_diffusion_inpaint_legacy.py | 2 +-
.../pipeline_stable_diffusion_k_diffusion.py | 2 +-
...pipeline_stable_diffusion_model_editing.py | 2 +-
.../pipeline_stable_diffusion_panorama.py | 2 +-
.../pipeline_stable_diffusion_pix2pix_zero.py | 2 +-
.../pipeline_stable_diffusion_sag.py | 2 +-
.../pipeline_stable_diffusion_upscale.py | 2 +-
.../pipeline_stable_unclip.py | 2 +-
.../pipeline_stable_unclip_img2img.py | 2 +-
.../pipeline_text_to_video_synth.py | 2 +-
.../stable_diffusion/test_stable_diffusion.py | 39 +++++++++++++++++++
20 files changed, 58 insertions(+), 19 deletions(-)
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index b61703a2146d..d8bae0a8df8a 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -369,7 +369,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index cabed8f017ce..b10d85f722eb 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -378,7 +378,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index a40ba75d04bd..f7748e6d6380 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -387,7 +387,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 4168dc7e9788..054dba150e63 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -372,7 +372,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
index eec7debc38b7..377795090b66 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
@@ -384,7 +384,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 5e8e68823b34..db41f22ab4b7 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -427,7 +427,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
index 16f96bbc2fd5..3167881db3db 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -256,7 +256,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 2dfa730549ab..258c8000ba63 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -385,7 +385,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 859a34677317..266648ce7613 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -437,7 +437,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 990c0e838f35..3256ff2b831f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -376,7 +376,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
index 5a21bcafccbc..2a6e7edc1351 100755
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
@@ -288,7 +288,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
index 3926a4e70ad0..3fe526418b4f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
@@ -315,7 +315,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
index facffd7a852a..5d69d2071801 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
@@ -279,7 +279,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
index b60987edfaca..c6e0a7620f77 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -520,7 +520,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
index 27ba46c8b3e7..9199eda37e04 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
@@ -296,7 +296,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index da1575289c8e..de73a92b07d3 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -296,7 +296,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
index 51ba24c65873..1867acdf0859 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
@@ -416,7 +416,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
index fce82a5bb61f..705eb50795e0 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
@@ -316,7 +316,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
index 6fc89e945604..a44b6f1d0744 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
@@ -305,7 +305,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 4583cc42e6f1..13b4d0dba827 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -251,6 +251,45 @@ def test_stable_diffusion_negative_prompt_embeds(self):
assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+ def test_stable_diffusion_prompt_embeds_with_plain_negative_prompt_list(self):
+ components = self.get_dummy_components()
+ sd_pipe = StableDiffusionPipeline(**components)
+ sd_pipe = sd_pipe.to(torch_device)
+ sd_pipe = sd_pipe.to(torch_device)
+ sd_pipe.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_inputs(torch_device)
+ negative_prompt = 3 * ["this is a negative prompt"]
+ inputs["negative_prompt"] = negative_prompt
+ inputs["prompt"] = 3 * [inputs["prompt"]]
+
+ # forward
+ output = sd_pipe(**inputs)
+ image_slice_1 = output.images[0, -3:, -3:, -1]
+
+ inputs = self.get_dummy_inputs(torch_device)
+ inputs["negative_prompt"] = negative_prompt
+ prompt = 3 * [inputs.pop("prompt")]
+
+ text_inputs = sd_pipe.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=sd_pipe.tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+ text_inputs = text_inputs["input_ids"].to(torch_device)
+
+ prompt_embeds = sd_pipe.text_encoder(text_inputs)[0]
+
+ inputs["prompt_embeds"] = prompt_embeds
+
+ # forward
+ output = sd_pipe(**inputs)
+ image_slice_2 = output.images[0, -3:, -3:, -1]
+
+ assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
def test_stable_diffusion_ddim_factor_8(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
From 0407c3e7d0ed844baf3c0b09d9b231d09445e5d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lucca=20Zen=C3=B3bio?=
Date: Sat, 6 May 2023 08:06:52 -0300
Subject: [PATCH 050/206] Fix pipeline class on README (#3345)
Update README.md
---
examples/community/README.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/examples/community/README.md b/examples/community/README.md
index 14f15fd2215e..3d034b30fcff 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -1274,11 +1274,11 @@ mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data
init_image = download_image(img_url).resize((512, 512))
mask_image = download_image(mask_url).resize((512, 512))
mask_image = PIL.ImageOps.invert(mask_image)
-pipe = DiffusionPipeline.from_pretrained(
+pipe = StableDiffusionPipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16, custom_pipeline="stable_diffusion_repaint",
)
pipe.scheduler = RePaintScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to("cuda")
prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
-```
\ No newline at end of file
+```
From b0966f5801f4ffb9f008c915a3db64032dcd1edd Mon Sep 17 00:00:00 2001
From: Lysandre Debut
Date: Sat, 6 May 2023 13:13:33 +0200
Subject: [PATCH 051/206] Inpainting: typo in docs (#3331)
Typo in docs
Co-authored-by: Patrick von Platen
---
docs/source/en/using-diffusers/inpaint.mdx | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/source/en/using-diffusers/inpaint.mdx b/docs/source/en/using-diffusers/inpaint.mdx
index 41a6d4b7e1b2..228e14e84833 100644
--- a/docs/source/en/using-diffusers/inpaint.mdx
+++ b/docs/source/en/using-diffusers/inpaint.mdx
@@ -52,7 +52,7 @@ Now you can create a prompt to replace the mask with something else:
```python
prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
```
`image` | `mask_image` | `prompt` | output |
From 0ffac97933d8ca1487d4ae9c374504a0979f0c5d Mon Sep 17 00:00:00 2001
From: Isotr0py <41363108+Isotr0py@users.noreply.github.com>
Date: Sat, 6 May 2023 19:19:27 +0800
Subject: [PATCH 052/206] Add `use_Karras_sigmas` to LMSDiscreteScheduler
(#3351)
* add karras sigma to lms discrete scheduler
* add test for lms_scheduler karras
* reformat test lms
---
.../schedulers/scheduling_lms_discrete.py | 54 ++++++++++++++++++-
tests/schedulers/test_scheduler_lms.py | 25 +++++++++
2 files changed, 77 insertions(+), 2 deletions(-)
diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py
index 68a8e1bddc01..0656475c3093 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -94,6 +94,10 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
`linear` or `scaled_linear`.
trained_betas (`np.ndarray`, optional):
option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+ use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+ This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the
+ noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence
+ of noise levels {σi} as defined in Equation (5) of the paper https://arxiv.org/pdf/2206.00364.pdf.
prediction_type (`str`, default `epsilon`, optional):
prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
@@ -111,6 +115,7 @@ def __init__(
beta_end: float = 0.02,
beta_schedule: str = "linear",
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ use_karras_sigmas: Optional[bool] = False,
prediction_type: str = "epsilon",
):
if trained_betas is not None:
@@ -140,8 +145,8 @@ def __init__(
# setable values
self.num_inference_steps = None
- timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
- self.timesteps = torch.from_numpy(timesteps)
+ self.use_karras_sigmas = use_karras_sigmas
+ self.set_timesteps(num_train_timesteps, None)
self.derivatives = []
self.is_scale_input_called = False
@@ -201,8 +206,15 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
self.num_inference_steps = num_inference_steps
timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+
sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+ log_sigmas = np.log(sigmas)
sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+
+ if self.use_karras_sigmas:
+ sigmas = self._convert_to_karras(in_sigmas=sigmas)
+ timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+
sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
self.sigmas = torch.from_numpy(sigmas).to(device=device)
@@ -214,6 +226,44 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
self.derivatives = []
+ # copied from diffusers.schedulers.scheduling_euler_discrete._sigma_to_t
+ def _sigma_to_t(self, sigma, log_sigmas):
+ # get log sigma
+ log_sigma = np.log(sigma)
+
+ # get distribution
+ dists = log_sigma - log_sigmas[:, np.newaxis]
+
+ # get sigmas range
+ low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+ high_idx = low_idx + 1
+
+ low = log_sigmas[low_idx]
+ high = log_sigmas[high_idx]
+
+ # interpolate sigmas
+ w = (low - log_sigma) / (low - high)
+ w = np.clip(w, 0, 1)
+
+ # transform interpolation to time range
+ t = (1 - w) * low_idx + w * high_idx
+ t = t.reshape(sigma.shape)
+ return t
+
+ # copied from diffusers.schedulers.scheduling_euler_discrete._convert_to_karras
+ def _convert_to_karras(self, in_sigmas: torch.FloatTensor) -> torch.FloatTensor:
+ """Constructs the noise schedule of Karras et al. (2022)."""
+
+ sigma_min: float = in_sigmas[-1].item()
+ sigma_max: float = in_sigmas[0].item()
+
+ rho = 7.0 # 7.0 is the value used in the paper
+ ramp = np.linspace(0, 1, self.num_inference_steps)
+ min_inv_rho = sigma_min ** (1 / rho)
+ max_inv_rho = sigma_max ** (1 / rho)
+ sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+ return sigmas
+
def step(
self,
model_output: torch.FloatTensor,
diff --git a/tests/schedulers/test_scheduler_lms.py b/tests/schedulers/test_scheduler_lms.py
index ca3574e9ee63..3f31f9696de2 100644
--- a/tests/schedulers/test_scheduler_lms.py
+++ b/tests/schedulers/test_scheduler_lms.py
@@ -113,3 +113,28 @@ def test_full_loop_device(self):
assert abs(result_sum.item() - 1006.388) < 1e-2
assert abs(result_mean.item() - 1.31) < 1e-3
+
+ def test_full_loop_device_karras_sigmas(self):
+ scheduler_class = self.scheduler_classes[0]
+ scheduler_config = self.get_scheduler_config()
+ scheduler = scheduler_class(**scheduler_config, use_karras_sigmas=True)
+
+ scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+ model = self.dummy_model()
+ sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
+ sample = sample.to(torch_device)
+
+ for t in scheduler.timesteps:
+ sample = scheduler.scale_model_input(sample, t)
+
+ model_output = model(sample, t)
+
+ output = scheduler.step(model_output, t, sample)
+ sample = output.prev_sample
+
+ result_sum = torch.sum(torch.abs(sample))
+ result_mean = torch.mean(torch.abs(sample))
+
+ assert abs(result_sum.item() - 3812.9927) < 1e-2
+ assert abs(result_mean.item() - 4.9648) < 1e-3
From 3d8b3d7cd87c069791809d31863ea17bba436c4b Mon Sep 17 00:00:00 2001
From: pdoane
Date: Mon, 8 May 2023 01:54:30 -0700
Subject: [PATCH 053/206] Batched load of textual inversions (#3277)
* Batched load of textual inversions
- Only call resize_token_embeddings once per batch as it is the most expensive operation
- Allow pretrained_model_name_or_path and token to be an optional list
- Remove Dict from type annotation pretrained_model_name_or_path as it was not supported in this function
- Add comment that single files (e.g. .pt/.safetensors) are supported
- Add comment for token parameter
- Convert token override log message from warning to info
* Update src/diffusers/loaders.py
Check for duplicate tokens
Co-authored-by: Patrick von Platen
* Update condition for None tokens
---------
Co-authored-by: Patrick von Platen
---
src/diffusers/loaders.py | 191 ++++++++++++++++++------------
tests/pipelines/test_pipelines.py | 25 ++++
2 files changed, 138 insertions(+), 78 deletions(-)
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index b4b0f4bb3bd6..f41d0ffe72e3 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -436,7 +436,10 @@ def _maybe_convert_prompt(self, prompt: str, tokenizer: "PreTrainedTokenizer"):
return prompt
def load_textual_inversion(
- self, pretrained_model_name_or_path: Union[str, Dict[str, torch.Tensor]], token: Optional[str] = None, **kwargs
+ self,
+ pretrained_model_name_or_path: Union[str, List[str]],
+ token: Optional[Union[str, List[str]]] = None,
+ **kwargs,
):
r"""
Load textual inversion embeddings into the text encoder of stable diffusion pipelines. Both `diffusers` and
@@ -449,7 +452,7 @@ def load_textual_inversion(
Parameters:
- pretrained_model_name_or_path (`str` or `os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike` or `List[str or os.PathLike]`):
Can be either:
- A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
@@ -457,6 +460,12 @@ def load_textual_inversion(
`"sd-concepts-library/low-poly-hd-logos-icons"`.
- A path to a *directory* containing textual inversion weights, e.g.
`./my_text_inversion_directory/`.
+ - A path to a *file* containing textual inversion weights, e.g. `./my_text_inversions.pt`.
+
+ Or a list of those elements.
+ token (`str` or `List[str]`, *optional*):
+ Override the token to use for the textual inversion weights. If `pretrained_model_name_or_path` is a
+ list, then `token` must also be a list of equal length.
weight_name (`str`, *optional*):
Name of a custom weight file. This should be used in two cases:
@@ -576,16 +585,62 @@ def load_textual_inversion(
"framework": "pytorch",
}
- # 1. Load textual inversion file
- model_file = None
- # Let's first try to load .safetensors weights
- if (use_safetensors and weight_name is None) or (
- weight_name is not None and weight_name.endswith(".safetensors")
- ):
- try:
+ if isinstance(pretrained_model_name_or_path, str):
+ pretrained_model_name_or_paths = [pretrained_model_name_or_path]
+ else:
+ pretrained_model_name_or_paths = pretrained_model_name_or_path
+
+ if isinstance(token, str):
+ tokens = [token]
+ elif token is None:
+ tokens = [None] * len(pretrained_model_name_or_paths)
+ else:
+ tokens = token
+
+ if len(pretrained_model_name_or_paths) != len(tokens):
+ raise ValueError(
+ f"You have passed a list of models of length {len(pretrained_model_name_or_paths)}, and list of tokens of length {len(tokens)}"
+ f"Make sure both lists have the same length."
+ )
+
+ valid_tokens = [t for t in tokens if t is not None]
+ if len(set(valid_tokens)) < len(valid_tokens):
+ raise ValueError(f"You have passed a list of tokens that contains duplicates: {tokens}")
+
+ token_ids_and_embeddings = []
+
+ for pretrained_model_name_or_path, token in zip(pretrained_model_name_or_paths, tokens):
+ # 1. Load textual inversion file
+ model_file = None
+ # Let's first try to load .safetensors weights
+ if (use_safetensors and weight_name is None) or (
+ weight_name is not None and weight_name.endswith(".safetensors")
+ ):
+ try:
+ model_file = _get_model_file(
+ pretrained_model_name_or_path,
+ weights_name=weight_name or TEXT_INVERSION_NAME_SAFE,
+ cache_dir=cache_dir,
+ force_download=force_download,
+ resume_download=resume_download,
+ proxies=proxies,
+ local_files_only=local_files_only,
+ use_auth_token=use_auth_token,
+ revision=revision,
+ subfolder=subfolder,
+ user_agent=user_agent,
+ )
+ state_dict = safetensors.torch.load_file(model_file, device="cpu")
+ except Exception as e:
+ if not allow_pickle:
+ raise e
+
+ model_file = None
+
+ if model_file is None:
model_file = _get_model_file(
pretrained_model_name_or_path,
- weights_name=weight_name or TEXT_INVERSION_NAME_SAFE,
+ weights_name=weight_name or TEXT_INVERSION_NAME,
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
@@ -596,88 +651,68 @@ def load_textual_inversion(
subfolder=subfolder,
user_agent=user_agent,
)
- state_dict = safetensors.torch.load_file(model_file, device="cpu")
- except Exception as e:
- if not allow_pickle:
- raise e
+ state_dict = torch.load(model_file, map_location="cpu")
- model_file = None
+ # 2. Load token and embedding correcly from file
+ if isinstance(state_dict, torch.Tensor):
+ if token is None:
+ raise ValueError(
+ "You are trying to load a textual inversion embedding that has been saved as a PyTorch tensor. Make sure to pass the name of the corresponding token in this case: `token=...`."
+ )
+ embedding = state_dict
+ elif len(state_dict) == 1:
+ # diffusers
+ loaded_token, embedding = next(iter(state_dict.items()))
+ elif "string_to_param" in state_dict:
+ # A1111
+ loaded_token = state_dict["name"]
+ embedding = state_dict["string_to_param"]["*"]
+
+ if token is not None and loaded_token != token:
+ logger.info(f"The loaded token: {loaded_token} is overwritten by the passed token {token}.")
+ else:
+ token = loaded_token
- if model_file is None:
- model_file = _get_model_file(
- pretrained_model_name_or_path,
- weights_name=weight_name or TEXT_INVERSION_NAME,
- cache_dir=cache_dir,
- force_download=force_download,
- resume_download=resume_download,
- proxies=proxies,
- local_files_only=local_files_only,
- use_auth_token=use_auth_token,
- revision=revision,
- subfolder=subfolder,
- user_agent=user_agent,
- )
- state_dict = torch.load(model_file, map_location="cpu")
+ embedding = embedding.to(dtype=self.text_encoder.dtype, device=self.text_encoder.device)
- # 2. Load token and embedding correcly from file
- if isinstance(state_dict, torch.Tensor):
- if token is None:
+ # 3. Make sure we don't mess up the tokenizer or text encoder
+ vocab = self.tokenizer.get_vocab()
+ if token in vocab:
raise ValueError(
- "You are trying to load a textual inversion embedding that has been saved as a PyTorch tensor. Make sure to pass the name of the corresponding token in this case: `token=...`."
+ f"Token {token} already in tokenizer vocabulary. Please choose a different token name or remove {token} and embedding from the tokenizer and text encoder."
)
- embedding = state_dict
- elif len(state_dict) == 1:
- # diffusers
- loaded_token, embedding = next(iter(state_dict.items()))
- elif "string_to_param" in state_dict:
- # A1111
- loaded_token = state_dict["name"]
- embedding = state_dict["string_to_param"]["*"]
-
- if token is not None and loaded_token != token:
- logger.warn(f"The loaded token: {loaded_token} is overwritten by the passed token {token}.")
- else:
- token = loaded_token
-
- embedding = embedding.to(dtype=self.text_encoder.dtype, device=self.text_encoder.device)
+ elif f"{token}_1" in vocab:
+ multi_vector_tokens = [token]
+ i = 1
+ while f"{token}_{i}" in self.tokenizer.added_tokens_encoder:
+ multi_vector_tokens.append(f"{token}_{i}")
+ i += 1
- # 3. Make sure we don't mess up the tokenizer or text encoder
- vocab = self.tokenizer.get_vocab()
- if token in vocab:
- raise ValueError(
- f"Token {token} already in tokenizer vocabulary. Please choose a different token name or remove {token} and embedding from the tokenizer and text encoder."
- )
- elif f"{token}_1" in vocab:
- multi_vector_tokens = [token]
- i = 1
- while f"{token}_{i}" in self.tokenizer.added_tokens_encoder:
- multi_vector_tokens.append(f"{token}_{i}")
- i += 1
+ raise ValueError(
+ f"Multi-vector Token {multi_vector_tokens} already in tokenizer vocabulary. Please choose a different token name or remove the {multi_vector_tokens} and embedding from the tokenizer and text encoder."
+ )
- raise ValueError(
- f"Multi-vector Token {multi_vector_tokens} already in tokenizer vocabulary. Please choose a different token name or remove the {multi_vector_tokens} and embedding from the tokenizer and text encoder."
- )
+ is_multi_vector = len(embedding.shape) > 1 and embedding.shape[0] > 1
- is_multi_vector = len(embedding.shape) > 1 and embedding.shape[0] > 1
+ if is_multi_vector:
+ tokens = [token] + [f"{token}_{i}" for i in range(1, embedding.shape[0])]
+ embeddings = [e for e in embedding] # noqa: C416
+ else:
+ tokens = [token]
+ embeddings = [embedding[0]] if len(embedding.shape) > 1 else [embedding]
- if is_multi_vector:
- tokens = [token] + [f"{token}_{i}" for i in range(1, embedding.shape[0])]
- embeddings = [e for e in embedding] # noqa: C416
- else:
- tokens = [token]
- embeddings = [embedding[0]] if len(embedding.shape) > 1 else [embedding]
+ # add tokens and get ids
+ self.tokenizer.add_tokens(tokens)
+ token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
+ token_ids_and_embeddings += zip(token_ids, embeddings)
- # add tokens and get ids
- self.tokenizer.add_tokens(tokens)
- token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
+ logger.info(f"Loaded textual inversion embedding for {token}.")
- # resize token embeddings and set new embeddings
+ # resize token embeddings and set all new embeddings
self.text_encoder.resize_token_embeddings(len(self.tokenizer))
- for token_id, embedding in zip(token_ids, embeddings):
+ for token_id, embedding in token_ids_and_embeddings:
self.text_encoder.get_input_embeddings().weight.data[token_id] = embedding
- logger.info(f"Loaded textual inversion embedding for {token}.")
-
class LoraLoaderMixin:
r"""
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index 168ff8106c52..70b1431d630a 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -575,6 +575,31 @@ def test_text_inversion_download(self):
out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
assert out.shape == (1, 128, 128, 3)
+ # multi embedding load
+ with tempfile.TemporaryDirectory() as tmpdirname1:
+ with tempfile.TemporaryDirectory() as tmpdirname2:
+ ten = {"<*****>": torch.ones((32,))}
+ torch.save(ten, os.path.join(tmpdirname1, "learned_embeds.bin"))
+
+ ten = {"<******>": 2 * torch.ones((1, 32))}
+ torch.save(ten, os.path.join(tmpdirname2, "learned_embeds.bin"))
+
+ pipe.load_textual_inversion([tmpdirname1, tmpdirname2])
+
+ token = pipe.tokenizer.convert_tokens_to_ids("<*****>")
+ assert token == num_tokens + 8, "Added token must be at spot `num_tokens`"
+ assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 32
+ assert pipe._maybe_convert_prompt("<*****>", pipe.tokenizer) == "<*****>"
+
+ token = pipe.tokenizer.convert_tokens_to_ids("<******>")
+ assert token == num_tokens + 9, "Added token must be at spot `num_tokens`"
+ assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 64
+ assert pipe._maybe_convert_prompt("<******>", pipe.tokenizer) == "<******>"
+
+ prompt = "hey <*****> <******>"
+ out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
+ assert out.shape == (1, 128, 128, 3)
+
def test_download_ignore_files(self):
# Check https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe-ignore-files/blob/72f58636e5508a218c6b3f60550dc96445547817/model_index.json#L4
with tempfile.TemporaryDirectory() as tmpdirname:
From f381402ec851d2e02dadd2f8a433c90dca4a9bfd Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Mon, 8 May 2023 10:55:02 +0200
Subject: [PATCH 054/206] make fix-copies
---
.../stable_diffusion/pipeline_stable_diffusion_diffedit.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
index adada63b83f7..e48d8a46423e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
@@ -560,7 +560,7 @@ def _encode_prompt(
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
From 571bc1ea118297fb60e95be5e3e162839381aa48 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Mon, 8 May 2023 12:08:23 -0700
Subject: [PATCH 055/206] [docs] Fix docstring (#3334)
fix docstring
Co-authored-by: Patrick von Platen
---
src/diffusers/pipelines/pipeline_utils.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 5e4290e8db9f..82bcda54938d 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -531,7 +531,7 @@ def save_pretrained(
"""
Save all variables of the pipeline that can be saved and loaded as well as the pipelines configuration file to
a directory. A pipeline variable can be saved and loaded if its class implements both a save and loading
- method. The pipeline can easily be re-loaded using the `[`~DiffusionPipeline.from_pretrained`]` class method.
+ method. The pipeline can easily be re-loaded using the [`~DiffusionPipeline.from_pretrained`] class method.
Arguments:
save_directory (`str` or `os.PathLike`):
@@ -1087,7 +1087,7 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
Download and cache a PyTorch diffusion pipeline from pre-trained pipeline weights.
Parameters:
- pretrained_model_name (`str` or `os.PathLike`, *optional*):
+ pretrained_model_name (`str` or `os.PathLike`, *optional*):
Should be a string, the *repo id* of a pretrained pipeline hosted inside a model repo on
https://huggingface.co/ Valid repo ids have to be located under a user or organization name, like
`CompVis/ldm-text2im-large-256`.
From a757b2db6ef9b0b462cfee4dcaf697c5c6e0c63d Mon Sep 17 00:00:00 2001
From: Will Berman
Date: Tue, 9 May 2023 10:24:36 -0700
Subject: [PATCH 056/206] if dreambooth lora (#3360)
* update IF stage I pipelines
add fixed variance schedulers and lora loading
* added kv lora attn processor
* allow loading into alternative lora attn processor
* make vae optional
* throw away predicted variance
* allow loading into added kv lora layer
* allow load T5
* allow pre compute text embeddings
* set new variance type in schedulers
* fix copies
* refactor all prompt embedding code
class prompts are now included in pre-encoding code
max tokenizer length is now configurable
embedding attention mask is now configurable
* fix for when variance type is not defined on scheduler
* do not pre compute validation prompt if not present
* add example test for if lora dreambooth
* add check for train text encoder and pre compute text embeddings
---
examples/dreambooth/train_dreambooth_lora.py | 285 +++++++++++++++---
examples/test_examples.py | 35 +++
src/diffusers/loaders.py | 20 +-
src/diffusers/models/attention_processor.py | 68 +++++
.../pipelines/deepfloyd_if/pipeline_if.py | 6 +-
.../deepfloyd_if/pipeline_if_img2img.py | 6 +-
.../deepfloyd_if/pipeline_if_inpainting.py | 6 +-
7 files changed, 382 insertions(+), 44 deletions(-)
diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
index 9af81aa5a95d..0bf3333a6209 100644
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
@@ -14,6 +14,7 @@
# See the License for the specific language governing permissions and
import argparse
+import gc
import hashlib
import itertools
import logging
@@ -30,7 +31,7 @@
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import ProjectConfiguration, set_seed
-from huggingface_hub import create_repo, upload_folder
+from huggingface_hub import create_repo, model_info, upload_folder
from packaging import version
from PIL import Image
from torch.utils.data import Dataset
@@ -48,7 +49,13 @@
UNet2DConditionModel,
)
from diffusers.loaders import AttnProcsLayers, LoraLoaderMixin
-from diffusers.models.attention_processor import LoRAAttnProcessor
+from diffusers.models.attention_processor import (
+ AttnAddedKVProcessor,
+ AttnAddedKVProcessor2_0,
+ LoRAAttnAddedKVProcessor,
+ LoRAAttnProcessor,
+ SlicedAttnAddedKVProcessor,
+)
from diffusers.optimization import get_scheduler
from diffusers.utils import TEXT_ENCODER_TARGET_MODULES, check_min_version, is_wandb_available
from diffusers.utils.import_utils import is_xformers_available
@@ -108,6 +115,10 @@ def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: st
from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
return RobertaSeriesModelWithTransformation
+ elif model_class == "T5EncoderModel":
+ from transformers import T5EncoderModel
+
+ return T5EncoderModel
else:
raise ValueError(f"{model_class} is not supported.")
@@ -387,6 +398,24 @@ def parse_args(input_args=None):
parser.add_argument(
"--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
)
+ parser.add_argument(
+ "--pre_compute_text_embeddings",
+ action="store_true",
+ help="Whether or not to pre-compute text embeddings. If text embeddings are pre-computed, the text encoder will not be kept in memory during training and will leave more GPU memory available for training the rest of the model. This is not compatible with `--train_text_encoder`.",
+ )
+ parser.add_argument(
+ "--tokenizer_max_length",
+ type=int,
+ default=None,
+ required=False,
+ help="The maximum length of the tokenizer. If not set, will default to the tokenizer's max length.",
+ )
+ parser.add_argument(
+ "--text_encoder_use_attention_mask",
+ action="store_true",
+ required=False,
+ help="Whether to use attention mask for the text encoder",
+ )
if input_args is not None:
args = parser.parse_args(input_args)
@@ -409,6 +438,9 @@ def parse_args(input_args=None):
if args.class_prompt is not None:
warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+ if args.train_text_encoder and args.pre_compute_text_embeddings:
+ raise ValueError("`--train_text_encoder` cannot be used with `--pre_compute_text_embeddings`")
+
return args
@@ -428,10 +460,16 @@ def __init__(
class_num=None,
size=512,
center_crop=False,
+ encoder_hidden_states=None,
+ instance_prompt_encoder_hidden_states=None,
+ tokenizer_max_length=None,
):
self.size = size
self.center_crop = center_crop
self.tokenizer = tokenizer
+ self.encoder_hidden_states = encoder_hidden_states
+ self.instance_prompt_encoder_hidden_states = instance_prompt_encoder_hidden_states
+ self.tokenizer_max_length = tokenizer_max_length
self.instance_data_root = Path(instance_data_root)
if not self.instance_data_root.exists():
@@ -473,39 +511,50 @@ def __getitem__(self, index):
if not instance_image.mode == "RGB":
instance_image = instance_image.convert("RGB")
example["instance_images"] = self.image_transforms(instance_image)
- example["instance_prompt_ids"] = self.tokenizer(
- self.instance_prompt,
- truncation=True,
- padding="max_length",
- max_length=self.tokenizer.model_max_length,
- return_tensors="pt",
- ).input_ids
+
+ if self.encoder_hidden_states is not None:
+ example["instance_prompt_ids"] = self.encoder_hidden_states
+ else:
+ text_inputs = tokenize_prompt(
+ self.tokenizer, self.instance_prompt, tokenizer_max_length=self.tokenizer_max_length
+ )
+ example["instance_prompt_ids"] = text_inputs.input_ids
+ example["instance_attention_mask"] = text_inputs.attention_mask
if self.class_data_root:
class_image = Image.open(self.class_images_path[index % self.num_class_images])
if not class_image.mode == "RGB":
class_image = class_image.convert("RGB")
example["class_images"] = self.image_transforms(class_image)
- example["class_prompt_ids"] = self.tokenizer(
- self.class_prompt,
- truncation=True,
- padding="max_length",
- max_length=self.tokenizer.model_max_length,
- return_tensors="pt",
- ).input_ids
+
+ if self.instance_prompt_encoder_hidden_states is not None:
+ example["class_prompt_ids"] = self.instance_prompt_encoder_hidden_states
+ else:
+ class_text_inputs = tokenize_prompt(
+ self.tokenizer, self.class_prompt, tokenizer_max_length=self.tokenizer_max_length
+ )
+ example["class_prompt_ids"] = class_text_inputs.input_ids
+ example["class_attention_mask"] = class_text_inputs.attention_mask
return example
def collate_fn(examples, with_prior_preservation=False):
+ has_attention_mask = "instance_attention_mask" in examples[0]
+
input_ids = [example["instance_prompt_ids"] for example in examples]
pixel_values = [example["instance_images"] for example in examples]
+ if has_attention_mask:
+ attention_mask = [example["instance_attention_mask"] for example in examples]
+
# Concat class and instance examples for prior preservation.
# We do this to avoid doing two forward passes.
if with_prior_preservation:
input_ids += [example["class_prompt_ids"] for example in examples]
pixel_values += [example["class_images"] for example in examples]
+ if has_attention_mask:
+ attention_mask += [example["class_attention_mask"] for example in examples]
pixel_values = torch.stack(pixel_values)
pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
@@ -516,6 +565,10 @@ def collate_fn(examples, with_prior_preservation=False):
"input_ids": input_ids,
"pixel_values": pixel_values,
}
+
+ if has_attention_mask:
+ batch["attention_mask"] = attention_mask
+
return batch
@@ -536,6 +589,50 @@ def __getitem__(self, index):
return example
+def model_has_vae(args):
+ config_file_name = os.path.join("vae", AutoencoderKL.config_name)
+ if os.path.isdir(args.pretrained_model_name_or_path):
+ config_file_name = os.path.join(args.pretrained_model_name_or_path, config_file_name)
+ return os.path.isfile(config_file_name)
+ else:
+ files_in_repo = model_info(args.pretrained_model_name_or_path, revision=args.revision).siblings
+ return any(file.rfilename == config_file_name for file in files_in_repo)
+
+
+def tokenize_prompt(tokenizer, prompt, tokenizer_max_length=None):
+ if tokenizer_max_length is not None:
+ max_length = tokenizer_max_length
+ else:
+ max_length = tokenizer.model_max_length
+
+ text_inputs = tokenizer(
+ prompt,
+ truncation=True,
+ padding="max_length",
+ max_length=max_length,
+ return_tensors="pt",
+ )
+
+ return text_inputs
+
+
+def encode_prompt(text_encoder, input_ids, attention_mask, text_encoder_use_attention_mask=None):
+ text_input_ids = input_ids.to(text_encoder.device)
+
+ if text_encoder_use_attention_mask:
+ attention_mask = attention_mask.to(text_encoder.device)
+ else:
+ attention_mask = None
+
+ prompt_embeds = text_encoder(
+ text_input_ids,
+ attention_mask=attention_mask,
+ )
+ prompt_embeds = prompt_embeds[0]
+
+ return prompt_embeds
+
+
def main(args):
logging_dir = Path(args.output_dir, args.logging_dir)
@@ -656,13 +753,20 @@ def main(args):
text_encoder = text_encoder_cls.from_pretrained(
args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
)
- vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
+ if model_has_vae(args):
+ vae = AutoencoderKL.from_pretrained(
+ args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision
+ )
+ else:
+ vae = None
+
unet = UNet2DConditionModel.from_pretrained(
args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
)
# We only train the additional adapter LoRA layers
- vae.requires_grad_(False)
+ if vae is not None:
+ vae.requires_grad_(False)
text_encoder.requires_grad_(False)
unet.requires_grad_(False)
@@ -676,7 +780,8 @@ def main(args):
# Move unet, vae and text_encoder to device and cast to weight_dtype
unet.to(accelerator.device, dtype=weight_dtype)
- vae.to(accelerator.device, dtype=weight_dtype)
+ if vae is not None:
+ vae.to(accelerator.device, dtype=weight_dtype)
text_encoder.to(accelerator.device, dtype=weight_dtype)
if args.enable_xformers_memory_efficient_attention:
@@ -707,7 +812,7 @@ def main(args):
# Set correct lora layers
unet_lora_attn_procs = {}
- for name in unet.attn_processors.keys():
+ for name, attn_processor in unet.attn_processors.items():
cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
if name.startswith("mid_block"):
hidden_size = unet.config.block_out_channels[-1]
@@ -718,7 +823,12 @@ def main(args):
block_id = int(name[len("down_blocks.")])
hidden_size = unet.config.block_out_channels[block_id]
- unet_lora_attn_procs[name] = LoRAAttnProcessor(
+ if isinstance(attn_processor, (AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_0)):
+ lora_attn_processor_class = LoRAAttnAddedKVProcessor
+ else:
+ lora_attn_processor_class = LoRAAttnProcessor
+
+ unet_lora_attn_procs[name] = lora_attn_processor_class(
hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
)
@@ -783,6 +893,44 @@ def main(args):
eps=args.adam_epsilon,
)
+ if args.pre_compute_text_embeddings:
+
+ def compute_text_embeddings(prompt):
+ with torch.no_grad():
+ text_inputs = tokenize_prompt(tokenizer, prompt, tokenizer_max_length=args.tokenizer_max_length)
+ prompt_embeds = encode_prompt(
+ text_encoder,
+ text_inputs.input_ids,
+ text_inputs.attention_mask,
+ text_encoder_use_attention_mask=args.text_encoder_use_attention_mask,
+ )
+
+ return prompt_embeds
+
+ pre_computed_encoder_hidden_states = compute_text_embeddings(args.instance_prompt)
+ validation_prompt_negative_prompt_embeds = compute_text_embeddings("")
+
+ if args.validation_prompt is not None:
+ validation_prompt_encoder_hidden_states = compute_text_embeddings(args.validation_prompt)
+ else:
+ validation_prompt_encoder_hidden_states = None
+
+ if args.instance_prompt is not None:
+ pre_computed_instance_prompt_encoder_hidden_states = compute_text_embeddings(args.instance_prompt)
+ else:
+ pre_computed_instance_prompt_encoder_hidden_states = None
+
+ text_encoder = None
+ tokenizer = None
+
+ gc.collect()
+ torch.cuda.empty_cache()
+ else:
+ pre_computed_encoder_hidden_states = None
+ validation_prompt_encoder_hidden_states = None
+ validation_prompt_negative_prompt_embeds = None
+ pre_computed_instance_prompt_encoder_hidden_states = None
+
# Dataset and DataLoaders creation:
train_dataset = DreamBoothDataset(
instance_data_root=args.instance_data_dir,
@@ -793,6 +941,9 @@ def main(args):
tokenizer=tokenizer,
size=args.resolution,
center_crop=args.center_crop,
+ encoder_hidden_states=pre_computed_encoder_hidden_states,
+ instance_prompt_encoder_hidden_states=pre_computed_instance_prompt_encoder_hidden_states,
+ tokenizer_max_length=args.tokenizer_max_length,
)
train_dataloader = torch.utils.data.DataLoader(
@@ -896,32 +1047,53 @@ def main(args):
continue
with accelerator.accumulate(unet):
- # Convert images to latent space
- latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
- latents = latents * vae.config.scaling_factor
+ pixel_values = batch["pixel_values"].to(dtype=weight_dtype)
+
+ if vae is not None:
+ # Convert images to latent space
+ model_input = vae.encode(pixel_values).latent_dist.sample()
+ model_input = model_input * vae.config.scaling_factor
+ else:
+ model_input = pixel_values
# Sample noise that we'll add to the latents
- noise = torch.randn_like(latents)
- bsz = latents.shape[0]
+ noise = torch.randn_like(model_input)
+ bsz = model_input.shape[0]
# Sample a random timestep for each image
- timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+ timesteps = torch.randint(
+ 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+ )
timesteps = timesteps.long()
- # Add noise to the latents according to the noise magnitude at each timestep
+ # Add noise to the model input according to the noise magnitude at each timestep
# (this is the forward diffusion process)
- noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+ noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
# Get the text embedding for conditioning
- encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+ if args.pre_compute_text_embeddings:
+ encoder_hidden_states = batch["input_ids"]
+ else:
+ encoder_hidden_states = encode_prompt(
+ text_encoder,
+ batch["input_ids"],
+ batch["attention_mask"],
+ text_encoder_use_attention_mask=args.text_encoder_use_attention_mask,
+ )
# Predict the noise residual
- model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+ model_pred = unet(noisy_model_input, timesteps, encoder_hidden_states).sample
+
+ # if model predicts variance, throw away the prediction. we will only train on the
+ # simplified training objective. This means that all schedulers using the fine tuned
+ # model must be configured to use one of the fixed variance variance types.
+ if model_pred.shape[1] == 6:
+ model_pred, _ = torch.chunk(model_pred, 2, dim=1)
# Get the target for loss depending on the prediction type
if noise_scheduler.config.prediction_type == "epsilon":
target = noise
elif noise_scheduler.config.prediction_type == "v_prediction":
- target = noise_scheduler.get_velocity(latents, noise, timesteps)
+ target = noise_scheduler.get_velocity(model_input, noise, timesteps)
else:
raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
@@ -988,19 +1160,40 @@ def main(args):
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
unet=accelerator.unwrap_model(unet),
- text_encoder=accelerator.unwrap_model(text_encoder),
+ text_encoder=None if args.pre_compute_text_embeddings else accelerator.unwrap_model(text_encoder),
revision=args.revision,
torch_dtype=weight_dtype,
)
- pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+
+ # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+ scheduler_args = {}
+
+ if "variance_type" in pipeline.scheduler.config:
+ variance_type = pipeline.scheduler.config.variance_type
+
+ if variance_type in ["learned", "learned_range"]:
+ variance_type = "fixed_small"
+
+ scheduler_args["variance_type"] = variance_type
+
+ pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
+ pipeline.scheduler.config, **scheduler_args
+ )
+
pipeline = pipeline.to(accelerator.device)
pipeline.set_progress_bar_config(disable=True)
# run inference
- generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+ generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+ if args.pre_compute_text_embeddings:
+ pipeline_args = {
+ "prompt_embeds": validation_prompt_encoder_hidden_states,
+ "negative_prompt_embeds": validation_prompt_negative_prompt_embeds,
+ }
+ else:
+ pipeline_args = {"prompt": args.validation_prompt}
images = [
- pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
- for _ in range(args.num_validation_images)
+ pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)
]
for tracker in accelerator.trackers:
@@ -1024,7 +1217,8 @@ def main(args):
accelerator.wait_for_everyone()
if accelerator.is_main_process:
unet = unet.to(torch.float32)
- text_encoder = text_encoder.to(torch.float32)
+ if text_encoder is not None:
+ text_encoder = text_encoder.to(torch.float32)
LoraLoaderMixin.save_lora_weights(
save_directory=args.output_dir,
unet_lora_layers=unet_lora_layers,
@@ -1036,7 +1230,20 @@ def main(args):
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path, revision=args.revision, torch_dtype=weight_dtype
)
- pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+
+ # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+ scheduler_args = {}
+
+ if "variance_type" in pipeline.scheduler.config:
+ variance_type = pipeline.scheduler.config.variance_type
+
+ if variance_type in ["learned", "learned_range"]:
+ variance_type = "fixed_small"
+
+ scheduler_args["variance_type"] = variance_type
+
+ pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+
pipeline = pipeline.to(accelerator.device)
# load attention processors
diff --git a/examples/test_examples.py b/examples/test_examples.py
index 648c2cb8a1b7..d9e7de717f47 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -292,6 +292,41 @@ def test_dreambooth_lora_with_text_encoder(self):
is_correct_naming = all(k.startswith("unet") or k.startswith("text_encoder") for k in keys)
self.assertTrue(is_correct_naming)
+ def test_dreambooth_lora_if_model(self):
+ with tempfile.TemporaryDirectory() as tmpdir:
+ test_args = f"""
+ examples/dreambooth/train_dreambooth_lora.py
+ --pretrained_model_name_or_path hf-internal-testing/tiny-if-pipe
+ --instance_data_dir docs/source/en/imgs
+ --instance_prompt photo
+ --resolution 64
+ --train_batch_size 1
+ --gradient_accumulation_steps 1
+ --max_train_steps 2
+ --learning_rate 5.0e-04
+ --scale_lr
+ --lr_scheduler constant
+ --lr_warmup_steps 0
+ --output_dir {tmpdir}
+ --pre_compute_text_embeddings
+ --tokenizer_max_length=77
+ --text_encoder_use_attention_mask
+ """.split()
+
+ run_command(self._launch_args + test_args)
+ # save_pretrained smoke test
+ self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.bin")))
+
+ # make sure the state_dict has the correct naming in the parameters.
+ lora_state_dict = torch.load(os.path.join(tmpdir, "pytorch_lora_weights.bin"))
+ is_lora = all("lora" in k for k in lora_state_dict.keys())
+ self.assertTrue(is_lora)
+
+ # when not training the text encoder, all the parameters in the state dict should start
+ # with `"unet"` in their names.
+ starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys())
+ self.assertTrue(starts_with_unet)
+
def test_custom_diffusion(self):
with tempfile.TemporaryDirectory() as tmpdir:
test_args = f"""
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index f41d0ffe72e3..17e24ff2f0b2 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -21,9 +21,13 @@
from huggingface_hub import hf_hub_download
from .models.attention_processor import (
+ AttnAddedKVProcessor,
+ AttnAddedKVProcessor2_0,
CustomDiffusionAttnProcessor,
CustomDiffusionXFormersAttnProcessor,
+ LoRAAttnAddedKVProcessor,
LoRAAttnProcessor,
+ SlicedAttnAddedKVProcessor,
)
from .utils import (
DIFFUSERS_CACHE,
@@ -250,10 +254,22 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
for key, value_dict in lora_grouped_dict.items():
rank = value_dict["to_k_lora.down.weight"].shape[0]
- cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[1]
hidden_size = value_dict["to_k_lora.up.weight"].shape[0]
- attn_processors[key] = LoRAAttnProcessor(
+ attn_processor = self
+ for sub_key in key.split("."):
+ attn_processor = getattr(attn_processor, sub_key)
+
+ if isinstance(
+ attn_processor, (AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_0)
+ ):
+ cross_attention_dim = value_dict["add_k_proj_lora.down.weight"].shape[1]
+ attn_processor_class = LoRAAttnAddedKVProcessor
+ else:
+ cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[1]
+ attn_processor_class = LoRAAttnProcessor
+
+ attn_processors[key] = attn_processor_class(
hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank
)
attn_processors[key].load_state_dict(value_dict)
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 7ac88b17999a..6701122fc13b 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -669,6 +669,73 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
return hidden_states
+class LoRAAttnAddedKVProcessor(nn.Module):
+ def __init__(self, hidden_size, cross_attention_dim=None, rank=4):
+ super().__init__()
+
+ self.hidden_size = hidden_size
+ self.cross_attention_dim = cross_attention_dim
+ self.rank = rank
+
+ self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank)
+ self.add_k_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank)
+ self.add_v_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank)
+ self.to_k_lora = LoRALinearLayer(hidden_size, hidden_size, rank)
+ self.to_v_lora = LoRALinearLayer(hidden_size, hidden_size, rank)
+ self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank)
+
+ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0):
+ residual = hidden_states
+ hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
+ batch_size, sequence_length, _ = hidden_states.shape
+
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+ if encoder_hidden_states is None:
+ encoder_hidden_states = hidden_states
+ elif attn.norm_cross:
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+ query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
+ query = attn.head_to_batch_dim(query)
+
+ encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + scale * self.add_k_proj_lora(
+ encoder_hidden_states
+ )
+ encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + scale * self.add_v_proj_lora(
+ encoder_hidden_states
+ )
+ encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+ encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
+
+ if not attn.only_cross_attention:
+ key = attn.to_k(hidden_states) + scale * self.to_k_lora(hidden_states)
+ value = attn.to_v(hidden_states) + scale * self.to_v_lora(hidden_states)
+ key = attn.head_to_batch_dim(key)
+ value = attn.head_to_batch_dim(value)
+ key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+ value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+ else:
+ key = encoder_hidden_states_key_proj
+ value = encoder_hidden_states_value_proj
+
+ attention_probs = attn.get_attention_scores(query, key, attention_mask)
+ hidden_states = torch.bmm(attention_probs, value)
+ hidden_states = attn.batch_to_head_dim(hidden_states)
+
+ # linear proj
+ hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
+ # dropout
+ hidden_states = attn.to_out[1](hidden_states)
+
+ hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
+ hidden_states = hidden_states + residual
+
+ return hidden_states
+
+
class XFormersAttnProcessor:
def __init__(self, attention_op: Optional[Callable] = None):
self.attention_op = attention_op
@@ -1022,6 +1089,7 @@ def __call__(self, attn: "Attention", hidden_states, encoder_hidden_states=None,
AttnAddedKVProcessor2_0,
LoRAAttnProcessor,
LoRAXFormersAttnProcessor,
+ LoRAAttnAddedKVProcessor,
CustomDiffusionAttnProcessor,
CustomDiffusionXFormersAttnProcessor,
]
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
index 448389b9f1f6..cd1015dc03bb 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
@@ -7,6 +7,7 @@
import torch
from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+from ...loaders import LoraLoaderMixin
from ...models import UNet2DConditionModel
from ...schedulers import DDPMScheduler
from ...utils import (
@@ -85,7 +86,7 @@
"""
-class IFPipeline(DiffusionPipeline):
+class IFPipeline(DiffusionPipeline, LoraLoaderMixin):
tokenizer: T5Tokenizer
text_encoder: T5EncoderModel
@@ -804,6 +805,9 @@ def __call__(
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+ if self.scheduler.config.variance_type not in ["learned", "learned_range"]:
+ noise_pred, _ = noise_pred.split(model_input.shape[1], dim=1)
+
# compute the previous noisy sample x_t -> x_t-1
intermediate_images = self.scheduler.step(
noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
index 231ee02b1bb8..6bae2071173b 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
@@ -9,6 +9,7 @@
import torch
from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+from ...loaders import LoraLoaderMixin
from ...models import UNet2DConditionModel
from ...schedulers import DDPMScheduler
from ...utils import (
@@ -109,7 +110,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
"""
-class IFImg2ImgPipeline(DiffusionPipeline):
+class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
tokenizer: T5Tokenizer
text_encoder: T5EncoderModel
@@ -929,6 +930,9 @@ def __call__(
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+ if self.scheduler.config.variance_type not in ["learned", "learned_range"]:
+ noise_pred, _ = noise_pred.split(model_input.shape[1], dim=1)
+
# compute the previous noisy sample x_t -> x_t-1
intermediate_images = self.scheduler.step(
noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
index 6986387ca995..9c1f71126ac5 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
@@ -9,6 +9,7 @@
import torch
from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+from ...loaders import LoraLoaderMixin
from ...models import UNet2DConditionModel
from ...schedulers import DDPMScheduler
from ...utils import (
@@ -112,7 +113,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
"""
-class IFInpaintingPipeline(DiffusionPipeline):
+class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin):
tokenizer: T5Tokenizer
text_encoder: T5EncoderModel
@@ -1044,6 +1045,9 @@ def __call__(
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+ if self.scheduler.config.variance_type not in ["learned", "learned_range"]:
+ noise_pred, _ = noise_pred.split(model_input.shape[1], dim=1)
+
# compute the previous noisy sample x_t -> x_t-1
prev_intermediate_images = intermediate_images
From c5594795929c9c0274ae4a72cbffb2e03d128efe Mon Sep 17 00:00:00 2001
From: YiYi Xu
Date: Tue, 9 May 2023 11:28:30 -1000
Subject: [PATCH 057/206] Postprocessing refactor all others (#3337)
* add text2img
* fix-copies
* add
* add all other pipelines
* add
* add
* add
* add
* add
* make style
* style + fix copies
---------
Co-authored-by: yiyixuxu
---
.../alt_diffusion/pipeline_alt_diffusion.py | 43 +++++++++++-------
.../pipeline_paint_by_example.py | 38 +++++++++++-----
.../pipeline_semantic_stable_diffusion.py | 44 ++++++++++++++-----
.../pipeline_cycle_diffusion.py | 38 +++++++++++-----
.../pipeline_stable_diffusion.py | 41 ++++++++++-------
...line_stable_diffusion_attend_and_excite.py | 37 +++++++++++-----
.../pipeline_stable_diffusion_controlnet.py | 41 ++++++++++-------
.../pipeline_stable_diffusion_depth2img.py | 30 +++++++++----
.../pipeline_stable_diffusion_diffedit.py | 42 ++++++++++++------
...peline_stable_diffusion_image_variation.py | 38 +++++++++++-----
.../pipeline_stable_diffusion_inpaint.py | 38 +++++++++++-----
...ipeline_stable_diffusion_inpaint_legacy.py | 38 +++++++++++-----
...eline_stable_diffusion_instruct_pix2pix.py | 38 +++++++++++-----
.../pipeline_stable_diffusion_k_diffusion.py | 38 +++++++++++-----
...ipeline_stable_diffusion_latent_upscale.py | 19 +++++---
...pipeline_stable_diffusion_model_editing.py | 41 ++++++++++-------
.../pipeline_stable_diffusion_panorama.py | 38 +++++++++++-----
.../pipeline_stable_diffusion_pix2pix_zero.py | 44 +++++++++++++------
.../pipeline_stable_diffusion_sag.py | 38 +++++++++++-----
.../pipeline_stable_diffusion_upscale.py | 6 +++
.../pipeline_stable_unclip.py | 20 ++++++---
.../pipeline_stable_unclip_img2img.py | 19 +++++---
.../pipeline_stable_diffusion_safe.py | 5 +++
...ipeline_versatile_diffusion_dual_guided.py | 18 +++++---
...ine_versatile_diffusion_image_variation.py | 18 +++++---
...eline_versatile_diffusion_text_to_image.py | 18 +++++---
.../altdiffusion/test_alt_diffusion.py | 7 +--
.../test_alt_diffusion_img2img.py | 3 +-
.../paint_by_example/test_paint_by_example.py | 1 +
.../stable_diffusion/test_cycle_diffusion.py | 5 ++-
.../stable_diffusion/test_stable_diffusion.py | 7 +--
.../test_stable_diffusion_controlnet.py | 5 ++-
.../test_stable_diffusion_image_variation.py | 9 +++-
.../test_stable_diffusion_inpaint.py | 7 ++-
...st_stable_diffusion_instruction_pix2pix.py | 9 +++-
.../test_stable_diffusion_model_editing.py | 7 +--
.../test_stable_diffusion_panorama.py | 7 +--
.../test_stable_diffusion_pix2pix_zero.py | 7 ++-
.../test_stable_diffusion_sag.py | 7 +--
.../test_stable_diffusion.py | 7 +--
...test_stable_diffusion_attend_and_excite.py | 9 ++--
.../test_stable_diffusion_depth.py | 9 ++--
.../test_stable_diffusion_diffedit.py | 7 ++-
.../test_stable_diffusion_inpaint.py | 7 ++-
.../test_stable_diffusion_latent_upscale.py | 8 +++-
.../stable_unclip/test_stable_unclip.py | 7 +--
.../test_stable_unclip_img2img.py | 6 ++-
tests/pipelines/test_pipelines_common.py | 2 +-
48 files changed, 669 insertions(+), 302 deletions(-)
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index d8bae0a8df8a..8507684cf9b4 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -13,6 +13,7 @@
# limitations under the License.
import inspect
+import warnings
from typing import Any, Callable, Dict, List, Optional, Union
import torch
@@ -22,6 +23,7 @@
from diffusers.utils import is_accelerate_available, is_accelerate_version
from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
@@ -174,6 +176,7 @@ def __init__(
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
def enable_vae_slicing(self):
@@ -426,16 +429,27 @@ def _encode_prompt(
return prompt_embeds
def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
- else:
- has_nsfw_concept = None
return image, has_nsfw_concept
def decode_latents(self, latents):
+ warnings.warn(
+ (
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead"
+ ),
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -700,24 +714,19 @@ def __call__(
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- if output_type == "latent":
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
image = latents
has_nsfw_concept = None
- elif output_type == "pil":
- # 8. Post-processing
- image = self.decode_latents(latents)
- # 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-
- # 10. Convert to PIL
- image = self.numpy_to_pil(image)
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
else:
- # 8. Post-processing
- image = self.decode_latents(latents)
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- # 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
index d6c069bbb7d0..24b05f36f913 100644
--- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
+++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
@@ -13,6 +13,7 @@
# limitations under the License.
import inspect
+import warnings
from typing import Callable, List, Optional, Union
import numpy as np
@@ -22,6 +23,7 @@
from diffusers.utils import is_accelerate_available
+from ...image_processor import VaeImageProcessor
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
from ...utils import logging, randn_tensor
@@ -184,6 +186,7 @@ def __init__(
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
def enable_sequential_cpu_offload(self, gpu_id=0):
@@ -226,13 +229,17 @@ def _execution_device(self):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
- else:
- has_nsfw_concept = None
return image, has_nsfw_concept
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
@@ -255,6 +262,11 @@ def prepare_extra_step_kwargs(self, generator, eta):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -560,15 +572,19 @@ def __call__(
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- # 11. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
+ else:
+ image = latents
+ has_nsfw_concept = None
- # 12. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- # 13. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
index fbe436ec9666..e3fe20e196d8 100644
--- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
+++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
@@ -1,10 +1,12 @@
import inspect
+import warnings
from itertools import repeat
from typing import Callable, List, Optional, Union
import torch
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+from ...image_processor import VaeImageProcessor
from ...models import AutoencoderKL, UNet2DConditionModel
from ...pipeline_utils import DiffusionPipeline
from ...pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -129,10 +131,31 @@ def __init__(
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+ def run_safety_checker(self, image, device, dtype):
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+ image, has_nsfw_concept = self.safety_checker(
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+ )
+ return image, has_nsfw_concept
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -681,20 +704,19 @@ def __call__(
callback(i, t, latents)
# 8. Post-processing
- image = self.decode_latents(latents)
-
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
- self.device
- )
- image, has_nsfw_concept = self.safety_checker(
- images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
- )
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, self.device, text_embeddings.dtype)
else:
+ image = latents
has_nsfw_concept = None
- if output_type == "pil":
- image = self.numpy_to_pil(image)
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index f7748e6d6380..8babc6ab0d11 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -13,6 +13,7 @@
# limitations under the License.
import inspect
+import warnings
from typing import Callable, List, Optional, Union
import numpy as np
@@ -24,6 +25,7 @@
from diffusers.utils import is_accelerate_available, is_accelerate_version
from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import DDIMScheduler
@@ -220,6 +222,8 @@ def __init__(
safety_checker=safety_checker,
feature_extractor=feature_extractor,
)
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
@@ -504,17 +508,26 @@ def prepare_extra_step_kwargs(self, generator, eta):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
- else:
- has_nsfw_concept = None
return image, has_nsfw_concept
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -770,14 +783,19 @@ def __call__(
callback(i, t, latents)
# 9. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
+ image = latents
+ has_nsfw_concept = None
- # 10. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- # 11. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 054dba150e63..170002b2514e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -13,6 +13,7 @@
# limitations under the License.
import inspect
+import warnings
from typing import Any, Callable, Dict, List, Optional, Union
import torch
@@ -20,6 +21,7 @@
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
@@ -177,6 +179,7 @@ def __init__(
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
def enable_vae_slicing(self):
@@ -429,16 +432,25 @@ def _encode_prompt(
return prompt_embeds
def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
- else:
- has_nsfw_concept = None
return image, has_nsfw_concept
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -703,24 +715,19 @@ def __call__(
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- if output_type == "latent":
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
image = latents
has_nsfw_concept = None
- elif output_type == "pil":
- # 8. Post-processing
- image = self.decode_latents(latents)
- # 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-
- # 10. Convert to PIL
- image = self.numpy_to_pil(image)
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
else:
- # 8. Post-processing
- image = self.decode_latents(latents)
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- # 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
index 377795090b66..64e8577438ea 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
@@ -14,6 +14,7 @@
import inspect
import math
+import warnings
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import numpy as np
@@ -21,6 +22,7 @@
from torch.nn import functional as F
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+from ...image_processor import VaeImageProcessor
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...models.attention_processor import Attention
@@ -228,6 +230,7 @@ def __init__(
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
@@ -442,17 +445,26 @@ def _encode_prompt(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
- else:
- has_nsfw_concept = None
return image, has_nsfw_concept
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -972,14 +984,19 @@ def __call__(
callback(i, t, latents)
# 8. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
+ image = latents
+ has_nsfw_concept = None
- # 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- # 10. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index db41f22ab4b7..00030a6acd89 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -15,6 +15,7 @@
import inspect
import os
+import warnings
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import numpy as np
@@ -24,6 +25,7 @@
from torch import nn
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+from ...image_processor import VaeImageProcessor
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
from ...models.controlnet import ControlNetOutput
@@ -230,6 +232,7 @@ def __init__(
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
@@ -485,17 +488,26 @@ def _encode_prompt(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
- else:
- has_nsfw_concept = None
return image, has_nsfw_concept
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -1061,24 +1073,19 @@ def __call__(
self.controlnet.to("cpu")
torch.cuda.empty_cache()
- if output_type == "latent":
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
image = latents
has_nsfw_concept = None
- elif output_type == "pil":
- # 8. Post-processing
- image = self.decode_latents(latents)
- # 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-
- # 10. Convert to PIL
- image = self.numpy_to_pil(image)
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
else:
- # 8. Post-processing
- image = self.decode_latents(latents)
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- # 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
index 3167881db3db..a5b2a9987fa1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -14,6 +14,7 @@
import contextlib
import inspect
+import warnings
from typing import Callable, List, Optional, Union
import numpy as np
@@ -23,6 +24,7 @@
from transformers import CLIPTextModel, CLIPTokenizer, DPTFeatureExtractor, DPTForDepthEstimation
from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
@@ -128,6 +130,7 @@ def __init__(
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
@@ -314,17 +317,26 @@ def _encode_prompt(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
- else:
- has_nsfw_concept = None
return image, has_nsfw_concept
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -695,12 +707,12 @@ def __call__(
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- # 10. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ else:
+ image = latents
- # 11. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.postprocess(image, output_type=output_type)
if not return_dict:
return (image,)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
index e48d8a46423e..e4fc08b79cfd 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
@@ -13,6 +13,7 @@
# limitations under the License.
import inspect
+import warnings
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Optional, Union
@@ -23,6 +24,7 @@
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import DDIMInverseScheduler, KarrasDiffusionSchedulers
@@ -357,6 +359,7 @@ def __init__(
inverse_scheduler=inverse_scheduler,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
@@ -618,13 +621,17 @@ def _encode_prompt(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
- else:
- has_nsfw_concept = None
return image, has_nsfw_concept
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
@@ -647,6 +654,11 @@ def prepare_extra_step_kwargs(self, generator, eta):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -1052,7 +1064,7 @@ def generate_mask(
# 9. Convert to Numpy array or PIL.
if output_type == "pil":
- mask_image = self.numpy_to_pil(mask_image)
+ mask_image = self.image_processor.numpy_to_pil(mask_image)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
@@ -1287,7 +1299,7 @@ def invert(
# 9. Convert to PIL.
if decode_latents and output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.numpy_to_pil(image)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
@@ -1510,15 +1522,19 @@ def __call__(
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- # 9. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
+ image = latents
+ has_nsfw_concept = None
- # 10. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- # 11. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index 2dc762d62529..640fd7f2d94b 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -13,6 +13,7 @@
# limitations under the License.
import inspect
+import warnings
from typing import Callable, List, Optional, Union
import PIL
@@ -21,6 +22,7 @@
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import deprecate, is_accelerate_available, logging, randn_tensor
@@ -118,6 +120,7 @@ def __init__(
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
def enable_sequential_cpu_offload(self, gpu_id=0):
@@ -183,17 +186,26 @@ def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
- else:
- has_nsfw_concept = None
return image, has_nsfw_concept
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -398,15 +410,19 @@ def __call__(
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- # 8. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
+ else:
+ image = latents
+ has_nsfw_concept = None
- # 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- # 10. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 266648ce7613..7068408b9dcf 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -13,6 +13,7 @@
# limitations under the License.
import inspect
+import warnings
from typing import Callable, List, Optional, Union
import numpy as np
@@ -22,6 +23,7 @@
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
@@ -270,6 +272,7 @@ def __init__(
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
@@ -495,13 +498,17 @@ def _encode_prompt(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
- else:
- has_nsfw_concept = None
return image, has_nsfw_concept
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
@@ -524,6 +531,11 @@ def prepare_extra_step_kwargs(self, generator, eta):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -896,15 +908,19 @@ def __call__(
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- # 11. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
+ image = latents
+ has_nsfw_concept = None
- # 12. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- # 13. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 3256ff2b831f..5a2329a5c51f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -13,6 +13,7 @@
# limitations under the License.
import inspect
+import warnings
from typing import Callable, List, Optional, Union
import numpy as np
@@ -22,6 +23,7 @@
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
@@ -209,6 +211,7 @@ def __init__(
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
@@ -434,17 +437,26 @@ def _encode_prompt(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
- else:
- has_nsfw_concept = None
return image, has_nsfw_concept
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -720,15 +732,19 @@ def __call__(
# use original latents corresponding to unmasked portions of the image
latents = (init_latents_orig * mask) + (latents * (1 - mask))
- # 10. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
+ image = latents
+ has_nsfw_concept = None
- # 11. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- # 12. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index b9dd3aa24b11..65ef5617fc68 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -13,6 +13,7 @@
# limitations under the License.
import inspect
+import warnings
from typing import Callable, List, Optional, Union
import numpy as np
@@ -20,6 +21,7 @@
import torch
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+from ...image_processor import VaeImageProcessor
from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
@@ -136,6 +138,7 @@ def __init__(
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
@torch.no_grad()
@@ -386,15 +389,19 @@ def __call__(
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- # 10. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
+ image = latents
+ has_nsfw_concept = None
- # 11. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- # 12. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
@@ -628,13 +635,17 @@ def _encode_prompt(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
- else:
- has_nsfw_concept = None
return image, has_nsfw_concept
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
@@ -657,6 +668,11 @@ def prepare_extra_step_kwargs(self, generator, eta):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
index 2a6e7edc1351..39601ac36c33 100755
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
@@ -13,12 +13,14 @@
# limitations under the License.
import importlib
+import warnings
from typing import Callable, List, Optional, Union
import torch
from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser
from k_diffusion.sampling import get_sigmas_karras
+from ...image_processor import VaeImageProcessor
from ...loaders import TextualInversionLoaderMixin
from ...pipelines import DiffusionPipeline
from ...schedulers import LMSDiscreteScheduler
@@ -111,6 +113,7 @@ def __init__(
)
self.register_to_config(requires_safety_checker=requires_safety_checker)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
model = ModelWrapper(unet, scheduler.alphas_cumprod)
if scheduler.config.prediction_type == "v_prediction":
@@ -346,17 +349,26 @@ def _encode_prompt(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
- else:
- has_nsfw_concept = None
return image, has_nsfw_concept
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -590,15 +602,19 @@ def model_fn(x, t):
# 8. Run k-diffusion solver
latents = self.sampler(model_fn, latents, sigmas)
- # 9. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
+ image = latents
+ has_nsfw_concept = None
- # 10. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- # 11. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
index fcda8d526c99..664d58dc812f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import warnings
from typing import Callable, List, Optional, Union
import numpy as np
@@ -20,6 +21,7 @@
import torch.nn.functional as F
from transformers import CLIPTextModel, CLIPTokenizer
+from ...image_processor import VaeImageProcessor
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import EulerDiscreteScheduler
from ...utils import is_accelerate_available, logging, randn_tensor
@@ -91,6 +93,8 @@ def __init__(
unet=unet,
scheduler=scheduler,
)
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
@@ -220,6 +224,11 @@ def _encode_prompt(self, prompt, device, do_classifier_free_guidance, negative_p
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -505,12 +514,12 @@ def __call__(
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- # 10. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ else:
+ image = latents
- # 11. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.postprocess(image, output_type=output_type)
if not return_dict:
return (image,)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
index 3fe526418b4f..7bc6b466b46f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
@@ -13,11 +13,13 @@
import copy
import inspect
+import warnings
from typing import Any, Callable, Dict, List, Optional, Union
import torch
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from ...image_processor import VaeImageProcessor
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import PNDMScheduler
@@ -129,6 +131,7 @@ def __init__(
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
self.with_to_k = with_to_k
@@ -373,17 +376,26 @@ def _encode_prompt(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
- else:
- has_nsfw_concept = None
return image, has_nsfw_concept
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -767,24 +779,19 @@ def __call__(
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- if output_type == "latent":
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
image = latents
has_nsfw_concept = None
- elif output_type == "pil":
- # 8. Post-processing
- image = self.decode_latents(latents)
- # 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-
- # 10. Convert to PIL
- image = self.numpy_to_pil(image)
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
else:
- # 8. Post-processing
- image = self.decode_latents(latents)
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- # 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
index 5d69d2071801..22c22b56c7ee 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
@@ -12,11 +12,13 @@
# limitations under the License.
import inspect
+import warnings
from typing import Any, Callable, Dict, List, Optional, Union
import torch
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+from ...image_processor import VaeImageProcessor
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import DDIMScheduler, PNDMScheduler
@@ -123,6 +125,7 @@ def __init__(
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
@@ -337,17 +340,26 @@ def _encode_prompt(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
- else:
- has_nsfw_concept = None
return image, has_nsfw_concept
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -659,15 +671,19 @@ def __call__(
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- # 8. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
+ image = latents
+ has_nsfw_concept = None
- # 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- # 10. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
index c6e0a7620f77..3b7c6dc6b513 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -13,6 +13,7 @@
# limitations under the License.
import inspect
+import warnings
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Optional, Union
@@ -28,6 +29,7 @@
CLIPTokenizer,
)
+from ...image_processor import VaeImageProcessor
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...models.attention_processor import Attention
@@ -358,6 +360,7 @@ def __init__(
inverse_scheduler=inverse_scheduler,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
@@ -578,17 +581,26 @@ def _encode_prompt(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
- else:
- has_nsfw_concept = None
return image, has_nsfw_concept
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -1045,24 +1057,28 @@ def __call__(
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
- # 11. Post-process the latents.
- edited_image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
+ image = latents
+ has_nsfw_concept = None
- # 12. Run the safety checker.
- edited_image, has_nsfw_concept = self.run_safety_checker(edited_image, device, prompt_embeds.dtype)
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- # 13. Convert to PIL.
- if output_type == "pil":
- edited_image = self.numpy_to_pil(edited_image)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
self.final_offload_hook.offload()
if not return_dict:
- return (edited_image, has_nsfw_concept)
+ return (image, has_nsfw_concept)
- return StableDiffusionPipelineOutput(images=edited_image, nsfw_content_detected=has_nsfw_concept)
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
@torch.no_grad()
@replace_example_docstring(EXAMPLE_INVERT_DOC_STRING)
@@ -1259,7 +1275,7 @@ def invert(
# 9. Convert to PIL.
if output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.numpy_to_pil(image)
if not return_dict:
return (inverted_latents, image)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
index 9199eda37e04..db3c148f04e5 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
@@ -13,12 +13,14 @@
# limitations under the License.
import inspect
+import warnings
from typing import Any, Callable, Dict, List, Optional, Union
import torch
import torch.nn.functional as F
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+from ...image_processor import VaeImageProcessor
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
@@ -140,6 +142,7 @@ def __init__(
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
@@ -354,17 +357,26 @@ def _encode_prompt(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is not None:
- safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
- else:
- has_nsfw_concept = None
return image, has_nsfw_concept
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -682,15 +694,19 @@ def get_map_size(module, input, output):
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- # 8. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
+ image = latents
+ has_nsfw_concept = None
- # 9. Run safety checker
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
- # 10. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index de73a92b07d3..b7530ac4ec5c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -13,6 +13,7 @@
# limitations under the License.
import inspect
+import warnings
from typing import Any, Callable, List, Optional, Union
import numpy as np
@@ -372,6 +373,11 @@ def prepare_extra_step_kwargs(self, generator, eta):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
index 1867acdf0859..fb907f49553c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
@@ -13,12 +13,14 @@
# limitations under the License.
import inspect
+import warnings
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import torch
from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
from transformers.models.clip.modeling_clip import CLIPTextModelOutput
+from ...image_processor import VaeImageProcessor
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, PriorTransformer, UNet2DConditionModel
from ...models.embeddings import get_timestep_embedding
@@ -136,6 +138,7 @@ def __init__(
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
def enable_vae_slicing(self):
@@ -474,6 +477,11 @@ def _encode_prompt(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -918,17 +926,17 @@ def __call__(
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- # 14. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ else:
+ image = latents
+
+ image = self.image_processor.postprocess(image, output_type=output_type)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
self.final_offload_hook.offload()
- # 15. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
-
if not return_dict:
return (image,)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
index 705eb50795e0..44916049e29f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
@@ -13,6 +13,7 @@
# limitations under the License.
import inspect
+import warnings
from typing import Any, Callable, Dict, List, Optional, Union
import PIL
@@ -21,6 +22,7 @@
from diffusers.utils.import_utils import is_accelerate_available
+from ...image_processor import VaeImageProcessor
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...models.embeddings import get_timestep_embedding
@@ -138,6 +140,7 @@ def __init__(
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
def enable_vae_slicing(self):
@@ -429,6 +432,11 @@ def _encode_image(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -814,16 +822,17 @@ def __call__(
callback(i, t, latents)
# 9. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ else:
+ image = latents
+
+ image = self.image_processor.postprocess(image, output_type=output_type)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
self.final_offload_hook.offload()
- # 10. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
-
if not return_dict:
return (image,)
diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
index f4f7eefcd07a..d770ee290517 100644
--- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -363,6 +363,11 @@ def run_safety_checker(self, image, device, dtype, enable_safety_guidance):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
index 2827ed4a7378..1d2e61d86b90 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
@@ -13,6 +13,7 @@
# limitations under the License.
import inspect
+import warnings
from typing import Callable, List, Optional, Tuple, Union
import numpy as np
@@ -26,6 +27,7 @@
CLIPVisionModelWithProjection,
)
+from ...image_processor import VaeImageProcessor
from ...models import AutoencoderKL, DualTransformer2DModel, Transformer2DModel, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import is_accelerate_available, logging, randn_tensor
@@ -88,6 +90,7 @@ def __init__(
scheduler=scheduler,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
if self.text_unet is not None and (
"dual_cross_attention" not in self.image_unet.config or not self.image_unet.config.dual_cross_attention
@@ -329,6 +332,11 @@ def normalize_embeddings(encoder_output):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -572,12 +580,12 @@ def __call__(
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- # 9. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ else:
+ image = latents
- # 10. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.postprocess(image, output_type=output_type)
if not return_dict:
return (image,)
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
index 46eee27bcbfc..4450846300fc 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
@@ -13,6 +13,7 @@
# limitations under the License.
import inspect
+import warnings
from typing import Callable, List, Optional, Union
import numpy as np
@@ -21,6 +22,7 @@
import torch.utils.checkpoint
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from ...image_processor import VaeImageProcessor
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import is_accelerate_available, logging, randn_tensor
@@ -71,6 +73,7 @@ def __init__(
scheduler=scheduler,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
@@ -189,6 +192,11 @@ def normalize_embeddings(encoder_output):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -414,12 +422,12 @@ def __call__(
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- # 8. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ else:
+ image = latents
- # 9. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.postprocess(image, output_type=output_type)
if not return_dict:
return (image,)
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
index cd5dd70a2cdc..1fdb21f2b745 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
@@ -13,12 +13,14 @@
# limitations under the License.
import inspect
+import warnings
from typing import Callable, List, Optional, Union
import torch
import torch.utils.checkpoint
from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer
+from ...image_processor import VaeImageProcessor
from ...models import AutoencoderKL, Transformer2DModel, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import is_accelerate_available, logging, randn_tensor
@@ -76,6 +78,7 @@ def __init__(
scheduler=scheduler,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
if self.text_unet is not None:
self._swap_unet_attention_blocks()
@@ -246,6 +249,11 @@ def normalize_embeddings(encoder_output):
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
@@ -488,12 +496,12 @@ def __call__(
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
- # 9. Post-processing
- image = self.decode_latents(latents)
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ else:
+ image = latents
- # 10. Convert to PIL
- if output_type == "pil":
- image = self.numpy_to_pil(image)
+ image = self.image_processor.postprocess(image, output_type=output_type)
if not return_dict:
return (image,)
diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion.py b/tests/pipelines/altdiffusion/test_alt_diffusion.py
index 4d19621f0c2c..60eb17e76c0a 100644
--- a/tests/pipelines/altdiffusion/test_alt_diffusion.py
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion.py
@@ -28,17 +28,18 @@
from diffusers.utils import slow, torch_device
from diffusers.utils.testing_utils import require_torch_gpu
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
torch.backends.cuda.matmul.allow_tf32 = False
-class AltDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class AltDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = AltDiffusionPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+ image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
def get_dummy_components(self):
torch.manual_seed(0)
diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
index 144107ec1c97..1f96d8954156 100644
--- a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
@@ -123,6 +123,7 @@ def test_stable_diffusion_img2img_default_case(self):
tokenizer.model_max_length = 77
init_image = self.dummy_image.to(device)
+ init_image = init_image / 2 + 0.5
# make sure here that pndm scheduler skips prk
alt_pipe = AltDiffusionImg2ImgPipeline(
@@ -134,7 +135,7 @@ def test_stable_diffusion_img2img_default_case(self):
safety_checker=None,
feature_extractor=self.dummy_extractor,
)
- alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False)
+ alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=True)
alt_pipe = alt_pipe.to(device)
alt_pipe.set_progress_bar_config(disable=None)
diff --git a/tests/pipelines/paint_by_example/test_paint_by_example.py b/tests/pipelines/paint_by_example/test_paint_by_example.py
index 17feba59e8e4..bb798ff729bf 100644
--- a/tests/pipelines/paint_by_example/test_paint_by_example.py
+++ b/tests/pipelines/paint_by_example/test_paint_by_example.py
@@ -38,6 +38,7 @@ class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = PaintByExamplePipeline
params = IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
batch_params = IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+ image_params = frozenset([]) # TO_DO: update the image_prams once refactored VaeImageProcessor.preprocess
def get_dummy_components(self):
torch.manual_seed(0)
diff --git a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
index 05b72ab6a0fd..52d3b03e5220 100644
--- a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
@@ -26,13 +26,13 @@
from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
torch.backends.cuda.matmul.allow_tf32 = False
-class CycleDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class CycleDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = CycleDiffusionPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {
"negative_prompt",
@@ -42,6 +42,7 @@ class CycleDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
}
required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"source_prompt"})
+ image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess
def get_dummy_components(self):
torch.manual_seed(0)
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 13b4d0dba827..ddbf9f45f274 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -42,17 +42,18 @@
from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
from ...models.test_models_unet_2d_condition import create_lora_layers
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
torch.backends.cuda.matmul.allow_tf32 = False
-class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+ image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
def get_dummy_components(self):
torch.manual_seed(0)
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 279df4a32b29..765b2393c105 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -35,13 +35,14 @@
from diffusers.utils.testing_utils import require_torch_gpu
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableDiffusionControlNetPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionControlNetPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+ image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess
def get_dummy_components(self):
torch.manual_seed(0)
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
index 3bfa5810428a..fbdfc75faa84 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
@@ -33,16 +33,21 @@
from diffusers.utils.testing_utils import require_torch_gpu
from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
torch.backends.cuda.matmul.allow_tf32 = False
-class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableDiffusionImageVariationPipelineFastTests(
+ PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
pipeline_class = StableDiffusionImageVariationPipeline
params = IMAGE_VARIATION_PARAMS
batch_params = IMAGE_VARIATION_BATCH_PARAMS
+ image_params = frozenset(
+ []
+ ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
def get_dummy_components(self):
torch.manual_seed(0)
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 20977c346ecc..7a8e2ee05ad4 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -36,16 +36,19 @@
from diffusers.utils.testing_utils import require_torch_gpu
from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
torch.backends.cuda.matmul.allow_tf32 = False
-class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableDiffusionInpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionInpaintPipeline
params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+ image_params = frozenset(
+ []
+ ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
def get_dummy_components(self):
torch.manual_seed(0)
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
index 8915f524d972..08dc1b2844dc 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
@@ -35,16 +35,21 @@
from diffusers.utils.testing_utils import require_torch_gpu
from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
torch.backends.cuda.matmul.allow_tf32 = False
-class StableDiffusionInstructPix2PixPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableDiffusionInstructPix2PixPipelineFastTests(
+ PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
pipeline_class = StableDiffusionInstructPix2PixPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "cross_attention_kwargs"}
batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+ image_params = frozenset(
+ []
+ ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
def get_dummy_components(self):
torch.manual_seed(0)
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
index bafad63ec2db..b1bed4b3cf25 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
@@ -31,18 +31,19 @@
from diffusers.utils import slow, torch_device
from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
torch.backends.cuda.matmul.allow_tf32 = False
@skip_mps
-class StableDiffusionModelEditingPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableDiffusionModelEditingPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionModelEditingPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+ image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
def get_dummy_components(self):
torch.manual_seed(0)
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
index 3ead4fe55bab..82e42b095f5d 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
@@ -32,18 +32,19 @@
from diffusers.utils import slow, torch_device
from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
torch.backends.cuda.matmul.allow_tf32 = False
@skip_mps
-class StableDiffusionPanoramaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableDiffusionPanoramaPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionPanoramaPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+ image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
def get_dummy_components(self):
torch.manual_seed(0)
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
index 661926daaa3e..af64a23c4003 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
@@ -36,17 +36,20 @@
from diffusers.utils.testing_utils import load_image, load_pt, require_torch_gpu, skip_mps
from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
torch.backends.cuda.matmul.allow_tf32 = False
@skip_mps
-class StableDiffusionPix2PixZeroPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableDiffusionPix2PixZeroPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionPix2PixZeroPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+ image_params = frozenset(
+ []
+ ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
@classmethod
def setUpClass(cls):
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
index 73859bdbf7d8..ad0d50df3ce5 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
@@ -29,17 +29,18 @@
from diffusers.utils import slow, torch_device
from diffusers.utils.testing_utils import require_torch_gpu
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
torch.backends.cuda.matmul.allow_tf32 = False
-class StableDiffusionSAGPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableDiffusionSAGPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionSAGPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+ image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
test_cpu_offload = False
def get_dummy_components(self):
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
index 623dbde99469..be807b5c0c33 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
@@ -35,17 +35,18 @@
from diffusers.utils import load_numpy, nightly, slow, torch_device
from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
torch.backends.cuda.matmul.allow_tf32 = False
-class StableDiffusion2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableDiffusion2PipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+ image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
def get_dummy_components(self):
torch.manual_seed(0)
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
index 846e251f3ce2..60cf9c7982e9 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
@@ -29,16 +29,19 @@
from diffusers.utils import load_numpy, skip_mps, slow
from diffusers.utils.testing_utils import require_torch_gpu
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
@skip_mps
-class StableDiffusionAttendAndExcitePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableDiffusionAttendAndExcitePipelineFastTests(
+ PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
pipeline_class = StableDiffusionAttendAndExcitePipeline
test_attention_slicing = False
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"token_indices"})
+ image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
def get_dummy_components(self):
torch.manual_seed(0)
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
index 7a5e02a42af4..7b63583eef77 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
@@ -52,19 +52,22 @@
from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
torch.backends.cuda.matmul.allow_tf32 = False
@skip_mps
-class StableDiffusionDepth2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableDiffusionDepth2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionDepth2ImgPipeline
test_save_load_optional_components = False
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+ image_params = frozenset(
+ []
+ ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
def get_dummy_components(self):
torch.manual_seed(0)
@@ -132,7 +135,7 @@ def get_dummy_components(self):
backbone_config=backbone_config,
backbone_featmap_shape=[1, 384, 24, 24],
)
- depth_estimator = DPTForDepthEstimation(depth_estimator_config)
+ depth_estimator = DPTForDepthEstimation(depth_estimator_config).eval()
feature_extractor = DPTFeatureExtractor.from_pretrained(
"hf-internal-testing/tiny-random-DPTForDepthEstimation"
)
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
index c20bc3b47d7b..bd9ce25bdbac 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
@@ -34,16 +34,19 @@
from diffusers.utils.testing_utils import floats_tensor, require_torch_gpu, torch_device
from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
torch.backends.cuda.matmul.allow_tf32 = False
-class StableDiffusionDiffEditPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableDiffusionDiffEditPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionDiffEditPipeline
params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"height", "width", "image"} | {"image_latents"}
batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - {"image"} | {"image_latents"}
+ image_params = frozenset(
+ []
+ ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
def get_dummy_components(self):
torch.manual_seed(0)
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
index 2fa8b9045f43..843a6146dac9 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
@@ -27,16 +27,19 @@
from diffusers.utils.testing_utils import require_torch_gpu, slow
from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
torch.backends.cuda.matmul.allow_tf32 = False
-class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableDiffusion2InpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionInpaintPipeline
params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+ image_params = frozenset(
+ []
+ ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
def get_dummy_components(self):
torch.manual_seed(0)
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
index aff1c1cdbde9..70277d6283e8 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
@@ -32,13 +32,13 @@
from diffusers.utils.testing_utils import require_torch_gpu
from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
torch.backends.cuda.matmul.allow_tf32 = False
-class StableDiffusionLatentUpscalePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableDiffusionLatentUpscalePipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionLatentUpscalePipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {
"height",
@@ -49,6 +49,10 @@ class StableDiffusionLatentUpscalePipelineFastTests(PipelineTesterMixin, unittes
}
required_optional_params = PipelineTesterMixin.required_optional_params - {"num_images_per_prompt"}
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+ image_params = frozenset(
+ []
+ ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
+
test_cpu_offload = True
@property
diff --git a/tests/pipelines/stable_unclip/test_stable_unclip.py b/tests/pipelines/stable_unclip/test_stable_unclip.py
index 891323d22fe0..b0e65692e8b5 100644
--- a/tests/pipelines/stable_unclip/test_stable_unclip.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip.py
@@ -15,14 +15,15 @@
from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
from diffusers.utils.testing_utils import load_numpy, require_torch_gpu, slow, torch_device
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin, assert_mean_pixel_difference
-class StableUnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableUnCLIPPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableUnCLIPPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+ image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
# TODO(will) Expected attn_bias.stride(1) == 0 to be true, but got false
test_xformers_attention = False
diff --git a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
index 69e3225ced52..450e0af8dcdc 100644
--- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
@@ -29,15 +29,19 @@
from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
from ..test_pipelines_common import (
+ PipelineLatentTesterMixin,
PipelineTesterMixin,
assert_mean_pixel_difference,
)
-class StableUnCLIPImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class StableUnCLIPImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableUnCLIPImg2ImgPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+ image_params = frozenset(
+ []
+ ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
def get_dummy_components(self):
embedder_hidden_size = 32
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index aedda7bae026..4a51e997f93a 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -79,7 +79,7 @@ def test_pt_np_pil_outputs_equivalent(self):
self.assertLess(max_diff, 1e-4, "`output_type=='pt'` generate different results from `output_type=='np'`")
max_diff = np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max()
- self.assertLess(max_diff, 1e-4, "`output_type=='pil'` generate different results from `output_type=='np'`")
+ self.assertLess(max_diff, 2.0, "`output_type=='pil'` generate different results from `output_type=='np'`")
def test_pt_np_pil_inputs_equivalent(self):
if len(self.image_params) == 0:
From 26832aa5ef900e9750cde6ad7e91c43de80c1c46 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Tue, 9 May 2023 16:15:05 -0700
Subject: [PATCH 058/206] [docs] Improve safetensors docstring (#3368)
* clarify safetensor docstring
* fix typo
* apply feedback
---
src/diffusers/loaders.py | 8 ++++----
src/diffusers/models/modeling_utils.py | 8 ++++----
src/diffusers/pipelines/pipeline_utils.py | 8 ++++----
3 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 17e24ff2f0b2..7513fa2732ba 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -1221,10 +1221,10 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
- use_safetensors (`bool`, *optional* ):
- If set to `True`, the pipeline will be loaded from `safetensors` weights. If set to `None` (the
- default). The pipeline will load using `safetensors` if the safetensors weights are available *and* if
- `safetensors` is installed. If the to `False` the pipeline will *not* use `safetensors`.
+ use_safetensors (`bool`, *optional*, defaults to `None`):
+ If set to `None`, the pipeline will load the `safetensors` weights if they're available **and** if the
+ `safetensors` library is installed. If set to `True`, the pipeline will forcibly load the models from
+ `safetensors` weights. If set to `False` the pipeline will *not* use `safetensors`.
extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults
to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index 6644042077d2..ef14ec3d09ef 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -406,10 +406,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
variant (`str`, *optional*):
If specified load weights from `variant` filename, *e.g.* pytorch_model..bin. `variant` is
ignored when using `from_flax`.
- use_safetensors (`bool`, *optional* ):
- If set to `True`, the pipeline will forcibly load the models from `safetensors` weights. If set to
- `None` (the default). The pipeline will load using `safetensors` if safetensors weights are available
- *and* if `safetensors` is installed. If the to `False` the pipeline will *not* use `safetensors`.
+ use_safetensors (`bool`, *optional*, defaults to `None`):
+ If set to `None`, the `safetensors` weights will be downloaded if they're available **and** if the
+ `safetensors` library is installed. If set to `True`, the model will be forcibly loaded from
+ `safetensors` weights. If set to `False`, loading will *not* use `safetensors`.
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 82bcda54938d..9288248d309b 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -814,10 +814,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
setting this argument to `True` will raise an error.
- use_safetensors (`bool`, *optional* ):
- If set to `True`, the pipeline will be loaded from `safetensors` weights. If set to `None` (the
- default). The pipeline will load using `safetensors` if the safetensors weights are available *and* if
- `safetensors` is installed. If the to `False` the pipeline will *not* use `safetensors`.
+ use_safetensors (`bool`, *optional*, defaults to `None`):
+ If set to `None`, the pipeline will load the `safetensors` weights if they're available **and** if the
+ `safetensors` library is installed. If set to `True`, the pipeline will forcibly load the models from
+ `safetensors` weights. If set to `False` the pipeline will *not* use `safetensors`.
kwargs (remaining dictionary of keyword arguments, *optional*):
Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the
specific pipeline class. The overwritten components are then directly passed to the pipelines
From 94a0c644a8ce5b05a969859e0814ef4883ac870e Mon Sep 17 00:00:00 2001
From: Sayak Paul
Date: Wed, 10 May 2023 07:22:04 +0530
Subject: [PATCH 059/206] add: a warning message when using xformers in a PT
2.0 env. (#3365)
* add: a warning message when using xformers in a PT 2.0 env.
* Apply suggestions from code review
Co-authored-by: Patrick von Platen
---------
Co-authored-by: Patrick von Platen
---
src/diffusers/models/attention_processor.py | 29 ++++++++++++++++++---
1 file changed, 25 insertions(+), 4 deletions(-)
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 6701122fc13b..b727c76e2137 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+import warnings
from typing import Callable, Optional, Union
import torch
@@ -72,7 +73,8 @@ def __init__(
self.upcast_attention = upcast_attention
self.upcast_softmax = upcast_softmax
- self.scale = dim_head**-0.5 if scale_qk else 1.0
+ self.scale_qk = scale_qk
+ self.scale = dim_head**-0.5 if self.scale_qk else 1.0
self.heads = heads
# for slice_size > 0 the attention score computation
@@ -140,7 +142,7 @@ def __init__(
# but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
if processor is None:
processor = (
- AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and scale_qk else AttnProcessor()
+ AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
)
self.set_processor(processor)
@@ -176,6 +178,11 @@ def set_use_memory_efficient_attention_xformers(
"torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is"
" only available for GPU "
)
+ elif hasattr(F, "scaled_dot_product_attention") and self.scale_qk:
+ warnings.warn(
+ "You have specified using flash attention using xFormers but you have PyTorch 2.0 already installed. "
+ "We will default to PyTorch's native efficient flash attention implementation provided by PyTorch 2.0."
+ )
else:
try:
# Make sure we can run the memory efficient attention
@@ -229,7 +236,15 @@ def set_use_memory_efficient_attention_xformers(
if hasattr(self.processor, "to_k_custom_diffusion"):
processor.to(self.processor.to_k_custom_diffusion.weight.device)
else:
- processor = AttnProcessor()
+ # set attention processor
+ # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+ # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+ # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+ processor = (
+ AttnProcessor2_0()
+ if hasattr(F, "scaled_dot_product_attention") and self.scale_qk
+ else AttnProcessor()
+ )
self.set_processor(processor)
@@ -244,7 +259,13 @@ def set_attention_slice(self, slice_size):
elif self.added_kv_proj_dim is not None:
processor = AttnAddedKVProcessor()
else:
- processor = AttnProcessor()
+ # set attention processor
+ # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+ # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+ # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+ processor = (
+ AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+ )
self.set_processor(processor)
From edb087a21788f7792959f37f46008246c2bdf3f9 Mon Sep 17 00:00:00 2001
From: Rupert Menneer <71332436+rupertmenneer@users.noreply.github.com>
Date: Wed, 10 May 2023 19:14:25 +0100
Subject: [PATCH 060/206] StableDiffusionInpaintingPipeline - resize image
w.r.t height and width (#3322)
* StableDiffusionInpaintingPipeline now resizes input images and masks w.r.t to passed input height and width. Default is already set to 512. This addresses the common tensor mismatch error. Also moved type check into relevant funciton to keep main pipeline body tidy.
* Fixed StableDiffusionInpaintingPrepareMaskAndMaskedImageTests
Due to previous commit these tests were failing as height and width need to be passed into the prepare_mask_and_masked_image function, I have updated the code and added a height/width variable per unit test as it seemed more appropriate than the current hard coded solution
* Added a resolution test to StableDiffusionInpaintPipelineSlowTests
this unit test simply gets the input and resizes it into some that would fail (e.g. would throw a tensor mismatch error/not a mult of 8). Then passes it through the pipeline and verifies it produces output with correct dims w.r.t the passed height and width
---------
Co-authored-by: Patrick von Platen
---
.../pipeline_stable_diffusion_inpaint.py | 23 +--
.../test_stable_diffusion_inpaint.py | 138 ++++++++++++------
2 files changed, 104 insertions(+), 57 deletions(-)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 7068408b9dcf..67ce572ea433 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -36,7 +36,7 @@
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-def prepare_mask_and_masked_image(image, mask):
+def prepare_mask_and_masked_image(image, mask, height, width):
"""
Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
@@ -64,6 +64,13 @@ def prepare_mask_and_masked_image(image, mask):
tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
dimensions: ``batch x channels x height x width``.
"""
+
+ if image is None:
+ raise ValueError("`image` input cannot be undefined.")
+
+ if mask is None:
+ raise ValueError("`mask_image` input cannot be undefined.")
+
if isinstance(image, torch.Tensor):
if not isinstance(mask, torch.Tensor):
raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
@@ -111,8 +118,9 @@ def prepare_mask_and_masked_image(image, mask):
# preprocess image
if isinstance(image, (PIL.Image.Image, np.ndarray)):
image = [image]
-
if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+ # resize all images w.r.t passed height an width
+ image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
image = [np.array(i.convert("RGB"))[None, :] for i in image]
image = np.concatenate(image, axis=0)
elif isinstance(image, list) and isinstance(image[0], np.ndarray):
@@ -126,6 +134,7 @@ def prepare_mask_and_masked_image(image, mask):
mask = [mask]
if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+ mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
mask = mask.astype(np.float32) / 255.0
elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
@@ -799,12 +808,6 @@ def __call__(
negative_prompt_embeds,
)
- if image is None:
- raise ValueError("`image` input cannot be undefined.")
-
- if mask_image is None:
- raise ValueError("`mask_image` input cannot be undefined.")
-
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
batch_size = 1
@@ -830,8 +833,8 @@ def __call__(
negative_prompt_embeds=negative_prompt_embeds,
)
- # 4. Preprocess mask and image
- mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
+ # 4. Preprocess mask and image - resizes image and mask w.r.t height and width
+ mask, masked_image = prepare_mask_and_masked_image(image, mask_image, height, width)
# 5. set timesteps
self.scheduler.set_timesteps(num_inference_steps, device=device)
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 7a8e2ee05ad4..497d9e53673c 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -303,6 +303,25 @@ def test_inpaint_compile(self):
assert np.abs(expected_slice - image_slice).max() < 1e-4
assert np.abs(expected_slice - image_slice).max() < 1e-3
+ def test_stable_diffusion_inpaint_pil_input_resolution_test(self):
+ pipe = StableDiffusionInpaintPipeline.from_pretrained(
+ "runwayml/stable-diffusion-inpainting", safety_checker=None
+ )
+ pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+ pipe.to(torch_device)
+ pipe.set_progress_bar_config(disable=None)
+ pipe.enable_attention_slicing()
+
+ inputs = self.get_inputs(torch_device)
+ # change input image to a random size (one that would cause a tensor mismatch error)
+ inputs['image'] = inputs['image'].resize((127,127))
+ inputs['mask_image'] = inputs['mask_image'].resize((127,127))
+ inputs['height'] = 128
+ inputs['width'] = 128
+ image = pipe(**inputs).images
+ # verify that the returned image has the same height and width as the input height and width
+ assert image.shape == (1, inputs['height'], inputs['width'], 3)
+
@nightly
@require_torch_gpu
@@ -400,12 +419,13 @@ def test_inpaint_dpm(self):
class StableDiffusionInpaintingPrepareMaskAndMaskedImageTests(unittest.TestCase):
def test_pil_inputs(self):
- im = np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8)
+ height, width = 32, 32
+ im = np.random.randint(0, 255, (height, width, 3), dtype=np.uint8)
im = Image.fromarray(im)
- mask = np.random.randint(0, 255, (32, 32), dtype=np.uint8) > 127.5
+ mask = np.random.randint(0, 255, (height, width), dtype=np.uint8) > 127.5
mask = Image.fromarray((mask * 255).astype(np.uint8))
- t_mask, t_masked = prepare_mask_and_masked_image(im, mask)
+ t_mask, t_masked = prepare_mask_and_masked_image(im, mask, height, width)
self.assertTrue(isinstance(t_mask, torch.Tensor))
self.assertTrue(isinstance(t_masked, torch.Tensor))
@@ -413,8 +433,8 @@ def test_pil_inputs(self):
self.assertEqual(t_mask.ndim, 4)
self.assertEqual(t_masked.ndim, 4)
- self.assertEqual(t_mask.shape, (1, 1, 32, 32))
- self.assertEqual(t_masked.shape, (1, 3, 32, 32))
+ self.assertEqual(t_mask.shape, (1, 1, height, width))
+ self.assertEqual(t_masked.shape, (1, 3, height, width))
self.assertTrue(t_mask.dtype == torch.float32)
self.assertTrue(t_masked.dtype == torch.float32)
@@ -427,86 +447,100 @@ def test_pil_inputs(self):
self.assertTrue(t_mask.sum() > 0.0)
def test_np_inputs(self):
- im_np = np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8)
+ height, width = 32, 32
+
+ im_np = np.random.randint(0, 255, (height, width, 3), dtype=np.uint8)
im_pil = Image.fromarray(im_np)
- mask_np = np.random.randint(0, 255, (32, 32), dtype=np.uint8) > 127.5
+ mask_np = np.random.randint(0, 255, (height, width,), dtype=np.uint8) > 127.5
mask_pil = Image.fromarray((mask_np * 255).astype(np.uint8))
- t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
- t_mask_pil, t_masked_pil = prepare_mask_and_masked_image(im_pil, mask_pil)
+ t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
+ t_mask_pil, t_masked_pil = prepare_mask_and_masked_image(im_pil, mask_pil, height, width)
self.assertTrue((t_mask_np == t_mask_pil).all())
self.assertTrue((t_masked_np == t_masked_pil).all())
def test_torch_3D_2D_inputs(self):
- im_tensor = torch.randint(0, 255, (3, 32, 32), dtype=torch.uint8)
- mask_tensor = torch.randint(0, 255, (32, 32), dtype=torch.uint8) > 127.5
+ height, width = 32, 32
+
+ im_tensor = torch.randint(0, 255, (3, height, width,), dtype=torch.uint8)
+ mask_tensor = torch.randint(0, 255, (height, width,), dtype=torch.uint8) > 127.5
im_np = im_tensor.numpy().transpose(1, 2, 0)
mask_np = mask_tensor.numpy()
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
- t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
+ t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
+ t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
def test_torch_3D_3D_inputs(self):
- im_tensor = torch.randint(0, 255, (3, 32, 32), dtype=torch.uint8)
- mask_tensor = torch.randint(0, 255, (1, 32, 32), dtype=torch.uint8) > 127.5
+ height, width = 32, 32
+
+ im_tensor = torch.randint(0, 255, (3, height, width,), dtype=torch.uint8)
+ mask_tensor = torch.randint(0, 255, (1, height, width,), dtype=torch.uint8) > 127.5
im_np = im_tensor.numpy().transpose(1, 2, 0)
mask_np = mask_tensor.numpy()[0]
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
- t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
+ t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
+ t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
def test_torch_4D_2D_inputs(self):
- im_tensor = torch.randint(0, 255, (1, 3, 32, 32), dtype=torch.uint8)
- mask_tensor = torch.randint(0, 255, (32, 32), dtype=torch.uint8) > 127.5
+ height, width = 32, 32
+
+ im_tensor = torch.randint(0, 255, (1, 3, height, width,), dtype=torch.uint8)
+ mask_tensor = torch.randint(0, 255, (height, width,), dtype=torch.uint8) > 127.5
im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
mask_np = mask_tensor.numpy()
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
- t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
+ t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
+ t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
def test_torch_4D_3D_inputs(self):
- im_tensor = torch.randint(0, 255, (1, 3, 32, 32), dtype=torch.uint8)
- mask_tensor = torch.randint(0, 255, (1, 32, 32), dtype=torch.uint8) > 127.5
+ height, width = 32, 32
+
+ im_tensor = torch.randint(0, 255, (1, 3, height, width,), dtype=torch.uint8)
+ mask_tensor = torch.randint(0, 255, (1, height, width,), dtype=torch.uint8) > 127.5
im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
mask_np = mask_tensor.numpy()[0]
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
- t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
+ t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
+ t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
def test_torch_4D_4D_inputs(self):
- im_tensor = torch.randint(0, 255, (1, 3, 32, 32), dtype=torch.uint8)
- mask_tensor = torch.randint(0, 255, (1, 1, 32, 32), dtype=torch.uint8) > 127.5
+ height, width = 32, 32
+
+ im_tensor = torch.randint(0, 255, (1, 3, height, width,), dtype=torch.uint8)
+ mask_tensor = torch.randint(0, 255, (1, 1, height, width,), dtype=torch.uint8) > 127.5
im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
mask_np = mask_tensor.numpy()[0][0]
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
- t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
+ t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
+ t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
def test_torch_batch_4D_3D(self):
- im_tensor = torch.randint(0, 255, (2, 3, 32, 32), dtype=torch.uint8)
- mask_tensor = torch.randint(0, 255, (2, 32, 32), dtype=torch.uint8) > 127.5
+ height, width = 32, 32
+
+ im_tensor = torch.randint(0, 255, (2, 3, height, width,), dtype=torch.uint8)
+ mask_tensor = torch.randint(0, 255, (2, height, width,), dtype=torch.uint8) > 127.5
im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
mask_nps = [mask.numpy() for mask in mask_tensor]
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
- nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)]
+ t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
+ nps = [prepare_mask_and_masked_image(i, m, height, width) for i, m in zip(im_nps, mask_nps)]
t_mask_np = torch.cat([n[0] for n in nps])
t_masked_np = torch.cat([n[1] for n in nps])
@@ -514,14 +548,16 @@ def test_torch_batch_4D_3D(self):
self.assertTrue((t_masked_tensor == t_masked_np).all())
def test_torch_batch_4D_4D(self):
- im_tensor = torch.randint(0, 255, (2, 3, 32, 32), dtype=torch.uint8)
- mask_tensor = torch.randint(0, 255, (2, 1, 32, 32), dtype=torch.uint8) > 127.5
+ height, width = 32, 32
+
+ im_tensor = torch.randint(0, 255, (2, 3, height, width,), dtype=torch.uint8)
+ mask_tensor = torch.randint(0, 255, (2, 1, height, width,), dtype=torch.uint8) > 127.5
im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
mask_nps = [mask.numpy()[0] for mask in mask_tensor]
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
- nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)]
+ t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
+ nps = [prepare_mask_and_masked_image(i, m, height, width) for i, m in zip(im_nps, mask_nps)]
t_mask_np = torch.cat([n[0] for n in nps])
t_masked_np = torch.cat([n[1] for n in nps])
@@ -529,39 +565,47 @@ def test_torch_batch_4D_4D(self):
self.assertTrue((t_masked_tensor == t_masked_np).all())
def test_shape_mismatch(self):
+ height, width = 32, 32
+
# test height and width
with self.assertRaises(AssertionError):
- prepare_mask_and_masked_image(torch.randn(3, 32, 32), torch.randn(64, 64))
+ prepare_mask_and_masked_image(torch.randn(3, height, width,), torch.randn(64, 64), height, width)
# test batch dim
with self.assertRaises(AssertionError):
- prepare_mask_and_masked_image(torch.randn(2, 3, 32, 32), torch.randn(4, 64, 64))
+ prepare_mask_and_masked_image(torch.randn(2, 3, height, width,), torch.randn(4, 64, 64), height, width)
# test batch dim
with self.assertRaises(AssertionError):
- prepare_mask_and_masked_image(torch.randn(2, 3, 32, 32), torch.randn(4, 1, 64, 64))
+ prepare_mask_and_masked_image(torch.randn(2, 3, height, width,), torch.randn(4, 1, 64, 64), height, width)
def test_type_mismatch(self):
+ height, width = 32, 32
+
# test tensors-only
with self.assertRaises(TypeError):
- prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.rand(3, 32, 32).numpy())
+ prepare_mask_and_masked_image(torch.rand(3, height, width,), torch.rand(3, height, width,).numpy(), height, width)
# test tensors-only
with self.assertRaises(TypeError):
- prepare_mask_and_masked_image(torch.rand(3, 32, 32).numpy(), torch.rand(3, 32, 32))
+ prepare_mask_and_masked_image(torch.rand(3, height, width,).numpy(), torch.rand(3, height, width,), height, width)
def test_channels_first(self):
+ height, width = 32, 32
+
# test channels first for 3D tensors
with self.assertRaises(AssertionError):
- prepare_mask_and_masked_image(torch.rand(32, 32, 3), torch.rand(3, 32, 32))
+ prepare_mask_and_masked_image(torch.rand(height, width, 3), torch.rand(3, height, width,), height, width)
def test_tensor_range(self):
+ height, width = 32, 32
+
# test im <= 1
with self.assertRaises(ValueError):
- prepare_mask_and_masked_image(torch.ones(3, 32, 32) * 2, torch.rand(32, 32))
+ prepare_mask_and_masked_image(torch.ones(3, height, width,) * 2, torch.rand(height, width,), height, width)
# test im >= -1
with self.assertRaises(ValueError):
- prepare_mask_and_masked_image(torch.ones(3, 32, 32) * (-2), torch.rand(32, 32))
+ prepare_mask_and_masked_image(torch.ones(3, height, width,) * (-2), torch.rand(height, width,), height, width)
# test mask <= 1
with self.assertRaises(ValueError):
- prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.ones(32, 32) * 2)
+ prepare_mask_and_masked_image(torch.rand(3, height, width,), torch.ones(height, width,) * 2, height, width)
# test mask >= 0
with self.assertRaises(ValueError):
- prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.ones(32, 32) * -1)
+ prepare_mask_and_masked_image(torch.rand(3, height, width,), torch.ones(height, width,) * -1, height, width)
From 82e6fa56f0bfc219c26168a27a8ddb3a5488535e Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Wed, 10 May 2023 20:16:18 +0200
Subject: [PATCH 061/206] make style
---
.../pipeline_stable_diffusion_inpaint.py | 4 +-
.../test_stable_diffusion_inpaint.py | 391 +++++++++++++++---
2 files changed, 344 insertions(+), 51 deletions(-)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 67ce572ea433..518a9a3e9781 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -120,7 +120,7 @@ def prepare_mask_and_masked_image(image, mask, height, width):
image = [image]
if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
# resize all images w.r.t passed height an width
- image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
+ image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
image = [np.array(i.convert("RGB"))[None, :] for i in image]
image = np.concatenate(image, axis=0)
elif isinstance(image, list) and isinstance(image[0], np.ndarray):
@@ -134,7 +134,7 @@ def prepare_mask_and_masked_image(image, mask, height, width):
mask = [mask]
if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
- mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
+ mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
mask = mask.astype(np.float32) / 255.0
elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 497d9e53673c..93c3f7ec20ac 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -304,23 +304,23 @@ def test_inpaint_compile(self):
assert np.abs(expected_slice - image_slice).max() < 1e-3
def test_stable_diffusion_inpaint_pil_input_resolution_test(self):
- pipe = StableDiffusionInpaintPipeline.from_pretrained(
- "runwayml/stable-diffusion-inpainting", safety_checker=None
- )
- pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
- pipe.to(torch_device)
- pipe.set_progress_bar_config(disable=None)
- pipe.enable_attention_slicing()
-
- inputs = self.get_inputs(torch_device)
- # change input image to a random size (one that would cause a tensor mismatch error)
- inputs['image'] = inputs['image'].resize((127,127))
- inputs['mask_image'] = inputs['mask_image'].resize((127,127))
- inputs['height'] = 128
- inputs['width'] = 128
- image = pipe(**inputs).images
- # verify that the returned image has the same height and width as the input height and width
- assert image.shape == (1, inputs['height'], inputs['width'], 3)
+ pipe = StableDiffusionInpaintPipeline.from_pretrained(
+ "runwayml/stable-diffusion-inpainting", safety_checker=None
+ )
+ pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+ pipe.to(torch_device)
+ pipe.set_progress_bar_config(disable=None)
+ pipe.enable_attention_slicing()
+
+ inputs = self.get_inputs(torch_device)
+ # change input image to a random size (one that would cause a tensor mismatch error)
+ inputs["image"] = inputs["image"].resize((127, 127))
+ inputs["mask_image"] = inputs["mask_image"].resize((127, 127))
+ inputs["height"] = 128
+ inputs["width"] = 128
+ image = pipe(**inputs).images
+ # verify that the returned image has the same height and width as the input height and width
+ assert image.shape == (1, inputs["height"], inputs["width"], 3)
@nightly
@@ -451,7 +451,18 @@ def test_np_inputs(self):
im_np = np.random.randint(0, 255, (height, width, 3), dtype=np.uint8)
im_pil = Image.fromarray(im_np)
- mask_np = np.random.randint(0, 255, (height, width,), dtype=np.uint8) > 127.5
+ mask_np = (
+ np.random.randint(
+ 0,
+ 255,
+ (
+ height,
+ width,
+ ),
+ dtype=np.uint8,
+ )
+ > 127.5
+ )
mask_pil = Image.fromarray((mask_np * 255).astype(np.uint8))
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
@@ -463,12 +474,34 @@ def test_np_inputs(self):
def test_torch_3D_2D_inputs(self):
height, width = 32, 32
- im_tensor = torch.randint(0, 255, (3, height, width,), dtype=torch.uint8)
- mask_tensor = torch.randint(0, 255, (height, width,), dtype=torch.uint8) > 127.5
+ im_tensor = torch.randint(
+ 0,
+ 255,
+ (
+ 3,
+ height,
+ width,
+ ),
+ dtype=torch.uint8,
+ )
+ mask_tensor = (
+ torch.randint(
+ 0,
+ 255,
+ (
+ height,
+ width,
+ ),
+ dtype=torch.uint8,
+ )
+ > 127.5
+ )
im_np = im_tensor.numpy().transpose(1, 2, 0)
mask_np = mask_tensor.numpy()
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
+ t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
+ im_tensor / 127.5 - 1, mask_tensor, height, width
+ )
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
@@ -477,12 +510,35 @@ def test_torch_3D_2D_inputs(self):
def test_torch_3D_3D_inputs(self):
height, width = 32, 32
- im_tensor = torch.randint(0, 255, (3, height, width,), dtype=torch.uint8)
- mask_tensor = torch.randint(0, 255, (1, height, width,), dtype=torch.uint8) > 127.5
+ im_tensor = torch.randint(
+ 0,
+ 255,
+ (
+ 3,
+ height,
+ width,
+ ),
+ dtype=torch.uint8,
+ )
+ mask_tensor = (
+ torch.randint(
+ 0,
+ 255,
+ (
+ 1,
+ height,
+ width,
+ ),
+ dtype=torch.uint8,
+ )
+ > 127.5
+ )
im_np = im_tensor.numpy().transpose(1, 2, 0)
mask_np = mask_tensor.numpy()[0]
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
+ t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
+ im_tensor / 127.5 - 1, mask_tensor, height, width
+ )
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
@@ -491,12 +547,35 @@ def test_torch_3D_3D_inputs(self):
def test_torch_4D_2D_inputs(self):
height, width = 32, 32
- im_tensor = torch.randint(0, 255, (1, 3, height, width,), dtype=torch.uint8)
- mask_tensor = torch.randint(0, 255, (height, width,), dtype=torch.uint8) > 127.5
+ im_tensor = torch.randint(
+ 0,
+ 255,
+ (
+ 1,
+ 3,
+ height,
+ width,
+ ),
+ dtype=torch.uint8,
+ )
+ mask_tensor = (
+ torch.randint(
+ 0,
+ 255,
+ (
+ height,
+ width,
+ ),
+ dtype=torch.uint8,
+ )
+ > 127.5
+ )
im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
mask_np = mask_tensor.numpy()
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
+ t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
+ im_tensor / 127.5 - 1, mask_tensor, height, width
+ )
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
@@ -505,12 +584,36 @@ def test_torch_4D_2D_inputs(self):
def test_torch_4D_3D_inputs(self):
height, width = 32, 32
- im_tensor = torch.randint(0, 255, (1, 3, height, width,), dtype=torch.uint8)
- mask_tensor = torch.randint(0, 255, (1, height, width,), dtype=torch.uint8) > 127.5
+ im_tensor = torch.randint(
+ 0,
+ 255,
+ (
+ 1,
+ 3,
+ height,
+ width,
+ ),
+ dtype=torch.uint8,
+ )
+ mask_tensor = (
+ torch.randint(
+ 0,
+ 255,
+ (
+ 1,
+ height,
+ width,
+ ),
+ dtype=torch.uint8,
+ )
+ > 127.5
+ )
im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
mask_np = mask_tensor.numpy()[0]
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
+ t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
+ im_tensor / 127.5 - 1, mask_tensor, height, width
+ )
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
@@ -519,12 +622,37 @@ def test_torch_4D_3D_inputs(self):
def test_torch_4D_4D_inputs(self):
height, width = 32, 32
- im_tensor = torch.randint(0, 255, (1, 3, height, width,), dtype=torch.uint8)
- mask_tensor = torch.randint(0, 255, (1, 1, height, width,), dtype=torch.uint8) > 127.5
+ im_tensor = torch.randint(
+ 0,
+ 255,
+ (
+ 1,
+ 3,
+ height,
+ width,
+ ),
+ dtype=torch.uint8,
+ )
+ mask_tensor = (
+ torch.randint(
+ 0,
+ 255,
+ (
+ 1,
+ 1,
+ height,
+ width,
+ ),
+ dtype=torch.uint8,
+ )
+ > 127.5
+ )
im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
mask_np = mask_tensor.numpy()[0][0]
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
+ t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
+ im_tensor / 127.5 - 1, mask_tensor, height, width
+ )
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
@@ -533,13 +661,37 @@ def test_torch_4D_4D_inputs(self):
def test_torch_batch_4D_3D(self):
height, width = 32, 32
- im_tensor = torch.randint(0, 255, (2, 3, height, width,), dtype=torch.uint8)
- mask_tensor = torch.randint(0, 255, (2, height, width,), dtype=torch.uint8) > 127.5
+ im_tensor = torch.randint(
+ 0,
+ 255,
+ (
+ 2,
+ 3,
+ height,
+ width,
+ ),
+ dtype=torch.uint8,
+ )
+ mask_tensor = (
+ torch.randint(
+ 0,
+ 255,
+ (
+ 2,
+ height,
+ width,
+ ),
+ dtype=torch.uint8,
+ )
+ > 127.5
+ )
im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
mask_nps = [mask.numpy() for mask in mask_tensor]
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
+ t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
+ im_tensor / 127.5 - 1, mask_tensor, height, width
+ )
nps = [prepare_mask_and_masked_image(i, m, height, width) for i, m in zip(im_nps, mask_nps)]
t_mask_np = torch.cat([n[0] for n in nps])
t_masked_np = torch.cat([n[1] for n in nps])
@@ -550,13 +702,38 @@ def test_torch_batch_4D_3D(self):
def test_torch_batch_4D_4D(self):
height, width = 32, 32
- im_tensor = torch.randint(0, 255, (2, 3, height, width,), dtype=torch.uint8)
- mask_tensor = torch.randint(0, 255, (2, 1, height, width,), dtype=torch.uint8) > 127.5
+ im_tensor = torch.randint(
+ 0,
+ 255,
+ (
+ 2,
+ 3,
+ height,
+ width,
+ ),
+ dtype=torch.uint8,
+ )
+ mask_tensor = (
+ torch.randint(
+ 0,
+ 255,
+ (
+ 2,
+ 1,
+ height,
+ width,
+ ),
+ dtype=torch.uint8,
+ )
+ > 127.5
+ )
im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
mask_nps = [mask.numpy()[0] for mask in mask_tensor]
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
+ t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
+ im_tensor / 127.5 - 1, mask_tensor, height, width
+ )
nps = [prepare_mask_and_masked_image(i, m, height, width) for i, m in zip(im_nps, mask_nps)]
t_mask_np = torch.cat([n[0] for n in nps])
t_masked_np = torch.cat([n[1] for n in nps])
@@ -569,43 +746,159 @@ def test_shape_mismatch(self):
# test height and width
with self.assertRaises(AssertionError):
- prepare_mask_and_masked_image(torch.randn(3, height, width,), torch.randn(64, 64), height, width)
+ prepare_mask_and_masked_image(
+ torch.randn(
+ 3,
+ height,
+ width,
+ ),
+ torch.randn(64, 64),
+ height,
+ width,
+ )
# test batch dim
with self.assertRaises(AssertionError):
- prepare_mask_and_masked_image(torch.randn(2, 3, height, width,), torch.randn(4, 64, 64), height, width)
+ prepare_mask_and_masked_image(
+ torch.randn(
+ 2,
+ 3,
+ height,
+ width,
+ ),
+ torch.randn(4, 64, 64),
+ height,
+ width,
+ )
# test batch dim
with self.assertRaises(AssertionError):
- prepare_mask_and_masked_image(torch.randn(2, 3, height, width,), torch.randn(4, 1, 64, 64), height, width)
+ prepare_mask_and_masked_image(
+ torch.randn(
+ 2,
+ 3,
+ height,
+ width,
+ ),
+ torch.randn(4, 1, 64, 64),
+ height,
+ width,
+ )
def test_type_mismatch(self):
height, width = 32, 32
# test tensors-only
with self.assertRaises(TypeError):
- prepare_mask_and_masked_image(torch.rand(3, height, width,), torch.rand(3, height, width,).numpy(), height, width)
+ prepare_mask_and_masked_image(
+ torch.rand(
+ 3,
+ height,
+ width,
+ ),
+ torch.rand(
+ 3,
+ height,
+ width,
+ ).numpy(),
+ height,
+ width,
+ )
# test tensors-only
with self.assertRaises(TypeError):
- prepare_mask_and_masked_image(torch.rand(3, height, width,).numpy(), torch.rand(3, height, width,), height, width)
+ prepare_mask_and_masked_image(
+ torch.rand(
+ 3,
+ height,
+ width,
+ ).numpy(),
+ torch.rand(
+ 3,
+ height,
+ width,
+ ),
+ height,
+ width,
+ )
def test_channels_first(self):
height, width = 32, 32
# test channels first for 3D tensors
with self.assertRaises(AssertionError):
- prepare_mask_and_masked_image(torch.rand(height, width, 3), torch.rand(3, height, width,), height, width)
+ prepare_mask_and_masked_image(
+ torch.rand(height, width, 3),
+ torch.rand(
+ 3,
+ height,
+ width,
+ ),
+ height,
+ width,
+ )
def test_tensor_range(self):
height, width = 32, 32
# test im <= 1
with self.assertRaises(ValueError):
- prepare_mask_and_masked_image(torch.ones(3, height, width,) * 2, torch.rand(height, width,), height, width)
+ prepare_mask_and_masked_image(
+ torch.ones(
+ 3,
+ height,
+ width,
+ )
+ * 2,
+ torch.rand(
+ height,
+ width,
+ ),
+ height,
+ width,
+ )
# test im >= -1
with self.assertRaises(ValueError):
- prepare_mask_and_masked_image(torch.ones(3, height, width,) * (-2), torch.rand(height, width,), height, width)
+ prepare_mask_and_masked_image(
+ torch.ones(
+ 3,
+ height,
+ width,
+ )
+ * (-2),
+ torch.rand(
+ height,
+ width,
+ ),
+ height,
+ width,
+ )
# test mask <= 1
with self.assertRaises(ValueError):
- prepare_mask_and_masked_image(torch.rand(3, height, width,), torch.ones(height, width,) * 2, height, width)
+ prepare_mask_and_masked_image(
+ torch.rand(
+ 3,
+ height,
+ width,
+ ),
+ torch.ones(
+ height,
+ width,
+ )
+ * 2,
+ height,
+ width,
+ )
# test mask >= 0
with self.assertRaises(ValueError):
- prepare_mask_and_masked_image(torch.rand(3, height, width,), torch.ones(height, width,) * -1, height, width)
+ prepare_mask_and_masked_image(
+ torch.rand(
+ 3,
+ height,
+ width,
+ ),
+ torch.ones(
+ height,
+ width,
+ )
+ * -1,
+ height,
+ width,
+ )
From c49e9ede4d3b4eb728c489e7bb3f5959f8d3b663 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Wed, 10 May 2023 16:02:48 -0700
Subject: [PATCH 062/206] [docs] Adapt a model (#3326)
* first draft
* apply feedback
* conv_in.weight thrown away
---
docs/source/en/_toctree.yml | 2 ++
docs/source/en/training/adapt_a_model.mdx | 42 +++++++++++++++++++++++
2 files changed, 44 insertions(+)
create mode 100644 docs/source/en/training/adapt_a_model.mdx
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index f205046ffc90..e8f825244552 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -62,6 +62,8 @@
title: Overview
- local: training/create_dataset
title: Create a dataset for training
+ - local: training/adapt_a_model
+ title: Adapt a model to a new task
- local: training/unconditional_training
title: Unconditional image generation
- local: training/text_inversion
diff --git a/docs/source/en/training/adapt_a_model.mdx b/docs/source/en/training/adapt_a_model.mdx
new file mode 100644
index 000000000000..f1af5fca57a2
--- /dev/null
+++ b/docs/source/en/training/adapt_a_model.mdx
@@ -0,0 +1,42 @@
+# Adapt a model to a new task
+
+Many diffusion systems share the same components, allowing you to adapt a pretrained model for one task to an entirely different task.
+
+This guide will show you how to adapt a pretrained text-to-image model for inpainting by initializing and modifying the architecture of a pretrained [`UNet2DConditionModel`].
+
+## Configure UNet2DConditionModel parameters
+
+A [`UNet2DConditionModel`] by default accepts 4 channels in the [input sample](https://huggingface.co/docs/diffusers/v0.16.0/en/api/models#diffusers.UNet2DConditionModel.in_channels). For example, load a pretrained text-to-image model like [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) and take a look at the number of `in_channels`:
+
+```py
+from diffusers import StableDiffusionPipeline
+
+pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+pipeline.unet.config["in_channels"]
+4
+```
+
+Inpainting requires 9 channels in the input sample. You can check this value in a pretrained inpainting model like [`runwayml/stable-diffusion-inpainting`](https://huggingface.co/runwayml/stable-diffusion-inpainting):
+
+```py
+from diffusers import StableDiffusionPipeline
+
+pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
+pipeline.unet.config["in_channels"]
+9
+```
+
+To adapt your text-to-image model for inpainting, you'll need to change the number of `in_channels` from 4 to 9.
+
+Initialize a [`UNet2DConditionModel`] with the pretrained text-to-image model weights, and change `in_channels` to 9. Changing the number of `in_channels` means you need to set `ignore_mismatched_sizes=True` and `low_cpu_mem_usage=False` to avoid a size mismatch error because the shape is different now.
+
+```py
+from diffusers import UNet2DConditionModel
+
+model_id = "runwayml/stable-diffusion-v1-5"
+unet = UNet2DConditionModel.from_pretrained(
+ model_id, subfolder="unet", in_channels=9, low_cpu_mem_usage=False, ignore_mismatched_sizes=True
+)
+```
+
+The pretrained weights of the other components from the text-to-image model are initialized from their checkpoints, but the input channel weights (`conv_in.weight`) of the `unet` are randomly initialized. It is important to finetune the model for inpainting because otherwise the model returns noise.
From 5e746753d6f0bbadb3649b1d366c61f748ff68ee Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Thu, 11 May 2023 02:31:27 -0700
Subject: [PATCH 063/206] [docs] Load safetensors (#3333)
* safetensors
* apply feedback
* apply feedback
* Apply suggestions from code review
---------
Co-authored-by: Patrick von Platen
---
docs/source/en/_toctree.yml | 4 +-
.../en/using-diffusers/using_safetensors.mdx | 105 ++++++++----------
2 files changed, 48 insertions(+), 61 deletions(-)
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index e8f825244552..246b467d8b04 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -26,6 +26,8 @@
title: Load and compare different schedulers
- local: using-diffusers/custom_pipeline_overview
title: Load community pipelines
+ - local: using-diffusers/using_safetensors
+ title: Load safetensors
- local: using-diffusers/kerascv
title: Load KerasCV Stable Diffusion checkpoints
title: Loading & Hub
@@ -50,8 +52,6 @@
title: Community pipelines
- local: using-diffusers/contribute_pipeline
title: How to contribute a community pipeline
- - local: using-diffusers/using_safetensors
- title: Using safetensors
- local: using-diffusers/stable_diffusion_jax_how_to
title: Stable Diffusion in JAX/Flax
- local: using-diffusers/weighted_prompts
diff --git a/docs/source/en/using-diffusers/using_safetensors.mdx b/docs/source/en/using-diffusers/using_safetensors.mdx
index b522f3236fbb..93867db1c426 100644
--- a/docs/source/en/using-diffusers/using_safetensors.mdx
+++ b/docs/source/en/using-diffusers/using_safetensors.mdx
@@ -1,87 +1,74 @@
-# What is safetensors ?
+# Load safetensors
-[safetensors](https://github.com/huggingface/safetensors) is a different format
-from the classic `.bin` which uses Pytorch which uses pickle. It contains the
-exact same data, which is just the model weights (or tensors).
+[safetensors](https://github.com/huggingface/safetensors) is a safe and fast file format for storing and loading tensors. Typically, PyTorch model weights are saved or *pickled* into a `.bin` file with Python's [`pickle`](https://docs.python.org/3/library/pickle.html) utility. However, `pickle` is not secure and pickled files may contain malicious code that can be executed. safetensors is a secure alternative to `pickle`, making it ideal for sharing model weights.
-Pickle is notoriously unsafe which allow any malicious file to execute arbitrary code.
-The hub itself tries to prevent issues from it, but it's not a silver bullet.
+This guide will show you how you load `.safetensor` files, and how to convert Stable Diffusion model weights stored in other formats to `.safetensor`. Before you start, make sure you have safetensors installed:
-`safetensors` first and foremost goal is to make loading machine learning models *safe*
-in the sense that no takeover of your computer can be done.
-
-Hence the name.
-
-# Why use safetensors ?
-
-**Safety** can be one reason, if you're attempting to use a not well known model and
-you're not sure about the source of the file.
-
-And a secondary reason, is **the speed of loading**. Safetensors can load models much faster
-than regular pickle files. If you spend a lot of times switching models, this can be
-a huge timesave.
-
-Numbers taken AMD EPYC 7742 64-Core Processor
+```bash
+!pip install safetensors
```
-from diffusers import StableDiffusionPipeline
-pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1")
+If you look at the [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main) repository, you'll see weights inside the `text_encoder`, `unet` and `vae` subfolders are stored in the `.safetensors` format. By default, 🤗 Diffusers automatically loads these `.safetensors` files from their subfolders if they're available in the model repository.
-# Loaded in safetensors 0:00:02.033658
-# Loaded in Pytorch 0:00:02.663379
-```
+For more explicit control, you can optionally set `use_safetensors=True` (if `safetensors` is not installed, you'll get an error message asking you to install it):
-This is for the entire loading time, the actual weights loading time to load 500MB:
+```py
+from diffusers import DiffusionPipeline
-```
-Safetensors: 3.4873ms
-PyTorch: 172.7537ms
+pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
```
-Performance in general is a tricky business, and there are a few things to understand:
+However, model weights are not necessarily stored in separate subfolders like in the example above. Sometimes, all the weights are stored in a single `.safetensors` file. In this case, if the weights are Stable Diffusion weights, you can load the file directly with the [`~diffusers.loaders.FromCkptMixin.from_ckpt`] method:
-- If you're using the model for the first time from the hub, you will have to download the weights.
- That's extremely likely to be much slower than any loading method, therefore you will not see any difference
-- If you're loading the model for the first time (let's say after a reboot) then your machine will have to
- actually read the disk. It's likely to be as slow in both cases. Again the speed difference may not be as visible (this depends on hardware and the actual model).
-- The best performance benefit is when the model was already loaded previously on your computer and you're switching from one model to another. Your OS, is trying really hard not to read from disk, since this is slow, so it will keep the files around in RAM, making it loading again much faster. Since safetensors is doing zero-copy of the tensors, reloading will be faster than pytorch since it has at least once extra copy to do.
+```py
+from diffusers import StableDiffusionPipeline
-# How to use safetensors ?
+pipeline = StableDiffusionPipeline.from_ckpt(
+ "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors"
+)
+```
-If you have `safetensors` installed, and all the weights are available in `safetensors` format, \
-then by default it will use that instead of the pytorch weights.
+## Convert to safetensors
-If you are really paranoid about this, the ultimate weapon would be disabling `torch.load`:
-```python
-import torch
+Not all weights on the Hub are available in the `.safetensors` format, and you may encounter weights stored as `.bin`. In this case, use the Space below to convert the weights to `.safetensors`. The Convert Space downloads the pickled weights, converts them, and opens a Pull Request to upload the newly converted `.safetensors` file on the Hub. This way, if there is any malicious code contained in the pickled files, they're uploaded to the Hub - which has a [security scanner](https://huggingface.co/docs/hub/security-pickle#hubs-security-scanner) to detect unsafe files and suspicious pickle imports - instead of your computer.
+
-def _raise():
- raise RuntimeError("I don't want to use pickle")
+You can use the model with the new `.safetensors` weights by specifying the reference to the Pull Request in the `revision` parameter (you can also test it in this [Check PR](https://huggingface.co/spaces/diffusers/check_pr) Space on the Hub), for example `refs/pr/22`:
+```py
+from diffusers import DiffusionPipeline
-torch.load = lambda *args, **kwargs: _raise()
+pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1", revision="refs/pr/22")
```
-# I want to use model X but it doesn't have safetensors weights.
+## Why use safetensors?
-Just go to this [space](https://huggingface.co/spaces/diffusers/convert).
-This will create a new PR with the weights, let's say `refs/pr/22`.
+There are several reasons for using safetensors:
-This space will download the pickled version, convert it, and upload it on the hub as a PR.
-If anything bad is contained in the file, it's Huggingface hub that will get issues, not your own computer.
-And we're equipped with dealing with it.
+- Safety is the number one reason for using safetensors. As open-source and model distribution grows, it is important to be able to trust the model weights you downloaded don't contain any malicious code. The current size of the header in safetensors prevents parsing extremely large JSON files.
+- Loading speed between switching models is another reason to use safetensors, which performs zero-copy of the tensors. It is especially fast compared to `pickle` if you're loading the weights to CPU (the default case), and just as fast if not faster when directly loading the weights to GPU. You'll only notice the performance difference if the model is already loaded, and not if you're downloading the weights or loading the model for the first time.
-Then in order to use the model, even before the branch gets accepted by the original author you can do:
+ The time it takes to load the entire pipeline:
-```python
-from diffusers import DiffusionPipeline
+ ```py
+ from diffusers import StableDiffusionPipeline
-pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1", revision="refs/pr/22")
-```
+ pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1")
+ "Loaded in safetensors 0:00:02.033658"
+ "Loaded in PyTorch 0:00:02.663379"
+ ```
-or you can test it directly online with this [space](https://huggingface.co/spaces/diffusers/check_pr).
+ But the actual time it takes to load 500MB of the model weights is only:
-And that's it !
+ ```bash
+ safetensors: 3.4873ms
+ PyTorch: 172.7537ms
+ ```
-Anything unclear, concerns, or found a bugs ? [Open an issue](https://github.com/huggingface/diffusers/issues/new/choose)
+- Lazy loading is also supported in safetensors, which is useful in distributed settings to only load some of the tensors. This format allowed the [BLOOM](https://huggingface.co/bigscience/bloom) model to be loaded in 45 seconds on 8 GPUs instead of 10 minutes with regular PyTorch weights.
From f740d357c9bdfe1c7672161fba724a98f336600a Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Thu, 11 May 2023 11:31:49 +0200
Subject: [PATCH 064/206] make style
---
docs/source/en/using-diffusers/using_safetensors.mdx | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/docs/source/en/using-diffusers/using_safetensors.mdx b/docs/source/en/using-diffusers/using_safetensors.mdx
index 93867db1c426..2767b95f3bcc 100644
--- a/docs/source/en/using-diffusers/using_safetensors.mdx
+++ b/docs/source/en/using-diffusers/using_safetensors.mdx
@@ -57,11 +57,11 @@ There are several reasons for using safetensors:
The time it takes to load the entire pipeline:
```py
- from diffusers import StableDiffusionPipeline
+ from diffusers import StableDiffusionPipeline
- pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1")
- "Loaded in safetensors 0:00:02.033658"
- "Loaded in PyTorch 0:00:02.663379"
+ pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1")
+ "Loaded in safetensors 0:00:02.033658"
+ "Loaded in PyTorch 0:00:02.663379"
```
But the actual time it takes to load 500MB of the model weights is only:
From e0b56d2b189330afed74e984a3309e3877450d42 Mon Sep 17 00:00:00 2001
From: sudowind
Date: Thu, 11 May 2023 21:10:16 +0800
Subject: [PATCH 065/206] [Docs] Fix stable_diffusion.mdx typo (#3398)
Fix typo in last code block. Correct "prommpts" to "prompt"
---
docs/source/en/stable_diffusion.mdx | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/source/en/stable_diffusion.mdx b/docs/source/en/stable_diffusion.mdx
index 0cec07834507..d02e93033614 100644
--- a/docs/source/en/stable_diffusion.mdx
+++ b/docs/source/en/stable_diffusion.mdx
@@ -246,7 +246,7 @@ image_grid(images, rows=2, cols=4)
Pretty impressive! Let's tweak the second image - corresponding to the `Generator` with a seed of `1` - a bit more by adding some text about the age of the subject:
```python
-prommpts = [
+prompts = [
"portrait photo of the oldest warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3 --beta --upbeta",
"portrait photo of a old warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3 --beta --upbeta",
"portrait photo of a warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3 --beta --upbeta",
From 01c056f09441a8670d0a88f24e2d4fb4a2956ae8 Mon Sep 17 00:00:00 2001
From: Takuma Mori
Date: Thu, 11 May 2023 22:58:07 +0900
Subject: [PATCH 066/206] Support ControlNet v1.1 shuffle properly (#3340)
* add inferring_controlnet_cond_batch
* Revert "add inferring_controlnet_cond_batch"
This reverts commit abe8d6311d4b7f5b9409ca709c7fabf80d06c1a9.
* set guess_mode to True
whenever global_pool_conditions is True
Co-authored-by: Patrick von Platen
* nit
* add integration test
---------
Co-authored-by: Patrick von Platen
---
src/diffusers/models/controlnet.py | 2 +-
.../pipeline_stable_diffusion_controlnet.py | 7 +++++
.../test_stable_diffusion_controlnet.py | 31 +++++++++++++++++++
3 files changed, 39 insertions(+), 1 deletion(-)
diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py
index 7b36d2eed96a..0b0ce0be547f 100644
--- a/src/diffusers/models/controlnet.py
+++ b/src/diffusers/models/controlnet.py
@@ -558,7 +558,7 @@ def forward(
mid_block_res_sample = self.controlnet_mid_block(sample)
# 6. scaling
- if guess_mode:
+ if guess_mode and not self.config.global_pool_conditions:
scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device) # 0.1 to 1.0
scales = scales * conditioning_scale
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 00030a6acd89..1cef221ea6e1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -930,6 +930,13 @@ def __call__(
if isinstance(self.controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(self.controlnet.nets)
+ global_pool_conditions = (
+ self.controlnet.config.global_pool_conditions
+ if isinstance(self.controlnet, ControlNetModel)
+ else self.controlnet.nets[0].config.global_pool_conditions
+ )
+ guess_mode = guess_mode or global_pool_conditions
+
# 3. Encode input prompt
prompt_embeds = self._encode_prompt(
prompt,
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 765b2393c105..a91b7c5e75fa 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -623,6 +623,37 @@ def test_stable_diffusion_compile(self):
assert np.abs(expected_image - image).max() < 1e-1
+ def test_v11_shuffle_global_pool_conditions(self):
+ controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_shuffle")
+
+ pipe = StableDiffusionControlNetPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+ )
+ pipe.enable_model_cpu_offload()
+ pipe.set_progress_bar_config(disable=None)
+
+ generator = torch.Generator(device="cpu").manual_seed(0)
+ prompt = "New York"
+ image = load_image(
+ "https://huggingface.co/lllyasviel/control_v11e_sd15_shuffle/resolve/main/images/control.png"
+ )
+
+ output = pipe(
+ prompt,
+ image,
+ generator=generator,
+ output_type="np",
+ num_inference_steps=3,
+ guidance_scale=7.0,
+ )
+
+ image = output.images[0]
+ assert image.shape == (512, 640, 3)
+
+ image_slice = image[-3:, -3:, -1]
+ expected_slice = np.array([0.1338, 0.1597, 0.1202, 0.1687, 0.1377, 0.1017, 0.2070, 0.1574, 0.1348])
+ assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
@slow
@require_torch_gpu
From 90f5f3c4d4b9fc5166e471b475a70a3dd3077d0a Mon Sep 17 00:00:00 2001
From: Sayak Paul
Date: Thu, 11 May 2023 21:08:14 +0530
Subject: [PATCH 067/206] [Tests] better determinism (#3374)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* enable deterministic pytorch and cuda operations.
* disable manual seeding.
* make style && make quality for unet_2d tests.
* enable determinism for the unet2dconditional model.
* add CUBLAS_WORKSPACE_CONFIG for better reproducibility.
* relax tolerance (very weird issue, though).
* revert to torch manual_seed() where needed.
* relax more tolerance.
* better placement of the cuda variable and relax more tolerance.
* enable determinism for 3d condition model.
* relax tolerance.
* add: determinism to alt_diffusion.
* relax tolerance for alt diffusion.
* dance diffusion.
* dance diffusion is flaky.
* test_dict_tuple_outputs_equivalent edit.
* fix two more tests.
* fix more ddim tests.
* fix: argument.
* change to diff in place of difference.
* fix: test_save_load call.
* test_save_load_float16 call.
* fix: expected_max_diff
* fix: paint by example.
* relax tolerance.
* add determinism to 1d unet model.
* torch 2.0 regressions seem to be brutal
* determinism to vae.
* add reason to skipping.
* up tolerance.
* determinism to vq.
* determinism to cuda.
* determinism to the generic test pipeline file.
* refactor general pipelines testing a bit.
* determinism to alt diffusion i2i
* up tolerance for alt diff i2i and audio diff
* up tolerance.
* determinism to audioldm
* increase tolerance for audioldm lms.
* increase tolerance for paint by paint.
* increase tolerance for repaint.
* determinism to cycle diffusion and sd 1.
* relax tol for cycle diffusion 🚲
* relax tol for sd 1.0
* relax tol for controlnet.
* determinism to img var.
* relax tol for img variation.
* tolerance to i2i sd
* make style
* determinism to inpaint.
* relax tolerance for inpaiting.
* determinism for inpainting legacy
* relax tolerance.
* determinism to instruct pix2pix
* determinism to model editing.
* model editing tolerance.
* panorama determinism
* determinism to pix2pix zero.
* determinism to sag.
* sd 2. determinism
* sd. tolerance
* disallow tf32 matmul.
* relax tolerance is all you need.
* make style and determinism to sd 2 depth
* relax tolerance for depth.
* tolerance to diffedit.
* tolerance to sd 2 inpaint.
* up tolerance.
* determinism in upscaling.
* tolerance in upscaler.
* more tolerance relaxation.
* determinism to v pred.
* up tol for v_pred
* unclip determinism
* determinism to unclip img2img
* determinism to text to video.
* determinism to last set of tests
* up tol.
* vq cumsum doesn't have a deterministic kernel
* relax tol
* relax tol
---
.github/workflows/push_tests.yml | 3 ++
tests/models/test_modeling_common.py | 4 +-
tests/models/test_models_unet_1d.py | 2 +-
tests/models/test_models_unet_2d.py | 11 +----
tests/models/test_models_unet_2d_condition.py | 13 +++---
tests/models/test_models_unet_3d_condition.py | 9 ++--
tests/models/test_models_vae.py | 10 +++--
tests/models/test_models_vq.py | 1 +
tests/others/test_ema.py | 4 ++
.../altdiffusion/test_alt_diffusion.py | 7 +++
.../test_alt_diffusion_img2img.py | 5 ++-
.../audio_diffusion/test_audio_diffusion.py | 1 +
tests/pipelines/audioldm/test_audioldm.py | 6 ++-
.../dance_diffusion/test_dance_diffusion.py | 5 ++-
tests/pipelines/ddim/test_ddim.py | 12 +++++
tests/pipelines/deepfloyd_if/test_if.py | 2 +-
.../pipelines/deepfloyd_if/test_if_img2img.py | 4 +-
.../test_if_img2img_superresolution.py | 2 +-
.../deepfloyd_if/test_if_inpainting.py | 2 +-
.../test_if_inpainting_superresolution.py | 2 +-
.../deepfloyd_if/test_if_superresolution.py | 2 +-
.../paint_by_example/test_paint_by_example.py | 4 ++
tests/pipelines/repaint/test_repaint.py | 1 +
.../stable_diffusion/test_cycle_diffusion.py | 3 +-
.../stable_diffusion/test_stable_diffusion.py | 19 +++++---
.../test_stable_diffusion_controlnet.py | 22 ++++++----
.../test_stable_diffusion_image_variation.py | 6 ++-
.../test_stable_diffusion_img2img.py | 6 ++-
.../test_stable_diffusion_inpaint.py | 13 +++---
.../test_stable_diffusion_inpaint_legacy.py | 9 ++--
...st_stable_diffusion_instruction_pix2pix.py | 4 ++
.../test_stable_diffusion_model_editing.py | 7 +++
.../test_stable_diffusion_panorama.py | 3 +-
.../test_stable_diffusion_pix2pix_zero.py | 1 +
.../test_stable_diffusion_sag.py | 4 ++
.../test_stable_diffusion.py | 13 ++++--
...test_stable_diffusion_attend_and_excite.py | 21 ++++++++-
.../test_stable_diffusion_depth.py | 10 +++--
.../test_stable_diffusion_diffedit.py | 4 ++
.../test_stable_diffusion_inpaint.py | 6 ++-
.../test_stable_diffusion_latent_upscale.py | 20 ++++++++-
.../test_stable_diffusion_v_pred.py | 3 +-
.../stable_unclip/test_stable_unclip.py | 4 ++
.../test_stable_unclip_img2img.py | 4 ++
tests/pipelines/test_pipelines.py | 21 ++++++---
tests/pipelines/test_pipelines_common.py | 44 ++++++++++---------
.../text_to_video/test_text_to_video.py | 3 +-
tests/pipelines/unclip/test_unclip.py | 4 ++
.../unclip/test_unclip_image_variation.py | 6 ++-
.../vq_diffusion/test_vq_diffusion.py | 4 +-
50 files changed, 272 insertions(+), 104 deletions(-)
diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index 2d4875b80ced..7966a416fcf1 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -72,6 +72,9 @@ jobs:
if: ${{ matrix.config.framework == 'pytorch' }}
env:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+ # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+ CUBLAS_WORKSPACE_CONFIG: :16:8
+
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx" \
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index 4a94a77fcabb..b2c5f2d79d4f 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -268,7 +268,7 @@ def test_from_save_pretrained_dtype(self):
new_model = self.model_class.from_pretrained(tmpdirname, low_cpu_mem_usage=False, torch_dtype=dtype)
assert new_model.dtype == dtype
- def test_determinism(self):
+ def test_determinism(self, expected_max_diff=1e-5):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
model = self.model_class(**init_dict)
model.to(torch_device)
@@ -288,7 +288,7 @@ def test_determinism(self):
out_1 = out_1[~np.isnan(out_1)]
out_2 = out_2[~np.isnan(out_2)]
max_diff = np.amax(np.abs(out_1 - out_2))
- self.assertLessEqual(max_diff, 1e-5)
+ self.assertLessEqual(max_diff, expected_max_diff)
def test_output(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
diff --git a/tests/models/test_models_unet_1d.py b/tests/models/test_models_unet_1d.py
index f954d876fa76..78f759cb1a24 100644
--- a/tests/models/test_models_unet_1d.py
+++ b/tests/models/test_models_unet_1d.py
@@ -152,7 +152,7 @@ def test_unet_1d_maestro(self):
output_sum = output.abs().sum()
output_max = output.abs().max()
- assert (output_sum - 224.0896).abs() < 4e-2
+ assert (output_sum - 224.0896).abs() < 0.5
assert (output_max - 0.0607).abs() < 4e-4
diff --git a/tests/models/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py
index c20b0ef7d0a4..8f9a6b813f19 100644
--- a/tests/models/test_models_unet_2d.py
+++ b/tests/models/test_models_unet_2d.py
@@ -27,6 +27,7 @@
logger = logging.get_logger(__name__)
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class Unet2DModelTests(ModelTesterMixin, unittest.TestCase):
@@ -246,10 +247,6 @@ def test_output_pretrained_ve_mid(self):
model = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256")
model.to(torch_device)
- torch.manual_seed(0)
- if torch.cuda.is_available():
- torch.cuda.manual_seed_all(0)
-
batch_size = 4
num_channels = 3
sizes = (256, 256)
@@ -262,7 +259,7 @@ def test_output_pretrained_ve_mid(self):
output_slice = output[0, -3:, -3:, -1].flatten().cpu()
# fmt: off
- expected_output_slice = torch.tensor([-4836.2231, -6487.1387, -3816.7969, -7964.9253, -10966.2842, -20043.6016, 8137.0571, 2340.3499, 544.6114])
+ expected_output_slice = torch.tensor([-4842.8691, -6499.6631, -3800.1953, -7978.2686, -10980.7129, -20028.8535, 8148.2822, 2342.2905, 567.7608])
# fmt: on
self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
@@ -271,10 +268,6 @@ def test_output_pretrained_ve_large(self):
model = UNet2DModel.from_pretrained("fusing/ncsnpp-ffhq-ve-dummy-update")
model.to(torch_device)
- torch.manual_seed(0)
- if torch.cuda.is_available():
- torch.cuda.manual_seed_all(0)
-
batch_size = 4
num_channels = 3
sizes = (32, 32)
diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index 2576297762a8..d3ca5ea3048e 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -39,6 +39,7 @@
logger = logging.get_logger(__name__)
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
def create_lora_layers(model, mock_weights: bool = True):
@@ -442,8 +443,8 @@ def test_lora_processors(self):
sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
- assert (sample1 - sample2).abs().max() < 1e-4
- assert (sample3 - sample4).abs().max() < 1e-4
+ assert (sample1 - sample2).abs().max() < 3e-3
+ assert (sample3 - sample4).abs().max() < 3e-3
# sample 2 and sample 3 should be different
assert (sample2 - sample3).abs().max() > 1e-4
@@ -587,7 +588,7 @@ def test_lora_on_off(self):
new_sample = model(**inputs_dict).sample
assert (sample - new_sample).abs().max() < 1e-4
- assert (sample - old_sample).abs().max() < 1e-4
+ assert (sample - old_sample).abs().max() < 3e-3
@unittest.skipIf(
torch_device != "cuda" or not is_xformers_available(),
@@ -642,7 +643,7 @@ def test_custom_diffusion_processors(self):
with torch.no_grad():
sample2 = model(**inputs_dict).sample
- assert (sample1 - sample2).abs().max() < 1e-4
+ assert (sample1 - sample2).abs().max() < 3e-3
def test_custom_diffusion_save_load(self):
# enable deterministic behavior for gradient checkpointing
@@ -677,7 +678,7 @@ def test_custom_diffusion_save_load(self):
assert (sample - new_sample).abs().max() < 1e-4
# custom diffusion and no custom diffusion should be the same
- assert (sample - old_sample).abs().max() < 1e-4
+ assert (sample - old_sample).abs().max() < 3e-3
@unittest.skipIf(
torch_device != "cuda" or not is_xformers_available(),
@@ -957,7 +958,7 @@ def test_compvis_sd_inpaint(self, seed, timestep, expected_slice):
output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
expected_output_slice = torch.tensor(expected_slice)
- assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
+ assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
@parameterized.expand(
[
diff --git a/tests/models/test_models_unet_3d_condition.py b/tests/models/test_models_unet_3d_condition.py
index f245045bb3bb..08863adfeaac 100644
--- a/tests/models/test_models_unet_3d_condition.py
+++ b/tests/models/test_models_unet_3d_condition.py
@@ -35,6 +35,7 @@
logger = logging.get_logger(__name__)
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
def create_lora_layers(model, mock_weights: bool = True):
@@ -224,11 +225,11 @@ def test_lora_processors(self):
sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
- assert (sample1 - sample2).abs().max() < 1e-4
- assert (sample3 - sample4).abs().max() < 1e-4
+ assert (sample1 - sample2).abs().max() < 3e-3
+ assert (sample3 - sample4).abs().max() < 3e-3
# sample 2 and sample 3 should be different
- assert (sample2 - sample3).abs().max() > 1e-4
+ assert (sample2 - sample3).abs().max() > 3e-3
def test_lora_save_load(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -365,7 +366,7 @@ def test_lora_on_off(self):
new_sample = model(**inputs_dict).sample
assert (sample - new_sample).abs().max() < 1e-4
- assert (sample - old_sample).abs().max() < 1e-4
+ assert (sample - old_sample).abs().max() < 3e-3
@unittest.skipIf(
torch_device != "cuda" or not is_xformers_available(),
diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py
index 6cb71bebb9c0..fd4cf0114f51 100644
--- a/tests/models/test_models_vae.py
+++ b/tests/models/test_models_vae.py
@@ -21,11 +21,13 @@
from diffusers import AutoencoderKL
from diffusers.utils import floats_tensor, load_hf_numpy, require_torch_gpu, slow, torch_all_close, torch_device
+from diffusers.utils.import_utils import is_xformers_available
from .test_modeling_common import ModelTesterMixin
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class AutoencoderKLTests(ModelTesterMixin, unittest.TestCase):
@@ -225,7 +227,7 @@ def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps):
output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
- assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
+ assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
@parameterized.expand(
[
@@ -271,7 +273,7 @@ def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps):
output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
- assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
+ assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
@parameterized.expand(
[
@@ -321,6 +323,7 @@ def test_stable_diffusion_decode_fp16(self, seed, expected_slice):
@parameterized.expand([13, 16, 27])
@require_torch_gpu
+ @unittest.skipIf(not is_xformers_available(), reason="xformers is not required when using PyTorch 2.0.")
def test_stable_diffusion_decode_xformers_vs_2_0_fp16(self, seed):
model = self.get_sd_vae_model(fp16=True)
encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True)
@@ -338,6 +341,7 @@ def test_stable_diffusion_decode_xformers_vs_2_0_fp16(self, seed):
@parameterized.expand([13, 16, 37])
@require_torch_gpu
+ @unittest.skipIf(not is_xformers_available(), reason="xformers is not required when using PyTorch 2.0.")
def test_stable_diffusion_decode_xformers_vs_2_0(self, seed):
model = self.get_sd_vae_model()
encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
@@ -375,5 +379,5 @@ def test_stable_diffusion_encode_sample(self, seed, expected_slice):
output_slice = sample[0, -1, -3:, -3:].flatten().cpu()
expected_output_slice = torch.tensor(expected_slice)
- tolerance = 1e-3 if torch_device != "mps" else 1e-2
+ tolerance = 3e-3 if torch_device != "mps" else 1e-2
assert torch_all_close(output_slice, expected_output_slice, atol=tolerance)
diff --git a/tests/models/test_models_vq.py b/tests/models/test_models_vq.py
index 015d2abfc6fa..f0be6f6a6d64 100644
--- a/tests/models/test_models_vq.py
+++ b/tests/models/test_models_vq.py
@@ -24,6 +24,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class VQModelTests(ModelTesterMixin, unittest.TestCase):
diff --git a/tests/others/test_ema.py b/tests/others/test_ema.py
index 812d83e2f241..5526aadc4757 100644
--- a/tests/others/test_ema.py
+++ b/tests/others/test_ema.py
@@ -23,6 +23,10 @@
from diffusers.utils.testing_utils import skip_mps, torch_device
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
+
+
class EMAModelTests(unittest.TestCase):
model_id = "hf-internal-testing/tiny-stable-diffusion-pipe"
batch_size = 1
diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion.py b/tests/pipelines/altdiffusion/test_alt_diffusion.py
index 60eb17e76c0a..9237f7435b95 100644
--- a/tests/pipelines/altdiffusion/test_alt_diffusion.py
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion.py
@@ -33,6 +33,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class AltDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
@@ -126,6 +127,12 @@ def get_dummy_inputs(self, device, seed=0):
}
return inputs
+ def test_attention_slicing_forward_pass(self):
+ super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
+
+ def test_inference_batch_single_identical(self):
+ super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
def test_alt_diffusion_ddim(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
index 1f96d8954156..35a4e91284cd 100644
--- a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
@@ -37,6 +37,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class AltDiffusionImg2ImgPipelineFastTests(unittest.TestCase):
@@ -251,7 +252,7 @@ def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
assert image.shape == (504, 760, 3)
expected_slice = np.array([0.9358, 0.9397, 0.9599, 0.9901, 1.0000, 1.0000, 0.9882, 1.0000, 1.0000])
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+ assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@slow
@@ -297,4 +298,4 @@ def test_stable_diffusion_img2img_pipeline_default(self):
assert image.shape == (512, 768, 3)
# img2img is flaky across GPUs even in fp32, so using MAE here
- assert np.abs(expected_image - image).max() < 1e-3
+ assert np.abs(expected_image - image).max() < 1e-2
diff --git a/tests/pipelines/audio_diffusion/test_audio_diffusion.py b/tests/pipelines/audio_diffusion/test_audio_diffusion.py
index 0eb6252410f5..a848bd031797 100644
--- a/tests/pipelines/audio_diffusion/test_audio_diffusion.py
+++ b/tests/pipelines/audio_diffusion/test_audio_diffusion.py
@@ -34,6 +34,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class PipelineFastTests(unittest.TestCase):
diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index ec72108fafc9..566b2c2d2cd0 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -42,6 +42,10 @@
from ..test_pipelines_common import PipelineTesterMixin
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
+
+
class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = AudioLDMPipeline
params = TEXT_TO_AUDIO_PARAMS
@@ -413,4 +417,4 @@ def test_audioldm_lms(self):
audio_slice = audio[27780:27790]
expected_slice = np.array([-0.2131, -0.0873, -0.0124, -0.0189, 0.0569, 0.1373, 0.1883, 0.2886, 0.3297, 0.2212])
max_diff = np.abs(expected_slice - audio_slice).max()
- assert max_diff < 1e-2
+ assert max_diff < 3e-2
diff --git a/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
index 5db90a3aa740..361839043c9f 100644
--- a/tests/pipelines/dance_diffusion/test_dance_diffusion.py
+++ b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
@@ -103,7 +103,7 @@ def test_save_load_local(self):
@skip_mps
def test_dict_tuple_outputs_equivalent(self):
- return super().test_dict_tuple_outputs_equivalent()
+ return super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-3)
@skip_mps
def test_save_load_optional_components(self):
@@ -113,6 +113,9 @@ def test_save_load_optional_components(self):
def test_attention_slicing_forward_pass(self):
return super().test_attention_slicing_forward_pass()
+ def test_inference_batch_single_identical(self):
+ super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
@slow
@require_torch_gpu
diff --git a/tests/pipelines/ddim/test_ddim.py b/tests/pipelines/ddim/test_ddim.py
index 319bd778e3b2..e997ae45d975 100644
--- a/tests/pipelines/ddim/test_ddim.py
+++ b/tests/pipelines/ddim/test_ddim.py
@@ -87,6 +87,18 @@ def test_inference(self):
max_diff = np.abs(image_slice.flatten() - expected_slice).max()
self.assertLessEqual(max_diff, 1e-3)
+ def test_dict_tuple_outputs_equivalent(self):
+ super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-3)
+
+ def test_save_load_local(self):
+ super().test_save_load_local(expected_max_difference=3e-3)
+
+ def test_save_load_optional_components(self):
+ super().test_save_load_optional_components(expected_max_difference=3e-3)
+
+ def test_inference_batch_single_identical(self):
+ super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
@slow
@require_torch_gpu
diff --git a/tests/pipelines/deepfloyd_if/test_if.py b/tests/pipelines/deepfloyd_if/test_if.py
index bf01c2350d22..f4cb52d25a8d 100644
--- a/tests/pipelines/deepfloyd_if/test_if.py
+++ b/tests/pipelines/deepfloyd_if/test_if.py
@@ -68,7 +68,7 @@ def test_save_load_optional_components(self):
@unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
def test_save_load_float16(self):
# Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
- self._test_save_load_float16(expected_max_diff=1e-1)
+ super().test_save_load_float16(expected_max_diff=1e-1)
def test_attention_slicing_forward_pass(self):
self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img.py b/tests/pipelines/deepfloyd_if/test_if_img2img.py
index b4c99a8ab93a..c85063af9e30 100644
--- a/tests/pipelines/deepfloyd_if/test_if_img2img.py
+++ b/tests/pipelines/deepfloyd_if/test_if_img2img.py
@@ -66,11 +66,11 @@ def test_save_load_optional_components(self):
@unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
def test_save_load_float16(self):
# Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
- self._test_save_load_float16(expected_max_diff=1e-1)
+ super().test_save_load_float16(expected_max_diff=1e-1)
@unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
def test_float16_inference(self):
- self._test_float16_inference(expected_max_diff=1e-1)
+ super().test_float16_inference(expected_max_diff=1e-1)
def test_attention_slicing_forward_pass(self):
self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
index 626ab321f895..e7c8d58a3e0c 100644
--- a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
+++ b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
@@ -65,7 +65,7 @@ def test_save_load_optional_components(self):
@unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
def test_save_load_float16(self):
# Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
- self._test_save_load_float16(expected_max_diff=1e-1)
+ super().test_save_load_float16(expected_max_diff=1e-1)
def test_attention_slicing_forward_pass(self):
self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting.py b/tests/pipelines/deepfloyd_if/test_if_inpainting.py
index 37d818c7a910..6837ad36baf5 100644
--- a/tests/pipelines/deepfloyd_if/test_if_inpainting.py
+++ b/tests/pipelines/deepfloyd_if/test_if_inpainting.py
@@ -68,7 +68,7 @@ def test_save_load_optional_components(self):
@unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
def test_save_load_float16(self):
# Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
- self._test_save_load_float16(expected_max_diff=1e-1)
+ super().test_save_load_float16(expected_max_diff=1e-1)
def test_attention_slicing_forward_pass(self):
self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
index 30062cb2f8d0..fc130091b5e5 100644
--- a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
+++ b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
@@ -70,7 +70,7 @@ def test_save_load_optional_components(self):
@unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
def test_save_load_float16(self):
# Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
- self._test_save_load_float16(expected_max_diff=1e-1)
+ super().test_save_load_float16(expected_max_diff=1e-1)
def test_attention_slicing_forward_pass(self):
self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
diff --git a/tests/pipelines/deepfloyd_if/test_if_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_superresolution.py
index 14acfa5415c2..9e418ca6aff5 100644
--- a/tests/pipelines/deepfloyd_if/test_if_superresolution.py
+++ b/tests/pipelines/deepfloyd_if/test_if_superresolution.py
@@ -63,7 +63,7 @@ def test_save_load_optional_components(self):
@unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
def test_save_load_float16(self):
# Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
- self._test_save_load_float16(expected_max_diff=1e-1)
+ super().test_save_load_float16(expected_max_diff=1e-1)
def test_attention_slicing_forward_pass(self):
self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
diff --git a/tests/pipelines/paint_by_example/test_paint_by_example.py b/tests/pipelines/paint_by_example/test_paint_by_example.py
index bb798ff729bf..80ba3f5ed37f 100644
--- a/tests/pipelines/paint_by_example/test_paint_by_example.py
+++ b/tests/pipelines/paint_by_example/test_paint_by_example.py
@@ -32,6 +32,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@@ -161,6 +162,9 @@ def test_paint_by_example_image_tensor(self):
assert out_1.shape == (1, 64, 64, 3)
assert np.abs(out_1.flatten() - out_2.flatten()).max() < 5e-2
+ def test_inference_batch_single_identical(self):
+ super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
@slow
@require_torch_gpu
diff --git a/tests/pipelines/repaint/test_repaint.py b/tests/pipelines/repaint/test_repaint.py
index 4f98675bc5af..59968eaf101c 100644
--- a/tests/pipelines/repaint/test_repaint.py
+++ b/tests/pipelines/repaint/test_repaint.py
@@ -27,6 +27,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class RepaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
index 52d3b03e5220..3d6bfff1bbd1 100644
--- a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
@@ -30,6 +30,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class CycleDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
@@ -266,4 +267,4 @@ def test_cycle_diffusion_pipeline(self):
)
image = output.images
- assert np.abs(image - expected_image).max() < 1e-2
+ assert np.abs(image - expected_image).max() < 2e-2
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index ddbf9f45f274..1f52a09b672b 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -47,6 +47,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class StableDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
@@ -540,6 +541,12 @@ def test_stable_diffusion_height_width_opt(self):
image_shape = output.images[0].shape[:2]
assert image_shape == (192, 192)
+ def test_attention_slicing_forward_pass(self):
+ super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
+
+ def test_inference_batch_single_identical(self):
+ super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
@slow
@require_torch_gpu
@@ -574,7 +581,7 @@ def test_stable_diffusion_1_1_pndm(self):
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.43625, 0.43554, 0.36670, 0.40660, 0.39703, 0.38658, 0.43936, 0.43557, 0.40592])
- assert np.abs(image_slice - expected_slice).max() < 1e-4
+ assert np.abs(image_slice - expected_slice).max() < 3e-3
def test_stable_diffusion_1_4_pndm(self):
sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
@@ -587,7 +594,7 @@ def test_stable_diffusion_1_4_pndm(self):
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.57400, 0.47841, 0.31625, 0.63583, 0.58306, 0.55056, 0.50825, 0.56306, 0.55748])
- assert np.abs(image_slice - expected_slice).max() < 1e-4
+ assert np.abs(image_slice - expected_slice).max() < 3e-3
def test_stable_diffusion_ddim(self):
sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
@@ -615,7 +622,7 @@ def test_stable_diffusion_lms(self):
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.10542, 0.09620, 0.07332, 0.09015, 0.09382, 0.07597, 0.08496, 0.07806, 0.06455])
- assert np.abs(image_slice - expected_slice).max() < 1e-4
+ assert np.abs(image_slice - expected_slice).max() < 3e-3
def test_stable_diffusion_dpm(self):
sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
@@ -629,7 +636,7 @@ def test_stable_diffusion_dpm(self):
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.03503, 0.03494, 0.01087, 0.03128, 0.02552, 0.00803, 0.00742, 0.00372, 0.00000])
- assert np.abs(image_slice - expected_slice).max() < 1e-4
+ assert np.abs(image_slice - expected_slice).max() < 3e-3
def test_stable_diffusion_attention_slicing(self):
torch.cuda.reset_peak_memory_stats()
@@ -904,7 +911,7 @@ def test_stable_diffusion_textual_inversion(self):
)
max_diff = np.abs(expected_image - image).max()
- assert max_diff < 5e-2
+ assert max_diff < 8e-1
def test_stable_diffusion_compile(self):
if version.parse(torch.__version__) < version.parse("2.0"):
@@ -1048,7 +1055,7 @@ def test_stable_diffusion_ddim(self):
"/stable_diffusion_text2img/stable_diffusion_1_4_ddim.npy"
)
max_diff = np.abs(expected_image - image).max()
- assert max_diff < 1e-3
+ assert max_diff < 3e-3
def test_stable_diffusion_lms(self):
sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index a91b7c5e75fa..bd1470f5ebd1 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -38,6 +38,10 @@
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
+
+
class StableDiffusionControlNetPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionControlNetPipeline
params = TEXT_TO_IMAGE_PARAMS
@@ -334,7 +338,7 @@ def test_canny(self):
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out.npy"
)
- assert np.abs(expected_image - image).max() < 5e-3
+ assert np.abs(expected_image - image).max() < 9e-2
def test_depth(self):
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-depth")
@@ -361,7 +365,7 @@ def test_depth(self):
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth_out.npy"
)
- assert np.abs(expected_image - image).max() < 5e-3
+ assert np.abs(expected_image - image).max() < 8e-1
def test_hed(self):
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-hed")
@@ -388,7 +392,7 @@ def test_hed(self):
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed_out.npy"
)
- assert np.abs(expected_image - image).max() < 5e-3
+ assert np.abs(expected_image - image).max() < 8e-2
def test_mlsd(self):
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-mlsd")
@@ -415,7 +419,7 @@ def test_mlsd(self):
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd_out.npy"
)
- assert np.abs(expected_image - image).max() < 5e-3
+ assert np.abs(expected_image - image).max() < 5e-2
def test_normal(self):
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-normal")
@@ -442,7 +446,7 @@ def test_normal(self):
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/cute_toy_normal_out.npy"
)
- assert np.abs(expected_image - image).max() < 5e-3
+ assert np.abs(expected_image - image).max() < 5e-2
def test_openpose(self):
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
@@ -469,7 +473,7 @@ def test_openpose(self):
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/chef_pose_out.npy"
)
- assert np.abs(expected_image - image).max() < 5e-3
+ assert np.abs(expected_image - image).max() < 8e-2
def test_scribble(self):
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-scribble")
@@ -496,7 +500,7 @@ def test_scribble(self):
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bag_scribble_out.npy"
)
- assert np.abs(expected_image - image).max() < 5e-3
+ assert np.abs(expected_image - image).max() < 8e-2
def test_seg(self):
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
@@ -523,7 +527,7 @@ def test_seg(self):
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg_out.npy"
)
- assert np.abs(expected_image - image).max() < 5e-3
+ assert np.abs(expected_image - image).max() < 8e-2
def test_sequential_cpu_offloading(self):
torch.cuda.empty_cache()
@@ -621,7 +625,7 @@ def test_stable_diffusion_compile(self):
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy"
)
- assert np.abs(expected_image - image).max() < 1e-1
+ assert np.abs(expected_image - image).max() < 1.0
def test_v11_shuffle_global_pool_conditions(self):
controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_shuffle")
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
index fbdfc75faa84..8c27a568d24d 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
@@ -37,6 +37,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class StableDiffusionImageVariationPipelineFastTests(
@@ -148,6 +149,9 @@ def test_stable_diffusion_img_variation_multiple_images(self):
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+ def test_inference_batch_single_identical(self):
+ super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
@slow
@require_torch_gpu
@@ -188,7 +192,7 @@ def test_stable_diffusion_img_variation_pipeline_default(self):
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.84491, 0.90789, 0.75708, 0.78734, 0.83485, 0.70099, 0.66938, 0.68727, 0.61379])
- assert np.abs(image_slice - expected_slice).max() < 1e-4
+ assert np.abs(image_slice - expected_slice).max() < 6e-3
def test_stable_diffusion_img_variation_intermediate_state(self):
number_of_steps = 0
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 2f63371c1a0d..4afc16d9b65f 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -45,6 +45,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class StableDiffusionImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
@@ -211,7 +212,10 @@ def test_save_load_optional_components(self):
@skip_mps
def test_attention_slicing_forward_pass(self):
- return super().test_attention_slicing_forward_pass()
+ return super().test_attention_slicing_forward_pass(expected_max_diff=5e-3)
+
+ def test_inference_batch_single_identical(self):
+ super().test_inference_batch_single_identical(expected_max_diff=3e-3)
@slow
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 93c3f7ec20ac..cdf138c4e178 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -40,6 +40,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class StableDiffusionInpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
@@ -155,6 +156,9 @@ def test_stable_diffusion_inpaint_image_tensor(self):
assert out_pil.shape == (1, 64, 64, 3)
assert np.abs(out_pil.flatten() - out_tensor.flatten()).max() < 5e-2
+ def test_inference_batch_single_identical(self):
+ super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
@slow
@require_torch_gpu
@@ -203,7 +207,7 @@ def test_stable_diffusion_inpaint_ddim(self):
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.0427, 0.0460, 0.0483, 0.0460, 0.0584, 0.0521, 0.1549, 0.1695, 0.1794])
- assert np.abs(expected_slice - image_slice).max() < 1e-4
+ assert np.abs(expected_slice - image_slice).max() < 6e-4
def test_stable_diffusion_inpaint_fp16(self):
pipe = StableDiffusionInpaintPipeline.from_pretrained(
@@ -238,7 +242,7 @@ def test_stable_diffusion_inpaint_pndm(self):
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.0425, 0.0273, 0.0344, 0.1694, 0.1727, 0.1812, 0.3256, 0.3311, 0.3272])
- assert np.abs(expected_slice - image_slice).max() < 1e-4
+ assert np.abs(expected_slice - image_slice).max() < 5e-3
def test_stable_diffusion_inpaint_k_lms(self):
pipe = StableDiffusionInpaintPipeline.from_pretrained(
@@ -256,7 +260,7 @@ def test_stable_diffusion_inpaint_k_lms(self):
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.9314, 0.7575, 0.9432, 0.8885, 0.9028, 0.7298, 0.9811, 0.9667, 0.7633])
- assert np.abs(expected_slice - image_slice).max() < 1e-4
+ assert np.abs(expected_slice - image_slice).max() < 6e-3
def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self):
torch.cuda.empty_cache()
@@ -300,8 +304,7 @@ def test_inpaint_compile(self):
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.0425, 0.0273, 0.0344, 0.1694, 0.1727, 0.1812, 0.3256, 0.3311, 0.3272])
- assert np.abs(expected_slice - image_slice).max() < 1e-4
- assert np.abs(expected_slice - image_slice).max() < 1e-3
+ assert np.abs(expected_slice - image_slice).max() < 3e-3
def test_stable_diffusion_inpaint_pil_input_resolution_test(self):
pipe = StableDiffusionInpaintPipeline.from_pretrained(
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
index f56fa31a9601..8647041fbb6f 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
@@ -38,6 +38,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class StableDiffusionInpaintLegacyPipelineFastTests(unittest.TestCase):
@@ -435,7 +436,7 @@ def test_stable_diffusion_inpaint_legacy_pndm(self):
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.5665, 0.6117, 0.6430, 0.4057, 0.4594, 0.5658, 0.1596, 0.3106, 0.4305])
- assert np.abs(expected_slice - image_slice).max() < 1e-4
+ assert np.abs(expected_slice - image_slice).max() < 3e-3
def test_stable_diffusion_inpaint_legacy_batched(self):
pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
@@ -468,8 +469,8 @@ def test_stable_diffusion_inpaint_legacy_batched(self):
[0.3592432, 0.4233033, 0.3914635, 0.31014425, 0.3702293, 0.39412856, 0.17526966, 0.2642669, 0.37480092]
)
- assert np.abs(expected_slice_0 - image_slice_0).max() < 1e-4
- assert np.abs(expected_slice_1 - image_slice_1).max() < 1e-4
+ assert np.abs(expected_slice_0 - image_slice_0).max() < 3e-3
+ assert np.abs(expected_slice_1 - image_slice_1).max() < 3e-3
def test_stable_diffusion_inpaint_legacy_k_lms(self):
pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
@@ -487,7 +488,7 @@ def test_stable_diffusion_inpaint_legacy_k_lms(self):
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.4534, 0.4467, 0.4329, 0.4329, 0.4339, 0.4220, 0.4244, 0.4332, 0.4426])
- assert np.abs(expected_slice - image_slice).max() < 1e-4
+ assert np.abs(expected_slice - image_slice).max() < 3e-3
def test_stable_diffusion_inpaint_legacy_intermediate_state(self):
number_of_steps = 0
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
index 08dc1b2844dc..99a069493885 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
@@ -39,6 +39,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class StableDiffusionInstructPix2PixPipelineFastTests(
@@ -196,6 +197,9 @@ def test_stable_diffusion_pix2pix_euler(self):
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+ def test_inference_batch_single_identical(self):
+ super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
@slow
@require_torch_gpu
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
index b1bed4b3cf25..b448dbef1ebe 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
@@ -36,6 +36,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
@skip_mps
@@ -175,6 +176,12 @@ def test_stable_diffusion_model_editing_pndm(self):
with self.assertRaises(ValueError):
_ = sd_pipe(**inputs).images
+ def test_inference_batch_single_identical(self):
+ super().test_inference_batch_single_identical(expected_max_diff=5e-3)
+
+ def test_attention_slicing_forward_pass(self):
+ super().test_attention_slicing_forward_pass(expected_max_diff=5e-3)
+
@slow
@require_torch_gpu
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
index 82e42b095f5d..61708b36bfee 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
@@ -37,6 +37,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
@skip_mps
@@ -130,7 +131,7 @@ def test_inference_batch_consistent(self):
# override to speed the overall test timing up.
def test_inference_batch_single_identical(self):
- super().test_inference_batch_single_identical(batch_size=2)
+ super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=3e-3)
def test_stable_diffusion_panorama_negative_prompt(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
index af64a23c4003..90cc85646462 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
@@ -40,6 +40,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
@skip_mps
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
index ad0d50df3ce5..7cb8ab409a9b 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
@@ -34,6 +34,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class StableDiffusionSAGPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
@@ -112,6 +113,9 @@ def get_dummy_inputs(self, device, seed=0):
}
return inputs
+ def test_inference_batch_single_identical(self):
+ super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
@slow
@require_torch_gpu
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
index be807b5c0c33..bc4ab7d66431 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
@@ -40,6 +40,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class StableDiffusion2PipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
@@ -245,6 +246,12 @@ def test_stable_diffusion_long_prompt(self):
assert cap_logger.out.count("@") == 25
assert cap_logger_3.out == ""
+ def test_attention_slicing_forward_pass(self):
+ super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
+
+ def test_inference_batch_single_identical(self):
+ super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
@slow
@require_torch_gpu
@@ -279,7 +286,7 @@ def test_stable_diffusion_default_ddim(self):
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.49493, 0.47896, 0.40798, 0.54214, 0.53212, 0.48202, 0.47656, 0.46329, 0.48506])
- assert np.abs(image_slice - expected_slice).max() < 1e-4
+ assert np.abs(image_slice - expected_slice).max() < 7e-3
def test_stable_diffusion_pndm(self):
pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
@@ -293,7 +300,7 @@ def test_stable_diffusion_pndm(self):
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.49493, 0.47896, 0.40798, 0.54214, 0.53212, 0.48202, 0.47656, 0.46329, 0.48506])
- assert np.abs(image_slice - expected_slice).max() < 1e-4
+ assert np.abs(image_slice - expected_slice).max() < 7e-3
def test_stable_diffusion_k_lms(self):
pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
@@ -307,7 +314,7 @@ def test_stable_diffusion_k_lms(self):
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.10440, 0.13115, 0.11100, 0.10141, 0.11440, 0.07215, 0.11332, 0.09693, 0.10006])
- assert np.abs(image_slice - expected_slice).max() < 1e-4
+ assert np.abs(image_slice - expected_slice).max() < 3e-3
def test_stable_diffusion_attention_slicing(self):
torch.cuda.reset_peak_memory_stats()
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
index 60cf9c7982e9..898d5741043f 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
@@ -33,6 +33,10 @@
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(False)
+
+
@skip_mps
class StableDiffusionAttendAndExcitePipelineFastTests(
PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase
@@ -141,12 +145,27 @@ def test_inference(self):
max_diff = np.abs(image_slice.flatten() - expected_slice).max()
self.assertLessEqual(max_diff, 1e-3)
+ def test_cpu_offload_forward_pass(self):
+ super().test_cpu_offload_forward_pass(expected_max_diff=5e-4)
+
def test_inference_batch_consistent(self):
# NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches
self._test_inference_batch_consistent(batch_sizes=[1, 2])
def test_inference_batch_single_identical(self):
- self._test_inference_batch_single_identical(batch_size=2)
+ self._test_inference_batch_single_identical(batch_size=2, expected_max_diff=7e-4)
+
+ def test_dict_tuple_outputs_equivalent(self):
+ super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-3)
+
+ def test_pt_np_pil_outputs_equivalent(self):
+ super().test_pt_np_pil_outputs_equivalent(expected_max_diff=5e-4)
+
+ def test_save_load_local(self):
+ super().test_save_load_local(expected_max_difference=5e-4)
+
+ def test_save_load_optional_components(self):
+ super().test_save_load_optional_components(expected_max_difference=4e-4)
@require_torch_gpu
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
index 7b63583eef77..ae1eefa68242 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
@@ -56,6 +56,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
@skip_mps
@@ -362,6 +363,9 @@ def test_stable_diffusion_depth2img_pil(self):
def test_attention_slicing_forward_pass(self):
return super().test_attention_slicing_forward_pass()
+ def test_inference_batch_single_identical(self):
+ super().test_inference_batch_single_identical(expected_max_diff=7e-3)
+
@slow
@require_torch_gpu
@@ -402,7 +406,7 @@ def test_stable_diffusion_depth2img_pipeline_default(self):
assert image.shape == (1, 480, 640, 3)
expected_slice = np.array([0.5435, 0.4992, 0.3783, 0.4411, 0.5842, 0.4654, 0.3786, 0.5077, 0.4655])
- assert np.abs(expected_slice - image_slice).max() < 1e-4
+ assert np.abs(expected_slice - image_slice).max() < 6e-1
def test_stable_diffusion_depth2img_pipeline_k_lms(self):
pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
@@ -420,7 +424,7 @@ def test_stable_diffusion_depth2img_pipeline_k_lms(self):
assert image.shape == (1, 480, 640, 3)
expected_slice = np.array([0.6363, 0.6274, 0.6309, 0.6370, 0.6226, 0.6286, 0.6213, 0.6453, 0.6306])
- assert np.abs(expected_slice - image_slice).max() < 1e-4
+ assert np.abs(expected_slice - image_slice).max() < 8e-4
def test_stable_diffusion_depth2img_pipeline_ddim(self):
pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
@@ -438,7 +442,7 @@ def test_stable_diffusion_depth2img_pipeline_ddim(self):
assert image.shape == (1, 480, 640, 3)
expected_slice = np.array([0.6424, 0.6524, 0.6249, 0.6041, 0.6634, 0.6420, 0.6522, 0.6555, 0.6436])
- assert np.abs(expected_slice - image_slice).max() < 1e-4
+ assert np.abs(expected_slice - image_slice).max() < 5e-4
def test_stable_diffusion_depth2img_intermediate_state(self):
number_of_steps = 0
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
index bd9ce25bdbac..d32f4d665f55 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
@@ -38,6 +38,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class StableDiffusionDiffEditPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
@@ -252,6 +253,9 @@ def test_inversion(self):
max_diff = np.abs(image_slice.flatten() - expected_slice).max()
self.assertLessEqual(max_diff, 1e-3)
+ def test_inference_batch_single_identical(self):
+ super().test_inference_batch_single_identical(expected_max_diff=5e-3)
+
@require_torch_gpu
@slow
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
index 843a6146dac9..77242add93e9 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
@@ -31,6 +31,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class StableDiffusion2InpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
@@ -133,6 +134,9 @@ def test_stable_diffusion_inpaint(self):
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+ def test_inference_batch_single_identical(self):
+ super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
@slow
@require_torch_gpu
@@ -175,7 +179,7 @@ def test_stable_diffusion_inpaint_pipeline(self):
image = output.images[0]
assert image.shape == (512, 512, 3)
- assert np.abs(expected_image - image).max() < 1e-3
+ assert np.abs(expected_image - image).max() < 9e-3
def test_stable_diffusion_inpaint_pipeline_fp16(self):
init_image = load_image(
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
index 70277d6283e8..539b4b1cc350 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
@@ -163,8 +163,26 @@ def test_inference(self):
max_diff = np.abs(image_slice.flatten() - expected_slice).max()
self.assertLessEqual(max_diff, 1e-3)
+ def test_attention_slicing_forward_pass(self):
+ super().test_attention_slicing_forward_pass(expected_max_diff=7e-3)
+
+ def test_cpu_offload_forward_pass(self):
+ super().test_cpu_offload_forward_pass(expected_max_diff=3e-3)
+
+ def test_dict_tuple_outputs_equivalent(self):
+ super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-3)
+
def test_inference_batch_single_identical(self):
- self._test_inference_batch_single_identical(relax_max_difference=False)
+ super().test_inference_batch_single_identical(expected_max_diff=7e-3)
+
+ def test_pt_np_pil_outputs_equivalent(self):
+ super().test_pt_np_pil_outputs_equivalent(expected_max_diff=3e-3)
+
+ def test_save_load_local(self):
+ super().test_save_load_local(expected_max_difference=3e-3)
+
+ def test_save_load_optional_components(self):
+ super().test_save_load_optional_components(expected_max_difference=3e-3)
@require_torch_gpu
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
index 083640a87ba9..a874cbb7e0c5 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
@@ -34,6 +34,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class StableDiffusion2VPredictionPipelineFastTests(unittest.TestCase):
@@ -382,7 +383,7 @@ def test_stable_diffusion_text2img_pipeline_v_pred_default(self):
image = output.images[0]
assert image.shape == (768, 768, 3)
- assert np.abs(expected_image - image).max() < 7.5e-2
+ assert np.abs(expected_image - image).max() < 9e-1
def test_stable_diffusion_text2img_pipeline_v_pred_fp16(self):
expected_image = load_numpy(
diff --git a/tests/pipelines/stable_unclip/test_stable_unclip.py b/tests/pipelines/stable_unclip/test_stable_unclip.py
index b0e65692e8b5..78775a938b5b 100644
--- a/tests/pipelines/stable_unclip/test_stable_unclip.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip.py
@@ -19,6 +19,10 @@
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin, assert_mean_pixel_difference
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
+
+
class StableUnCLIPPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableUnCLIPPipeline
params = TEXT_TO_IMAGE_PARAMS
diff --git a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
index 450e0af8dcdc..dcd4300b85c1 100644
--- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
@@ -35,6 +35,10 @@
)
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
+
+
class StableUnCLIPImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableUnCLIPImg2ImgPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index 70b1431d630a..df1a3b6ac7bb 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -58,16 +58,23 @@
CONFIG_NAME,
WEIGHTS_NAME,
floats_tensor,
- is_flax_available,
nightly,
require_torch_2,
slow,
torch_device,
)
-from diffusers.utils.testing_utils import CaptureLogger, get_tests_dir, load_numpy, require_compel, require_torch_gpu
+from diffusers.utils.testing_utils import (
+ CaptureLogger,
+ get_tests_dir,
+ load_numpy,
+ require_compel,
+ require_flax,
+ require_torch_gpu,
+)
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
class DownloadTests(unittest.TestCase):
@@ -691,6 +698,9 @@ def test_local_custom_pipeline_file(self):
@slow
@require_torch_gpu
def test_download_from_git(self):
+ # Because adaptive_avg_pool2d_backward_cuda
+ # does not have a deterministic implementation.
+ torch.use_deterministic_algorithms(False)
clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
feature_extractor = CLIPImageProcessor.from_pretrained(clip_model_id)
@@ -712,6 +722,7 @@ def test_download_from_git(self):
image = pipeline("a prompt", num_inference_steps=2, output_type="np").images[0]
assert image.shape == (512, 512, 3)
+ torch.use_deterministic_algorithms(True)
def test_save_pipeline_change_config(self):
pipe = DiffusionPipeline.from_pretrained(
@@ -1402,15 +1413,13 @@ def test_output_format(self):
assert isinstance(images, list)
assert isinstance(images[0], PIL.Image.Image)
+ @require_flax
def test_from_flax_from_pt(self):
pipe_pt = StableDiffusionPipeline.from_pretrained(
"hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
)
pipe_pt.to(torch_device)
- if not is_flax_available():
- raise ImportError("Make sure flax is installed.")
-
from diffusers import FlaxStableDiffusionPipeline
with tempfile.TemporaryDirectory() as tmpdirname:
@@ -1474,7 +1483,7 @@ def test_weighted_prompts_compel(self):
f"/compel/forest_{i}.npy"
)
- assert np.abs(image - expected_image).max() < 1e-2
+ assert np.abs(image - expected_image).max() < 3e-1
@nightly
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 4a51e997f93a..f23e850f4d54 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -65,7 +65,7 @@ def convert_pt_to_type(image, input_image_type):
return inputs
- def test_pt_np_pil_outputs_equivalent(self):
+ def test_pt_np_pil_outputs_equivalent(self, expected_max_diff=1e-4):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(torch_device)
@@ -76,7 +76,9 @@ def test_pt_np_pil_outputs_equivalent(self):
output_pil = pipe(**self.get_dummy_inputs_by_type(torch_device, output_type="pil"))[0]
max_diff = np.abs(output_pt.cpu().numpy().transpose(0, 2, 3, 1) - output_np).max()
- self.assertLess(max_diff, 1e-4, "`output_type=='pt'` generate different results from `output_type=='np'`")
+ self.assertLess(
+ max_diff, expected_max_diff, "`output_type=='pt'` generate different results from `output_type=='np'`"
+ )
max_diff = np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max()
self.assertLess(max_diff, 2.0, "`output_type=='pil'` generate different results from `output_type=='np'`")
@@ -188,7 +190,7 @@ def tearDown(self):
gc.collect()
torch.cuda.empty_cache()
- def test_save_load_local(self):
+ def test_save_load_local(self, expected_max_difference=1e-4):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(torch_device)
@@ -207,7 +209,7 @@ def test_save_load_local(self):
output_loaded = pipe_loaded(**inputs)[0]
max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
- self.assertLess(max_diff, 1e-4)
+ self.assertLess(max_diff, expected_max_difference)
def test_pipeline_call_signature(self):
self.assertTrue(
@@ -308,8 +310,8 @@ def _test_inference_batch_consistent(
logger.setLevel(level=diffusers.logging.WARNING)
- def test_inference_batch_single_identical(self, batch_size=3):
- self._test_inference_batch_single_identical(batch_size=batch_size)
+ def test_inference_batch_single_identical(self, batch_size=3, expected_max_diff=1e-4):
+ self._test_inference_batch_single_identical(batch_size=batch_size, expected_max_diff=expected_max_diff)
def _test_inference_batch_single_identical(
self,
@@ -391,7 +393,7 @@ def _test_inference_batch_single_identical(
if test_mean_pixel_difference:
assert_mean_pixel_difference(output_batch[0][0], output[0][0])
- def test_dict_tuple_outputs_equivalent(self):
+ def test_dict_tuple_outputs_equivalent(self, expected_max_difference=1e-4):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(torch_device)
@@ -401,7 +403,7 @@ def test_dict_tuple_outputs_equivalent(self):
output_tuple = pipe(**self.get_dummy_inputs(torch_device), return_dict=False)[0]
max_diff = np.abs(to_np(output) - to_np(output_tuple)).max()
- self.assertLess(max_diff, 1e-4)
+ self.assertLess(max_diff, expected_max_difference)
def test_components_function(self):
init_components = self.get_dummy_components()
@@ -411,7 +413,7 @@ def test_components_function(self):
self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
@unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
- def test_float16_inference(self):
+ def test_float16_inference(self, expected_max_diff=1e-2):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(torch_device)
@@ -425,10 +427,10 @@ def test_float16_inference(self):
output_fp16 = pipe_fp16(**self.get_dummy_inputs(torch_device))[0]
max_diff = np.abs(to_np(output) - to_np(output_fp16)).max()
- self.assertLess(max_diff, 1e-2, "The outputs of the fp16 and fp32 pipelines are too different.")
+ self.assertLess(max_diff, expected_max_diff, "The outputs of the fp16 and fp32 pipelines are too different.")
@unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
- def test_save_load_float16(self):
+ def test_save_load_float16(self, expected_max_diff=1e-2):
components = self.get_dummy_components()
for name, module in components.items():
if hasattr(module, "half"):
@@ -457,9 +459,11 @@ def test_save_load_float16(self):
output_loaded = pipe_loaded(**inputs)[0]
max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
- self.assertLess(max_diff, 1e-2, "The output of the fp16 pipeline changed after saving and loading.")
+ self.assertLess(
+ max_diff, expected_max_diff, "The output of the fp16 pipeline changed after saving and loading."
+ )
- def test_save_load_optional_components(self):
+ def test_save_load_optional_components(self, expected_max_difference=1e-4):
if not hasattr(self.pipeline_class, "_optional_components"):
return
@@ -491,7 +495,7 @@ def test_save_load_optional_components(self):
output_loaded = pipe_loaded(**inputs)[0]
max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
- self.assertLess(max_diff, 1e-4)
+ self.assertLess(max_diff, expected_max_difference)
@unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
def test_to_device(self):
@@ -525,8 +529,8 @@ def test_to_dtype(self):
model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")]
self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
- def test_attention_slicing_forward_pass(self):
- self._test_attention_slicing_forward_pass()
+ def test_attention_slicing_forward_pass(self, expected_max_diff=1e-3):
+ self._test_attention_slicing_forward_pass(expected_max_diff=expected_max_diff)
def _test_attention_slicing_forward_pass(
self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
@@ -557,7 +561,7 @@ def _test_attention_slicing_forward_pass(
torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"),
reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher",
)
- def test_cpu_offload_forward_pass(self):
+ def test_cpu_offload_forward_pass(self, expected_max_diff=1e-4):
if not self.test_cpu_offload:
return
@@ -574,7 +578,7 @@ def test_cpu_offload_forward_pass(self):
output_with_offload = pipe(**inputs)[0]
max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
- self.assertLess(max_diff, 1e-4, "CPU offloading should not affect the inference results")
+ self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
@unittest.skipIf(
torch_device != "cuda" or not is_xformers_available(),
@@ -657,8 +661,8 @@ def test_num_images_per_prompt(self):
# Some models (e.g. unCLIP) are extremely likely to significantly deviate depending on which hardware is used.
# This helper function is used to check that the image doesn't deviate on average more than 10 pixels from a
# reference image.
-def assert_mean_pixel_difference(image, expected_image):
+def assert_mean_pixel_difference(image, expected_image, expected_max_diff=10):
image = np.asarray(DiffusionPipeline.numpy_to_pil(image)[0], dtype=np.float32)
expected_image = np.asarray(DiffusionPipeline.numpy_to_pil(expected_image)[0], dtype=np.float32)
avg_diff = np.abs(image - expected_image).mean()
- assert avg_diff < 10, f"Error image deviates {avg_diff} pixels on average"
+ assert avg_diff < expected_max_diff, f"Error image deviates {avg_diff} pixels on average"
diff --git a/tests/pipelines/text_to_video/test_text_to_video.py b/tests/pipelines/text_to_video/test_text_to_video.py
index b59653694616..212becbb6729 100644
--- a/tests/pipelines/text_to_video/test_text_to_video.py
+++ b/tests/pipelines/text_to_video/test_text_to_video.py
@@ -33,6 +33,7 @@
torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
@skip_mps
@@ -140,7 +141,7 @@ def test_text_to_video_default_case(self):
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_attention_slicing_forward_pass(self):
- self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
+ self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False, expected_max_diff=3e-3)
# (todo): sayakpaul
@unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
diff --git a/tests/pipelines/unclip/test_unclip.py b/tests/pipelines/unclip/test_unclip.py
index 5c9181c08e3f..5357e5b0e7ef 100644
--- a/tests/pipelines/unclip/test_unclip.py
+++ b/tests/pipelines/unclip/test_unclip.py
@@ -29,6 +29,10 @@
from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
+
+
class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = UnCLIPPipeline
params = TEXT_TO_IMAGE_PARAMS - {
diff --git a/tests/pipelines/unclip/test_unclip_image_variation.py b/tests/pipelines/unclip/test_unclip_image_variation.py
index c1b8be9cd49e..ded162102dd6 100644
--- a/tests/pipelines/unclip/test_unclip_image_variation.py
+++ b/tests/pipelines/unclip/test_unclip_image_variation.py
@@ -43,6 +43,10 @@
from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
+
+
class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = UnCLIPImageVariationPipeline
params = IMAGE_VARIATION_PARAMS - {"height", "width", "guidance_scale"}
@@ -516,4 +520,4 @@ def test_unclip_image_variation_karlo(self):
assert image.shape == (256, 256, 3)
- assert_mean_pixel_difference(image, expected_image)
+ assert_mean_pixel_difference(image, expected_image, 15)
diff --git a/tests/pipelines/vq_diffusion/test_vq_diffusion.py b/tests/pipelines/vq_diffusion/test_vq_diffusion.py
index d97a7b2f6564..3f5ef16cff72 100644
--- a/tests/pipelines/vq_diffusion/test_vq_diffusion.py
+++ b/tests/pipelines/vq_diffusion/test_vq_diffusion.py
@@ -189,7 +189,7 @@ def test_vq_diffusion_classifier_free_sampling(self):
expected_slice = np.array([0.6693, 0.6075, 0.4959, 0.5701, 0.5583, 0.4333, 0.6171, 0.5684, 0.4988])
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+ assert np.abs(image_slice.flatten() - expected_slice).max() < 2.0
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
@@ -225,4 +225,4 @@ def test_vq_diffusion_classifier_free_sampling(self):
image = output.images[0]
assert image.shape == (256, 256, 3)
- assert np.abs(expected_image - image).max() < 1e-2
+ assert np.abs(expected_image - image).max() < 2.0
From d71db894eb515152f37db97627a5c09700c9b469 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Thu, 11 May 2023 08:52:28 -0700
Subject: [PATCH 068/206] [docs] Add transformers to install (#3388)
add transformers to install
---
docs/source/en/installation.mdx | 46 +++++++++++++++++----------------
1 file changed, 24 insertions(+), 22 deletions(-)
diff --git a/docs/source/en/installation.mdx b/docs/source/en/installation.mdx
index 8639bcfca95b..218ccd7bc4f6 100644
--- a/docs/source/en/installation.mdx
+++ b/docs/source/en/installation.mdx
@@ -12,9 +12,9 @@ specific language governing permissions and limitations under the License.
# Installation
-Install 🤗 Diffusers for whichever deep learning library you’re working with.
+Install 🤗 Diffusers for whichever deep learning library you're working with.
-🤗 Diffusers is tested on Python 3.7+, PyTorch 1.7.0+ and flax. Follow the installation instructions below for the deep learning library you are using:
+🤗 Diffusers is tested on Python 3.7+, PyTorch 1.7.0+ and Flax. Follow the installation instructions below for the deep learning library you are using:
- [PyTorch](https://pytorch.org/get-started/locally/) installation instructions.
- [Flax](https://flax.readthedocs.io/en/latest/) installation instructions.
@@ -37,27 +37,28 @@ Activate the virtual environment:
source .env/bin/activate
```
-Now you're ready to install 🤗 Diffusers with the following command:
-
-**For PyTorch**
+🤗 Diffusers also relies on the 🤗 Transformers library, and you can install both with the following command:
+
+
```bash
-pip install diffusers["torch"]
+pip install diffusers["torch"] transformers
```
-
-**For Flax**
-
+
+
```bash
-pip install diffusers["flax"]
+pip install diffusers["flax"] transformers
```
+
+
## Install from source
-Before intsalling `diffusers` from source, make sure you have `torch` and `accelerate` installed.
+Before installing 🤗 Diffusers from source, make sure you have `torch` and 🤗 Accelerate installed.
-For `torch` installation refer to the `torch` [docs](https://pytorch.org/get-started/locally/#start-locally).
+For `torch` installation, refer to the `torch` [installation](https://pytorch.org/get-started/locally/#start-locally) guide.
-To install `accelerate`
+To install 🤗 Accelerate:
```bash
pip install accelerate
@@ -74,7 +75,7 @@ The `main` version is useful for staying up-to-date with the latest developments
For instance, if a bug has been fixed since the last official release but a new release hasn't been rolled out yet.
However, this means the `main` version may not always be stable.
We strive to keep the `main` version operational, and most issues are usually resolved within a few hours or a day.
-If you run into a problem, please open an [Issue](https://github.com/huggingface/transformers/issues), so we can fix it even sooner!
+If you run into a problem, please open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose), so we can fix it even sooner!
## Editable install
@@ -90,21 +91,22 @@ git clone https://github.com/huggingface/diffusers.git
cd diffusers
```
-**For PyTorch**
-
-```
+
+
+```bash
pip install -e ".[torch]"
```
-
-**For Flax**
-
-```
+
+
+```bash
pip install -e ".[flax]"
```
+
+
These commands will link the folder you cloned the repository to and your Python library paths.
Python will now look inside the folder you cloned to in addition to the normal library paths.
-For example, if your Python packages are typically installed in `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python will also search the folder you cloned to: `~/diffusers/`.
+For example, if your Python packages are typically installed in `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python will also search the `~/diffusers/` folder you cloned to.
From af2a237676ada656889de5e5b96ce609e37ed8c4 Mon Sep 17 00:00:00 2001
From: Stas Bekman
Date: Thu, 11 May 2023 08:59:20 -0700
Subject: [PATCH 069/206] [deepspeed] partial ZeRO-3 support (#3076)
* [deepspeed] partial ZeRO-3 support
* cleanup
* improve deepspeed fixes
* Improve
* make style
---------
Co-authored-by: Patrick von Platen
---
examples/text_to_image/train_text_to_image.py | 34 ++++++++++++++++---
src/diffusers/training_utils.py | 23 ++++++++++---
2 files changed, 48 insertions(+), 9 deletions(-)
diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py
index f9592e5adca3..1a6f4cde27ab 100644
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -29,6 +29,7 @@
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
+from accelerate.state import AcceleratorState
from accelerate.utils import ProjectConfiguration, set_seed
from datasets import load_dataset
from huggingface_hub import create_repo, upload_folder
@@ -36,6 +37,7 @@
from torchvision import transforms
from tqdm.auto import tqdm
from transformers import CLIPTextModel, CLIPTokenizer
+from transformers.utils import ContextManagers
import diffusers
from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
@@ -464,10 +466,34 @@ def main():
tokenizer = CLIPTokenizer.from_pretrained(
args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
)
- text_encoder = CLIPTextModel.from_pretrained(
- args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
- )
- vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
+
+ def deepspeed_zero_init_disabled_context_manager():
+ """
+ returns either a context list that includes one that will disable zero.Init or an empty context list
+ """
+ deepspeed_plugin = AcceleratorState() if accelerate.state.is_initialized() else None
+ if deepspeed_plugin is None:
+ return []
+
+ return [deepspeed_plugin.zero3_init_context_manager(enable=False)]
+
+ # Currently Accelerate doesn't know how to handle multiple models under Deepspeed ZeRO stage 3.
+ # For this to work properly all models must be run through `accelerate.prepare`. But accelerate
+ # will try to assign the same optimizer with the same weights to all models during
+ # `deepspeed.initialize`, which of course doesn't work.
+ #
+ # For now the following workaround will partially support Deepspeed ZeRO-3, by excluding the 2
+ # frozen models from being partitioned during `zero.Init` which gets called during
+ # `from_pretrained` So CLIPTextModel and AutoencoderKL will not enjoy the parameter sharding
+ # across multiple gpus and only UNet2DConditionModel will get ZeRO sharded.
+ with ContextManagers(deepspeed_zero_init_disabled_context_manager()):
+ text_encoder = CLIPTextModel.from_pretrained(
+ args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+ )
+ vae = AutoencoderKL.from_pretrained(
+ args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision
+ )
+
unet = UNet2DConditionModel.from_pretrained(
args.pretrained_model_name_or_path, subfolder="unet", revision=args.non_ema_revision
)
diff --git a/src/diffusers/training_utils.py b/src/diffusers/training_utils.py
index 340b96e29ac5..1a3abb49a065 100644
--- a/src/diffusers/training_utils.py
+++ b/src/diffusers/training_utils.py
@@ -1,3 +1,4 @@
+import contextlib
import copy
import os
import random
@@ -6,7 +7,11 @@
import numpy as np
import torch
-from .utils import deprecate
+from .utils import deprecate, is_transformers_available
+
+
+if is_transformers_available():
+ import transformers
def enable_full_determinism(seed: int):
@@ -197,11 +202,19 @@ def step(self, parameters: Iterable[torch.nn.Parameter]):
self.cur_decay_value = decay
one_minus_decay = 1 - decay
+ context_manager = contextlib.nullcontext
+ if is_transformers_available() and transformers.deepspeed.is_deepspeed_zero3_enabled():
+ import deepspeed
+
for s_param, param in zip(self.shadow_params, parameters):
- if param.requires_grad:
- s_param.sub_(one_minus_decay * (s_param - param))
- else:
- s_param.copy_(param)
+ if is_transformers_available() and transformers.deepspeed.is_deepspeed_zero3_enabled():
+ context_manager = deepspeed.zero.GatheredParameters(param, modifier_rank=None)
+
+ with context_manager():
+ if param.requires_grad:
+ s_param.sub_(one_minus_decay * (s_param - param))
+ else:
+ s_param.copy_(param)
def copy_to(self, parameters: Iterable[torch.nn.Parameter]) -> None:
"""
From 58c6f9cb719cf6ee5fda9302801f3030c73b83a9 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Thu, 11 May 2023 19:03:27 +0200
Subject: [PATCH 070/206] Add omegaconf for tests (#3400)
Add omegaconfg
---
docker/diffusers-pytorch-cuda/Dockerfile | 3 ++-
setup.py | 2 ++
src/diffusers/dependency_versions_table.py | 1 +
3 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/docker/diffusers-pytorch-cuda/Dockerfile b/docker/diffusers-pytorch-cuda/Dockerfile
index 8087be429996..6946685be280 100644
--- a/docker/diffusers-pytorch-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-cuda/Dockerfile
@@ -37,6 +37,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
numpy \
scipy \
tensorboard \
- transformers
+ transformers \
+ omegaconf
CMD ["/bin/bash"]
diff --git a/setup.py b/setup.py
index e8c80c492f88..a972df80b509 100644
--- a/setup.py
+++ b/setup.py
@@ -96,6 +96,7 @@
"k-diffusion>=0.0.12",
"librosa",
"numpy",
+ "omegaconf",
"parameterized",
"protobuf>=3.20.3,<4",
"pytest",
@@ -191,6 +192,7 @@ def run(self):
"Jinja2",
"k-diffusion",
"librosa",
+ "omegaconf",
"parameterized",
"pytest",
"pytest-timeout",
diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py
index 0b6f544b9817..19a843470ee1 100644
--- a/src/diffusers/dependency_versions_table.py
+++ b/src/diffusers/dependency_versions_table.py
@@ -20,6 +20,7 @@
"k-diffusion": "k-diffusion>=0.0.12",
"librosa": "librosa",
"numpy": "numpy",
+ "omegaconf": "omegaconf",
"parameterized": "parameterized",
"protobuf": "protobuf>=3.20.3,<4",
"pytest": "pytest",
From f92253015cc27557d78af62f7c62038dd8afd6f4 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Thu, 11 May 2023 20:28:09 +0200
Subject: [PATCH 071/206] Fix various bugs with LoRA Dreambooth and Dreambooth
script (#3353)
* Improve checkpointing lora
* fix more
* Improve doc string
* Update src/diffusers/loaders.py
* make stytle
* Apply suggestions from code review
* Update src/diffusers/loaders.py
* Apply suggestions from code review
* Apply suggestions from code review
* better
* Fix all
* Fix multi-GPU dreambooth
* Apply suggestions from code review
Co-authored-by: Pedro Cuenca
* Fix all
* make style
* make style
---------
Co-authored-by: Pedro Cuenca
---
examples/dreambooth/train_dreambooth.py | 55 +++++++-------
examples/dreambooth/train_dreambooth_lora.py | 80 +++++++++++++++++---
src/diffusers/loaders.py | 53 ++++++++++---
3 files changed, 135 insertions(+), 53 deletions(-)
diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index 190f4625a16c..5d2107f024d1 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -22,7 +22,6 @@
import warnings
from pathlib import Path
-import accelerate
import numpy as np
import torch
import torch.nn.functional as F
@@ -733,36 +732,34 @@ def main(args):
args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
)
- # `accelerate` 0.16.0 will have better support for customized saving
- if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
- # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
- def save_model_hook(models, weights, output_dir):
- for model in models:
- sub_dir = "unet" if type(model) == type(unet) else "text_encoder"
- model.save_pretrained(os.path.join(output_dir, sub_dir))
-
- # make sure to pop weight so that corresponding model is not saved again
- weights.pop()
-
- def load_model_hook(models, input_dir):
- while len(models) > 0:
- # pop models so that they are not loaded again
- model = models.pop()
-
- if type(model) == type(text_encoder):
- # load transformers style into model
- load_model = text_encoder_cls.from_pretrained(input_dir, subfolder="text_encoder")
- model.config = load_model.config
- else:
- # load diffusers style into model
- load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
- model.register_to_config(**load_model.config)
+ # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+ def save_model_hook(models, weights, output_dir):
+ for model in models:
+ sub_dir = "unet" if isinstance(model, type(accelerator.unwrap_model(unet))) else "text_encoder"
+ model.save_pretrained(os.path.join(output_dir, sub_dir))
+
+ # make sure to pop weight so that corresponding model is not saved again
+ weights.pop()
+
+ def load_model_hook(models, input_dir):
+ while len(models) > 0:
+ # pop models so that they are not loaded again
+ model = models.pop()
+
+ if isinstance(model, type(accelerator.unwrap_model(text_encoder))):
+ # load transformers style into model
+ load_model = text_encoder_cls.from_pretrained(input_dir, subfolder="text_encoder")
+ model.config = load_model.config
+ else:
+ # load diffusers style into model
+ load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
+ model.register_to_config(**load_model.config)
- model.load_state_dict(load_model.state_dict())
- del load_model
+ model.load_state_dict(load_model.state_dict())
+ del load_model
- accelerator.register_save_state_pre_hook(save_model_hook)
- accelerator.register_load_state_pre_hook(load_model_hook)
+ accelerator.register_save_state_pre_hook(save_model_hook)
+ accelerator.register_load_state_pre_hook(load_model_hook)
vae.requires_grad_(False)
if not args.train_text_encoder:
diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
index 0bf3333a6209..16adfe4b83fc 100644
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
@@ -834,7 +834,6 @@ def main(args):
unet.set_attn_processor(unet_lora_attn_procs)
unet_lora_layers = AttnProcsLayers(unet.attn_processors)
- accelerator.register_for_checkpointing(unet_lora_layers)
# The text encoder comes from 🤗 transformers, so we cannot directly modify it.
# So, instead, we monkey-patch the forward calls of its attention-blocks. For this,
@@ -853,9 +852,68 @@ def main(args):
)
temp_pipeline._modify_text_encoder(text_lora_attn_procs)
text_encoder = temp_pipeline.text_encoder
- accelerator.register_for_checkpointing(text_encoder_lora_layers)
del temp_pipeline
+ # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+ def save_model_hook(models, weights, output_dir):
+ # there are only two options here. Either are just the unet attn processor layers
+ # or there are the unet and text encoder atten layers
+ unet_lora_layers_to_save = None
+ text_encoder_lora_layers_to_save = None
+
+ if args.train_text_encoder:
+ text_encoder_keys = accelerator.unwrap_model(text_encoder_lora_layers).state_dict().keys()
+ unet_keys = accelerator.unwrap_model(unet_lora_layers).state_dict().keys()
+
+ for model in models:
+ state_dict = model.state_dict()
+
+ if (
+ text_encoder_lora_layers is not None
+ and text_encoder_keys is not None
+ and state_dict.keys() == text_encoder_keys
+ ):
+ # text encoder
+ text_encoder_lora_layers_to_save = state_dict
+ elif state_dict.keys() == unet_keys:
+ # unet
+ unet_lora_layers_to_save = state_dict
+
+ # make sure to pop weight so that corresponding model is not saved again
+ weights.pop()
+
+ LoraLoaderMixin.save_lora_weights(
+ output_dir,
+ unet_lora_layers=unet_lora_layers_to_save,
+ text_encoder_lora_layers=text_encoder_lora_layers_to_save,
+ )
+
+ def load_model_hook(models, input_dir):
+ # Note we DON'T pass the unet and text encoder here an purpose
+ # so that the we don't accidentally override the LoRA layers of
+ # unet_lora_layers and text_encoder_lora_layers which are stored in `models`
+ # with new torch.nn.Modules / weights. We simply use the pipeline class as
+ # an easy way to load the lora checkpoints
+ temp_pipeline = DiffusionPipeline.from_pretrained(
+ args.pretrained_model_name_or_path,
+ revision=args.revision,
+ torch_dtype=weight_dtype,
+ )
+ temp_pipeline.load_lora_weights(input_dir)
+
+ # load lora weights into models
+ models[0].load_state_dict(AttnProcsLayers(temp_pipeline.unet.attn_processors).state_dict())
+ if len(models) > 1:
+ models[1].load_state_dict(AttnProcsLayers(temp_pipeline.text_encoder_lora_attn_procs).state_dict())
+
+ # delete temporary pipeline and pop models
+ del temp_pipeline
+ for _ in range(len(models)):
+ models.pop()
+
+ accelerator.register_save_state_pre_hook(save_model_hook)
+ accelerator.register_load_state_pre_hook(load_model_hook)
+
# Enable TF32 for faster training on Ampere GPUs,
# cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
if args.allow_tf32:
@@ -1130,17 +1188,10 @@ def compute_text_embeddings(prompt):
progress_bar.update(1)
global_step += 1
- if global_step % args.checkpointing_steps == 0:
- if accelerator.is_main_process:
+ if accelerator.is_main_process:
+ if global_step % args.checkpointing_steps == 0:
save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
- # We combine the text encoder and UNet LoRA parameters with a simple
- # custom logic. `accelerator.save_state()` won't know that. So,
- # use `LoraLoaderMixin.save_lora_weights()`.
- LoraLoaderMixin.save_lora_weights(
- save_directory=save_path,
- unet_lora_layers=unet_lora_layers,
- text_encoder_lora_layers=text_encoder_lora_layers,
- )
+ accelerator.save_state(save_path)
logger.info(f"Saved state to {save_path}")
logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
@@ -1217,8 +1268,12 @@ def compute_text_embeddings(prompt):
accelerator.wait_for_everyone()
if accelerator.is_main_process:
unet = unet.to(torch.float32)
+ unet_lora_layers = accelerator.unwrap_model(unet_lora_layers)
+
if text_encoder is not None:
text_encoder = text_encoder.to(torch.float32)
+ text_encoder_lora_layers = accelerator.unwrap_model(text_encoder_lora_layers)
+
LoraLoaderMixin.save_lora_weights(
save_directory=args.output_dir,
unet_lora_layers=unet_lora_layers,
@@ -1250,6 +1305,7 @@ def compute_text_embeddings(prompt):
pipeline.load_lora_weights(args.output_dir)
# run inference
+ images = []
if args.validation_prompt and args.num_validation_images > 0:
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
images = [
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 7513fa2732ba..a1f0d8ec2a52 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -70,6 +70,9 @@ def __init__(self, state_dict: Dict[str, torch.Tensor]):
self.mapping = dict(enumerate(state_dict.keys()))
self.rev_mapping = {v: k for k, v in enumerate(state_dict.keys())}
+ # .processor for unet, .k_proj, ".q_proj", ".v_proj", and ".out_proj" for text encoder
+ self.split_keys = [".processor", ".k_proj", ".q_proj", ".v_proj", ".out_proj"]
+
# we add a hook to state_dict() and load_state_dict() so that the
# naming fits with `unet.attn_processors`
def map_to(module, state_dict, *args, **kwargs):
@@ -81,10 +84,19 @@ def map_to(module, state_dict, *args, **kwargs):
return new_state_dict
+ def remap_key(key, state_dict):
+ for k in self.split_keys:
+ if k in key:
+ return key.split(k)[0] + k
+
+ raise ValueError(
+ f"There seems to be a problem with the state_dict: {set(state_dict.keys())}. {key} has to have one of {self.split_keys}."
+ )
+
def map_from(module, state_dict, *args, **kwargs):
all_keys = list(state_dict.keys())
for key in all_keys:
- replace_key = key.split(".processor")[0] + ".processor"
+ replace_key = remap_key(key, state_dict)
new_key = key.replace(replace_key, f"layers.{module.rev_mapping[replace_key]}")
state_dict[new_key] = state_dict[key]
del state_dict[key]
@@ -898,6 +910,9 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
attn_procs_text_encoder = self._load_text_encoder_attn_procs(text_encoder_lora_state_dict)
self._modify_text_encoder(attn_procs_text_encoder)
+ # save lora attn procs of text encoder so that it can be easily retrieved
+ self._text_encoder_lora_attn_procs = attn_procs_text_encoder
+
# Otherwise, we're dealing with the old format. This means the `state_dict` should only
# contain the module names of the `unet` as its keys WITHOUT any prefix.
elif not all(
@@ -907,6 +922,12 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
warn_message = "You have saved the LoRA weights using the old format. To convert the old LoRA weights to the new format, you can first load them in a dictionary and then create a new dictionary like the following: `new_state_dict = {f'unet'.{module_name}: params for module_name, params in old_state_dict.items()}`."
warnings.warn(warn_message)
+ @property
+ def text_encoder_lora_attn_procs(self):
+ if hasattr(self, "_text_encoder_lora_attn_procs"):
+ return self._text_encoder_lora_attn_procs
+ return
+
def _modify_text_encoder(self, attn_processors: Dict[str, LoRAAttnProcessor]):
r"""
Monkey-patches the forward passes of attention modules of the text encoder.
@@ -1110,7 +1131,7 @@ def _load_text_encoder_attn_procs(
def save_lora_weights(
self,
save_directory: Union[str, os.PathLike],
- unet_lora_layers: Dict[str, torch.nn.Module] = None,
+ unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
text_encoder_lora_layers: Dict[str, torch.nn.Module] = None,
is_main_process: bool = True,
weight_name: str = None,
@@ -1123,13 +1144,14 @@ def save_lora_weights(
Arguments:
save_directory (`str` or `os.PathLike`):
Directory to which to save. Will be created if it doesn't exist.
- unet_lora_layers (`Dict[str, torch.nn.Module`]):
+ unet_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
State dict of the LoRA layers corresponding to the UNet. Specifying this helps to make the
- serialization process easier and cleaner.
- text_encoder_lora_layers (`Dict[str, torch.nn.Module`]):
+ serialization process easier and cleaner. Values can be both LoRA torch.nn.Modules layers or torch
+ weights.
+ text_encoder_lora_layers (`Dict[str, torch.nn.Module] or `Dict[str, torch.Tensor]`):
State dict of the LoRA layers corresponding to the `text_encoder`. Since the `text_encoder` comes from
`transformers`, we cannot rejig it. That is why we have to explicitly pass the text encoder LoRA state
- dict.
+ dict. Values can be both LoRA torch.nn.Modules layers or torch weights.
is_main_process (`bool`, *optional*, defaults to `True`):
Whether the process calling this is the main process or not. Useful when in distributed training like
TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
@@ -1157,15 +1179,22 @@ def save_function(weights, filename):
# Create a flat dictionary.
state_dict = {}
if unet_lora_layers is not None:
- unet_lora_state_dict = {
- f"{self.unet_name}.{module_name}": param
- for module_name, param in unet_lora_layers.state_dict().items()
- }
+ weights = (
+ unet_lora_layers.state_dict() if isinstance(unet_lora_layers, torch.nn.Module) else unet_lora_layers
+ )
+
+ unet_lora_state_dict = {f"{self.unet_name}.{module_name}": param for module_name, param in weights.items()}
state_dict.update(unet_lora_state_dict)
+
if text_encoder_lora_layers is not None:
+ weights = (
+ text_encoder_lora_layers.state_dict()
+ if isinstance(text_encoder_lora_layers, torch.nn.Module)
+ else text_encoder_lora_layers
+ )
+
text_encoder_lora_state_dict = {
- f"{self.text_encoder_name}.{module_name}": param
- for module_name, param in text_encoder_lora_layers.state_dict().items()
+ f"{self.text_encoder_name}.{module_name}": param for module_name, param in weights.items()
}
state_dict.update(text_encoder_lora_state_dict)
From 1a5797c6d4491a879ea5285c4efc377664e0332d Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Thu, 11 May 2023 21:28:37 +0200
Subject: [PATCH 072/206] Fix docker file (#3402)
* up
* up
---
docker/diffusers-pytorch-cuda/Dockerfile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docker/diffusers-pytorch-cuda/Dockerfile b/docker/diffusers-pytorch-cuda/Dockerfile
index 6946685be280..a51a12ee2838 100644
--- a/docker/diffusers-pytorch-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-cuda/Dockerfile
@@ -26,7 +26,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
python3 -m pip install --no-cache-dir \
torch \
torchvision \
- torchaudio \
+ torchaudio && \
python3 -m pip install --no-cache-dir \
accelerate \
datasets \
From 3a237f4fa25dd5a3f354428f0e2e869d08089dc7 Mon Sep 17 00:00:00 2001
From: Sayak Paul
Date: Fri, 12 May 2023 14:32:22 +0530
Subject: [PATCH 073/206] fix: deepseepd_plugin retrieval from accelerate state
(#3410)
---
examples/text_to_image/train_text_to_image.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py
index 1a6f4cde27ab..82370fc4e2dd 100644
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -471,7 +471,7 @@ def deepspeed_zero_init_disabled_context_manager():
"""
returns either a context list that includes one that will disable zero.Init or an empty context list
"""
- deepspeed_plugin = AcceleratorState() if accelerate.state.is_initialized() else None
+ deepspeed_plugin = AcceleratorState().deepspeed_plugin if accelerate.state.is_initialized() else None
if deepspeed_plugin is None:
return []
From 7f6373d2640e25ae99d520eef0b94649e8134dfe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Laure=CE=B7t?=
Date: Fri, 12 May 2023 13:48:26 +0200
Subject: [PATCH 074/206] [Docs] Add `sigmoid` beta_scheduler to docstrings of
relevant Schedulers (#3399)
* Add `sigmoid` beta scheduler to `DDPMScheduler` docstring
* Add `sigmoid` beta scheduler to `RePaintScheduler` docstring
---------
Co-authored-by: Patrick von Platen
---
src/diffusers/schedulers/scheduling_ddpm.py | 2 +-
src/diffusers/schedulers/scheduling_repaint.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index a8a71fe420aa..5d24766d68c7 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -91,7 +91,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
beta_end (`float`): the final `beta` value.
beta_schedule (`str`):
the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
- `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+ `linear`, `scaled_linear`, `squaredcos_cap_v2` or `sigmoid`.
trained_betas (`np.ndarray`, optional):
option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
variance_type (`str`):
diff --git a/src/diffusers/schedulers/scheduling_repaint.py b/src/diffusers/schedulers/scheduling_repaint.py
index 96af210f06b1..f2f97b38f3d3 100644
--- a/src/diffusers/schedulers/scheduling_repaint.py
+++ b/src/diffusers/schedulers/scheduling_repaint.py
@@ -89,7 +89,7 @@ class RePaintScheduler(SchedulerMixin, ConfigMixin):
beta_end (`float`): the final `beta` value.
beta_schedule (`str`):
the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
- `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+ `linear`, `scaled_linear`, `squaredcos_cap_v2` or `sigmoid`.
eta (`float`):
The weight of noise for added noise in a diffusion step. Its value is between 0.0 and 1.0 -0.0 is DDIM and
1.0 is DDPM scheduler respectively.
From b1b92f4a98442f8d820fb271272bbc24e384146e Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Fri, 12 May 2023 14:14:04 +0200
Subject: [PATCH 075/206] Don't install accelerate and transformers from source
(#3415)
---
.github/workflows/push_tests_fast.yml | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/push_tests_fast.yml b/.github/workflows/push_tests_fast.yml
index 525df28cbaa8..50ef729161d3 100644
--- a/.github/workflows/push_tests_fast.yml
+++ b/.github/workflows/push_tests_fast.yml
@@ -62,8 +62,6 @@ jobs:
run: |
apt-get update && apt-get install libsndfile1-dev -y
python -m pip install -e .[quality,test]
- python -m pip install -U git+https://github.com/huggingface/transformers
- python -m pip install git+https://github.com/huggingface/accelerate
- name: Environment
run: |
@@ -137,8 +135,8 @@ jobs:
${CONDA_RUN} python -m pip install --upgrade pip
${CONDA_RUN} python -m pip install -e .[quality,test]
${CONDA_RUN} python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
- ${CONDA_RUN} python -m pip install git+https://github.com/huggingface/accelerate
- ${CONDA_RUN} python -m pip install -U git+https://github.com/huggingface/transformers
+ ${CONDA_RUN} python -m pip install accelerate --upgrade
+ ${CONDA_RUN} python -m pip install transformers --upgrade
- name: Environment
shell: arch -arch arm64 bash {0}
From 03e51269782bd79e453232d1d3417c77e5412819 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Fri, 12 May 2023 14:15:23 +0200
Subject: [PATCH 076/206] Don't install transformers and accelerate from source
(#3414)
---
.github/workflows/push_tests.yml | 4 ----
1 file changed, 4 deletions(-)
diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index 7966a416fcf1..749da4dea81a 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -61,8 +61,6 @@ jobs:
- name: Install dependencies
run: |
python -m pip install -e .[quality,test]
- python -m pip install -U git+https://github.com/huggingface/transformers
- python -m pip install git+https://github.com/huggingface/accelerate
- name: Environment
run: |
@@ -134,8 +132,6 @@ jobs:
- name: Install dependencies
run: |
python -m pip install -e .[quality,test,training]
- python -m pip install git+https://github.com/huggingface/accelerate
- python -m pip install -U git+https://github.com/huggingface/transformers
- name: Environment
run: |
From 28f404349d69da1af7b52f18b022bc7971951a41 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Fri, 12 May 2023 15:01:03 +0200
Subject: [PATCH 077/206] Improve fast tests (#3416)
Update pr_tests.yml
---
.github/workflows/pr_tests.yml | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
index d06b576fa631..23a7659166c0 100644
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -69,8 +69,6 @@ jobs:
run: |
apt-get update && apt-get install libsndfile1-dev -y
python -m pip install -e .[quality,test]
- python -m pip install -U git+https://github.com/huggingface/transformers
- python -m pip install git+https://github.com/huggingface/accelerate
- name: Environment
run: |
@@ -152,8 +150,8 @@ jobs:
${CONDA_RUN} python -m pip install --upgrade pip
${CONDA_RUN} python -m pip install -e .[quality,test]
${CONDA_RUN} python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
- ${CONDA_RUN} python -m pip install git+https://github.com/huggingface/accelerate
- ${CONDA_RUN} python -m pip install -U git+https://github.com/huggingface/transformers
+ ${CONDA_RUN} python -m pip install accelerate --upgrade
+ ${CONDA_RUN} python -m pip install transformers --upgrade
- name: Environment
shell: arch -arch arm64 bash {0}
From 909742dbd6873052995dc6cd5f4150ff238015d2 Mon Sep 17 00:00:00 2001
From: Will Berman
Date: Fri, 12 May 2023 08:54:09 -0600
Subject: [PATCH 078/206] attention refactor: the trilogy (#3387)
* Replace `AttentionBlock` with `Attention`
* use _from_deprecated_attn_block check re: @patrickvonplaten
---
src/diffusers/models/attention.py | 174 +-----------------
src/diffusers/models/attention_processor.py | 129 ++++++++++++-
src/diffusers/models/modeling_utils.py | 46 +++++
src/diffusers/models/unet_2d_blocks.py | 67 +++++--
.../pipeline_stable_diffusion_upscale.py | 12 +-
tests/models/test_layers_utils.py | 55 +-----
6 files changed, 235 insertions(+), 248 deletions(-)
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 134f84fc9d50..0b313b83d360 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -11,189 +11,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-import math
-from typing import Callable, Optional
+from typing import Optional
import torch
import torch.nn.functional as F
from torch import nn
from ..utils import maybe_allow_in_graph
-from ..utils.import_utils import is_xformers_available
from .attention_processor import Attention
from .embeddings import CombinedTimestepLabelEmbeddings
-if is_xformers_available():
- import xformers
- import xformers.ops
-else:
- xformers = None
-
-
-class AttentionBlock(nn.Module):
- """
- An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
- to the N-d case.
- https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
- Uses three q, k, v linear layers to compute attention.
-
- Parameters:
- channels (`int`): The number of channels in the input and output.
- num_head_channels (`int`, *optional*):
- The number of channels in each head. If None, then `num_heads` = 1.
- norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for group norm.
- rescale_output_factor (`float`, *optional*, defaults to 1.0): The factor to rescale the output by.
- eps (`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
- """
-
- # IMPORTANT;TODO(Patrick, William) - this class will be deprecated soon. Do not use it anymore
-
- def __init__(
- self,
- channels: int,
- num_head_channels: Optional[int] = None,
- norm_num_groups: int = 32,
- rescale_output_factor: float = 1.0,
- eps: float = 1e-5,
- ):
- super().__init__()
- self.channels = channels
-
- self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
- self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=norm_num_groups, eps=eps, affine=True)
-
- # define q,k,v as linear layers
- self.query = nn.Linear(channels, channels)
- self.key = nn.Linear(channels, channels)
- self.value = nn.Linear(channels, channels)
-
- self.rescale_output_factor = rescale_output_factor
- self.proj_attn = nn.Linear(channels, channels, bias=True)
-
- self._use_memory_efficient_attention_xformers = False
- self._use_2_0_attn = True
- self._attention_op = None
-
- def reshape_heads_to_batch_dim(self, tensor, merge_head_and_batch=True):
- batch_size, seq_len, dim = tensor.shape
- head_size = self.num_heads
- tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
- tensor = tensor.permute(0, 2, 1, 3)
- if merge_head_and_batch:
- tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
- return tensor
-
- def reshape_batch_dim_to_heads(self, tensor, unmerge_head_and_batch=True):
- head_size = self.num_heads
-
- if unmerge_head_and_batch:
- batch_head_size, seq_len, dim = tensor.shape
- batch_size = batch_head_size // head_size
-
- tensor = tensor.reshape(batch_size, head_size, seq_len, dim)
- else:
- batch_size, _, seq_len, dim = tensor.shape
-
- tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size, seq_len, dim * head_size)
- return tensor
-
- def set_use_memory_efficient_attention_xformers(
- self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
- ):
- if use_memory_efficient_attention_xformers:
- if not is_xformers_available():
- raise ModuleNotFoundError(
- (
- "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
- " xformers"
- ),
- name="xformers",
- )
- elif not torch.cuda.is_available():
- raise ValueError(
- "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is"
- " only available for GPU "
- )
- else:
- try:
- # Make sure we can run the memory efficient attention
- _ = xformers.ops.memory_efficient_attention(
- torch.randn((1, 2, 40), device="cuda"),
- torch.randn((1, 2, 40), device="cuda"),
- torch.randn((1, 2, 40), device="cuda"),
- )
- except Exception as e:
- raise e
- self._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
- self._attention_op = attention_op
-
- def forward(self, hidden_states):
- residual = hidden_states
- batch, channel, height, width = hidden_states.shape
-
- # norm
- hidden_states = self.group_norm(hidden_states)
-
- hidden_states = hidden_states.view(batch, channel, height * width).transpose(1, 2)
-
- # proj to q, k, v
- query_proj = self.query(hidden_states)
- key_proj = self.key(hidden_states)
- value_proj = self.value(hidden_states)
-
- scale = 1 / math.sqrt(self.channels / self.num_heads)
-
- _use_2_0_attn = self._use_2_0_attn and not self._use_memory_efficient_attention_xformers
- use_torch_2_0_attn = hasattr(F, "scaled_dot_product_attention") and _use_2_0_attn
-
- query_proj = self.reshape_heads_to_batch_dim(query_proj, merge_head_and_batch=not use_torch_2_0_attn)
- key_proj = self.reshape_heads_to_batch_dim(key_proj, merge_head_and_batch=not use_torch_2_0_attn)
- value_proj = self.reshape_heads_to_batch_dim(value_proj, merge_head_and_batch=not use_torch_2_0_attn)
-
- if self._use_memory_efficient_attention_xformers:
- # Memory efficient attention
- hidden_states = xformers.ops.memory_efficient_attention(
- query_proj, key_proj, value_proj, attn_bias=None, op=self._attention_op, scale=scale
- )
- hidden_states = hidden_states.to(query_proj.dtype)
- elif use_torch_2_0_attn:
- # the output of sdp = (batch, num_heads, seq_len, head_dim)
- # TODO: add support for attn.scale when we move to Torch 2.1
- hidden_states = F.scaled_dot_product_attention(
- query_proj, key_proj, value_proj, dropout_p=0.0, is_causal=False
- )
- hidden_states = hidden_states.to(query_proj.dtype)
- else:
- attention_scores = torch.baddbmm(
- torch.empty(
- query_proj.shape[0],
- query_proj.shape[1],
- key_proj.shape[1],
- dtype=query_proj.dtype,
- device=query_proj.device,
- ),
- query_proj,
- key_proj.transpose(-1, -2),
- beta=0,
- alpha=scale,
- )
- attention_probs = torch.softmax(attention_scores.float(), dim=-1).type(attention_scores.dtype)
- hidden_states = torch.bmm(attention_probs, value_proj)
-
- # reshape hidden_states
- hidden_states = self.reshape_batch_dim_to_heads(hidden_states, unmerge_head_and_batch=not use_torch_2_0_attn)
-
- # compute next hidden_states
- hidden_states = self.proj_attn(hidden_states)
-
- hidden_states = hidden_states.transpose(-1, -2).reshape(batch, channel, height, width)
-
- # res connect and rescale
- hidden_states = (hidden_states + residual) / self.rescale_output_factor
- return hidden_states
-
-
@maybe_allow_in_graph
class BasicTransformerBlock(nn.Module):
r"""
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index b727c76e2137..f88400da0333 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -65,6 +65,10 @@ def __init__(
out_bias: bool = True,
scale_qk: bool = True,
only_cross_attention: bool = False,
+ eps: float = 1e-5,
+ rescale_output_factor: float = 1.0,
+ residual_connection: bool = False,
+ _from_deprecated_attn_block=False,
processor: Optional["AttnProcessor"] = None,
):
super().__init__()
@@ -72,6 +76,12 @@ def __init__(
cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
self.upcast_attention = upcast_attention
self.upcast_softmax = upcast_softmax
+ self.rescale_output_factor = rescale_output_factor
+ self.residual_connection = residual_connection
+
+ # we make use of this private variable to know whether this class is loaded
+ # with an deprecated state dict so that we can convert it on the fly
+ self._from_deprecated_attn_block = _from_deprecated_attn_block
self.scale_qk = scale_qk
self.scale = dim_head**-0.5 if self.scale_qk else 1.0
@@ -91,7 +101,7 @@ def __init__(
)
if norm_num_groups is not None:
- self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=1e-5, affine=True)
+ self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True)
else:
self.group_norm = None
@@ -407,10 +417,22 @@ def __call__(
encoder_hidden_states=None,
attention_mask=None,
):
+ residual = hidden_states
+
+ input_ndim = hidden_states.ndim
+
+ if input_ndim == 4:
+ batch_size, channel, height, width = hidden_states.shape
+ hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
batch_size, sequence_length, _ = (
hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
)
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+ if attn.group_norm is not None:
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
query = attn.to_q(hidden_states)
if encoder_hidden_states is None:
@@ -434,6 +456,14 @@ def __call__(
# dropout
hidden_states = attn.to_out[1](hidden_states)
+ if input_ndim == 4:
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+ if attn.residual_connection:
+ hidden_states = hidden_states + residual
+
+ hidden_states = hidden_states / attn.rescale_output_factor
+
return hidden_states
@@ -474,11 +504,22 @@ def __init__(self, hidden_size, cross_attention_dim=None, rank=4):
self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank)
def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0):
+ residual = hidden_states
+
+ input_ndim = hidden_states.ndim
+
+ if input_ndim == 4:
+ batch_size, channel, height, width = hidden_states.shape
+ hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
batch_size, sequence_length, _ = (
hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
)
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+ if attn.group_norm is not None:
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
query = attn.head_to_batch_dim(query)
@@ -502,6 +543,14 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
# dropout
hidden_states = attn.to_out[1](hidden_states)
+ if input_ndim == 4:
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+ if attn.residual_connection:
+ hidden_states = hidden_states + residual
+
+ hidden_states = hidden_states / attn.rescale_output_factor
+
return hidden_states
@@ -762,12 +811,23 @@ def __init__(self, attention_op: Optional[Callable] = None):
self.attention_op = attention_op
def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+ residual = hidden_states
+
+ input_ndim = hidden_states.ndim
+
+ if input_ndim == 4:
+ batch_size, channel, height, width = hidden_states.shape
+ hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
batch_size, sequence_length, _ = (
hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
)
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+ if attn.group_norm is not None:
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
query = attn.to_q(hidden_states)
if encoder_hidden_states is None:
@@ -792,6 +852,15 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
hidden_states = attn.to_out[0](hidden_states)
# dropout
hidden_states = attn.to_out[1](hidden_states)
+
+ if input_ndim == 4:
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+ if attn.residual_connection:
+ hidden_states = hidden_states + residual
+
+ hidden_states = hidden_states / attn.rescale_output_factor
+
return hidden_states
@@ -801,6 +870,14 @@ def __init__(self):
raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+ residual = hidden_states
+
+ input_ndim = hidden_states.ndim
+
+ if input_ndim == 4:
+ batch_size, channel, height, width = hidden_states.shape
+ hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
batch_size, sequence_length, _ = (
hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
)
@@ -812,6 +889,9 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
# (batch, heads, source_length, target_length)
attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+ if attn.group_norm is not None:
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
query = attn.to_q(hidden_states)
if encoder_hidden_states is None:
@@ -840,6 +920,15 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
hidden_states = attn.to_out[0](hidden_states)
# dropout
hidden_states = attn.to_out[1](hidden_states)
+
+ if input_ndim == 4:
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+ if attn.residual_connection:
+ hidden_states = hidden_states + residual
+
+ hidden_states = hidden_states / attn.rescale_output_factor
+
return hidden_states
@@ -858,11 +947,22 @@ def __init__(self, hidden_size, cross_attention_dim, rank=4, attention_op: Optio
self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank)
def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0):
+ residual = hidden_states
+
+ input_ndim = hidden_states.ndim
+
+ if input_ndim == 4:
+ batch_size, channel, height, width = hidden_states.shape
+ hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
batch_size, sequence_length, _ = (
hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
)
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+ if attn.group_norm is not None:
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
query = attn.head_to_batch_dim(query).contiguous()
@@ -887,6 +987,14 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
# dropout
hidden_states = attn.to_out[1](hidden_states)
+ if input_ndim == 4:
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+ if attn.residual_connection:
+ hidden_states = hidden_states + residual
+
+ hidden_states = hidden_states / attn.rescale_output_factor
+
return hidden_states
@@ -980,11 +1088,22 @@ def __init__(self, slice_size):
self.slice_size = slice_size
def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+ residual = hidden_states
+
+ input_ndim = hidden_states.ndim
+
+ if input_ndim == 4:
+ batch_size, channel, height, width = hidden_states.shape
+ hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
batch_size, sequence_length, _ = (
hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
)
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+ if attn.group_norm is not None:
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
query = attn.to_q(hidden_states)
dim = query.shape[-1]
query = attn.head_to_batch_dim(query)
@@ -1025,6 +1144,14 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
# dropout
hidden_states = attn.to_out[1](hidden_states)
+ if input_ndim == 4:
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+ if attn.residual_connection:
+ hidden_states = hidden_states + residual
+
+ hidden_states = hidden_states / attn.rescale_output_factor
+
return hidden_states
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index ef14ec3d09ef..e7cfcd71062f 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -583,6 +583,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
if device_map is None:
param_device = "cpu"
state_dict = load_state_dict(model_file, variant=variant)
+ model._convert_deprecated_attention_blocks(state_dict)
# move the params from meta device to cpu
missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
if len(missing_keys) > 0:
@@ -625,6 +626,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
model = cls.from_config(config, **unused_kwargs)
state_dict = load_state_dict(model_file, variant=variant)
+ model._convert_deprecated_attention_blocks(state_dict)
model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
model,
@@ -803,3 +805,47 @@ def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool
return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable)
else:
return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
+
+ def _convert_deprecated_attention_blocks(self, state_dict):
+ deprecated_attention_block_paths = []
+
+ def recursive_find_attn_block(name, module):
+ if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+ deprecated_attention_block_paths.append(name)
+
+ for sub_name, sub_module in module.named_children():
+ sub_name = sub_name if name == "" else f"{name}.{sub_name}"
+ recursive_find_attn_block(sub_name, sub_module)
+
+ recursive_find_attn_block("", self)
+
+ # NOTE: we have to check if the deprecated parameters are in the state dict
+ # because it is possible we are loading from a state dict that was already
+ # converted
+
+ for path in deprecated_attention_block_paths:
+ # group_norm path stays the same
+
+ # query -> to_q
+ if f"{path}.query.weight" in state_dict:
+ state_dict[f"{path}.to_q.weight"] = state_dict.pop(f"{path}.query.weight")
+ if f"{path}.query.bias" in state_dict:
+ state_dict[f"{path}.to_q.bias"] = state_dict.pop(f"{path}.query.bias")
+
+ # key -> to_k
+ if f"{path}.key.weight" in state_dict:
+ state_dict[f"{path}.to_k.weight"] = state_dict.pop(f"{path}.key.weight")
+ if f"{path}.key.bias" in state_dict:
+ state_dict[f"{path}.to_k.bias"] = state_dict.pop(f"{path}.key.bias")
+
+ # value -> to_v
+ if f"{path}.value.weight" in state_dict:
+ state_dict[f"{path}.to_v.weight"] = state_dict.pop(f"{path}.value.weight")
+ if f"{path}.value.bias" in state_dict:
+ state_dict[f"{path}.to_v.bias"] = state_dict.pop(f"{path}.value.bias")
+
+ # proj_attn -> to_out.0
+ if f"{path}.proj_attn.weight" in state_dict:
+ state_dict[f"{path}.to_out.0.weight"] = state_dict.pop(f"{path}.proj_attn.weight")
+ if f"{path}.proj_attn.bias" in state_dict:
+ state_dict[f"{path}.to_out.0.bias"] = state_dict.pop(f"{path}.proj_attn.bias")
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index 2f7b19b7328a..0004f074c563 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -18,7 +18,7 @@
import torch.nn.functional as F
from torch import nn
-from .attention import AdaGroupNorm, AttentionBlock
+from .attention import AdaGroupNorm
from .attention_processor import Attention, AttnAddedKVProcessor, AttnAddedKVProcessor2_0
from .dual_transformer_2d import DualTransformer2DModel
from .resnet import Downsample2D, FirDownsample2D, FirUpsample2D, KDownsample2D, KUpsample2D, ResnetBlock2D, Upsample2D
@@ -427,12 +427,17 @@ def __init__(
for _ in range(num_layers):
if self.add_attention:
attentions.append(
- AttentionBlock(
+ Attention(
in_channels,
- num_head_channels=attn_num_head_channels,
+ heads=in_channels // attn_num_head_channels if attn_num_head_channels is not None else 1,
+ dim_head=attn_num_head_channels if attn_num_head_channels is not None else in_channels,
rescale_output_factor=output_scale_factor,
eps=resnet_eps,
norm_num_groups=resnet_groups,
+ residual_connection=True,
+ bias=True,
+ upcast_softmax=True,
+ _from_deprecated_attn_block=True,
)
)
else:
@@ -711,12 +716,17 @@ def __init__(
)
)
attentions.append(
- AttentionBlock(
+ Attention(
out_channels,
- num_head_channels=attn_num_head_channels,
+ heads=out_channels // attn_num_head_channels if attn_num_head_channels is not None else 1,
+ dim_head=attn_num_head_channels if attn_num_head_channels is not None else out_channels,
rescale_output_factor=output_scale_factor,
eps=resnet_eps,
norm_num_groups=resnet_groups,
+ residual_connection=True,
+ bias=True,
+ upcast_softmax=True,
+ _from_deprecated_attn_block=True,
)
)
@@ -1060,12 +1070,17 @@ def __init__(
)
)
attentions.append(
- AttentionBlock(
+ Attention(
out_channels,
- num_head_channels=attn_num_head_channels,
+ heads=out_channels // attn_num_head_channels if attn_num_head_channels is not None else 1,
+ dim_head=attn_num_head_channels if attn_num_head_channels is not None else out_channels,
rescale_output_factor=output_scale_factor,
eps=resnet_eps,
norm_num_groups=resnet_groups,
+ residual_connection=True,
+ bias=True,
+ upcast_softmax=True,
+ _from_deprecated_attn_block=True,
)
)
@@ -1134,11 +1149,17 @@ def __init__(
)
)
self.attentions.append(
- AttentionBlock(
+ Attention(
out_channels,
- num_head_channels=attn_num_head_channels,
+ heads=out_channels // attn_num_head_channels if attn_num_head_channels is not None else 1,
+ dim_head=attn_num_head_channels if attn_num_head_channels is not None else out_channels,
rescale_output_factor=output_scale_factor,
eps=resnet_eps,
+ norm_num_groups=32,
+ residual_connection=True,
+ bias=True,
+ upcast_softmax=True,
+ _from_deprecated_attn_block=True,
)
)
@@ -1703,12 +1724,17 @@ def __init__(
)
)
attentions.append(
- AttentionBlock(
+ Attention(
out_channels,
- num_head_channels=attn_num_head_channels,
+ heads=out_channels // attn_num_head_channels if attn_num_head_channels is not None else 1,
+ dim_head=attn_num_head_channels if attn_num_head_channels is not None else out_channels,
rescale_output_factor=output_scale_factor,
eps=resnet_eps,
norm_num_groups=resnet_groups,
+ residual_connection=True,
+ bias=True,
+ upcast_softmax=True,
+ _from_deprecated_attn_block=True,
)
)
@@ -2037,12 +2063,17 @@ def __init__(
)
)
attentions.append(
- AttentionBlock(
+ Attention(
out_channels,
- num_head_channels=attn_num_head_channels,
+ heads=out_channels // attn_num_head_channels if attn_num_head_channels is not None else 1,
+ dim_head=attn_num_head_channels if attn_num_head_channels is not None else out_channels,
rescale_output_factor=output_scale_factor,
eps=resnet_eps,
norm_num_groups=resnet_groups,
+ residual_connection=True,
+ bias=True,
+ upcast_softmax=True,
+ _from_deprecated_attn_block=True,
)
)
@@ -2109,11 +2140,17 @@ def __init__(
)
self.attentions.append(
- AttentionBlock(
+ Attention(
out_channels,
- num_head_channels=attn_num_head_channels,
+ heads=out_channels // attn_num_head_channels if attn_num_head_channels is not None else 1,
+ dim_head=attn_num_head_channels if attn_num_head_channels is not None else out_channels,
rescale_output_factor=output_scale_factor,
eps=resnet_eps,
+ norm_num_groups=32,
+ residual_connection=True,
+ bias=True,
+ upcast_softmax=True,
+ _from_deprecated_attn_block=True,
)
)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index b7530ac4ec5c..6bb463a6a65f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -19,11 +19,11 @@
import numpy as np
import PIL
import torch
-import torch.nn.functional as F
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention_processor import AttnProcessor2_0, LoRAXFormersAttnProcessor, XFormersAttnProcessor
from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor
from ..pipeline_utils import DiffusionPipeline
@@ -709,12 +709,14 @@ def __call__(
# make sure the VAE is in float32 mode, as it overflows in float16
self.vae.to(dtype=torch.float32)
- # TODO(Patrick, William) - clean up when attention is refactored
- use_torch_2_0_attn = hasattr(F, "scaled_dot_product_attention")
- use_xformers = self.vae.decoder.mid_block.attentions[0]._use_memory_efficient_attention_xformers
+ use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [
+ AttnProcessor2_0,
+ XFormersAttnProcessor,
+ LoRAXFormersAttnProcessor,
+ ]
# if xformers or torch_2_0 is used attention block does not need
# to be in float32 which can save lots of memory
- if not use_torch_2_0_attn and not use_xformers:
+ if not use_torch_2_0_or_xformers:
self.vae.post_quant_conv.to(latents.dtype)
self.vae.decoder.conv_in.to(latents.dtype)
self.vae.decoder.mid_block.to(latents.dtype)
diff --git a/tests/models/test_layers_utils.py b/tests/models/test_layers_utils.py
index db0d6c78d902..98fa1afcbb9d 100644
--- a/tests/models/test_layers_utils.py
+++ b/tests/models/test_layers_utils.py
@@ -20,7 +20,7 @@
import torch
from torch import nn
-from diffusers.models.attention import GEGLU, AdaLayerNorm, ApproximateGELU, AttentionBlock
+from diffusers.models.attention import GEGLU, AdaLayerNorm, ApproximateGELU
from diffusers.models.embeddings import get_timestep_embedding
from diffusers.models.resnet import Downsample2D, ResnetBlock2D, Upsample2D
from diffusers.models.transformer_2d import Transformer2DModel
@@ -314,59 +314,6 @@ def test_restnet_with_kernel_sde_vp(self):
assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-class AttentionBlockTests(unittest.TestCase):
- @unittest.skipIf(
- torch_device == "mps", "Matmul crashes on MPS, see https://github.com/pytorch/pytorch/issues/84039"
- )
- def test_attention_block_default(self):
- torch.manual_seed(0)
- if torch.cuda.is_available():
- torch.cuda.manual_seed_all(0)
-
- sample = torch.randn(1, 32, 64, 64).to(torch_device)
- attentionBlock = AttentionBlock(
- channels=32,
- num_head_channels=1,
- rescale_output_factor=1.0,
- eps=1e-6,
- norm_num_groups=32,
- ).to(torch_device)
- with torch.no_grad():
- attention_scores = attentionBlock(sample)
-
- assert attention_scores.shape == (1, 32, 64, 64)
- output_slice = attention_scores[0, -1, -3:, -3:]
-
- expected_slice = torch.tensor(
- [-1.4975, -0.0038, -0.7847, -1.4567, 1.1220, -0.8962, -1.7394, 1.1319, -0.5427], device=torch_device
- )
- assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
- def test_attention_block_sd(self):
- # This version uses SD params and is compatible with mps
- torch.manual_seed(0)
- if torch.cuda.is_available():
- torch.cuda.manual_seed_all(0)
-
- sample = torch.randn(1, 512, 64, 64).to(torch_device)
- attentionBlock = AttentionBlock(
- channels=512,
- rescale_output_factor=1.0,
- eps=1e-6,
- norm_num_groups=32,
- ).to(torch_device)
- with torch.no_grad():
- attention_scores = attentionBlock(sample)
-
- assert attention_scores.shape == (1, 512, 64, 64)
- output_slice = attention_scores[0, -1, -3:, -3:]
-
- expected_slice = torch.tensor(
- [-0.6621, -0.0156, -3.2766, 0.8025, -0.8609, 0.2820, 0.0905, -1.1179, -3.2126], device=torch_device
- )
- assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-
class Transformer2DModelTests(unittest.TestCase):
def test_spatial_transformer_default(self):
torch.manual_seed(0)
From bdefabd1a8f155235ee4b65c91ef96ce60602c51 Mon Sep 17 00:00:00 2001
From: Sayak Paul
Date: Sat, 13 May 2023 15:12:01 +0530
Subject: [PATCH 079/206] [Docs] update the PT 2.0 optimization doc with latest
findings (#3370)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* add: benchmarking stats for A100 and V100.
* Apply suggestions from code review
Co-authored-by: Patrick von Platen
* address patrick's comments.
* add: rtx 4090 stats
* ⚔ benchmark reports done
* Apply suggestions from code review
Co-authored-by: Pedro Cuenca
* 3313 pr link.
* add: plots.
Co-authored-by: Pedro
* fix formattimg
* update number percent.
---------
Co-authored-by: Patrick von Platen
Co-authored-by: Pedro Cuenca
---
docs/source/en/optimization/torch2.0.mdx | 515 ++++++++++++++++-------
1 file changed, 374 insertions(+), 141 deletions(-)
diff --git a/docs/source/en/optimization/torch2.0.mdx b/docs/source/en/optimization/torch2.0.mdx
index 206ac4e447cc..2bcf3fa82115 100644
--- a/docs/source/en/optimization/torch2.0.mdx
+++ b/docs/source/en/optimization/torch2.0.mdx
@@ -12,19 +12,20 @@ specific language governing permissions and limitations under the License.
# Accelerated PyTorch 2.0 support in Diffusers
-Starting from version `0.13.0`, Diffusers supports the latest optimization from the upcoming [PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/) release. These include:
-1. Support for accelerated transformers implementation with memory-efficient attention – no extra dependencies required.
+Starting from version `0.13.0`, Diffusers supports the latest optimization from [PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/). These include:
+1. Support for accelerated transformers implementation with memory-efficient attention – no extra dependencies (such as `xformers`) required.
2. [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) support for extra performance boost when individual models are compiled.
## Installation
-To benefit from the accelerated attention implementation and `torch.compile`, you just need to install the latest versions of PyTorch 2.0 from `pip`, and make sure you are on diffusers 0.13.0 or later. As explained below, `diffusers` automatically uses the attention optimizations (but not `torch.compile`) when available.
+To benefit from the accelerated attention implementation and `torch.compile()`, you just need to install the latest versions of PyTorch 2.0 from pip, and make sure you are on diffusers 0.13.0 or later. As explained below, diffusers automatically uses the optimized attention processor ([`AttnProcessor2_0`](https://github.com/huggingface/diffusers/blob/1a5797c6d4491a879ea5285c4efc377664e0332d/src/diffusers/models/attention_processor.py#L798)) (but not `torch.compile()`)
+when PyTorch 2.0 is available.
```bash
pip install --upgrade torch torchvision diffusers
```
-## Using accelerated transformers and torch.compile.
+## Using accelerated transformers and `torch.compile`.
1. **Accelerated Transformers implementation**
@@ -46,13 +47,13 @@ pip install --upgrade torch torchvision diffusers
If you want to enable it explicitly (which is not required), you can do so as shown below.
- ```Python
+ ```diff
import torch
from diffusers import DiffusionPipeline
- from diffusers.models.attention_processor import AttnProcessor2_0
+ + from diffusers.models.attention_processor import AttnProcessor2_0
pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
- pipe.unet.set_attn_processor(AttnProcessor2_0())
+ + pipe.unet.set_attn_processor(AttnProcessor2_0())
prompt = "a photo of an astronaut riding a horse on mars"
image = pipe(prompt).images[0]
@@ -60,151 +61,383 @@ pip install --upgrade torch torchvision diffusers
This should be as fast and memory efficient as `xFormers`. More details [in our benchmark](#benchmark).
+ It is possible to revert to the vanilla attention processor ([`AttnProcessor`](https://github.com/huggingface/diffusers/blob/1a5797c6d4491a879ea5285c4efc377664e0332d/src/diffusers/models/attention_processor.py#L402)), which can be helpful to make the pipeline more deterministic, or if you need to convert a fine-tuned model to other formats such as [Core ML](https://huggingface.co/docs/diffusers/v0.16.0/en/optimization/coreml#how-to-run-stable-diffusion-with-core-ml). To use the normal attention processor you can use the [`~diffusers.UNet2DConditionModel.set_default_attn_processor`] function:
-2. **torch.compile**
-
- To get an additional speedup, we can use the new `torch.compile` feature. To do so, we simply wrap our `unet` with `torch.compile`. For more information and different options, refer to the
- [torch compile docs](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html).
-
- ```python
+ ```Python
import torch
from diffusers import DiffusionPipeline
+ from diffusers.models.attention_processor import AttnProcessor
pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
- pipe.unet = torch.compile(pipe.unet)
+ pipe.unet.set_default_attn_processor()
- batch_size = 10
- prompt = "A photo of an astronaut riding a horse on marse."
+ prompt = "a photo of an astronaut riding a horse on mars"
+ image = pipe(prompt).images[0]
+ ```
+
+2. **torch.compile**
+
+ To get an additional speedup, we can use the new `torch.compile` feature. Since the UNet of the pipeline is usually the most computationally expensive, we wrap the `unet` with `torch.compile` leaving rest of the sub-models (text encoder and VAE) as is. For more information and different options, refer to the
+ [torch compile docs](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html).
+
+ ```python
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
images = pipe(prompt, num_inference_steps=steps, num_images_per_prompt=batch_size).images
```
- Depending on the type of GPU, `compile()` can yield between 2-9% of _additional speed-up_ over the accelerated transformer optimizations. Note, however, that compilation is able to squeeze more performance improvements in more recent GPU architectures such as Ampere (A100, 3090), Ada (4090) and Hopper (H100).
+ Depending on the type of GPU, `compile()` can yield between **5% - 300%** of _additional speed-up_ over the accelerated transformer optimizations. Note, however, that compilation is able to squeeze more performance improvements in more recent GPU architectures such as Ampere (A100, 3090), Ada (4090) and Hopper (H100).
- Compilation takes some time to complete, so it is best suited for situations where you need to prepare your pipeline once and then perform the same type of inference operations multiple times.
+ Compilation takes some time to complete, so it is best suited for situations where you need to prepare your pipeline once and then perform the same type of inference operations multiple times. Calling the compiled pipeline on a different image size will re-trigger compilation which can be expensive.
## Benchmark
-We conducted a simple benchmark on different GPUs to compare vanilla attention, xFormers, `torch.nn.functional.scaled_dot_product_attention` and `torch.compile+torch.nn.functional.scaled_dot_product_attention`.
-For the benchmark we used the [stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) model with 50 steps. The `xFormers` benchmark is done using the `torch==1.13.1` version, while the accelerated transformers optimizations are tested using nightly versions of PyTorch 2.0. The tables below summarize the results we got.
-
-Please refer to [our featured blog post in the PyTorch site](https://pytorch.org/blog/accelerated-diffusers-pt-20/) for more details.
-
-### FP16 benchmark
-
-The table below shows the benchmark results for inference using `fp16`. As we can see, `torch.nn.functional.scaled_dot_product_attention` is as fast as `xFormers` (sometimes slightly faster/slower) on all the GPUs we tested.
-And using `torch.compile` gives further speed-up of up of 10% over `xFormers`, but it's mostly noticeable on the A100 GPU.
-
-___The time reported is in seconds.___
-
-| GPU | Batch Size | Vanilla Attention | xFormers | PyTorch2.0 SDPA | SDPA + torch.compile | Speed over xformers (%) |
-| --- | --- | --- | --- | --- | --- | --- |
-| A100 | 1 | 2.69 | 2.7 | 1.98 | 2.47 | 8.52 |
-| A100 | 2 | 3.21 | 3.04 | 2.38 | 2.78 | 8.55 |
-| A100 | 4 | 5.27 | 3.91 | 3.89 | 3.53 | 9.72 |
-| A100 | 8 | 9.74 | 7.03 | 7.04 | 6.62 | 5.83 |
-| A100 | 10 | 12.02 | 8.7 | 8.67 | 8.45 | 2.87 |
-| A100 | 16 | 18.95 | 13.57 | 13.55 | 13.20 | 2.73 |
-| A100 | 32 (1) | OOM | 26.56 | 26.68 | 25.85 | 2.67 |
-| A100 | 64 | | 52.51 | 53.03 | 50.93 | 3.01 |
-| | | | | | | |
-| A10 | 4 | 13.94 | 9.81 | 10.01 | 9.35 | 4.69 |
-| A10 | 8 | 27.09 | 19 | 19.53 | 18.33 | 3.53 |
-| A10 | 10 | 33.69 | 23.53 | 24.19 | 22.52 | 4.29 |
-| A10 | 16 | OOM | 37.55 | 38.31 | 36.81 | 1.97 |
-| A10 | 32 (1) | | 77.19 | 78.43 | 76.64 | 0.71 |
-| A10 | 64 (1) | | 173.59 | 158.99 | 155.14 | 10.63 |
-| | | | | | | |
-| T4 | 4 | 38.81 | 30.09 | 29.74 | 27.55 | 8.44 |
-| T4 | 8 | OOM | 55.71 | 55.99 | 53.85 | 3.34 |
-| T4 | 10 | OOM | 68.96 | 69.86 | 65.35 | 5.23 |
-| T4 | 16 | OOM | 111.47 | 113.26 | 106.93 | 4.07 |
-| | | | | | | |
-| V100 | 4 | 9.84 | 8.16 | 8.09 | 7.65 | 6.25 |
-| V100 | 8 | OOM | 15.62 | 15.44 | 14.59 | 6.59 |
-| V100 | 10 | OOM | 19.52 | 19.28 | 18.18 | 6.86 |
-| V100 | 16 | OOM | 30.29 | 29.84 | 28.22 | 6.83 |
-| | | | | | | |
-| 3090 | 1 | 2.94 | 2.5 | 2.42 | 2.33 | 6.80 |
-| 3090 | 4 | 10.04 | 7.82 | 7.72 | 7.38 | 5.63 |
-| 3090 | 8 | 19.27 | 14.97 | 14.88 | 14.15 | 5.48 |
-| 3090 | 10| 24.08 | 18.7 | 18.62 | 18.12 | 3.10 |
-| 3090 | 16 | OOM | 29.06 | 28.88 | 28.2 | 2.96 |
-| 3090 | 32 (1) | | 58.05 | 57.42 | 56.28 | 3.05 |
-| 3090 | 64 (1) | | 126.54 | 114.27 | 112.21 | 11.32 |
-| | | | | | | |
-| 3090 Ti | 1 | 2.7 | 2.26 | 2.19 | 2.12 | 6.19 |
-| 3090 Ti | 4 | 9.07 | 7.14 | 7.00 | 6.71 | 6.02 |
-| 3090 Ti | 8 | 17.51 | 13.65 | 13.53 | 12.94 | 5.20 |
-| 3090 Ti | 10 (2) | 21.79 | 16.85 | 16.77 | 16.44 | 2.43 |
-| 3090 Ti | 16 | OOM | 26.1 | 26.04 | 25.53 | 2.18 |
-| 3090 Ti | 32 (1) | | 51.78 | 51.71 | 50.91 | 1.68 |
-| 3090 Ti | 64 (1) | | 112.02 | 102.78 | 100.89 | 9.94 |
-| | | | | | | |
-| 4090 | 1 | 4.47 | 3.98 | 1.28 | 1.21 | 69.60 |
-| 4090 | 4 | 10.48 | 8.37 | 3.76 | 3.56 | 57.47 |
-| 4090 | 8 | 14.33 | 10.22 | 7.43 | 6.99 | 31.60 |
-| 4090 | 16 | | 17.07 | 14.98 | 14.58 | 14.59 |
-| 4090 | 32 (1) | | 39.03 | 30.18 | 29.49 | 24.44 |
-| 4090 | 64 (1) | | 77.29 | 61.34 | 59.96 | 22.42 |
-
-
-
-### FP32 benchmark
-
-The table below shows the benchmark results for inference using `fp32`. In this case, `torch.nn.functional.scaled_dot_product_attention` is faster than `xFormers` on all the GPUs we tested.
-
-Using `torch.compile` in addition to the accelerated transformers implementation can yield up to 19% performance improvement over `xFormers` in Ampere and Ada cards, and up to 20% (Ampere) or 28% (Ada) over vanilla attention.
-
-| GPU | Batch Size | Vanilla Attention | xFormers | PyTorch2.0 SDPA | SDPA + torch.compile | Speed over xformers (%) | Speed over vanilla (%) |
-| --- | --- | --- | --- | --- | --- | --- | --- |
-| A100 | 1 | 4.97 | 3.86 | 2.6 | 2.86 | 25.91 | 42.45 |
-| A100 | 2 | 9.03 | 6.76 | 4.41 | 4.21 | 37.72 | 53.38 |
-| A100 | 4 | 16.70 | 12.42 | 7.94 | 7.54 | 39.29 | 54.85 |
-| A100 | 10 | OOM | 29.93 | 18.70 | 18.46 | 38.32 | |
-| A100 | 16 | | 47.08 | 29.41 | 29.04 | 38.32 | |
-| A100 | 32 | | 92.89 | 57.55 | 56.67 | 38.99 | |
-| A100 | 64 | | 185.3 | 114.8 | 112.98 | 39.03 | |
-| | | | | | | |
-| A10 | 1 | 10.59 | 8.81 | 7.51 | 7.35 | 16.57 | 30.59 |
-| A10 | 4 | 34.77 | 27.63 | 22.77 | 22.07 | 20.12 | 36.53 |
-| A10 | 8 | | 56.19 | 43.53 | 43.86 | 21.94 | |
-| A10 | 16 | | 116.49 | 88.56 | 86.64 | 25.62 | |
-| A10 | 32 | | 221.95 | 175.74 | 168.18 | 24.23 | |
-| A10 | 48 | | 333.23 | 264.84 | | 20.52 | |
-| | | | | | | |
-| T4 | 1 | 28.2 | 24.49 | 23.93 | 23.56 | 3.80 | 16.45 |
-| T4 | 2 | 52.77 | 45.7 | 45.88 | 45.06 | 1.40 | 14.61 |
-| T4 | 4 | OOM | 85.72 | 85.78 | 84.48 | 1.45 | |
-| T4 | 8 | | 149.64 | 150.75 | 148.4 | 0.83 | |
-| | | | | | | |
-| V100 | 1 | 7.4 | 6.84 | 6.8 | 6.66 | 2.63 | 10.00 |
-| V100 | 2 | 13.85 | 12.81 | 12.66 | 12.35 | 3.59 | 10.83 |
-| V100 | 4 | OOM | 25.73 | 25.31 | 24.78 | 3.69 | |
-| V100 | 8 | | 43.95 | 43.37 | 42.25 | 3.87 | |
-| V100 | 16 | | 84.99 | 84.73 | 82.55 | 2.87 | |
-| | | | | | | |
-| 3090 | 1 | 7.09 | 6.78 | 5.34 | 5.35 | 21.09 | 24.54 |
-| 3090 | 4 | 22.69 | 21.45 | 18.56 | 18.18 | 15.24 | 19.88 |
-| 3090 | 8 | | 42.59 | 36.68 | 35.61 | 16.39 | |
-| 3090 | 16 | | 85.35 | 72.93 | 70.18 | 17.77 | |
-| 3090 | 32 (1) | | 162.05 | 143.46 | 138.67 | 14.43 | |
-| | | | | | | |
-| 3090 Ti | 1 | 6.45 | 6.19 | 4.99 | 4.89 | 21.00 | 24.19 |
-| 3090 Ti | 4 | 20.32 | 19.31 | 17.02 | 16.48 | 14.66 | 18.90 |
-| 3090 Ti | 8 | | 37.93 | 33.21 | 32.24 | 15.00 | |
-| 3090 Ti | 16 | | 75.37 | 66.63 | 64.5 | 14.42 | |
-| 3090 Ti | 32 (1) | | 142.55 | 128.89 | 124.92 | 12.37 | |
-| | | | | | | |
-| 4090 | 1 | 5.54 | 4.99 | 2.66 | 2.58 | 48.30 | 53.43 |
-| 4090 | 4 | 13.67 | 11.4 | 8.81 | 8.46 | 25.79 | 38.11 |
-| 4090 | 8 | | 19.79 | 17.55 | 16.62 | 16.02 | |
-| 4090 | 16 | | 38.62 | 35.65 | 34.07 | 11.78 | |
-| 4090 | 32 (1) | | 76.57 | 69.48 | 65.35 | 14.65 | |
-| 4090 | 48 | | 114.44 | 106.3 | | 7.11 | |
-
-
-(1) Batch Size >= 32 requires enable_vae_slicing() because of https://github.com/pytorch/pytorch/issues/81665.
-This is required for PyTorch 1.13.1, and also for PyTorch 2.0 and large batch sizes.
-
-For more details about how this benchmark was run, please refer to [this PR](https://github.com/huggingface/diffusers/pull/2303) and to [the blog post](https://pytorch.org/blog/accelerated-diffusers-pt-20/).
+We conducted a comprehensive benchmark with PyTorch 2.0's efficient attention implementation and `torch.compile` across different GPUs and batch sizes for five of our most used pipelines. We used `diffusers 0.17.0.dev0`, which [makes sure `torch.compile()` is leveraged optimally](https://github.com/huggingface/diffusers/pull/3313).
+
+### Benchmarking code
+
+#### Stable Diffusion text-to-image
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+path = "runwayml/stable-diffusion-v1-5"
+
+run_compile = True # Set True / False
+
+pipe = DiffusionPipeline.from_pretrained(path, torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+pipe.unet.to(memory_format=torch.channels_last)
+
+if run_compile:
+ print("Run torch compile")
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+prompt = "ghibli style, a fantasy landscape with castles"
+
+for _ in range(3):
+ images = pipe(prompt=prompt).images
+```
+
+#### Stable Diffusion image-to-image
+
+```python
+from diffusers import StableDiffusionImg2ImgPipeline
+import requests
+import torch
+from PIL import Image
+from io import BytesIO
+
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((512, 512))
+
+path = "runwayml/stable-diffusion-v1-5"
+
+run_compile = True # Set True / False
+
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained(path, torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+pipe.unet.to(memory_format=torch.channels_last)
+
+if run_compile:
+ print("Run torch compile")
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+prompt = "ghibli style, a fantasy landscape with castles"
+
+for _ in range(3):
+ image = pipe(prompt=prompt, image=init_image).images[0]
+```
+
+#### Stable Diffusion - inpatining
+
+```python
+from diffusers import StableDiffusionInpaintPipeline
+import requests
+import torch
+from PIL import Image
+from io import BytesIO
+
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+def download_image(url):
+ response = requests.get(url)
+ return Image.open(BytesIO(response.content)).convert("RGB")
+
+
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+init_image = download_image(img_url).resize((512, 512))
+mask_image = download_image(mask_url).resize((512, 512))
+
+path = "runwayml/stable-diffusion-inpainting"
+
+run_compile = True # Set True / False
+
+pipe = StableDiffusionInpaintPipeline.from_pretrained(path, torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+pipe.unet.to(memory_format=torch.channels_last)
+
+if run_compile:
+ print("Run torch compile")
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+prompt = "ghibli style, a fantasy landscape with castles"
+
+for _ in range(3):
+ image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+```
+
+#### ControlNet
+
+```python
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+import requests
+import torch
+from PIL import Image
+from io import BytesIO
+
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((512, 512))
+
+path = "runwayml/stable-diffusion-v1-5"
+
+run_compile = True # Set True / False
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+pipe = StableDiffusionControlNetPipeline.from_pretrained(
+ path, controlnet=controlnet, torch_dtype=torch.float16
+)
+
+pipe = pipe.to("cuda")
+pipe.unet.to(memory_format=torch.channels_last)
+pipe.controlnet.to(memory_format=torch.channels_last)
+
+if run_compile:
+ print("Run torch compile")
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+ pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
+
+prompt = "ghibli style, a fantasy landscape with castles"
+
+for _ in range(3):
+ image = pipe(prompt=prompt, image=init_image).images[0]
+```
+
+#### IF text-to-image + upscaling
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+run_compile = True # Set True / False
+
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-M-v1.0", variant="fp16", text_encoder=None, torch_dtype=torch.float16)
+pipe.to("cuda")
+pipe_2 = DiffusionPipeline.from_pretrained("DeepFloyd/IF-II-M-v1.0", variant="fp16", text_encoder=None, torch_dtype=torch.float16)
+pipe_2.to("cuda")
+pipe_3 = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler", torch_dtype=torch.float16)
+pipe_3.to("cuda")
+
+
+pipe.unet.to(memory_format=torch.channels_last)
+pipe_2.unet.to(memory_format=torch.channels_last)
+pipe_3.unet.to(memory_format=torch.channels_last)
+
+if run_compile:
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+ pipe_2.unet = torch.compile(pipe_2.unet, mode="reduce-overhead", fullgraph=True)
+ pipe_3.unet = torch.compile(pipe_3.unet, mode="reduce-overhead", fullgraph=True)
+
+prompt = "the blue hulk"
+
+prompt_embeds = torch.randn((1, 2, 4096), dtype=torch.float16)
+neg_prompt_embeds = torch.randn((1, 2, 4096), dtype=torch.float16)
+
+for _ in range(3):
+ image = pipe(prompt_embeds=prompt_embeds, negative_prompt_embeds=neg_prompt_embeds, output_type="pt").images
+ image_2 = pipe_2(image=image, prompt_embeds=prompt_embeds, negative_prompt_embeds=neg_prompt_embeds, output_type="pt").images
+ image_3 = pipe_3(prompt=prompt, image=image, noise_level=100).images
+```
+
+To give you a pictorial overview of the possible speed-ups that can be obtained with PyTorch 2.0 and `torch.compile()`,
+here is a plot that shows relative speed-ups for the [Stable Diffusion text-to-image pipeline](StableDiffusionPipeline) across five
+different GPU families (with a batch size of 4):
+
+
+
+To give you an even better idea of how this speed-up holds for the other pipelines presented above, consider the following
+plot that shows the benchmarking numbers from an A100 across three different batch sizes
+(with PyTorch 2.0 nightly and `torch.compile()`):
+
+
+
+_(Our benchmarking metric for the plots above is **number of iterations/second**)_
+
+But we reveal all the benchmarking numbers in the interest of transparency!
+
+In the following tables, we report our findings in terms of the number of **_iterations processed per second_**.
+
+### A100 (batch size: 1)
+
+| **Pipeline** | **torch 2.0 -
no compile** | **torch nightly -
no compile** | **torch 2.0 -
compile** | **torch nightly -
compile** |
+|:---:|:---:|:---:|:---:|:---:|
+| SD - txt2img | 21.66 | 23.13 | 44.03 | 49.74 |
+| SD - img2img | 21.81 | 22.40 | 43.92 | 46.32 |
+| SD - inpaint | 22.24 | 23.23 | 43.76 | 49.25 |
+| SD - controlnet | 15.02 | 15.82 | 32.13 | 36.08 |
+| IF | 20.21 /
13.84 /
24.00 | 20.12 /
13.70 /
24.03 | ❌ | 97.34 /
27.23 /
111.66 |
+
+### A100 (batch size: 4)
+
+| **Pipeline** | **torch 2.0 -
no compile** | **torch nightly -
no compile** | **torch 2.0 -
compile** | **torch nightly -
compile** |
+|:---:|:---:|:---:|:---:|:---:|
+| SD - txt2img | 11.6 | 13.12 | 14.62 | 17.27 |
+| SD - img2img | 11.47 | 13.06 | 14.66 | 17.25 |
+| SD - inpaint | 11.67 | 13.31 | 14.88 | 17.48 |
+| SD - controlnet | 8.28 | 9.38 | 10.51 | 12.41 |
+| IF | 25.02 | 18.04 | ❌ | 48.47 |
+
+### A100 (batch size: 16)
+
+| **Pipeline** | **torch 2.0 -
no compile** | **torch nightly -
no compile** | **torch 2.0 -
compile** | **torch nightly -
compile** |
+|:---:|:---:|:---:|:---:|:---:|
+| SD - txt2img | 3.04 | 3.6 | 3.83 | 4.68 |
+| SD - img2img | 2.98 | 3.58 | 3.83 | 4.67 |
+| SD - inpaint | 3.04 | 3.66 | 3.9 | 4.76 |
+| SD - controlnet | 2.15 | 2.58 | 2.74 | 3.35 |
+| IF | 8.78 | 9.82 | ❌ | 16.77 |
+
+### V100 (batch size: 1)
+
+| **Pipeline** | **torch 2.0 -
no compile** | **torch nightly -
no compile** | **torch 2.0 -
compile** | **torch nightly -
compile** |
+|:---:|:---:|:---:|:---:|:---:|
+| SD - txt2img | 18.99 | 19.14 | 20.95 | 22.17 |
+| SD - img2img | 18.56 | 19.18 | 20.95 | 22.11 |
+| SD - inpaint | 19.14 | 19.06 | 21.08 | 22.20 |
+| SD - controlnet | 13.48 | 13.93 | 15.18 | 15.88 |
+| IF | 20.01 /
9.08 /
23.34 | 19.79 /
8.98 /
24.10 | ❌ | 55.75 /
11.57 /
57.67 |
+
+### V100 (batch size: 4)
+
+| **Pipeline** | **torch 2.0 -
no compile** | **torch nightly -
no compile** | **torch 2.0 -
compile** | **torch nightly -
compile** |
+|:---:|:---:|:---:|:---:|:---:|
+| SD - txt2img | 5.96 | 5.89 | 6.83 | 6.86 |
+| SD - img2img | 5.90 | 5.91 | 6.81 | 6.82 |
+| SD - inpaint | 5.99 | 6.03 | 6.93 | 6.95 |
+| SD - controlnet | 4.26 | 4.29 | 4.92 | 4.93 |
+| IF | 15.41 | 14.76 | ❌ | 22.95 |
+
+### V100 (batch size: 16)
+
+| **Pipeline** | **torch 2.0 -
no compile** | **torch nightly -
no compile** | **torch 2.0 -
compile** | **torch nightly -
compile** |
+|:---:|:---:|:---:|:---:|:---:|
+| SD - txt2img | 1.66 | 1.66 | 1.92 | 1.90 |
+| SD - img2img | 1.65 | 1.65 | 1.91 | 1.89 |
+| SD - inpaint | 1.69 | 1.69 | 1.95 | 1.93 |
+| SD - controlnet | 1.19 | 1.19 | OOM after warmup | 1.36 |
+| IF | 5.43 | 5.29 | ❌ | 7.06 |
+
+### T4 (batch size: 1)
+
+| **Pipeline** | **torch 2.0 -
no compile** | **torch nightly -
no compile** | **torch 2.0 -
compile** | **torch nightly -
compile** |
+|:---:|:---:|:---:|:---:|:---:|
+| SD - txt2img | 6.9 | 6.95 | 7.3 | 7.56 |
+| SD - img2img | 6.84 | 6.99 | 7.04 | 7.55 |
+| SD - inpaint | 6.91 | 6.7 | 7.01 | 7.37 |
+| SD - controlnet | 4.89 | 4.86 | 5.35 | 5.48 |
+| IF | 17.42 /
2.47 /
18.52 | 16.96 /
2.45 /
18.69 | ❌ | 24.63 /
2.47 /
23.39 |
+
+### T4 (batch size: 4)
+
+| **Pipeline** | **torch 2.0 -
no compile** | **torch nightly -
no compile** | **torch 2.0 -
compile** | **torch nightly -
compile** |
+|:---:|:---:|:---:|:---:|:---:|
+| SD - txt2img | 1.79 | 1.79 | 2.03 | 1.99 |
+| SD - img2img | 1.77 | 1.77 | 2.05 | 2.04 |
+| SD - inpaint | 1.81 | 1.82 | 2.09 | 2.09 |
+| SD - controlnet | 1.34 | 1.27 | 1.47 | 1.46 |
+| IF | 5.79 | 5.61 | ❌ | 7.39 |
+
+### T4 (batch size: 16)
+
+| **Pipeline** | **torch 2.0 -
no compile** | **torch nightly -
no compile** | **torch 2.0 -
compile** | **torch nightly -
compile** |
+|:---:|:---:|:---:|:---:|:---:|
+| SD - txt2img | 2.34s | 2.30s | OOM after 2nd iteration | 1.99s |
+| SD - img2img | 2.35s | 2.31s | OOM after warmup | 2.00s |
+| SD - inpaint | 2.30s | 2.26s | OOM after 2nd iteration | 1.95s |
+| SD - controlnet | OOM after 2nd iteration | OOM after 2nd iteration | OOM after warmup | OOM after warmup |
+| IF * | 1.44 | 1.44 | ❌ | 1.94 |
+
+### RTX 3090 (batch size: 1)
+
+| **Pipeline** | **torch 2.0 -
no compile** | **torch nightly -
no compile** | **torch 2.0 -
compile** | **torch nightly -
compile** |
+|:---:|:---:|:---:|:---:|:---:|
+| SD - txt2img | 22.56 | 22.84 | 23.84 | 25.69 |
+| SD - img2img | 22.25 | 22.61 | 24.1 | 25.83 |
+| SD - inpaint | 22.22 | 22.54 | 24.26 | 26.02 |
+| SD - controlnet | 16.03 | 16.33 | 17.38 | 18.56 |
+| IF | 27.08 /
9.07 /
31.23 | 26.75 /
8.92 /
31.47 | ❌ | 68.08 /
11.16 /
65.29 |
+
+### RTX 3090 (batch size: 4)
+
+| **Pipeline** | **torch 2.0 -
no compile** | **torch nightly -
no compile** | **torch 2.0 -
compile** | **torch nightly -
compile** |
+|:---:|:---:|:---:|:---:|:---:|
+| SD - txt2img | 6.46 | 6.35 | 7.29 | 7.3 |
+| SD - img2img | 6.33 | 6.27 | 7.31 | 7.26 |
+| SD - inpaint | 6.47 | 6.4 | 7.44 | 7.39 |
+| SD - controlnet | 4.59 | 4.54 | 5.27 | 5.26 |
+| IF | 16.81 | 16.62 | ❌ | 21.57 |
+
+### RTX 3090 (batch size: 16)
+
+| **Pipeline** | **torch 2.0 -
no compile** | **torch nightly -
no compile** | **torch 2.0 -
compile** | **torch nightly -
compile** |
+|:---:|:---:|:---:|:---:|:---:|
+| SD - txt2img | 1.7 | 1.69 | 1.93 | 1.91 |
+| SD - img2img | 1.68 | 1.67 | 1.93 | 1.9 |
+| SD - inpaint | 1.72 | 1.71 | 1.97 | 1.94 |
+| SD - controlnet | 1.23 | 1.22 | 1.4 | 1.38 |
+| IF | 5.01 | 5.00 | ❌ | 6.33 |
+
+### RTX 4090 (batch size: 1)
+
+| **Pipeline** | **torch 2.0 -
no compile** | **torch nightly -
no compile** | **torch 2.0 -
compile** | **torch nightly -
compile** |
+|:---:|:---:|:---:|:---:|:---:|
+| SD - txt2img | 40.5 | 41.89 | 44.65 | 49.81 |
+| SD - img2img | 40.39 | 41.95 | 44.46 | 49.8 |
+| SD - inpaint | 40.51 | 41.88 | 44.58 | 49.72 |
+| SD - controlnet | 29.27 | 30.29 | 32.26 | 36.03 |
+| IF | 69.71 /
18.78 /
85.49 | 69.13 /
18.80 /
85.56 | ❌ | 124.60 /
26.37 /
138.79 |
+
+### RTX 4090 (batch size: 4)
+
+| **Pipeline** | **torch 2.0 -
no compile** | **torch nightly -
no compile** | **torch 2.0 -
compile** | **torch nightly -
compile** |
+|:---:|:---:|:---:|:---:|:---:|
+| SD - txt2img | 12.62 | 12.84 | 15.32 | 15.59 |
+| SD - img2img | 12.61 | 12,.79 | 15.35 | 15.66 |
+| SD - inpaint | 12.65 | 12.81 | 15.3 | 15.58 |
+| SD - controlnet | 9.1 | 9.25 | 11.03 | 11.22 |
+| IF | 31.88 | 31.14 | ❌ | 43.92 |
+
+### RTX 4090 (batch size: 16)
+
+| **Pipeline** | **torch 2.0 -
no compile** | **torch nightly -
no compile** | **torch 2.0 -
compile** | **torch nightly -
compile** |
+|:---:|:---:|:---:|:---:|:---:|
+| SD - txt2img | 3.17 | 3.2 | 3.84 | 3.85 |
+| SD - img2img | 3.16 | 3.2 | 3.84 | 3.85 |
+| SD - inpaint | 3.17 | 3.2 | 3.85 | 3.85 |
+| SD - controlnet | 2.23 | 2.3 | 2.7 | 2.75 |
+| IF | 9.26 | 9.2 | ❌ | 13.31 |
+
+## Notes
+
+* Follow [this PR](https://github.com/huggingface/diffusers/pull/3313) for more details on the environment used for conducting the benchmarks.
+* For the IF pipeline and batch sizes > 1, we only used a batch size of >1 in the first IF pipeline for text-to-image generation and NOT for upscaling. So, that means the two upscaling pipelines received a batch size of 1.
+
+*Thanks to [Horace He](https://github.com/Chillee) from the PyTorch team for their support in improving our support of `torch.compile()` in Diffusers.*
\ No newline at end of file
From 7a32b6beeb0cfdefed645253dce23d9b0a78597f Mon Sep 17 00:00:00 2001
From: Pedro Cuenca
Date: Mon, 15 May 2023 11:02:34 +0200
Subject: [PATCH 080/206] Fix style rendering (#3433)
* Fix style rendering.
* Fix typo
---
docs/source/en/optimization/fp16.mdx | 2 ++
docs/source/en/optimization/torch2.0.mdx | 3 ++-
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/docs/source/en/optimization/fp16.mdx b/docs/source/en/optimization/fp16.mdx
index 596312a0ffe0..4081cfc6efd6 100644
--- a/docs/source/en/optimization/fp16.mdx
+++ b/docs/source/en/optimization/fp16.mdx
@@ -60,8 +60,10 @@ image = pipe(prompt).images[0]
```
+
It is strongly discouraged to make use of [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than using pure
float16 precision.
+
## Sliced attention for additional memory savings
diff --git a/docs/source/en/optimization/torch2.0.mdx b/docs/source/en/optimization/torch2.0.mdx
index 2bcf3fa82115..05a4043d26d1 100644
--- a/docs/source/en/optimization/torch2.0.mdx
+++ b/docs/source/en/optimization/torch2.0.mdx
@@ -18,6 +18,7 @@ Starting from version `0.13.0`, Diffusers supports the latest optimization from
## Installation
+
To benefit from the accelerated attention implementation and `torch.compile()`, you just need to install the latest versions of PyTorch 2.0 from pip, and make sure you are on diffusers 0.13.0 or later. As explained below, diffusers automatically uses the optimized attention processor ([`AttnProcessor2_0`](https://github.com/huggingface/diffusers/blob/1a5797c6d4491a879ea5285c4efc377664e0332d/src/diffusers/models/attention_processor.py#L798)) (but not `torch.compile()`)
when PyTorch 2.0 is available.
@@ -153,7 +154,7 @@ for _ in range(3):
image = pipe(prompt=prompt, image=init_image).images[0]
```
-#### Stable Diffusion - inpatining
+#### Stable Diffusion - inpainting
```python
from diffusers import StableDiffusionInpaintPipeline
From 29b1325a5ae28fa8d7f459b372582287ffc571e5 Mon Sep 17 00:00:00 2001
From: Will Berman
Date: Mon, 15 May 2023 09:47:14 -0600
Subject: [PATCH 081/206] unCLIP scheduler do not use note (#3417)
---
src/diffusers/schedulers/scheduling_unclip.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index 6403ee3f1518..d44edcb1812a 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -75,6 +75,9 @@ def alpha_bar(time_step):
class UnCLIPScheduler(SchedulerMixin, ConfigMixin):
"""
+ NOTE: do not use this scheduler. The DDPM scheduler has been updated to support the changes made here. This
+ scheduler will be removed and replaced with DDPM.
+
This is a modified DDPM Scheduler specifically for the karlo unCLIP model.
This scheduler has some minor variations in how it calculates the learned range variance and dynamically
From 326f326e1781b1fb888611a37795b474fe496dd8 Mon Sep 17 00:00:00 2001
From: Jongwoo Han
Date: Tue, 16 May 2023 20:51:10 +0900
Subject: [PATCH 082/206] Replace deprecated command with environment file
(#3409)
Co-authored-by: Patrick von Platen
---
.github/actions/setup-miniconda/action.yml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/.github/actions/setup-miniconda/action.yml b/.github/actions/setup-miniconda/action.yml
index 8a82ae8b17bf..cc755d3aad79 100644
--- a/.github/actions/setup-miniconda/action.yml
+++ b/.github/actions/setup-miniconda/action.yml
@@ -27,7 +27,7 @@ runs:
- name: Get date
id: get-date
shell: bash
- run: echo "::set-output name=today::$(/bin/date -u '+%Y%m%d')d"
+ run: echo "today=$(/bin/date -u '+%Y%m%d')d" >> $GITHUB_OUTPUT
- name: Setup miniconda cache
id: miniconda-cache
uses: actions/cache@v2
@@ -143,4 +143,4 @@ runs:
echo "There is ${AVAIL}KB free space left in $MOUNT, continue"
fi
fi
- done
\ No newline at end of file
+ done
From d2285f51589bbee18673272611b709d306e7f911 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Tue, 16 May 2023 13:58:24 +0200
Subject: [PATCH 083/206] fix warning message pipeline loading (#3446)
---
src/diffusers/pipelines/pipeline_utils.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 9288248d309b..a4d3dd1f1673 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -296,8 +296,7 @@ def maybe_raise_or_warn(
if not issubclass(model_cls, expected_class_obj):
raise ValueError(
- f"{passed_class_obj[name]} is of type: {type(passed_class_obj[name])}, but should be"
- f" {expected_class_obj}"
+ f"{passed_class_obj[name]} is of type: {model_cls}, but should be" f" {expected_class_obj}"
)
else:
logger.warning(
From 9d44e2fb6600e80f410b2c05139c001fb0fa9794 Mon Sep 17 00:00:00 2001
From: asfiyab-nvidia <117682710+asfiyab-nvidia@users.noreply.github.com>
Date: Tue, 16 May 2023 06:28:01 -0700
Subject: [PATCH 084/206] add stable diffusion tensorrt img2img pipeline
(#3419)
* add stable diffusion tensorrt img2img pipeline
Signed-off-by: Asfiya Baig
* update docstrings
Signed-off-by: Asfiya Baig
---------
Signed-off-by: Asfiya Baig
---
examples/community/README.md | 44 +-
.../stable_diffusion_tensorrt_img2img.py | 1055 +++++++++++++++++
.../stable_diffusion_tensorrt_txt2img.py | 10 +-
3 files changed, 1102 insertions(+), 7 deletions(-)
mode change 100644 => 100755 examples/community/README.md
create mode 100755 examples/community/stable_diffusion_tensorrt_img2img.py
mode change 100644 => 100755 examples/community/stable_diffusion_tensorrt_txt2img.py
diff --git a/examples/community/README.md b/examples/community/README.md
old mode 100644
new mode 100755
index 3d034b30fcff..47b129ce9e7e
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -31,11 +31,10 @@ If a community doesn't work as expected, please open an issue and ping the autho
| UnCLIP Image Interpolation Pipeline | Diffusion Pipeline that allows passing two images/image_embeddings and produces images while interpolating between their image-embeddings | [UnCLIP Image Interpolation Pipeline](#unclip-image-interpolation-pipeline) | - | [Naga Sai Abhinay Devarinti](https://github.com/Abhinay1997/) |
| DDIM Noise Comparative Analysis Pipeline | Investigating how the diffusion models learn visual concepts from each noise level (which is a contribution of [P2 weighting (CVPR 2022)](https://arxiv.org/abs/2204.00227)) | [DDIM Noise Comparative Analysis Pipeline](#ddim-noise-comparative-analysis-pipeline) | - | [Aengus (Duc-Anh)](https://github.com/aengusng8) |
| CLIP Guided Img2Img Stable Diffusion Pipeline | Doing CLIP guidance for image to image generation with Stable Diffusion | [CLIP Guided Img2Img Stable Diffusion](#clip-guided-img2img-stable-diffusion) | - | [Nipun Jindal](https://github.com/nipunjindal/) |
-| TensorRT Stable Diffusion Pipeline | Accelerates the Stable Diffusion Text2Image Pipeline using TensorRT | [TensorRT Stable Diffusion Pipeline](#tensorrt-text2image-stable-diffusion-pipeline) | - | [Asfiya Baig](https://github.com/asfiyab-nvidia) |
+| TensorRT Stable Diffusion Text to Image Pipeline | Accelerates the Stable Diffusion Text2Image Pipeline using TensorRT | [TensorRT Stable Diffusion Text to Image Pipeline](#tensorrt-text2image-stable-diffusion-pipeline) | - | [Asfiya Baig](https://github.com/asfiyab-nvidia) |
| EDICT Image Editing Pipeline | Diffusion pipeline for text-guided image editing | [EDICT Image Editing Pipeline](#edict-image-editing-pipeline) | - | [Joqsan Azocar](https://github.com/Joqsan) |
| Stable Diffusion RePaint | Stable Diffusion pipeline using [RePaint](https://arxiv.org/abs/2201.0986) for inpainting. | [Stable Diffusion RePaint](#stable-diffusion-repaint ) | - | [Markus Pobitzer](https://github.com/Markus-Pobitzer) |
-
-
+| TensorRT Stable Diffusion Image to Image Pipeline | Accelerates the Stable Diffusion Image2Image Pipeline using TensorRT | [TensorRT Stable Diffusion Image to Image Pipeline](#tensorrt-image2image-stable-diffusion-pipeline) | - | [Asfiya Baig](https://github.com/asfiyab-nvidia) |
To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
```py
@@ -1282,3 +1281,42 @@ pipe = pipe.to("cuda")
prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
```
+
+### TensorRT Image2Image Stable Diffusion Pipeline
+
+The TensorRT Pipeline can be used to accelerate the Image2Image Stable Diffusion Inference run.
+
+NOTE: The ONNX conversions and TensorRT engine build may take up to 30 minutes.
+
+```python
+import requests
+from io import BytesIO
+from PIL import Image
+import torch
+from diffusers import DDIMScheduler
+from diffusers.pipelines.stable_diffusion import StableDiffusionImg2ImgPipeline
+
+# Use the DDIMScheduler scheduler here instead
+scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-2-1",
+ subfolder="scheduler")
+
+
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-1",
+ custom_pipeline="stable_diffusion_tensorrt_img2img",
+ revision='fp16',
+ torch_dtype=torch.float16,
+ scheduler=scheduler,)
+
+# re-use cached folder to save ONNX models and TensorRT Engines
+pipe.set_cached_folder("stabilityai/stable-diffusion-2-1", revision='fp16',)
+
+pipe = pipe.to("cuda")
+
+url = "https://pajoca.com/wp-content/uploads/2022/09/tekito-yamakawa-1.png"
+response = requests.get(url)
+input_image = Image.open(BytesIO(response.content)).convert("RGB")
+
+prompt = "photorealistic new zealand hills"
+image = pipe(prompt, image=input_image, strength=0.75,).images[0]
+image.save('tensorrt_img2img_new_zealand_hills.png')
+```
diff --git a/examples/community/stable_diffusion_tensorrt_img2img.py b/examples/community/stable_diffusion_tensorrt_img2img.py
new file mode 100755
index 000000000000..67c7c2d00fbf
--- /dev/null
+++ b/examples/community/stable_diffusion_tensorrt_img2img.py
@@ -0,0 +1,1055 @@
+#
+# Copyright 2023 The HuggingFace Inc. team.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import os
+from collections import OrderedDict
+from copy import copy
+from typing import List, Optional, Union
+
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+import PIL
+import tensorrt as trt
+import torch
+from huggingface_hub import snapshot_download
+from onnx import shape_inference
+from polygraphy import cuda
+from polygraphy.backend.common import bytes_from_path
+from polygraphy.backend.onnx.loader import fold_constants
+from polygraphy.backend.trt import (
+ CreateConfig,
+ Profile,
+ engine_from_bytes,
+ engine_from_network,
+ network_from_onnx_path,
+ save_engine,
+)
+from polygraphy.backend.trt import util as trt_util
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion import (
+ StableDiffusionImg2ImgPipeline,
+ StableDiffusionPipelineOutput,
+ StableDiffusionSafetyChecker,
+)
+from diffusers.schedulers import DDIMScheduler
+from diffusers.utils import DIFFUSERS_CACHE, logging
+
+
+"""
+Installation instructions
+python3 -m pip install --upgrade transformers diffusers>=0.16.0
+python3 -m pip install --upgrade tensorrt>=8.6.1
+python3 -m pip install --upgrade polygraphy>=0.47.0 onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com
+python3 -m pip install onnxruntime
+"""
+
+TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+# Map of numpy dtype -> torch dtype
+numpy_to_torch_dtype_dict = {
+ np.uint8: torch.uint8,
+ np.int8: torch.int8,
+ np.int16: torch.int16,
+ np.int32: torch.int32,
+ np.int64: torch.int64,
+ np.float16: torch.float16,
+ np.float32: torch.float32,
+ np.float64: torch.float64,
+ np.complex64: torch.complex64,
+ np.complex128: torch.complex128,
+}
+if np.version.full_version >= "1.24.0":
+ numpy_to_torch_dtype_dict[np.bool_] = torch.bool
+else:
+ numpy_to_torch_dtype_dict[np.bool] = torch.bool
+
+# Map of torch dtype -> numpy dtype
+torch_to_numpy_dtype_dict = {value: key for (key, value) in numpy_to_torch_dtype_dict.items()}
+
+
+def device_view(t):
+ return cuda.DeviceView(ptr=t.data_ptr(), shape=t.shape, dtype=torch_to_numpy_dtype_dict[t.dtype])
+
+
+def preprocess_image(image):
+ """
+ image: torch.Tensor
+ """
+ w, h = image.size
+ w, h = (x - x % 32 for x in (w, h)) # resize to integer multiple of 32
+ image = image.resize((w, h))
+ image = np.array(image).astype(np.float32) / 255.0
+ image = image[None].transpose(0, 3, 1, 2)
+ image = torch.from_numpy(image).contiguous()
+ return 2.0 * image - 1.0
+
+
+class Engine:
+ def __init__(self, engine_path):
+ self.engine_path = engine_path
+ self.engine = None
+ self.context = None
+ self.buffers = OrderedDict()
+ self.tensors = OrderedDict()
+
+ def __del__(self):
+ [buf.free() for buf in self.buffers.values() if isinstance(buf, cuda.DeviceArray)]
+ del self.engine
+ del self.context
+ del self.buffers
+ del self.tensors
+
+ def build(
+ self,
+ onnx_path,
+ fp16,
+ input_profile=None,
+ enable_preview=False,
+ enable_all_tactics=False,
+ timing_cache=None,
+ workspace_size=0,
+ ):
+ logger.warning(f"Building TensorRT engine for {onnx_path}: {self.engine_path}")
+ p = Profile()
+ if input_profile:
+ for name, dims in input_profile.items():
+ assert len(dims) == 3
+ p.add(name, min=dims[0], opt=dims[1], max=dims[2])
+
+ config_kwargs = {}
+
+ config_kwargs["preview_features"] = [trt.PreviewFeature.DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]
+ if enable_preview:
+ # Faster dynamic shapes made optional since it increases engine build time.
+ config_kwargs["preview_features"].append(trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805)
+ if workspace_size > 0:
+ config_kwargs["memory_pool_limits"] = {trt.MemoryPoolType.WORKSPACE: workspace_size}
+ if not enable_all_tactics:
+ config_kwargs["tactic_sources"] = []
+
+ engine = engine_from_network(
+ network_from_onnx_path(onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM]),
+ config=CreateConfig(fp16=fp16, profiles=[p], load_timing_cache=timing_cache, **config_kwargs),
+ save_timing_cache=timing_cache,
+ )
+ save_engine(engine, path=self.engine_path)
+
+ def load(self):
+ logger.warning(f"Loading TensorRT engine: {self.engine_path}")
+ self.engine = engine_from_bytes(bytes_from_path(self.engine_path))
+
+ def activate(self):
+ self.context = self.engine.create_execution_context()
+
+ def allocate_buffers(self, shape_dict=None, device="cuda"):
+ for idx in range(trt_util.get_bindings_per_profile(self.engine)):
+ binding = self.engine[idx]
+ if shape_dict and binding in shape_dict:
+ shape = shape_dict[binding]
+ else:
+ shape = self.engine.get_binding_shape(binding)
+ dtype = trt.nptype(self.engine.get_binding_dtype(binding))
+ if self.engine.binding_is_input(binding):
+ self.context.set_binding_shape(idx, shape)
+ tensor = torch.empty(tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype]).to(device=device)
+ self.tensors[binding] = tensor
+ self.buffers[binding] = cuda.DeviceView(ptr=tensor.data_ptr(), shape=shape, dtype=dtype)
+
+ def infer(self, feed_dict, stream):
+ start_binding, end_binding = trt_util.get_active_profile_bindings(self.context)
+ # shallow copy of ordered dict
+ device_buffers = copy(self.buffers)
+ for name, buf in feed_dict.items():
+ assert isinstance(buf, cuda.DeviceView)
+ device_buffers[name] = buf
+ bindings = [0] * start_binding + [buf.ptr for buf in device_buffers.values()]
+ noerror = self.context.execute_async_v2(bindings=bindings, stream_handle=stream.ptr)
+ if not noerror:
+ raise ValueError("ERROR: inference failed.")
+
+ return self.tensors
+
+
+class Optimizer:
+ def __init__(self, onnx_graph):
+ self.graph = gs.import_onnx(onnx_graph)
+
+ def cleanup(self, return_onnx=False):
+ self.graph.cleanup().toposort()
+ if return_onnx:
+ return gs.export_onnx(self.graph)
+
+ def select_outputs(self, keep, names=None):
+ self.graph.outputs = [self.graph.outputs[o] for o in keep]
+ if names:
+ for i, name in enumerate(names):
+ self.graph.outputs[i].name = name
+
+ def fold_constants(self, return_onnx=False):
+ onnx_graph = fold_constants(gs.export_onnx(self.graph), allow_onnxruntime_shape_inference=True)
+ self.graph = gs.import_onnx(onnx_graph)
+ if return_onnx:
+ return onnx_graph
+
+ def infer_shapes(self, return_onnx=False):
+ onnx_graph = gs.export_onnx(self.graph)
+ if onnx_graph.ByteSize() > 2147483648:
+ raise TypeError("ERROR: model size exceeds supported 2GB limit")
+ else:
+ onnx_graph = shape_inference.infer_shapes(onnx_graph)
+
+ self.graph = gs.import_onnx(onnx_graph)
+ if return_onnx:
+ return onnx_graph
+
+
+class BaseModel:
+ def __init__(self, model, fp16=False, device="cuda", max_batch_size=16, embedding_dim=768, text_maxlen=77):
+ self.model = model
+ self.name = "SD Model"
+ self.fp16 = fp16
+ self.device = device
+
+ self.min_batch = 1
+ self.max_batch = max_batch_size
+ self.min_image_shape = 256 # min image resolution: 256x256
+ self.max_image_shape = 1024 # max image resolution: 1024x1024
+ self.min_latent_shape = self.min_image_shape // 8
+ self.max_latent_shape = self.max_image_shape // 8
+
+ self.embedding_dim = embedding_dim
+ self.text_maxlen = text_maxlen
+
+ def get_model(self):
+ return self.model
+
+ def get_input_names(self):
+ pass
+
+ def get_output_names(self):
+ pass
+
+ def get_dynamic_axes(self):
+ return None
+
+ def get_sample_input(self, batch_size, image_height, image_width):
+ pass
+
+ def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+ return None
+
+ def get_shape_dict(self, batch_size, image_height, image_width):
+ return None
+
+ def optimize(self, onnx_graph):
+ opt = Optimizer(onnx_graph)
+ opt.cleanup()
+ opt.fold_constants()
+ opt.infer_shapes()
+ onnx_opt_graph = opt.cleanup(return_onnx=True)
+ return onnx_opt_graph
+
+ def check_dims(self, batch_size, image_height, image_width):
+ assert batch_size >= self.min_batch and batch_size <= self.max_batch
+ assert image_height % 8 == 0 or image_width % 8 == 0
+ latent_height = image_height // 8
+ latent_width = image_width // 8
+ assert latent_height >= self.min_latent_shape and latent_height <= self.max_latent_shape
+ assert latent_width >= self.min_latent_shape and latent_width <= self.max_latent_shape
+ return (latent_height, latent_width)
+
+ def get_minmax_dims(self, batch_size, image_height, image_width, static_batch, static_shape):
+ min_batch = batch_size if static_batch else self.min_batch
+ max_batch = batch_size if static_batch else self.max_batch
+ latent_height = image_height // 8
+ latent_width = image_width // 8
+ min_image_height = image_height if static_shape else self.min_image_shape
+ max_image_height = image_height if static_shape else self.max_image_shape
+ min_image_width = image_width if static_shape else self.min_image_shape
+ max_image_width = image_width if static_shape else self.max_image_shape
+ min_latent_height = latent_height if static_shape else self.min_latent_shape
+ max_latent_height = latent_height if static_shape else self.max_latent_shape
+ min_latent_width = latent_width if static_shape else self.min_latent_shape
+ max_latent_width = latent_width if static_shape else self.max_latent_shape
+ return (
+ min_batch,
+ max_batch,
+ min_image_height,
+ max_image_height,
+ min_image_width,
+ max_image_width,
+ min_latent_height,
+ max_latent_height,
+ min_latent_width,
+ max_latent_width,
+ )
+
+
+def getOnnxPath(model_name, onnx_dir, opt=True):
+ return os.path.join(onnx_dir, model_name + (".opt" if opt else "") + ".onnx")
+
+
+def getEnginePath(model_name, engine_dir):
+ return os.path.join(engine_dir, model_name + ".plan")
+
+
+def build_engines(
+ models: dict,
+ engine_dir,
+ onnx_dir,
+ onnx_opset,
+ opt_image_height,
+ opt_image_width,
+ opt_batch_size=1,
+ force_engine_rebuild=False,
+ static_batch=False,
+ static_shape=True,
+ enable_preview=False,
+ enable_all_tactics=False,
+ timing_cache=None,
+ max_workspace_size=0,
+):
+ built_engines = {}
+ if not os.path.isdir(onnx_dir):
+ os.makedirs(onnx_dir)
+ if not os.path.isdir(engine_dir):
+ os.makedirs(engine_dir)
+
+ # Export models to ONNX
+ for model_name, model_obj in models.items():
+ engine_path = getEnginePath(model_name, engine_dir)
+ if force_engine_rebuild or not os.path.exists(engine_path):
+ logger.warning("Building Engines...")
+ logger.warning("Engine build can take a while to complete")
+ onnx_path = getOnnxPath(model_name, onnx_dir, opt=False)
+ onnx_opt_path = getOnnxPath(model_name, onnx_dir)
+ if force_engine_rebuild or not os.path.exists(onnx_opt_path):
+ if force_engine_rebuild or not os.path.exists(onnx_path):
+ logger.warning(f"Exporting model: {onnx_path}")
+ model = model_obj.get_model()
+ with torch.inference_mode(), torch.autocast("cuda"):
+ inputs = model_obj.get_sample_input(opt_batch_size, opt_image_height, opt_image_width)
+ torch.onnx.export(
+ model,
+ inputs,
+ onnx_path,
+ export_params=True,
+ opset_version=onnx_opset,
+ do_constant_folding=True,
+ input_names=model_obj.get_input_names(),
+ output_names=model_obj.get_output_names(),
+ dynamic_axes=model_obj.get_dynamic_axes(),
+ )
+ del model
+ torch.cuda.empty_cache()
+ gc.collect()
+ else:
+ logger.warning(f"Found cached model: {onnx_path}")
+
+ # Optimize onnx
+ if force_engine_rebuild or not os.path.exists(onnx_opt_path):
+ logger.warning(f"Generating optimizing model: {onnx_opt_path}")
+ onnx_opt_graph = model_obj.optimize(onnx.load(onnx_path))
+ onnx.save(onnx_opt_graph, onnx_opt_path)
+ else:
+ logger.warning(f"Found cached optimized model: {onnx_opt_path} ")
+
+ # Build TensorRT engines
+ for model_name, model_obj in models.items():
+ engine_path = getEnginePath(model_name, engine_dir)
+ engine = Engine(engine_path)
+ onnx_path = getOnnxPath(model_name, onnx_dir, opt=False)
+ onnx_opt_path = getOnnxPath(model_name, onnx_dir)
+
+ if force_engine_rebuild or not os.path.exists(engine.engine_path):
+ engine.build(
+ onnx_opt_path,
+ fp16=True,
+ input_profile=model_obj.get_input_profile(
+ opt_batch_size,
+ opt_image_height,
+ opt_image_width,
+ static_batch=static_batch,
+ static_shape=static_shape,
+ ),
+ enable_preview=enable_preview,
+ timing_cache=timing_cache,
+ workspace_size=max_workspace_size,
+ )
+ built_engines[model_name] = engine
+
+ # Load and activate TensorRT engines
+ for model_name, model_obj in models.items():
+ engine = built_engines[model_name]
+ engine.load()
+ engine.activate()
+
+ return built_engines
+
+
+def runEngine(engine, feed_dict, stream):
+ return engine.infer(feed_dict, stream)
+
+
+class CLIP(BaseModel):
+ def __init__(self, model, device, max_batch_size, embedding_dim):
+ super(CLIP, self).__init__(
+ model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim
+ )
+ self.name = "CLIP"
+
+ def get_input_names(self):
+ return ["input_ids"]
+
+ def get_output_names(self):
+ return ["text_embeddings", "pooler_output"]
+
+ def get_dynamic_axes(self):
+ return {"input_ids": {0: "B"}, "text_embeddings": {0: "B"}}
+
+ def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+ self.check_dims(batch_size, image_height, image_width)
+ min_batch, max_batch, _, _, _, _, _, _, _, _ = self.get_minmax_dims(
+ batch_size, image_height, image_width, static_batch, static_shape
+ )
+ return {
+ "input_ids": [(min_batch, self.text_maxlen), (batch_size, self.text_maxlen), (max_batch, self.text_maxlen)]
+ }
+
+ def get_shape_dict(self, batch_size, image_height, image_width):
+ self.check_dims(batch_size, image_height, image_width)
+ return {
+ "input_ids": (batch_size, self.text_maxlen),
+ "text_embeddings": (batch_size, self.text_maxlen, self.embedding_dim),
+ }
+
+ def get_sample_input(self, batch_size, image_height, image_width):
+ self.check_dims(batch_size, image_height, image_width)
+ return torch.zeros(batch_size, self.text_maxlen, dtype=torch.int32, device=self.device)
+
+ def optimize(self, onnx_graph):
+ opt = Optimizer(onnx_graph)
+ opt.select_outputs([0]) # delete graph output#1
+ opt.cleanup()
+ opt.fold_constants()
+ opt.infer_shapes()
+ opt.select_outputs([0], names=["text_embeddings"]) # rename network output
+ opt_onnx_graph = opt.cleanup(return_onnx=True)
+ return opt_onnx_graph
+
+
+def make_CLIP(model, device, max_batch_size, embedding_dim, inpaint=False):
+ return CLIP(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim)
+
+
+class UNet(BaseModel):
+ def __init__(
+ self, model, fp16=False, device="cuda", max_batch_size=16, embedding_dim=768, text_maxlen=77, unet_dim=4
+ ):
+ super(UNet, self).__init__(
+ model=model,
+ fp16=fp16,
+ device=device,
+ max_batch_size=max_batch_size,
+ embedding_dim=embedding_dim,
+ text_maxlen=text_maxlen,
+ )
+ self.unet_dim = unet_dim
+ self.name = "UNet"
+
+ def get_input_names(self):
+ return ["sample", "timestep", "encoder_hidden_states"]
+
+ def get_output_names(self):
+ return ["latent"]
+
+ def get_dynamic_axes(self):
+ return {
+ "sample": {0: "2B", 2: "H", 3: "W"},
+ "encoder_hidden_states": {0: "2B"},
+ "latent": {0: "2B", 2: "H", 3: "W"},
+ }
+
+ def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+ latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+ (
+ min_batch,
+ max_batch,
+ _,
+ _,
+ _,
+ _,
+ min_latent_height,
+ max_latent_height,
+ min_latent_width,
+ max_latent_width,
+ ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+ return {
+ "sample": [
+ (2 * min_batch, self.unet_dim, min_latent_height, min_latent_width),
+ (2 * batch_size, self.unet_dim, latent_height, latent_width),
+ (2 * max_batch, self.unet_dim, max_latent_height, max_latent_width),
+ ],
+ "encoder_hidden_states": [
+ (2 * min_batch, self.text_maxlen, self.embedding_dim),
+ (2 * batch_size, self.text_maxlen, self.embedding_dim),
+ (2 * max_batch, self.text_maxlen, self.embedding_dim),
+ ],
+ }
+
+ def get_shape_dict(self, batch_size, image_height, image_width):
+ latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+ return {
+ "sample": (2 * batch_size, self.unet_dim, latent_height, latent_width),
+ "encoder_hidden_states": (2 * batch_size, self.text_maxlen, self.embedding_dim),
+ "latent": (2 * batch_size, 4, latent_height, latent_width),
+ }
+
+ def get_sample_input(self, batch_size, image_height, image_width):
+ latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+ dtype = torch.float16 if self.fp16 else torch.float32
+ return (
+ torch.randn(
+ 2 * batch_size, self.unet_dim, latent_height, latent_width, dtype=torch.float32, device=self.device
+ ),
+ torch.tensor([1.0], dtype=torch.float32, device=self.device),
+ torch.randn(2 * batch_size, self.text_maxlen, self.embedding_dim, dtype=dtype, device=self.device),
+ )
+
+
+def make_UNet(model, device, max_batch_size, embedding_dim, inpaint=False):
+ return UNet(
+ model,
+ fp16=True,
+ device=device,
+ max_batch_size=max_batch_size,
+ embedding_dim=embedding_dim,
+ unet_dim=(9 if inpaint else 4),
+ )
+
+
+class VAE(BaseModel):
+ def __init__(self, model, device, max_batch_size, embedding_dim):
+ super(VAE, self).__init__(
+ model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim
+ )
+ self.name = "VAE decoder"
+
+ def get_input_names(self):
+ return ["latent"]
+
+ def get_output_names(self):
+ return ["images"]
+
+ def get_dynamic_axes(self):
+ return {"latent": {0: "B", 2: "H", 3: "W"}, "images": {0: "B", 2: "8H", 3: "8W"}}
+
+ def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+ latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+ (
+ min_batch,
+ max_batch,
+ _,
+ _,
+ _,
+ _,
+ min_latent_height,
+ max_latent_height,
+ min_latent_width,
+ max_latent_width,
+ ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+ return {
+ "latent": [
+ (min_batch, 4, min_latent_height, min_latent_width),
+ (batch_size, 4, latent_height, latent_width),
+ (max_batch, 4, max_latent_height, max_latent_width),
+ ]
+ }
+
+ def get_shape_dict(self, batch_size, image_height, image_width):
+ latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+ return {
+ "latent": (batch_size, 4, latent_height, latent_width),
+ "images": (batch_size, 3, image_height, image_width),
+ }
+
+ def get_sample_input(self, batch_size, image_height, image_width):
+ latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+ return torch.randn(batch_size, 4, latent_height, latent_width, dtype=torch.float32, device=self.device)
+
+
+def make_VAE(model, device, max_batch_size, embedding_dim, inpaint=False):
+ return VAE(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim)
+
+
+class TorchVAEEncoder(torch.nn.Module):
+ def __init__(self, model):
+ super().__init__()
+ self.vae_encoder = model
+
+ def forward(self, x):
+ return self.vae_encoder.encode(x).latent_dist.sample()
+
+
+class VAEEncoder(BaseModel):
+ def __init__(self, model, device, max_batch_size, embedding_dim):
+ super(VAEEncoder, self).__init__(
+ model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim
+ )
+ self.name = "VAE encoder"
+
+ def get_model(self):
+ vae_encoder = TorchVAEEncoder(self.model)
+ return vae_encoder
+
+ def get_input_names(self):
+ return ["images"]
+
+ def get_output_names(self):
+ return ["latent"]
+
+ def get_dynamic_axes(self):
+ return {"images": {0: "B", 2: "8H", 3: "8W"}, "latent": {0: "B", 2: "H", 3: "W"}}
+
+ def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+ assert batch_size >= self.min_batch and batch_size <= self.max_batch
+ min_batch = batch_size if static_batch else self.min_batch
+ max_batch = batch_size if static_batch else self.max_batch
+ self.check_dims(batch_size, image_height, image_width)
+ (
+ min_batch,
+ max_batch,
+ min_image_height,
+ max_image_height,
+ min_image_width,
+ max_image_width,
+ _,
+ _,
+ _,
+ _,
+ ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+
+ return {
+ "images": [
+ (min_batch, 3, min_image_height, min_image_width),
+ (batch_size, 3, image_height, image_width),
+ (max_batch, 3, max_image_height, max_image_width),
+ ]
+ }
+
+ def get_shape_dict(self, batch_size, image_height, image_width):
+ latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+ return {
+ "images": (batch_size, 3, image_height, image_width),
+ "latent": (batch_size, 4, latent_height, latent_width),
+ }
+
+ def get_sample_input(self, batch_size, image_height, image_width):
+ self.check_dims(batch_size, image_height, image_width)
+ return torch.randn(batch_size, 3, image_height, image_width, dtype=torch.float32, device=self.device)
+
+
+def make_VAEEncoder(model, device, max_batch_size, embedding_dim, inpaint=False):
+ return VAEEncoder(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim)
+
+
+class TensorRTStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline):
+ r"""
+ Pipeline for image-to-image generation using TensorRT accelerated Stable Diffusion.
+
+ This model inherits from [`StableDiffusionImg2ImgPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`CLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+ safety_checker ([`StableDiffusionSafetyChecker`]):
+ Classification module that estimates whether generated images could be considered offensive or harmful.
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+ feature_extractor ([`CLIPFeatureExtractor`]):
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+ """
+
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: DDIMScheduler,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ requires_safety_checker: bool = True,
+ stages=["clip", "unet", "vae", "vae_encoder"],
+ image_height: int = 512,
+ image_width: int = 512,
+ max_batch_size: int = 16,
+ # ONNX export parameters
+ onnx_opset: int = 17,
+ onnx_dir: str = "onnx",
+ # TensorRT engine build parameters
+ engine_dir: str = "engine",
+ build_preview_features: bool = True,
+ force_engine_rebuild: bool = False,
+ timing_cache: str = "timing_cache",
+ ):
+ super().__init__(
+ vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker
+ )
+
+ self.vae.forward = self.vae.decode
+
+ self.stages = stages
+ self.image_height, self.image_width = image_height, image_width
+ self.inpaint = False
+ self.onnx_opset = onnx_opset
+ self.onnx_dir = onnx_dir
+ self.engine_dir = engine_dir
+ self.force_engine_rebuild = force_engine_rebuild
+ self.timing_cache = timing_cache
+ self.build_static_batch = False
+ self.build_dynamic_shape = False
+ self.build_preview_features = build_preview_features
+
+ self.max_batch_size = max_batch_size
+ # TODO: Restrict batch size to 4 for larger image dimensions as a WAR for TensorRT limitation.
+ if self.build_dynamic_shape or self.image_height > 512 or self.image_width > 512:
+ self.max_batch_size = 4
+
+ self.stream = None # loaded in loadResources()
+ self.models = {} # loaded in __loadModels()
+ self.engine = {} # loaded in build_engines()
+
+ def __loadModels(self):
+ # Load pipeline models
+ self.embedding_dim = self.text_encoder.config.hidden_size
+ models_args = {
+ "device": self.torch_device,
+ "max_batch_size": self.max_batch_size,
+ "embedding_dim": self.embedding_dim,
+ "inpaint": self.inpaint,
+ }
+ if "clip" in self.stages:
+ self.models["clip"] = make_CLIP(self.text_encoder, **models_args)
+ if "unet" in self.stages:
+ self.models["unet"] = make_UNet(self.unet, **models_args)
+ if "vae" in self.stages:
+ self.models["vae"] = make_VAE(self.vae, **models_args)
+ if "vae_encoder" in self.stages:
+ self.models["vae_encoder"] = make_VAEEncoder(self.vae, **models_args)
+
+ @classmethod
+ def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+ cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+ resume_download = kwargs.pop("resume_download", False)
+ proxies = kwargs.pop("proxies", None)
+ local_files_only = kwargs.pop("local_files_only", False)
+ use_auth_token = kwargs.pop("use_auth_token", None)
+ revision = kwargs.pop("revision", None)
+
+ cls.cached_folder = (
+ pretrained_model_name_or_path
+ if os.path.isdir(pretrained_model_name_or_path)
+ else snapshot_download(
+ pretrained_model_name_or_path,
+ cache_dir=cache_dir,
+ resume_download=resume_download,
+ proxies=proxies,
+ local_files_only=local_files_only,
+ use_auth_token=use_auth_token,
+ revision=revision,
+ )
+ )
+
+ def to(self, torch_device: Optional[Union[str, torch.device]] = None, silence_dtype_warnings: bool = False):
+ super().to(torch_device, silence_dtype_warnings=silence_dtype_warnings)
+
+ self.onnx_dir = os.path.join(self.cached_folder, self.onnx_dir)
+ self.engine_dir = os.path.join(self.cached_folder, self.engine_dir)
+ self.timing_cache = os.path.join(self.cached_folder, self.timing_cache)
+
+ # set device
+ self.torch_device = self._execution_device
+ logger.warning(f"Running inference on device: {self.torch_device}")
+
+ # load models
+ self.__loadModels()
+
+ # build engines
+ self.engine = build_engines(
+ self.models,
+ self.engine_dir,
+ self.onnx_dir,
+ self.onnx_opset,
+ opt_image_height=self.image_height,
+ opt_image_width=self.image_width,
+ force_engine_rebuild=self.force_engine_rebuild,
+ static_batch=self.build_static_batch,
+ static_shape=not self.build_dynamic_shape,
+ enable_preview=self.build_preview_features,
+ timing_cache=self.timing_cache,
+ )
+
+ return self
+
+ def __initialize_timesteps(self, timesteps, strength):
+ self.scheduler.set_timesteps(timesteps)
+ offset = self.scheduler.steps_offset if hasattr(self.scheduler, "steps_offset") else 0
+ init_timestep = int(timesteps * strength) + offset
+ init_timestep = min(init_timestep, timesteps)
+ t_start = max(timesteps - init_timestep + offset, 0)
+ timesteps = self.scheduler.timesteps[t_start:].to(self.torch_device)
+ return timesteps, t_start
+
+ def __preprocess_images(self, batch_size, images=()):
+ init_images = []
+ for image in images:
+ image = image.to(self.torch_device).float()
+ image = image.repeat(batch_size, 1, 1, 1)
+ init_images.append(image)
+ return tuple(init_images)
+
+ def __encode_image(self, init_image):
+ init_latents = runEngine(self.engine["vae_encoder"], {"images": device_view(init_image)}, self.stream)[
+ "latent"
+ ]
+ init_latents = 0.18215 * init_latents
+ return init_latents
+
+ def __encode_prompt(self, prompt, negative_prompt):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ prompt to be encoded
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+ Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+ """
+ # Tokenize prompt
+ text_input_ids = (
+ self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+ .input_ids.type(torch.int32)
+ .to(self.torch_device)
+ )
+
+ text_input_ids_inp = device_view(text_input_ids)
+ # NOTE: output tensor for CLIP must be cloned because it will be overwritten when called again for negative prompt
+ text_embeddings = runEngine(self.engine["clip"], {"input_ids": text_input_ids_inp}, self.stream)[
+ "text_embeddings"
+ ].clone()
+
+ # Tokenize negative prompt
+ uncond_input_ids = (
+ self.tokenizer(
+ negative_prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+ .input_ids.type(torch.int32)
+ .to(self.torch_device)
+ )
+ uncond_input_ids_inp = device_view(uncond_input_ids)
+ uncond_embeddings = runEngine(self.engine["clip"], {"input_ids": uncond_input_ids_inp}, self.stream)[
+ "text_embeddings"
+ ]
+
+ # Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance
+ text_embeddings = torch.cat([uncond_embeddings, text_embeddings]).to(dtype=torch.float16)
+
+ return text_embeddings
+
+ def __denoise_latent(
+ self, latents, text_embeddings, timesteps=None, step_offset=0, mask=None, masked_image_latents=None
+ ):
+ if not isinstance(timesteps, torch.Tensor):
+ timesteps = self.scheduler.timesteps
+ for step_index, timestep in enumerate(timesteps):
+ # Expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat([latents] * 2)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, timestep)
+ if isinstance(mask, torch.Tensor):
+ latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+ # Predict the noise residual
+ timestep_float = timestep.float() if timestep.dtype != torch.float32 else timestep
+
+ sample_inp = device_view(latent_model_input)
+ timestep_inp = device_view(timestep_float)
+ embeddings_inp = device_view(text_embeddings)
+ noise_pred = runEngine(
+ self.engine["unet"],
+ {"sample": sample_inp, "timestep": timestep_inp, "encoder_hidden_states": embeddings_inp},
+ self.stream,
+ )["latent"]
+
+ # Perform guidance
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ latents = self.scheduler.step(noise_pred, timestep, latents).prev_sample
+
+ latents = 1.0 / 0.18215 * latents
+ return latents
+
+ def __decode_latent(self, latents):
+ images = runEngine(self.engine["vae"], {"latent": device_view(latents)}, self.stream)["images"]
+ images = (images / 2 + 0.5).clamp(0, 1)
+ return images.cpu().permute(0, 2, 3, 1).float().numpy()
+
+ def __loadResources(self, image_height, image_width, batch_size):
+ self.stream = cuda.Stream()
+
+ # Allocate buffers for TensorRT engine bindings
+ for model_name, obj in self.models.items():
+ self.engine[model_name].allocate_buffers(
+ shape_dict=obj.get_shape_dict(batch_size, image_height, image_width), device=self.torch_device
+ )
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+ strength: float = 0.8,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+ instead.
+ image (`PIL.Image.Image`):
+ `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+ be masked out with `mask_image` and repainted according to `prompt`.
+ strength (`float`, *optional*, defaults to 0.8):
+ Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+ will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+ denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+ be maximum and the denoising process will run for the full number of iterations specified in
+ `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+ Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+
+ """
+ self.generator = generator
+ self.denoising_steps = num_inference_steps
+ self.guidance_scale = guidance_scale
+
+ # Pre-compute latent input scales and linear multistep coefficients
+ self.scheduler.set_timesteps(self.denoising_steps, device=self.torch_device)
+
+ # Define call parameters
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ prompt = [prompt]
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ raise ValueError(f"Expected prompt to be of type list or str but got {type(prompt)}")
+
+ if negative_prompt is None:
+ negative_prompt = [""] * batch_size
+
+ if negative_prompt is not None and isinstance(negative_prompt, str):
+ negative_prompt = [negative_prompt]
+
+ assert len(prompt) == len(negative_prompt)
+
+ if batch_size > self.max_batch_size:
+ raise ValueError(
+ f"Batch size {len(prompt)} is larger than allowed {self.max_batch_size}. If dynamic shape is used, then maximum batch size is 4"
+ )
+
+ # load resources
+ self.__loadResources(self.image_height, self.image_width, batch_size)
+
+ with torch.inference_mode(), torch.autocast("cuda"), trt.Runtime(TRT_LOGGER):
+ # Initialize timesteps
+ timesteps, t_start = self.__initialize_timesteps(self.denoising_steps, strength)
+ latent_timestep = timesteps[:1].repeat(batch_size)
+
+ # Pre-process input image
+ if isinstance(image, PIL.Image.Image):
+ image = preprocess_image(image)
+ init_image = self.__preprocess_images(batch_size, (image,))[0]
+
+ # VAE encode init image
+ init_latents = self.__encode_image(init_image)
+
+ # Add noise to latents using timesteps
+ noise = torch.randn(
+ init_latents.shape, generator=self.generator, device=self.torch_device, dtype=torch.float32
+ )
+ latents = self.scheduler.add_noise(init_latents, noise, latent_timestep)
+
+ # CLIP text encoder
+ text_embeddings = self.__encode_prompt(prompt, negative_prompt)
+
+ # UNet denoiser
+ latents = self.__denoise_latent(latents, text_embeddings, timesteps=timesteps, step_offset=t_start)
+
+ # VAE decode latent
+ images = self.__decode_latent(latents)
+
+ images = self.numpy_to_pil(images)
+ return StableDiffusionPipelineOutput(images=images, nsfw_content_detected=None)
diff --git a/examples/community/stable_diffusion_tensorrt_txt2img.py b/examples/community/stable_diffusion_tensorrt_txt2img.py
old mode 100644
new mode 100755
index aa7b5c12313b..b51f3176b958
--- a/examples/community/stable_diffusion_tensorrt_txt2img.py
+++ b/examples/community/stable_diffusion_tensorrt_txt2img.py
@@ -54,8 +54,9 @@
"""
Installation instructions
-python3 -m pip install --upgrade tensorrt
-python3 -m pip install --upgrade polygraphy onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com
+python3 -m pip install --upgrade transformers diffusers>=0.16.0
+python3 -m pip install --upgrade tensorrt>=8.6.1
+python3 -m pip install --upgrade polygraphy>=0.47.0 onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com
python3 -m pip install onnxruntime
"""
@@ -132,7 +133,7 @@ def build(
config_kwargs["tactic_sources"] = []
engine = engine_from_network(
- network_from_onnx_path(onnx_path),
+ network_from_onnx_path(onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM]),
config=CreateConfig(fp16=fp16, profiles=[p], load_timing_cache=timing_cache, **config_kwargs),
save_timing_cache=timing_cache,
)
@@ -633,6 +634,7 @@ def __init__(
onnx_dir: str = "onnx",
# TensorRT engine build parameters
engine_dir: str = "engine",
+ build_preview_features: bool = True,
force_engine_rebuild: bool = False,
timing_cache: str = "timing_cache",
):
@@ -652,7 +654,7 @@ def __init__(
self.timing_cache = timing_cache
self.build_static_batch = False
self.build_dynamic_shape = False
- self.build_preview_features = False
+ self.build_preview_features = build_preview_features
self.max_batch_size = max_batch_size
# TODO: Restrict batch size to 4 for larger image dimensions as a WAR for TensorRT limitation.
From 886575ee43c3e7060d74e2feb2018111e0998013 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Tue, 16 May 2023 20:07:21 +0200
Subject: [PATCH 085/206] Refactor controlnet and add img2img and inpaint
(#3386)
* refactor controlnet and add img2img and inpaint
* First draft to get pipelines to work
* make style
* Fix more
* Fix more
* More tests
* Fix more
* Make inpainting work
* make style and more tests
* Apply suggestions from code review
* up
* make style
* Fix imports
* Fix more
* Fix more
* Improve examples
* add test
* Make sure import is correctly deprecated
* Make sure everything works in compile mode
* make sure authorship is correctly attributed
---
docs/source/en/_toctree.yml | 4 +-
.../{stable_diffusion => }/controlnet.mdx | 61 +-
docs/source/en/api/pipelines/overview.mdx | 2 +-
docs/source/en/index.mdx | 2 +-
src/diffusers/__init__.py | 2 +
src/diffusers/pipeline_utils.py | 10 +
src/diffusers/pipelines/__init__.py | 8 +-
.../pipelines/controlnet/__init__.py | 22 +
.../pipelines/controlnet/multicontrolnet.py | 66 +
.../controlnet/pipeline_controlnet.py | 1035 ++++++++++++++
.../controlnet/pipeline_controlnet_img2img.py | 1113 +++++++++++++++
.../controlnet/pipeline_controlnet_inpaint.py | 1228 +++++++++++++++++
.../controlnet/pipeline_flax_controlnet.py | 537 +++++++
.../pipeline_semantic_stable_diffusion.py | 2 +-
.../pipelines/stable_diffusion/__init__.py | 2 -
...peline_flax_stable_diffusion_controlnet.py | 529 +------
.../pipeline_stable_diffusion_controlnet.py | 1102 +--------------
.../dummy_torch_and_transformers_objects.py | 30 +
tests/pipelines/controlnet/__init__.py | 0
.../test_controlnet.py} | 10 +-
.../controlnet/test_controlnet_img2img.py | 366 +++++
.../controlnet/test_controlnet_inpaint.py | 379 +++++
.../test_flax_controlnet.py} | 2 +-
.../test_stable_diffusion_image_variation.py | 5 +-
.../test_stable_diffusion_inpaint.py | 5 +-
25 files changed, 4878 insertions(+), 1644 deletions(-)
rename docs/source/en/api/pipelines/{stable_diffusion => }/controlnet.mdx (67%)
create mode 100644 src/diffusers/pipelines/controlnet/__init__.py
create mode 100644 src/diffusers/pipelines/controlnet/multicontrolnet.py
create mode 100644 src/diffusers/pipelines/controlnet/pipeline_controlnet.py
create mode 100644 src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
create mode 100644 src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
create mode 100644 src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py
create mode 100644 tests/pipelines/controlnet/__init__.py
rename tests/pipelines/{stable_diffusion/test_stable_diffusion_controlnet.py => controlnet/test_controlnet.py} (98%)
create mode 100644 tests/pipelines/controlnet/test_controlnet_img2img.py
create mode 100644 tests/pipelines/controlnet/test_controlnet_inpaint.py
rename tests/pipelines/{stable_diffusion/test_stable_diffusion_flax_controlnet.py => controlnet/test_flax_controlnet.py} (98%)
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 246b467d8b04..52d8988206f1 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -148,6 +148,8 @@
title: Audio Diffusion
- local: api/pipelines/audioldm
title: AudioLDM
+ - local: api/pipelines/controlnet
+ title: ControlNet
- local: api/pipelines/cycle_diffusion
title: Cycle Diffusion
- local: api/pipelines/dance_diffusion
@@ -203,8 +205,6 @@
title: Self-Attention Guidance
- local: api/pipelines/stable_diffusion/panorama
title: MultiDiffusion Panorama
- - local: api/pipelines/stable_diffusion/controlnet
- title: Text-to-Image Generation with ControlNet Conditioning
- local: api/pipelines/stable_diffusion/model_editing
title: Text-to-Image Model Editing
- local: api/pipelines/stable_diffusion/diffedit
diff --git a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx b/docs/source/en/api/pipelines/controlnet.mdx
similarity index 67%
rename from docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
rename to docs/source/en/api/pipelines/controlnet.mdx
index fd5c87821c01..f9e4c3c47e3e 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
+++ b/docs/source/en/api/pipelines/controlnet.mdx
@@ -22,7 +22,7 @@ The abstract of the paper is the following:
*We present a neural network structure, ControlNet, to control pretrained large diffusion models to support additional input conditions. The ControlNet learns task-specific conditions in an end-to-end way, and the learning is robust even when the training dataset is small (< 50k). Moreover, training a ControlNet is as fast as fine-tuning a diffusion model, and the model can be trained on a personal devices. Alternatively, if powerful computation clusters are available, the model can scale to large amounts (millions to billions) of data. We report that large diffusion models like Stable Diffusion can be augmented with ControlNets to enable conditional inputs like edge maps, segmentation maps, keypoints, etc. This may enrich the methods to control large diffusion models and further facilitate related applications.*
-This model was contributed by the amazing community contributor [takuma104](https://huggingface.co/takuma104) ❤️ .
+This model was contributed by the community contributor [takuma104](https://huggingface.co/takuma104) ❤️ .
Resources:
@@ -33,7 +33,9 @@ Resources:
| Pipeline | Tasks | Demo
|---|---|:---:|
-| [StableDiffusionControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py) | *Text-to-Image Generation with ControlNet Conditioning* | [Colab Example](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/controlnet.ipynb)
+| [StableDiffusionControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/controlnet/pipeline_controlnet.py) | *Text-to-Image Generation with ControlNet Conditioning* | [Colab Example](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/controlnet.ipynb)
+| [StableDiffusionControlNetImg2ImgPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py) | *Image-to-Image Generation with ControlNet Conditioning* |
+| [StableDiffusionControlNetInpaintPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_controlnet_inpaint.py) | *Inpainting Generation with ControlNet Conditioning* |
## Usage example
@@ -301,21 +303,22 @@ All checkpoints can be found under the authors' namespace [lllyasviel](https://h
### ControlNet v1.1
-| Model Name | Control Image Overview| Control Image Example | Generated Image Example |
-|---|---|---|---|
-|[lllyasviel/control_v11p_sd15_canny](https://huggingface.co/lllyasviel/control_v11p_sd15_canny)
*Trained with canny edge detection* | A monochrome image with white edges on a black background.|
|
|
-|[lllyasviel/control_v11e_sd15_ip2p](https://huggingface.co/lllyasviel/control_v11e_sd15_ip2p)
*Trained with pixel to pixel instruction* | No condition .|
|
|
-|[lllyasviel/control_v11p_sd15_inpaint](https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint)
Trained with image inpainting | No condition.|
|
|
-|[lllyasviel/control_v11p_sd15_mlsd](https://huggingface.co/lllyasviel/control_v11p_sd15_mlsd)
Trained with multi-level line segment detection | An image with annotated line segments.|
|
|
-|[lllyasviel/control_v11f1p_sd15_depth](https://huggingface.co/lllyasviel/control_v11f1p_sd15_depth)
Trained with depth estimation | An image with depth information, usually represented as a grayscale image.|
|
|
-|[lllyasviel/control_v11p_sd15_normalbae](https://huggingface.co/lllyasviel/control_v11p_sd15_normalbae)
Trained with surface normal estimation | An image with surface normal information, usually represented as a color-coded image.|
|
|
-|[lllyasviel/control_v11p_sd15_seg](https://huggingface.co/lllyasviel/control_v11p_sd15_seg)
Trained with image segmentation | An image with segmented regions, usually represented as a color-coded image.|
|
|
-|[lllyasviel/control_v11p_sd15_lineart](https://huggingface.co/lllyasviel/control_v11p_sd15_lineart)
Trained with line art generation | An image with line art, usually black lines on a white background.|
|
|
-|[lllyasviel/control_v11p_sd15s2_lineart_anime](https://huggingface.co/lllyasviel/control_v11p_sd15s2_lineart_anime)
Trained with anime line art generation | An image with anime-style line art.|
|
|
-|[lllyasviel/control_v11p_sd15_openpose](https://huggingface.co/lllyasviel/control_v11p_sd15s2_lineart_anime)
Trained with human pose estimation | An image with human poses, usually represented as a set of keypoints or skeletons.|
|
|
-|[lllyasviel/control_v11p_sd15_scribble](https://huggingface.co/lllyasviel/control_v11p_sd15_scribble)
Trained with scribble-based image generation | An image with scribbles, usually random or user-drawn strokes.|
|
|
-|[lllyasviel/control_v11p_sd15_softedge](https://huggingface.co/lllyasviel/control_v11p_sd15_softedge)
Trained with soft edge image generation | An image with soft edges, usually to create a more painterly or artistic effect.|
|
|
-|[lllyasviel/control_v11e_sd15_shuffle](https://huggingface.co/lllyasviel/control_v11e_sd15_shuffle)
Trained with image shuffling | An image with shuffled patches or regions.|
|
|
+| Model Name | Control Image Overview| Condition Image | Control Image Example | Generated Image Example |
+|---|---|---|---|---|
+|[lllyasviel/control_v11p_sd15_canny](https://huggingface.co/lllyasviel/control_v11p_sd15_canny)
| *Trained with canny edge detection* | A monochrome image with white edges on a black background.|
|
|
+|[lllyasviel/control_v11e_sd15_ip2p](https://huggingface.co/lllyasviel/control_v11e_sd15_ip2p)
| *Trained with pixel to pixel instruction* | No condition .|
|
|
+|[lllyasviel/control_v11p_sd15_inpaint](https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint)
| Trained with image inpainting | No condition.|
|
|
+|[lllyasviel/control_v11p_sd15_mlsd](https://huggingface.co/lllyasviel/control_v11p_sd15_mlsd)
| Trained with multi-level line segment detection | An image with annotated line segments.|
|
|
+|[lllyasviel/control_v11f1p_sd15_depth](https://huggingface.co/lllyasviel/control_v11f1p_sd15_depth)
| Trained with depth estimation | An image with depth information, usually represented as a grayscale image.|
|
|
+|[lllyasviel/control_v11p_sd15_normalbae](https://huggingface.co/lllyasviel/control_v11p_sd15_normalbae)
| Trained with surface normal estimation | An image with surface normal information, usually represented as a color-coded image.|
|
|
+|[lllyasviel/control_v11p_sd15_seg](https://huggingface.co/lllyasviel/control_v11p_sd15_seg)
| Trained with image segmentation | An image with segmented regions, usually represented as a color-coded image.|
|
|
+|[lllyasviel/control_v11p_sd15_lineart](https://huggingface.co/lllyasviel/control_v11p_sd15_lineart)
| Trained with line art generation | An image with line art, usually black lines on a white background.|
|
|
+|[lllyasviel/control_v11p_sd15s2_lineart_anime](https://huggingface.co/lllyasviel/control_v11p_sd15s2_lineart_anime)
| Trained with anime line art generation | An image with anime-style line art.|
|
|
+|[lllyasviel/control_v11p_sd15_openpose](https://huggingface.co/lllyasviel/control_v11p_sd15s2_lineart_anime)
| Trained with human pose estimation | An image with human poses, usually represented as a set of keypoints or skeletons.|
|
|
+|[lllyasviel/control_v11p_sd15_scribble](https://huggingface.co/lllyasviel/control_v11p_sd15_scribble)
| Trained with scribble-based image generation | An image with scribbles, usually random or user-drawn strokes.|
|
|
+|[lllyasviel/control_v11p_sd15_softedge](https://huggingface.co/lllyasviel/control_v11p_sd15_softedge)
| Trained with soft edge image generation | An image with soft edges, usually to create a more painterly or artistic effect.|
|
|
+|[lllyasviel/control_v11e_sd15_shuffle](https://huggingface.co/lllyasviel/control_v11e_sd15_shuffle)
| Trained with image shuffling | An image with shuffled patches or regions.|
|
|
+|[lllyasviel/control_v11f1e_sd15_tile](https://huggingface.co/lllyasviel/control_v11f1e_sd15_tile)
| Trained with image tiling | A blurry image or part of an image .|
|
|
## StableDiffusionControlNetPipeline
[[autodoc]] StableDiffusionControlNetPipeline
@@ -329,6 +332,30 @@ All checkpoints can be found under the authors' namespace [lllyasviel](https://h
- disable_xformers_memory_efficient_attention
- load_textual_inversion
+## StableDiffusionControlNetImg2ImgPipeline
+[[autodoc]] StableDiffusionControlNetImg2ImgPipeline
+ - all
+ - __call__
+ - enable_attention_slicing
+ - disable_attention_slicing
+ - enable_vae_slicing
+ - disable_vae_slicing
+ - enable_xformers_memory_efficient_attention
+ - disable_xformers_memory_efficient_attention
+ - load_textual_inversion
+
+## StableDiffusionControlNetInpaintPipeline
+[[autodoc]] StableDiffusionControlNetInpaintPipeline
+ - all
+ - __call__
+ - enable_attention_slicing
+ - disable_attention_slicing
+ - enable_vae_slicing
+ - disable_vae_slicing
+ - enable_xformers_memory_efficient_attention
+ - disable_xformers_memory_efficient_attention
+ - load_textual_inversion
+
## FlaxStableDiffusionControlNetPipeline
[[autodoc]] FlaxStableDiffusionControlNetPipeline
- all
diff --git a/docs/source/en/api/pipelines/overview.mdx b/docs/source/en/api/pipelines/overview.mdx
index 91716784f8fe..2b2f95590016 100644
--- a/docs/source/en/api/pipelines/overview.mdx
+++ b/docs/source/en/api/pipelines/overview.mdx
@@ -46,7 +46,7 @@ available a colab notebook to directly try them out.
|---|---|:---:|:---:|
| [alt_diffusion](./alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation | -
| [audio_diffusion](./audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio_diffusion.git) | Unconditional Audio Generation |
-| [controlnet](./api/pipelines/stable_diffusion/controlnet) | [**ControlNet with Stable Diffusion**](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/controlnet.ipynb)
+| [controlnet](./api/pipelines/controlnet) | [**ControlNet with Stable Diffusion**](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/controlnet.ipynb)
| [cycle_diffusion](./cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
| [dance_diffusion](./dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
| [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 46a985ac2f8d..66548663827a 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -53,7 +53,7 @@ The library has three main components:
|---|---|:---:|
| [alt_diffusion](./api/pipelines/alt_diffusion) | [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation |
| [audio_diffusion](./api/pipelines/audio_diffusion) | [Audio Diffusion](https://github.com/teticio/audio-diffusion.git) | Unconditional Audio Generation |
-| [controlnet](./api/pipelines/stable_diffusion/controlnet) | [Adding Conditional Control to Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation |
+| [controlnet](./api/pipelines/controlnet) | [Adding Conditional Control to Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation |
| [cycle_diffusion](./api/pipelines/cycle_diffusion) | [Unifying Diffusion Models' Latent Space, with Applications to CycleDiffusion and Guidance](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
| [dance_diffusion](./api/pipelines/dance_diffusion) | [Dance Diffusion](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
| [ddpm](./api/pipelines/ddpm) | [Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index a8293ea77fef..0d48a16b6216 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -132,6 +132,8 @@
PaintByExamplePipeline,
SemanticStableDiffusionPipeline,
StableDiffusionAttendAndExcitePipeline,
+ StableDiffusionControlNetImg2ImgPipeline,
+ StableDiffusionControlNetInpaintPipeline,
StableDiffusionControlNetPipeline,
StableDiffusionDepth2ImgPipeline,
StableDiffusionDiffEditPipeline,
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 5c0c2337dc04..87709d5f616c 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -17,3 +17,13 @@
# It only exists so that temporarely `from diffusers.pipelines import DiffusionPipeline` works
from .pipelines import DiffusionPipeline, ImagePipelineOutput # noqa: F401
+from .utils import deprecate
+
+
+deprecate(
+ "pipelines_utils",
+ "0.22.0",
+ "Importing `DiffusionPipeline` or `ImagePipelineOutput` from diffusers.pipeline_utils is deprecated. Please import from diffusers.pipelines.pipeline_utils instead.",
+ standard_warn=False,
+ stacklevel=3,
+)
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 3cddad4a6b26..9b44f4e5eb14 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -44,6 +44,11 @@
else:
from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline
from .audioldm import AudioLDMPipeline
+ from .controlnet import (
+ StableDiffusionControlNetImg2ImgPipeline,
+ StableDiffusionControlNetInpaintPipeline,
+ StableDiffusionControlNetPipeline,
+ )
from .deepfloyd_if import (
IFImg2ImgPipeline,
IFImg2ImgSuperResolutionPipeline,
@@ -58,7 +63,6 @@
from .stable_diffusion import (
CycleDiffusionPipeline,
StableDiffusionAttendAndExcitePipeline,
- StableDiffusionControlNetPipeline,
StableDiffusionDepth2ImgPipeline,
StableDiffusionDiffEditPipeline,
StableDiffusionImageVariationPipeline,
@@ -133,8 +137,8 @@
except OptionalDependencyNotAvailable:
from ..utils.dummy_flax_and_transformers_objects import * # noqa F403
else:
+ from .controlnet import FlaxStableDiffusionControlNetPipeline
from .stable_diffusion import (
- FlaxStableDiffusionControlNetPipeline,
FlaxStableDiffusionImg2ImgPipeline,
FlaxStableDiffusionInpaintPipeline,
FlaxStableDiffusionPipeline,
diff --git a/src/diffusers/pipelines/controlnet/__init__.py b/src/diffusers/pipelines/controlnet/__init__.py
new file mode 100644
index 000000000000..76ab63bdb116
--- /dev/null
+++ b/src/diffusers/pipelines/controlnet/__init__.py
@@ -0,0 +1,22 @@
+from ...utils import (
+ OptionalDependencyNotAvailable,
+ is_flax_available,
+ is_torch_available,
+ is_transformers_available,
+)
+
+
+try:
+ if not (is_transformers_available() and is_torch_available()):
+ raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+ from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
+else:
+ from .multicontrolnet import MultiControlNetModel
+ from .pipeline_controlnet import StableDiffusionControlNetPipeline
+ from .pipeline_controlnet_img2img import StableDiffusionControlNetImg2ImgPipeline
+ from .pipeline_controlnet_inpaint import StableDiffusionControlNetInpaintPipeline
+
+
+if is_transformers_available() and is_flax_available():
+ from .pipeline_flax_controlnet import FlaxStableDiffusionControlNetPipeline
diff --git a/src/diffusers/pipelines/controlnet/multicontrolnet.py b/src/diffusers/pipelines/controlnet/multicontrolnet.py
new file mode 100644
index 000000000000..91d40b20124c
--- /dev/null
+++ b/src/diffusers/pipelines/controlnet/multicontrolnet.py
@@ -0,0 +1,66 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...models.controlnet import ControlNetModel, ControlNetOutput
+from ...models.modeling_utils import ModelMixin
+
+
+class MultiControlNetModel(ModelMixin):
+ r"""
+ Multiple `ControlNetModel` wrapper class for Multi-ControlNet
+
+ This module is a wrapper for multiple instances of the `ControlNetModel`. The `forward()` API is designed to be
+ compatible with `ControlNetModel`.
+
+ Args:
+ controlnets (`List[ControlNetModel]`):
+ Provides additional conditioning to the unet during the denoising process. You must set multiple
+ `ControlNetModel` as a list.
+ """
+
+ def __init__(self, controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]):
+ super().__init__()
+ self.nets = nn.ModuleList(controlnets)
+
+ def forward(
+ self,
+ sample: torch.FloatTensor,
+ timestep: Union[torch.Tensor, float, int],
+ encoder_hidden_states: torch.Tensor,
+ controlnet_cond: List[torch.tensor],
+ conditioning_scale: List[float],
+ class_labels: Optional[torch.Tensor] = None,
+ timestep_cond: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ guess_mode: bool = False,
+ return_dict: bool = True,
+ ) -> Union[ControlNetOutput, Tuple]:
+ for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)):
+ down_samples, mid_sample = controlnet(
+ sample,
+ timestep,
+ encoder_hidden_states,
+ image,
+ scale,
+ class_labels,
+ timestep_cond,
+ attention_mask,
+ cross_attention_kwargs,
+ guess_mode,
+ return_dict,
+ )
+
+ # merge samples
+ if i == 0:
+ down_block_res_samples, mid_block_res_sample = down_samples, mid_sample
+ else:
+ down_block_res_samples = [
+ samples_prev + samples_curr
+ for samples_prev, samples_curr in zip(down_block_res_samples, down_samples)
+ ]
+ mid_block_res_sample += mid_sample
+
+ return down_block_res_samples, mid_block_res_sample
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
new file mode 100644
index 000000000000..8a2ffbbff171
--- /dev/null
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -0,0 +1,1035 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+import os
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+ PIL_INTERPOLATION,
+ is_accelerate_available,
+ is_accelerate_version,
+ is_compiled_module,
+ logging,
+ randn_tensor,
+ replace_example_docstring,
+)
+from ..pipeline_utils import DiffusionPipeline
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from .multicontrolnet import MultiControlNetModel
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+ Examples:
+ ```py
+ >>> # !pip install opencv-python transformers accelerate
+ >>> from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
+ >>> from diffusers.utils import load_image
+ >>> import numpy as np
+ >>> import torch
+
+ >>> import cv2
+ >>> from PIL import Image
+
+ >>> # download an image
+ >>> image = load_image(
+ ... "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+ ... )
+ >>> image = np.array(image)
+
+ >>> # get canny image
+ >>> image = cv2.Canny(image, 100, 200)
+ >>> image = image[:, :, None]
+ >>> image = np.concatenate([image, image, image], axis=2)
+ >>> canny_image = Image.fromarray(image)
+
+ >>> # load control net and stable diffusion v1-5
+ >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+ >>> pipe = StableDiffusionControlNetPipeline.from_pretrained(
+ ... "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+ ... )
+
+ >>> # speed up diffusion process with faster scheduler and memory optimization
+ >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+ >>> # remove following line if xformers is not installed
+ >>> pipe.enable_xformers_memory_efficient_attention()
+
+ >>> pipe.enable_model_cpu_offload()
+
+ >>> # generate image
+ >>> generator = torch.manual_seed(0)
+ >>> image = pipe(
+ ... "futuristic-looking woman", num_inference_steps=20, generator=generator, image=canny_image
+ ... ).images[0]
+ ```
+"""
+
+
+class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
+ r"""
+ Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ In addition the pipeline inherits the following loading methods:
+ - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`CLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+ Provides additional conditioning to the unet during the denoising process. If you set multiple ControlNets
+ as a list, the outputs from each ControlNet are added together to create one combined additional
+ conditioning.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+ safety_checker ([`StableDiffusionSafetyChecker`]):
+ Classification module that estimates whether generated images could be considered offensive or harmful.
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+ feature_extractor ([`CLIPImageProcessor`]):
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+ """
+ _optional_components = ["safety_checker", "feature_extractor"]
+
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
+ super().__init__()
+
+ if safety_checker is None and requires_safety_checker:
+ logger.warning(
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+ )
+
+ if safety_checker is not None and feature_extractor is None:
+ raise ValueError(
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+ )
+
+ if isinstance(controlnet, (list, tuple)):
+ controlnet = MultiControlNetModel(controlnet)
+
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ controlnet=controlnet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
+ def enable_vae_slicing(self):
+ r"""
+ Enable sliced VAE decoding.
+
+ When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
+ steps. This is useful to save some memory and allow larger batch sizes.
+ """
+ self.vae.enable_slicing()
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+ def disable_vae_slicing(self):
+ r"""
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_slicing()
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
+ def enable_vae_tiling(self):
+ r"""
+ Enable tiled VAE decoding.
+
+ When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
+ several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
+ """
+ self.vae.enable_tiling()
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+ def disable_vae_tiling(self):
+ r"""
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_tiling()
+
+ def enable_sequential_cpu_offload(self, gpu_id=0):
+ r"""
+ Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+ text_encoder, vae, controlnet, and safety checker have their state dicts saved to CPU and then are moved to a
+ `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+ Note that offloading happens on a submodule basis. Memory savings are higher than with
+ `enable_model_cpu_offload`, but performance is lower.
+ """
+ if is_accelerate_available():
+ from accelerate import cpu_offload
+ else:
+ raise ImportError("Please install accelerate via `pip install accelerate`")
+
+ device = torch.device(f"cuda:{gpu_id}")
+
+ for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.controlnet]:
+ cpu_offload(cpu_offloaded_model, device)
+
+ if self.safety_checker is not None:
+ cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+ def enable_model_cpu_offload(self, gpu_id=0):
+ r"""
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+ """
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+ from accelerate import cpu_offload_with_hook
+ else:
+ raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+ device = torch.device(f"cuda:{gpu_id}")
+
+ hook = None
+ for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+ _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+ if self.safety_checker is not None:
+ # the safety checker can offload the vae again
+ _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+ # control net hook has be manually offloaded as it alternates with unet
+ cpu_offload_with_hook(self.controlnet, device)
+
+ # We'll offload the last model manually.
+ self.final_offload_hook = hook
+
+ @property
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+ def _execution_device(self):
+ r"""
+ Returns the device on which the pipeline's models will be executed. After calling
+ `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+ hooks.
+ """
+ if not hasattr(self.unet, "_hf_hook"):
+ return self.device
+ for module in self.unet.modules():
+ if (
+ hasattr(module, "_hf_hook")
+ and hasattr(module._hf_hook, "execution_device")
+ and module._hf_hook.execution_device is not None
+ ):
+ return torch.device(module._hf_hook.execution_device)
+ return self.device
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+ def _encode_prompt(
+ self,
+ prompt,
+ device,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ ):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ prompt to be encoded
+ device: (`torch.device`):
+ torch device
+ num_images_per_prompt (`int`):
+ number of images that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ """
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ if prompt_embeds is None:
+ # textual inversion: procecss multi-vector tokens if necessary
+ if isinstance(self, TextualInversionLoaderMixin):
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+ text_input_ids = text_inputs.input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+ attention_mask = text_inputs.attention_mask.to(device)
+ else:
+ attention_mask = None
+
+ prompt_embeds = self.text_encoder(
+ text_input_ids.to(device),
+ attention_mask=attention_mask,
+ )
+ prompt_embeds = prompt_embeds[0]
+
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+ bs_embed, seq_len, _ = prompt_embeds.shape
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
+ uncond_tokens: List[str]
+ if negative_prompt is None:
+ uncond_tokens = [""] * batch_size
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif isinstance(negative_prompt, str):
+ uncond_tokens = [negative_prompt]
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = negative_prompt
+
+ # textual inversion: procecss multi-vector tokens if necessary
+ if isinstance(self, TextualInversionLoaderMixin):
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+ max_length = prompt_embeds.shape[1]
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+ attention_mask = uncond_input.attention_mask.to(device)
+ else:
+ attention_mask = None
+
+ negative_prompt_embeds = self.text_encoder(
+ uncond_input.input_ids.to(device),
+ attention_mask=attention_mask,
+ )
+ negative_prompt_embeds = negative_prompt_embeds[0]
+
+ if do_classifier_free_guidance:
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+ seq_len = negative_prompt_embeds.shape[1]
+
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+ return prompt_embeds
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+ def run_safety_checker(self, image, device, dtype):
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+ image, has_nsfw_concept = self.safety_checker(
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+ )
+ return image, has_nsfw_concept
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+ def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
+ latents = 1 / self.vae.config.scaling_factor * latents
+ image = self.vae.decode(latents, return_dict=False)[0]
+ image = (image / 2 + 0.5).clamp(0, 1)
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+ return image
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+ def prepare_extra_step_kwargs(self, generator, eta):
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ # check if the scheduler accepts generator
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ if accepts_generator:
+ extra_step_kwargs["generator"] = generator
+ return extra_step_kwargs
+
+ def check_inputs(
+ self,
+ prompt,
+ image,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ controlnet_conditioning_scale=1.0,
+ ):
+ if height % 8 != 0 or width % 8 != 0:
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+ if (callback_steps is None) or (
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ if prompt is not None and prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+ " only forward one of the two."
+ )
+ elif prompt is None and prompt_embeds is None:
+ raise ValueError(
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+ )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ if negative_prompt is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
+ raise ValueError(
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+ f" {negative_prompt_embeds.shape}."
+ )
+
+ # `prompt` needs more sophisticated handling when there are multiple
+ # conditionings.
+ if isinstance(self.controlnet, MultiControlNetModel):
+ if isinstance(prompt, list):
+ logger.warning(
+ f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+ " prompts. The conditionings will be fixed across the prompts."
+ )
+
+ # Check `image`
+ is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+ self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+ )
+ if (
+ isinstance(self.controlnet, ControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, ControlNetModel)
+ ):
+ self.check_image(image, prompt, prompt_embeds)
+ elif (
+ isinstance(self.controlnet, MultiControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+ ):
+ if not isinstance(image, list):
+ raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+ # When `image` is a nested list:
+ # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+ elif any(isinstance(i, list) for i in image):
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+ elif len(image) != len(self.controlnet.nets):
+ raise ValueError(
+ "For multiple controlnets: `image` must have the same length as the number of controlnets."
+ )
+
+ for image_ in image:
+ self.check_image(image_, prompt, prompt_embeds)
+ else:
+ assert False
+
+ # Check `controlnet_conditioning_scale`
+ if (
+ isinstance(self.controlnet, ControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, ControlNetModel)
+ ):
+ if not isinstance(controlnet_conditioning_scale, float):
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+ elif (
+ isinstance(self.controlnet, MultiControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+ ):
+ if isinstance(controlnet_conditioning_scale, list):
+ if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+ elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+ self.controlnet.nets
+ ):
+ raise ValueError(
+ "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+ " the same length as the number of controlnets"
+ )
+ else:
+ assert False
+
+ def check_image(self, image, prompt, prompt_embeds):
+ image_is_pil = isinstance(image, PIL.Image.Image)
+ image_is_tensor = isinstance(image, torch.Tensor)
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+
+ if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list:
+ raise TypeError(
+ "image must be passed and be one of PIL image, torch tensor, list of PIL images, or list of torch tensors"
+ )
+
+ if image_is_pil:
+ image_batch_size = 1
+ elif image_is_tensor:
+ image_batch_size = image.shape[0]
+ elif image_is_pil_list:
+ image_batch_size = len(image)
+ elif image_is_tensor_list:
+ image_batch_size = len(image)
+
+ if prompt is not None and isinstance(prompt, str):
+ prompt_batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ prompt_batch_size = len(prompt)
+ elif prompt_embeds is not None:
+ prompt_batch_size = prompt_embeds.shape[0]
+
+ if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+ raise ValueError(
+ f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+ )
+
+ def prepare_image(
+ self,
+ image,
+ width,
+ height,
+ batch_size,
+ num_images_per_prompt,
+ device,
+ dtype,
+ do_classifier_free_guidance=False,
+ guess_mode=False,
+ ):
+ if not isinstance(image, torch.Tensor):
+ if isinstance(image, PIL.Image.Image):
+ image = [image]
+
+ if isinstance(image[0], PIL.Image.Image):
+ images = []
+
+ for image_ in image:
+ image_ = image_.convert("RGB")
+ image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
+ image_ = np.array(image_)
+ image_ = image_[None, :]
+ images.append(image_)
+
+ image = images
+
+ image = np.concatenate(image, axis=0)
+ image = np.array(image).astype(np.float32) / 255.0
+ image = image.transpose(0, 3, 1, 2)
+ image = torch.from_numpy(image)
+ elif isinstance(image[0], torch.Tensor):
+ image = torch.cat(image, dim=0)
+
+ image_batch_size = image.shape[0]
+
+ if image_batch_size == 1:
+ repeat_by = batch_size
+ else:
+ # image batch size is the same as prompt batch size
+ repeat_by = num_images_per_prompt
+
+ image = image.repeat_interleave(repeat_by, dim=0)
+
+ image = image.to(device=device, dtype=dtype)
+
+ if do_classifier_free_guidance and not guess_mode:
+ image = torch.cat([image] * 2)
+
+ return image
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+ if isinstance(generator, list) and len(generator) != batch_size:
+ raise ValueError(
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+ )
+
+ if latents is None:
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+ else:
+ latents = latents.to(device)
+
+ # scale the initial noise by the standard deviation required by the scheduler
+ latents = latents * self.scheduler.init_noise_sigma
+ return latents
+
+ def _default_height_width(self, height, width, image):
+ # NOTE: It is possible that a list of images have different
+ # dimensions for each image, so just checking the first image
+ # is not _exactly_ correct, but it is simple.
+ while isinstance(image, list):
+ image = image[0]
+
+ if height is None:
+ if isinstance(image, PIL.Image.Image):
+ height = image.height
+ elif isinstance(image, torch.Tensor):
+ height = image.shape[2]
+
+ height = (height // 8) * 8 # round down to nearest multiple of 8
+
+ if width is None:
+ if isinstance(image, PIL.Image.Image):
+ width = image.width
+ elif isinstance(image, torch.Tensor):
+ width = image.shape[3]
+
+ width = (width // 8) * 8 # round down to nearest multiple of 8
+
+ return height, width
+
+ # override DiffusionPipeline
+ def save_pretrained(
+ self,
+ save_directory: Union[str, os.PathLike],
+ safe_serialization: bool = False,
+ variant: Optional[str] = None,
+ ):
+ if isinstance(self.controlnet, ControlNetModel):
+ super().save_pretrained(save_directory, safe_serialization, variant)
+ else:
+ raise NotImplementedError("Currently, the `save_pretrained()` is not implemented for Multi-ControlNet.")
+
+ @torch.no_grad()
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
+ def __call__(
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ latents: Optional[torch.FloatTensor] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+ callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+ guess_mode: bool = False,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+ instead.
+ image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`,
+ `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`):
+ The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+ the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
+ also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
+ height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
+ specified in init, images must be passed as a list such that each element of the list can be correctly
+ batched for input to a single controlnet.
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+ The height in pixels of the generated image.
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+ The width in pixels of the generated image.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+ latents (`torch.FloatTensor`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+ plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+ controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+ The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+ to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
+ corresponding scale as a list.
+ guess_mode (`bool`, *optional*, defaults to `False`):
+ In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
+ you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
+
+ Examples:
+
+ Returns:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+ (nsfw) content, according to the `safety_checker`.
+ """
+ # 0. Default height and width to unet
+ height, width = self._default_height_width(height, width, image)
+
+ # 1. Check inputs. Raise error if not correct
+ self.check_inputs(
+ prompt,
+ image,
+ height,
+ width,
+ callback_steps,
+ negative_prompt,
+ prompt_embeds,
+ negative_prompt_embeds,
+ controlnet_conditioning_scale,
+ )
+
+ # 2. Define call parameters
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ device = self._execution_device
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+ if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+ global_pool_conditions = (
+ controlnet.config.global_pool_conditions
+ if isinstance(controlnet, ControlNetModel)
+ else controlnet.nets[0].config.global_pool_conditions
+ )
+ guess_mode = guess_mode or global_pool_conditions
+
+ # 3. Encode input prompt
+ prompt_embeds = self._encode_prompt(
+ prompt,
+ device,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
+
+ # 4. Prepare image
+ if isinstance(controlnet, ControlNetModel):
+ image = self.prepare_image(
+ image=image,
+ width=width,
+ height=height,
+ batch_size=batch_size * num_images_per_prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ device=device,
+ dtype=controlnet.dtype,
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ guess_mode=guess_mode,
+ )
+ elif isinstance(controlnet, MultiControlNetModel):
+ images = []
+
+ for image_ in image:
+ image_ = self.prepare_image(
+ image=image_,
+ width=width,
+ height=height,
+ batch_size=batch_size * num_images_per_prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ device=device,
+ dtype=controlnet.dtype,
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ guess_mode=guess_mode,
+ )
+
+ images.append(image_)
+
+ image = images
+ else:
+ assert False
+
+ # 5. Prepare timesteps
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
+ timesteps = self.scheduler.timesteps
+
+ # 6. Prepare latent variables
+ num_channels_latents = self.unet.config.in_channels
+ latents = self.prepare_latents(
+ batch_size * num_images_per_prompt,
+ num_channels_latents,
+ height,
+ width,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ latents,
+ )
+
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+ # 8. Denoising loop
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
+ for i, t in enumerate(timesteps):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+ # controlnet(s) inference
+ if guess_mode and do_classifier_free_guidance:
+ # Infer ControlNet only for the conditional batch.
+ controlnet_latent_model_input = latents
+ controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+ else:
+ controlnet_latent_model_input = latent_model_input
+ controlnet_prompt_embeds = prompt_embeds
+
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
+ controlnet_latent_model_input,
+ t,
+ encoder_hidden_states=controlnet_prompt_embeds,
+ controlnet_cond=image,
+ conditioning_scale=controlnet_conditioning_scale,
+ guess_mode=guess_mode,
+ return_dict=False,
+ )
+
+ if guess_mode and do_classifier_free_guidance:
+ # Infered ControlNet only for the conditional batch.
+ # To apply the output of ControlNet to both the unconditional and conditional batches,
+ # add 0 to the unconditional batch to keep it unchanged.
+ down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+ mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+ # predict the noise residual
+ noise_pred = self.unet(
+ latent_model_input,
+ t,
+ encoder_hidden_states=prompt_embeds,
+ cross_attention_kwargs=cross_attention_kwargs,
+ down_block_additional_residuals=down_block_res_samples,
+ mid_block_additional_residual=mid_block_res_sample,
+ return_dict=False,
+ )[0]
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+ # call the callback, if provided
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+ progress_bar.update()
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, latents)
+
+ # If we do sequential model offloading, let's offload unet and controlnet
+ # manually for max memory savings
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+ self.unet.to("cpu")
+ self.controlnet.to("cpu")
+ torch.cuda.empty_cache()
+
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
+ image = latents
+ has_nsfw_concept = None
+
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+ # Offload last model to CPU
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+ self.final_offload_hook.offload()
+
+ if not return_dict:
+ return (image, has_nsfw_concept)
+
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
new file mode 100644
index 000000000000..cb5492790353
--- /dev/null
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
@@ -0,0 +1,1113 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+import os
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+ PIL_INTERPOLATION,
+ deprecate,
+ is_accelerate_available,
+ is_accelerate_version,
+ is_compiled_module,
+ logging,
+ randn_tensor,
+ replace_example_docstring,
+)
+from ..pipeline_utils import DiffusionPipeline
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from .multicontrolnet import MultiControlNetModel
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+ Examples:
+ ```py
+ >>> # !pip install opencv-python transformers accelerate
+ >>> from diffusers import StableDiffusionControlNetImg2ImgPipeline, ControlNetModel, UniPCMultistepScheduler
+ >>> from diffusers.utils import load_image
+ >>> import numpy as np
+ >>> import torch
+
+ >>> import cv2
+ >>> from PIL import Image
+
+ >>> # download an image
+ >>> image = load_image(
+ ... "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+ ... )
+ >>> np_image = np.array(image)
+
+ >>> # get canny image
+ >>> np_image = cv2.Canny(np_image, 100, 200)
+ >>> np_image = np_image[:, :, None]
+ >>> np_image = np.concatenate([np_image, np_image, np_image], axis=2)
+ >>> canny_image = Image.fromarray(np_image)
+
+ >>> # load control net and stable diffusion v1-5
+ >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+ >>> pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
+ ... "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+ ... )
+
+ >>> # speed up diffusion process with faster scheduler and memory optimization
+ >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+ >>> pipe.enable_model_cpu_offload()
+
+ >>> # generate image
+ >>> generator = torch.manual_seed(0)
+ >>> image = pipe(
+ ... "futuristic-looking woman",
+ ... num_inference_steps=20,
+ ... generator=generator,
+ ... image=image,
+ ... control_image=canny_image,
+ ... ).images[0]
+ ```
+"""
+
+
+def prepare_image(image):
+ if isinstance(image, torch.Tensor):
+ # Batch single image
+ if image.ndim == 3:
+ image = image.unsqueeze(0)
+
+ image = image.to(dtype=torch.float32)
+ else:
+ # preprocess image
+ if isinstance(image, (PIL.Image.Image, np.ndarray)):
+ image = [image]
+
+ if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+ image = [np.array(i.convert("RGB"))[None, :] for i in image]
+ image = np.concatenate(image, axis=0)
+ elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+ image = np.concatenate([i[None, :] for i in image], axis=0)
+
+ image = image.transpose(0, 3, 1, 2)
+ image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+ return image
+
+
+class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
+ r"""
+ Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ In addition the pipeline inherits the following loading methods:
+ - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`CLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+ Provides additional conditioning to the unet during the denoising process. If you set multiple ControlNets
+ as a list, the outputs from each ControlNet are added together to create one combined additional
+ conditioning.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+ safety_checker ([`StableDiffusionSafetyChecker`]):
+ Classification module that estimates whether generated images could be considered offensive or harmful.
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+ feature_extractor ([`CLIPImageProcessor`]):
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+ """
+ _optional_components = ["safety_checker", "feature_extractor"]
+
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
+ super().__init__()
+
+ if safety_checker is None and requires_safety_checker:
+ logger.warning(
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+ )
+
+ if safety_checker is not None and feature_extractor is None:
+ raise ValueError(
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+ )
+
+ if isinstance(controlnet, (list, tuple)):
+ controlnet = MultiControlNetModel(controlnet)
+
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ controlnet=controlnet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
+ def enable_vae_slicing(self):
+ r"""
+ Enable sliced VAE decoding.
+
+ When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
+ steps. This is useful to save some memory and allow larger batch sizes.
+ """
+ self.vae.enable_slicing()
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+ def disable_vae_slicing(self):
+ r"""
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_slicing()
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
+ def enable_vae_tiling(self):
+ r"""
+ Enable tiled VAE decoding.
+
+ When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
+ several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
+ """
+ self.vae.enable_tiling()
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+ def disable_vae_tiling(self):
+ r"""
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_tiling()
+
+ def enable_sequential_cpu_offload(self, gpu_id=0):
+ r"""
+ Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+ text_encoder, vae, controlnet, and safety checker have their state dicts saved to CPU and then are moved to a
+ `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+ Note that offloading happens on a submodule basis. Memory savings are higher than with
+ `enable_model_cpu_offload`, but performance is lower.
+ """
+ if is_accelerate_available():
+ from accelerate import cpu_offload
+ else:
+ raise ImportError("Please install accelerate via `pip install accelerate`")
+
+ device = torch.device(f"cuda:{gpu_id}")
+
+ for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.controlnet]:
+ cpu_offload(cpu_offloaded_model, device)
+
+ if self.safety_checker is not None:
+ cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+ def enable_model_cpu_offload(self, gpu_id=0):
+ r"""
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+ """
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+ from accelerate import cpu_offload_with_hook
+ else:
+ raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+ device = torch.device(f"cuda:{gpu_id}")
+
+ hook = None
+ for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+ _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+ if self.safety_checker is not None:
+ # the safety checker can offload the vae again
+ _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+ # control net hook has be manually offloaded as it alternates with unet
+ cpu_offload_with_hook(self.controlnet, device)
+
+ # We'll offload the last model manually.
+ self.final_offload_hook = hook
+
+ @property
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+ def _execution_device(self):
+ r"""
+ Returns the device on which the pipeline's models will be executed. After calling
+ `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+ hooks.
+ """
+ if not hasattr(self.unet, "_hf_hook"):
+ return self.device
+ for module in self.unet.modules():
+ if (
+ hasattr(module, "_hf_hook")
+ and hasattr(module._hf_hook, "execution_device")
+ and module._hf_hook.execution_device is not None
+ ):
+ return torch.device(module._hf_hook.execution_device)
+ return self.device
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+ def _encode_prompt(
+ self,
+ prompt,
+ device,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ ):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ prompt to be encoded
+ device: (`torch.device`):
+ torch device
+ num_images_per_prompt (`int`):
+ number of images that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ """
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ if prompt_embeds is None:
+ # textual inversion: procecss multi-vector tokens if necessary
+ if isinstance(self, TextualInversionLoaderMixin):
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+ text_input_ids = text_inputs.input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+ attention_mask = text_inputs.attention_mask.to(device)
+ else:
+ attention_mask = None
+
+ prompt_embeds = self.text_encoder(
+ text_input_ids.to(device),
+ attention_mask=attention_mask,
+ )
+ prompt_embeds = prompt_embeds[0]
+
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+ bs_embed, seq_len, _ = prompt_embeds.shape
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
+ uncond_tokens: List[str]
+ if negative_prompt is None:
+ uncond_tokens = [""] * batch_size
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif isinstance(negative_prompt, str):
+ uncond_tokens = [negative_prompt]
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = negative_prompt
+
+ # textual inversion: procecss multi-vector tokens if necessary
+ if isinstance(self, TextualInversionLoaderMixin):
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+ max_length = prompt_embeds.shape[1]
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+ attention_mask = uncond_input.attention_mask.to(device)
+ else:
+ attention_mask = None
+
+ negative_prompt_embeds = self.text_encoder(
+ uncond_input.input_ids.to(device),
+ attention_mask=attention_mask,
+ )
+ negative_prompt_embeds = negative_prompt_embeds[0]
+
+ if do_classifier_free_guidance:
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+ seq_len = negative_prompt_embeds.shape[1]
+
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+ return prompt_embeds
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+ def run_safety_checker(self, image, device, dtype):
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+ image, has_nsfw_concept = self.safety_checker(
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+ )
+ return image, has_nsfw_concept
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+ def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
+ latents = 1 / self.vae.config.scaling_factor * latents
+ image = self.vae.decode(latents, return_dict=False)[0]
+ image = (image / 2 + 0.5).clamp(0, 1)
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+ return image
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+ def prepare_extra_step_kwargs(self, generator, eta):
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ # check if the scheduler accepts generator
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ if accepts_generator:
+ extra_step_kwargs["generator"] = generator
+ return extra_step_kwargs
+
+ def check_inputs(
+ self,
+ prompt,
+ image,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ controlnet_conditioning_scale=1.0,
+ ):
+ if height % 8 != 0 or width % 8 != 0:
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+ if (callback_steps is None) or (
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ if prompt is not None and prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+ " only forward one of the two."
+ )
+ elif prompt is None and prompt_embeds is None:
+ raise ValueError(
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+ )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ if negative_prompt is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
+ raise ValueError(
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+ f" {negative_prompt_embeds.shape}."
+ )
+
+ # `prompt` needs more sophisticated handling when there are multiple
+ # conditionings.
+ if isinstance(self.controlnet, MultiControlNetModel):
+ if isinstance(prompt, list):
+ logger.warning(
+ f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+ " prompts. The conditionings will be fixed across the prompts."
+ )
+
+ # Check `image`
+ is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+ self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+ )
+ if (
+ isinstance(self.controlnet, ControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, ControlNetModel)
+ ):
+ self.check_image(image, prompt, prompt_embeds)
+ elif (
+ isinstance(self.controlnet, MultiControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+ ):
+ if not isinstance(image, list):
+ raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+ # When `image` is a nested list:
+ # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+ elif any(isinstance(i, list) for i in image):
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+ elif len(image) != len(self.controlnet.nets):
+ raise ValueError(
+ "For multiple controlnets: `image` must have the same length as the number of controlnets."
+ )
+
+ for image_ in image:
+ self.check_image(image_, prompt, prompt_embeds)
+ else:
+ assert False
+
+ # Check `controlnet_conditioning_scale`
+ if (
+ isinstance(self.controlnet, ControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, ControlNetModel)
+ ):
+ if not isinstance(controlnet_conditioning_scale, float):
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+ elif (
+ isinstance(self.controlnet, MultiControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+ ):
+ if isinstance(controlnet_conditioning_scale, list):
+ if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+ elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+ self.controlnet.nets
+ ):
+ raise ValueError(
+ "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+ " the same length as the number of controlnets"
+ )
+ else:
+ assert False
+
+ def check_image(self, image, prompt, prompt_embeds):
+ image_is_pil = isinstance(image, PIL.Image.Image)
+ image_is_tensor = isinstance(image, torch.Tensor)
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+
+ if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list:
+ raise TypeError(
+ "image must be passed and be one of PIL image, torch tensor, list of PIL images, or list of torch tensors"
+ )
+
+ if image_is_pil:
+ image_batch_size = 1
+ elif image_is_tensor:
+ image_batch_size = image.shape[0]
+ elif image_is_pil_list:
+ image_batch_size = len(image)
+ elif image_is_tensor_list:
+ image_batch_size = len(image)
+
+ if prompt is not None and isinstance(prompt, str):
+ prompt_batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ prompt_batch_size = len(prompt)
+ elif prompt_embeds is not None:
+ prompt_batch_size = prompt_embeds.shape[0]
+
+ if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+ raise ValueError(
+ f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+ )
+
+ # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+ def prepare_control_image(
+ self,
+ image,
+ width,
+ height,
+ batch_size,
+ num_images_per_prompt,
+ device,
+ dtype,
+ do_classifier_free_guidance=False,
+ guess_mode=False,
+ ):
+ if not isinstance(image, torch.Tensor):
+ if isinstance(image, PIL.Image.Image):
+ image = [image]
+
+ if isinstance(image[0], PIL.Image.Image):
+ images = []
+
+ for image_ in image:
+ image_ = image_.convert("RGB")
+ image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
+ image_ = np.array(image_)
+ image_ = image_[None, :]
+ images.append(image_)
+
+ image = images
+
+ image = np.concatenate(image, axis=0)
+ image = np.array(image).astype(np.float32) / 255.0
+ image = image.transpose(0, 3, 1, 2)
+ image = torch.from_numpy(image)
+ elif isinstance(image[0], torch.Tensor):
+ image = torch.cat(image, dim=0)
+
+ image_batch_size = image.shape[0]
+
+ if image_batch_size == 1:
+ repeat_by = batch_size
+ else:
+ # image batch size is the same as prompt batch size
+ repeat_by = num_images_per_prompt
+
+ image = image.repeat_interleave(repeat_by, dim=0)
+
+ image = image.to(device=device, dtype=dtype)
+
+ if do_classifier_free_guidance and not guess_mode:
+ image = torch.cat([image] * 2)
+
+ return image
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+ def get_timesteps(self, num_inference_steps, strength, device):
+ # get the original timestep using init_timestep
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+ t_start = max(num_inference_steps - init_timestep, 0)
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+ return timesteps, num_inference_steps - t_start
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents
+ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+ if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+ raise ValueError(
+ f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+ )
+
+ image = image.to(device=device, dtype=dtype)
+
+ batch_size = batch_size * num_images_per_prompt
+ if isinstance(generator, list) and len(generator) != batch_size:
+ raise ValueError(
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+ )
+
+ if isinstance(generator, list):
+ init_latents = [
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+ ]
+ init_latents = torch.cat(init_latents, dim=0)
+ else:
+ init_latents = self.vae.encode(image).latent_dist.sample(generator)
+
+ init_latents = self.vae.config.scaling_factor * init_latents
+
+ if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+ # expand init_latents for batch_size
+ deprecation_message = (
+ f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+ " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+ " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+ " your script to pass as many initial images as text prompts to suppress this warning."
+ )
+ deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+ additional_image_per_prompt = batch_size // init_latents.shape[0]
+ init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+ elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+ raise ValueError(
+ f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+ )
+ else:
+ init_latents = torch.cat([init_latents], dim=0)
+
+ shape = init_latents.shape
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+ # get latents
+ init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+ latents = init_latents
+
+ return latents
+
+ def _default_height_width(self, height, width, image):
+ # NOTE: It is possible that a list of images have different
+ # dimensions for each image, so just checking the first image
+ # is not _exactly_ correct, but it is simple.
+ while isinstance(image, list):
+ image = image[0]
+
+ if height is None:
+ if isinstance(image, PIL.Image.Image):
+ height = image.height
+ elif isinstance(image, torch.Tensor):
+ height = image.shape[2]
+
+ height = (height // 8) * 8 # round down to nearest multiple of 8
+
+ if width is None:
+ if isinstance(image, PIL.Image.Image):
+ width = image.width
+ elif isinstance(image, torch.Tensor):
+ width = image.shape[3]
+
+ width = (width // 8) * 8 # round down to nearest multiple of 8
+
+ return height, width
+
+ # override DiffusionPipeline
+ def save_pretrained(
+ self,
+ save_directory: Union[str, os.PathLike],
+ safe_serialization: bool = False,
+ variant: Optional[str] = None,
+ ):
+ if isinstance(self.controlnet, ControlNetModel):
+ super().save_pretrained(save_directory, safe_serialization, variant)
+ else:
+ raise NotImplementedError("Currently, the `save_pretrained()` is not implemented for Multi-ControlNet.")
+
+ @torch.no_grad()
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
+ def __call__(
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None,
+ control_image: Union[
+ torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]
+ ] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ strength: float = 0.8,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ latents: Optional[torch.FloatTensor] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+ callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
+ guess_mode: bool = False,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+ instead.
+ image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`,
+ `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`):
+ The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+ the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
+ also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
+ height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
+ specified in init, images must be passed as a list such that each element of the list can be correctly
+ batched for input to a single controlnet.
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+ The height in pixels of the generated image.
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+ The width in pixels of the generated image.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+ latents (`torch.FloatTensor`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+ plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+ controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+ The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+ to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
+ corresponding scale as a list. Note that by default, we use a smaller conditioning scale for inpainting
+ than for [`~StableDiffusionControlNetPipeline.__call__`].
+ guess_mode (`bool`, *optional*, defaults to `False`):
+ In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
+ you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
+
+ Examples:
+
+ Returns:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+ (nsfw) content, according to the `safety_checker`.
+ """
+ # 0. Default height and width to unet
+ height, width = self._default_height_width(height, width, image)
+
+ # 1. Check inputs. Raise error if not correct
+ self.check_inputs(
+ prompt,
+ control_image,
+ height,
+ width,
+ callback_steps,
+ negative_prompt,
+ prompt_embeds,
+ negative_prompt_embeds,
+ controlnet_conditioning_scale,
+ )
+
+ # 2. Define call parameters
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ device = self._execution_device
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+ if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+ global_pool_conditions = (
+ controlnet.config.global_pool_conditions
+ if isinstance(controlnet, ControlNetModel)
+ else controlnet.nets[0].config.global_pool_conditions
+ )
+ guess_mode = guess_mode or global_pool_conditions
+
+ # 3. Encode input prompt
+ prompt_embeds = self._encode_prompt(
+ prompt,
+ device,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
+ # 4. Prepare image, and controlnet_conditioning_image
+ image = prepare_image(image)
+
+ # 5. Prepare image
+ if isinstance(controlnet, ControlNetModel):
+ control_image = self.prepare_control_image(
+ image=control_image,
+ width=width,
+ height=height,
+ batch_size=batch_size * num_images_per_prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ device=device,
+ dtype=controlnet.dtype,
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ guess_mode=guess_mode,
+ )
+ elif isinstance(controlnet, MultiControlNetModel):
+ control_images = []
+
+ for control_image_ in control_image:
+ control_image_ = self.prepare_control_image(
+ image=control_image_,
+ width=width,
+ height=height,
+ batch_size=batch_size * num_images_per_prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ device=device,
+ dtype=controlnet.dtype,
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ guess_mode=guess_mode,
+ )
+
+ control_images.append(control_image_)
+
+ control_image = control_images
+ else:
+ assert False
+
+ # 5. Prepare timesteps
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+ # 6. Prepare latent variables
+ latents = self.prepare_latents(
+ image,
+ latent_timestep,
+ batch_size,
+ num_images_per_prompt,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ )
+
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+ # 8. Denoising loop
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
+ for i, t in enumerate(timesteps):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+ # controlnet(s) inference
+ if guess_mode and do_classifier_free_guidance:
+ # Infer ControlNet only for the conditional batch.
+ controlnet_latent_model_input = latents
+ controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+ else:
+ controlnet_latent_model_input = latent_model_input
+ controlnet_prompt_embeds = prompt_embeds
+
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
+ controlnet_latent_model_input,
+ t,
+ encoder_hidden_states=controlnet_prompt_embeds,
+ controlnet_cond=control_image,
+ conditioning_scale=controlnet_conditioning_scale,
+ guess_mode=guess_mode,
+ return_dict=False,
+ )
+
+ if guess_mode and do_classifier_free_guidance:
+ # Infered ControlNet only for the conditional batch.
+ # To apply the output of ControlNet to both the unconditional and conditional batches,
+ # add 0 to the unconditional batch to keep it unchanged.
+ down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+ mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+ # predict the noise residual
+ noise_pred = self.unet(
+ latent_model_input,
+ t,
+ encoder_hidden_states=prompt_embeds,
+ cross_attention_kwargs=cross_attention_kwargs,
+ down_block_additional_residuals=down_block_res_samples,
+ mid_block_additional_residual=mid_block_res_sample,
+ return_dict=False,
+ )[0]
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+ # call the callback, if provided
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+ progress_bar.update()
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, latents)
+
+ # If we do sequential model offloading, let's offload unet and controlnet
+ # manually for max memory savings
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+ self.unet.to("cpu")
+ self.controlnet.to("cpu")
+ torch.cuda.empty_cache()
+
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
+ image = latents
+ has_nsfw_concept = None
+
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+ # Offload last model to CPU
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+ self.final_offload_hook.offload()
+
+ if not return_dict:
+ return (image, has_nsfw_concept)
+
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
new file mode 100644
index 000000000000..a146a1cc2908
--- /dev/null
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -0,0 +1,1228 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This model implementation is heavily inspired by https://github.com/haofanwang/ControlNet-for-Diffusers/
+
+import inspect
+import os
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+ PIL_INTERPOLATION,
+ is_accelerate_available,
+ is_accelerate_version,
+ is_compiled_module,
+ logging,
+ randn_tensor,
+ replace_example_docstring,
+)
+from ..pipeline_utils import DiffusionPipeline
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from .multicontrolnet import MultiControlNetModel
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+ Examples:
+ ```py
+ >>> # !pip install opencv-python transformers accelerate
+ >>> from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel, UniPCMultistepScheduler
+ >>> from diffusers.utils import load_image
+ >>> import numpy as np
+ >>> import torch
+
+ >>> import cv2
+ >>> from PIL import Image
+
+ >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+ >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+ >>> init_image = load_image(img_url).resize((512, 512))
+ >>> mask_image = load_image(mask_url).resize((512, 512))
+
+ >>> image = np.array(init_image)
+
+ >>> # get canny image
+ >>> image = cv2.Canny(image, 100, 200)
+ >>> image = image[:, :, None]
+ >>> image = np.concatenate([image, image, image], axis=2)
+ >>> canny_image = Image.fromarray(image)
+
+ >>> # load control net and stable diffusion inpainting
+ >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+ >>> pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+ ... "runwayml/stable-diffusion-inpainting", controlnet=controlnet, torch_dtype=torch.float16
+ ... )
+
+ >>> # speed up diffusion process with faster scheduler and memory optimization
+ >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+
+ >>> pipe.enable_model_cpu_offload()
+
+ >>> # generate image
+ >>> generator = torch.manual_seed(0)
+ >>> image = pipe(
+ ... "spiderman",
+ ... num_inference_steps=30,
+ ... generator=generator,
+ ... image=init_image,
+ ... mask_image=mask_image,
+ ... control_image=canny_image,
+ ... ).images[0]
+ ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.prepare_mask_and_masked_image
+def prepare_mask_and_masked_image(image, mask, height, width):
+ """
+ Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
+ converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
+ ``image`` and ``1`` for the ``mask``.
+
+ The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+ binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+
+ Args:
+ image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+ It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+ ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+ mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+ It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+ ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+
+
+ Raises:
+ ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+ should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+ TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+ (ot the other way around).
+
+ Returns:
+ tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
+ dimensions: ``batch x channels x height x width``.
+ """
+
+ if image is None:
+ raise ValueError("`image` input cannot be undefined.")
+
+ if mask is None:
+ raise ValueError("`mask_image` input cannot be undefined.")
+
+ if isinstance(image, torch.Tensor):
+ if not isinstance(mask, torch.Tensor):
+ raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
+
+ # Batch single image
+ if image.ndim == 3:
+ assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+ image = image.unsqueeze(0)
+
+ # Batch and add channel dim for single mask
+ if mask.ndim == 2:
+ mask = mask.unsqueeze(0).unsqueeze(0)
+
+ # Batch single mask or add channel dim
+ if mask.ndim == 3:
+ # Single batched mask, no channel dim or single mask not batched but channel dim
+ if mask.shape[0] == 1:
+ mask = mask.unsqueeze(0)
+
+ # Batched masks no channel dim
+ else:
+ mask = mask.unsqueeze(1)
+
+ assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+ assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+ assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+
+ # Check image is in [-1, 1]
+ if image.min() < -1 or image.max() > 1:
+ raise ValueError("Image should be in [-1, 1] range")
+
+ # Check mask is in [0, 1]
+ if mask.min() < 0 or mask.max() > 1:
+ raise ValueError("Mask should be in [0, 1] range")
+
+ # Binarize mask
+ mask[mask < 0.5] = 0
+ mask[mask >= 0.5] = 1
+
+ # Image as float32
+ image = image.to(dtype=torch.float32)
+ elif isinstance(mask, torch.Tensor):
+ raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+ else:
+ # preprocess image
+ if isinstance(image, (PIL.Image.Image, np.ndarray)):
+ image = [image]
+ if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+ # resize all images w.r.t passed height an width
+ image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
+ image = [np.array(i.convert("RGB"))[None, :] for i in image]
+ image = np.concatenate(image, axis=0)
+ elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+ image = np.concatenate([i[None, :] for i in image], axis=0)
+
+ image = image.transpose(0, 3, 1, 2)
+ image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+ # preprocess mask
+ if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+ mask = [mask]
+
+ if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+ mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
+ mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+ mask = mask.astype(np.float32) / 255.0
+ elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+ mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+ mask[mask < 0.5] = 0
+ mask[mask >= 0.5] = 1
+ mask = torch.from_numpy(mask)
+
+ masked_image = image * (mask < 0.5)
+
+ return mask, masked_image
+
+
+class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
+ r"""
+ Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ In addition the pipeline inherits the following loading methods:
+ - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`CLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+ Provides additional conditioning to the unet during the denoising process. If you set multiple ControlNets
+ as a list, the outputs from each ControlNet are added together to create one combined additional
+ conditioning.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+ safety_checker ([`StableDiffusionSafetyChecker`]):
+ Classification module that estimates whether generated images could be considered offensive or harmful.
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+ feature_extractor ([`CLIPImageProcessor`]):
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+ """
+ _optional_components = ["safety_checker", "feature_extractor"]
+
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPImageProcessor,
+ requires_safety_checker: bool = True,
+ ):
+ super().__init__()
+
+ if safety_checker is None and requires_safety_checker:
+ logger.warning(
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+ )
+
+ if safety_checker is not None and feature_extractor is None:
+ raise ValueError(
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+ )
+
+ if isinstance(controlnet, (list, tuple)):
+ controlnet = MultiControlNetModel(controlnet)
+
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ controlnet=controlnet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
+ def enable_vae_slicing(self):
+ r"""
+ Enable sliced VAE decoding.
+
+ When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
+ steps. This is useful to save some memory and allow larger batch sizes.
+ """
+ self.vae.enable_slicing()
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+ def disable_vae_slicing(self):
+ r"""
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_slicing()
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
+ def enable_vae_tiling(self):
+ r"""
+ Enable tiled VAE decoding.
+
+ When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
+ several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
+ """
+ self.vae.enable_tiling()
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+ def disable_vae_tiling(self):
+ r"""
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_tiling()
+
+ def enable_sequential_cpu_offload(self, gpu_id=0):
+ r"""
+ Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+ text_encoder, vae, controlnet, and safety checker have their state dicts saved to CPU and then are moved to a
+ `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+ Note that offloading happens on a submodule basis. Memory savings are higher than with
+ `enable_model_cpu_offload`, but performance is lower.
+ """
+ if is_accelerate_available():
+ from accelerate import cpu_offload
+ else:
+ raise ImportError("Please install accelerate via `pip install accelerate`")
+
+ device = torch.device(f"cuda:{gpu_id}")
+
+ for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.controlnet]:
+ cpu_offload(cpu_offloaded_model, device)
+
+ if self.safety_checker is not None:
+ cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+ def enable_model_cpu_offload(self, gpu_id=0):
+ r"""
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+ """
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+ from accelerate import cpu_offload_with_hook
+ else:
+ raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+ device = torch.device(f"cuda:{gpu_id}")
+
+ hook = None
+ for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+ _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+ if self.safety_checker is not None:
+ # the safety checker can offload the vae again
+ _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+ # control net hook has be manually offloaded as it alternates with unet
+ cpu_offload_with_hook(self.controlnet, device)
+
+ # We'll offload the last model manually.
+ self.final_offload_hook = hook
+
+ @property
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+ def _execution_device(self):
+ r"""
+ Returns the device on which the pipeline's models will be executed. After calling
+ `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+ hooks.
+ """
+ if not hasattr(self.unet, "_hf_hook"):
+ return self.device
+ for module in self.unet.modules():
+ if (
+ hasattr(module, "_hf_hook")
+ and hasattr(module._hf_hook, "execution_device")
+ and module._hf_hook.execution_device is not None
+ ):
+ return torch.device(module._hf_hook.execution_device)
+ return self.device
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+ def _encode_prompt(
+ self,
+ prompt,
+ device,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ ):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ prompt to be encoded
+ device: (`torch.device`):
+ torch device
+ num_images_per_prompt (`int`):
+ number of images that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ """
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ if prompt_embeds is None:
+ # textual inversion: procecss multi-vector tokens if necessary
+ if isinstance(self, TextualInversionLoaderMixin):
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+ text_input_ids = text_inputs.input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+ attention_mask = text_inputs.attention_mask.to(device)
+ else:
+ attention_mask = None
+
+ prompt_embeds = self.text_encoder(
+ text_input_ids.to(device),
+ attention_mask=attention_mask,
+ )
+ prompt_embeds = prompt_embeds[0]
+
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+ bs_embed, seq_len, _ = prompt_embeds.shape
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
+ uncond_tokens: List[str]
+ if negative_prompt is None:
+ uncond_tokens = [""] * batch_size
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif isinstance(negative_prompt, str):
+ uncond_tokens = [negative_prompt]
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = negative_prompt
+
+ # textual inversion: procecss multi-vector tokens if necessary
+ if isinstance(self, TextualInversionLoaderMixin):
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+ max_length = prompt_embeds.shape[1]
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+ attention_mask = uncond_input.attention_mask.to(device)
+ else:
+ attention_mask = None
+
+ negative_prompt_embeds = self.text_encoder(
+ uncond_input.input_ids.to(device),
+ attention_mask=attention_mask,
+ )
+ negative_prompt_embeds = negative_prompt_embeds[0]
+
+ if do_classifier_free_guidance:
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+ seq_len = negative_prompt_embeds.shape[1]
+
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+ return prompt_embeds
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+ def run_safety_checker(self, image, device, dtype):
+ if self.safety_checker is None:
+ has_nsfw_concept = None
+ else:
+ if torch.is_tensor(image):
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+ else:
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+ image, has_nsfw_concept = self.safety_checker(
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+ )
+ return image, has_nsfw_concept
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+ def decode_latents(self, latents):
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
+ " use VaeImageProcessor instead",
+ FutureWarning,
+ )
+ latents = 1 / self.vae.config.scaling_factor * latents
+ image = self.vae.decode(latents, return_dict=False)[0]
+ image = (image / 2 + 0.5).clamp(0, 1)
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+ return image
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+ def prepare_extra_step_kwargs(self, generator, eta):
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ # check if the scheduler accepts generator
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ if accepts_generator:
+ extra_step_kwargs["generator"] = generator
+ return extra_step_kwargs
+
+ def check_inputs(
+ self,
+ prompt,
+ image,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ controlnet_conditioning_scale=1.0,
+ ):
+ if height % 8 != 0 or width % 8 != 0:
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+ if (callback_steps is None) or (
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ if prompt is not None and prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+ " only forward one of the two."
+ )
+ elif prompt is None and prompt_embeds is None:
+ raise ValueError(
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+ )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ if negative_prompt is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
+ raise ValueError(
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+ f" {negative_prompt_embeds.shape}."
+ )
+
+ # `prompt` needs more sophisticated handling when there are multiple
+ # conditionings.
+ if isinstance(self.controlnet, MultiControlNetModel):
+ if isinstance(prompt, list):
+ logger.warning(
+ f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+ " prompts. The conditionings will be fixed across the prompts."
+ )
+
+ # Check `image`
+ is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+ self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+ )
+ if (
+ isinstance(self.controlnet, ControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, ControlNetModel)
+ ):
+ self.check_image(image, prompt, prompt_embeds)
+ elif (
+ isinstance(self.controlnet, MultiControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+ ):
+ if not isinstance(image, list):
+ raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+ # When `image` is a nested list:
+ # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+ elif any(isinstance(i, list) for i in image):
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+ elif len(image) != len(self.controlnet.nets):
+ raise ValueError(
+ "For multiple controlnets: `image` must have the same length as the number of controlnets."
+ )
+
+ for image_ in image:
+ self.check_image(image_, prompt, prompt_embeds)
+ else:
+ assert False
+
+ # Check `controlnet_conditioning_scale`
+ if (
+ isinstance(self.controlnet, ControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, ControlNetModel)
+ ):
+ if not isinstance(controlnet_conditioning_scale, float):
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+ elif (
+ isinstance(self.controlnet, MultiControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+ ):
+ if isinstance(controlnet_conditioning_scale, list):
+ if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+ elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+ self.controlnet.nets
+ ):
+ raise ValueError(
+ "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+ " the same length as the number of controlnets"
+ )
+ else:
+ assert False
+
+ def check_image(self, image, prompt, prompt_embeds):
+ image_is_pil = isinstance(image, PIL.Image.Image)
+ image_is_tensor = isinstance(image, torch.Tensor)
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+
+ if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list:
+ raise TypeError(
+ "image must be passed and be one of PIL image, torch tensor, list of PIL images, or list of torch tensors"
+ )
+
+ if image_is_pil:
+ image_batch_size = 1
+ elif image_is_tensor:
+ image_batch_size = image.shape[0]
+ elif image_is_pil_list:
+ image_batch_size = len(image)
+ elif image_is_tensor_list:
+ image_batch_size = len(image)
+
+ if prompt is not None and isinstance(prompt, str):
+ prompt_batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ prompt_batch_size = len(prompt)
+ elif prompt_embeds is not None:
+ prompt_batch_size = prompt_embeds.shape[0]
+
+ if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+ raise ValueError(
+ f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+ )
+
+ # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+ def prepare_control_image(
+ self,
+ image,
+ width,
+ height,
+ batch_size,
+ num_images_per_prompt,
+ device,
+ dtype,
+ do_classifier_free_guidance=False,
+ guess_mode=False,
+ ):
+ if not isinstance(image, torch.Tensor):
+ if isinstance(image, PIL.Image.Image):
+ image = [image]
+
+ if isinstance(image[0], PIL.Image.Image):
+ images = []
+
+ for image_ in image:
+ image_ = image_.convert("RGB")
+ image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
+ image_ = np.array(image_)
+ image_ = image_[None, :]
+ images.append(image_)
+
+ image = images
+
+ image = np.concatenate(image, axis=0)
+ image = np.array(image).astype(np.float32) / 255.0
+ image = image.transpose(0, 3, 1, 2)
+ image = torch.from_numpy(image)
+ elif isinstance(image[0], torch.Tensor):
+ image = torch.cat(image, dim=0)
+
+ image_batch_size = image.shape[0]
+
+ if image_batch_size == 1:
+ repeat_by = batch_size
+ else:
+ # image batch size is the same as prompt batch size
+ repeat_by = num_images_per_prompt
+
+ image = image.repeat_interleave(repeat_by, dim=0)
+
+ image = image.to(device=device, dtype=dtype)
+
+ if do_classifier_free_guidance and not guess_mode:
+ image = torch.cat([image] * 2)
+
+ return image
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_latents
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+ if isinstance(generator, list) and len(generator) != batch_size:
+ raise ValueError(
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+ )
+
+ if latents is None:
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+ else:
+ latents = latents.to(device)
+
+ # scale the initial noise by the standard deviation required by the scheduler
+ latents = latents * self.scheduler.init_noise_sigma
+ return latents
+
+ def _default_height_width(self, height, width, image):
+ # NOTE: It is possible that a list of images have different
+ # dimensions for each image, so just checking the first image
+ # is not _exactly_ correct, but it is simple.
+ while isinstance(image, list):
+ image = image[0]
+
+ if height is None:
+ if isinstance(image, PIL.Image.Image):
+ height = image.height
+ elif isinstance(image, torch.Tensor):
+ height = image.shape[2]
+
+ height = (height // 8) * 8 # round down to nearest multiple of 8
+
+ if width is None:
+ if isinstance(image, PIL.Image.Image):
+ width = image.width
+ elif isinstance(image, torch.Tensor):
+ width = image.shape[3]
+
+ width = (width // 8) * 8 # round down to nearest multiple of 8
+
+ return height, width
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_mask_latents
+ def prepare_mask_latents(
+ self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+ ):
+ # resize the mask to latents shape as we concatenate the mask to the latents
+ # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+ # and half precision
+ mask = torch.nn.functional.interpolate(
+ mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+ )
+ mask = mask.to(device=device, dtype=dtype)
+
+ masked_image = masked_image.to(device=device, dtype=dtype)
+
+ # encode the mask image into latents space so we can concatenate it to the latents
+ if isinstance(generator, list):
+ masked_image_latents = [
+ self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i])
+ for i in range(batch_size)
+ ]
+ masked_image_latents = torch.cat(masked_image_latents, dim=0)
+ else:
+ masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
+ masked_image_latents = self.vae.config.scaling_factor * masked_image_latents
+
+ # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+ if mask.shape[0] < batch_size:
+ if not batch_size % mask.shape[0] == 0:
+ raise ValueError(
+ "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+ f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+ " of masks that you pass is divisible by the total requested batch size."
+ )
+ mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+ if masked_image_latents.shape[0] < batch_size:
+ if not batch_size % masked_image_latents.shape[0] == 0:
+ raise ValueError(
+ "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+ f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+ " Make sure the number of images that you pass is divisible by the total requested batch size."
+ )
+ masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+
+ mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+ masked_image_latents = (
+ torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+ )
+
+ # aligning device to prevent device errors when concating it with the latent model input
+ masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+ return mask, masked_image_latents
+
+ # override DiffusionPipeline
+ def save_pretrained(
+ self,
+ save_directory: Union[str, os.PathLike],
+ safe_serialization: bool = False,
+ variant: Optional[str] = None,
+ ):
+ if isinstance(self.controlnet, ControlNetModel):
+ super().save_pretrained(save_directory, safe_serialization, variant)
+ else:
+ raise NotImplementedError("Currently, the `save_pretrained()` is not implemented for Multi-ControlNet.")
+
+ @torch.no_grad()
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
+ def __call__(
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[torch.Tensor, PIL.Image.Image] = None,
+ mask_image: Union[torch.Tensor, PIL.Image.Image] = None,
+ control_image: Union[
+ torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]
+ ] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ latents: Optional[torch.FloatTensor] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+ callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ controlnet_conditioning_scale: Union[float, List[float]] = 0.5,
+ guess_mode: bool = False,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+ instead.
+ image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`,
+ `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`):
+ The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+ the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
+ also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
+ height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
+ specified in init, images must be passed as a list such that each element of the list can be correctly
+ batched for input to a single controlnet.
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+ The height in pixels of the generated image.
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+ The width in pixels of the generated image.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+ latents (`torch.FloatTensor`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+ plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+ controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 0.5):
+ The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+ to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
+ corresponding scale as a list. Note that by default, we use a smaller conditioning scale for inpainting
+ than for [`~StableDiffusionControlNetPipeline.__call__`].
+ guess_mode (`bool`, *optional*, defaults to `False`):
+ In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
+ you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
+
+ Examples:
+
+ Returns:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+ (nsfw) content, according to the `safety_checker`.
+ """
+ # 0. Default height and width to unet
+ height, width = self._default_height_width(height, width, image)
+
+ # 1. Check inputs. Raise error if not correct
+ self.check_inputs(
+ prompt,
+ control_image,
+ height,
+ width,
+ callback_steps,
+ negative_prompt,
+ prompt_embeds,
+ negative_prompt_embeds,
+ controlnet_conditioning_scale,
+ )
+
+ # 2. Define call parameters
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ device = self._execution_device
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+ if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+ global_pool_conditions = (
+ controlnet.config.global_pool_conditions
+ if isinstance(controlnet, ControlNetModel)
+ else controlnet.nets[0].config.global_pool_conditions
+ )
+ guess_mode = guess_mode or global_pool_conditions
+
+ # 3. Encode input prompt
+ prompt_embeds = self._encode_prompt(
+ prompt,
+ device,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
+
+ # 4. Prepare image
+ if isinstance(controlnet, ControlNetModel):
+ control_image = self.prepare_control_image(
+ image=control_image,
+ width=width,
+ height=height,
+ batch_size=batch_size * num_images_per_prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ device=device,
+ dtype=controlnet.dtype,
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ guess_mode=guess_mode,
+ )
+ elif isinstance(controlnet, MultiControlNetModel):
+ control_images = []
+
+ for control_image_ in control_image:
+ control_image_ = self.prepare_control_image(
+ image=control_image_,
+ width=width,
+ height=height,
+ batch_size=batch_size * num_images_per_prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ device=device,
+ dtype=controlnet.dtype,
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ guess_mode=guess_mode,
+ )
+
+ control_images.append(control_image_)
+
+ control_image = control_images
+ else:
+ assert False
+
+ # 4. Preprocess mask and image - resizes image and mask w.r.t height and width
+ # 5. Prepare timesteps
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
+ timesteps = self.scheduler.timesteps
+
+ # 6. Prepare latent variables
+ num_channels_latents = self.vae.config.latent_channels
+ latents = self.prepare_latents(
+ batch_size * num_images_per_prompt,
+ num_channels_latents,
+ height,
+ width,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ latents,
+ )
+
+ # 7. Prepare mask latent variables
+ mask, masked_image = prepare_mask_and_masked_image(image, mask_image, height, width)
+ mask, masked_image_latents = self.prepare_mask_latents(
+ mask,
+ masked_image,
+ batch_size * num_images_per_prompt,
+ height,
+ width,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ do_classifier_free_guidance,
+ )
+
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+ # 8. Denoising loop
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
+ for i, t in enumerate(timesteps):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+ if guess_mode and do_classifier_free_guidance:
+ # Infer ControlNet only for the conditional batch.
+ controlnet_latent_model_input = latents
+ controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+ else:
+ controlnet_latent_model_input = latent_model_input
+ controlnet_prompt_embeds = prompt_embeds
+
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
+ controlnet_latent_model_input,
+ t,
+ encoder_hidden_states=controlnet_prompt_embeds,
+ controlnet_cond=control_image,
+ conditioning_scale=controlnet_conditioning_scale,
+ guess_mode=guess_mode,
+ return_dict=False,
+ )
+
+ if guess_mode and do_classifier_free_guidance:
+ # Infered ControlNet only for the conditional batch.
+ # To apply the output of ControlNet to both the unconditional and conditional batches,
+ # add 0 to the unconditional batch to keep it unchanged.
+ down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+ mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+ # predict the noise residual
+ latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+ noise_pred = self.unet(
+ latent_model_input,
+ t,
+ encoder_hidden_states=prompt_embeds,
+ cross_attention_kwargs=cross_attention_kwargs,
+ down_block_additional_residuals=down_block_res_samples,
+ mid_block_additional_residual=mid_block_res_sample,
+ return_dict=False,
+ )[0]
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+ # call the callback, if provided
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+ progress_bar.update()
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, latents)
+
+ # If we do sequential model offloading, let's offload unet and controlnet
+ # manually for max memory savings
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+ self.unet.to("cpu")
+ self.controlnet.to("cpu")
+ torch.cuda.empty_cache()
+
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
+ image = latents
+ has_nsfw_concept = None
+
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+ # Offload last model to CPU
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+ self.final_offload_hook.offload()
+
+ if not return_dict:
+ return (image, has_nsfw_concept)
+
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py
new file mode 100644
index 000000000000..6003fc96b0ad
--- /dev/null
+++ b/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py
@@ -0,0 +1,537 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from functools import partial
+from typing import Dict, List, Optional, Union
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict
+from flax.jax_utils import unreplicate
+from flax.training.common_utils import shard
+from PIL import Image
+from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel
+
+from ...models import FlaxAutoencoderKL, FlaxControlNetModel, FlaxUNet2DConditionModel
+from ...schedulers import (
+ FlaxDDIMScheduler,
+ FlaxDPMSolverMultistepScheduler,
+ FlaxLMSDiscreteScheduler,
+ FlaxPNDMScheduler,
+)
+from ...utils import PIL_INTERPOLATION, logging, replace_example_docstring
+from ..pipeline_flax_utils import FlaxDiffusionPipeline
+from ..stable_diffusion import FlaxStableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker_flax import FlaxStableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+# Set to True to use python for loop instead of jax.fori_loop for easier debugging
+DEBUG = False
+
+EXAMPLE_DOC_STRING = """
+ Examples:
+ ```py
+ >>> import jax
+ >>> import numpy as np
+ >>> import jax.numpy as jnp
+ >>> from flax.jax_utils import replicate
+ >>> from flax.training.common_utils import shard
+ >>> from diffusers.utils import load_image
+ >>> from PIL import Image
+ >>> from diffusers import FlaxStableDiffusionControlNetPipeline, FlaxControlNetModel
+
+
+ >>> def image_grid(imgs, rows, cols):
+ ... w, h = imgs[0].size
+ ... grid = Image.new("RGB", size=(cols * w, rows * h))
+ ... for i, img in enumerate(imgs):
+ ... grid.paste(img, box=(i % cols * w, i // cols * h))
+ ... return grid
+
+
+ >>> def create_key(seed=0):
+ ... return jax.random.PRNGKey(seed)
+
+
+ >>> rng = create_key(0)
+
+ >>> # get canny image
+ >>> canny_image = load_image(
+ ... "https://huggingface.co/datasets/YiYiXu/test-doc-assets/resolve/main/blog_post_cell_10_output_0.jpeg"
+ ... )
+
+ >>> prompts = "best quality, extremely detailed"
+ >>> negative_prompts = "monochrome, lowres, bad anatomy, worst quality, low quality"
+
+ >>> # load control net and stable diffusion v1-5
+ >>> controlnet, controlnet_params = FlaxControlNetModel.from_pretrained(
+ ... "lllyasviel/sd-controlnet-canny", from_pt=True, dtype=jnp.float32
+ ... )
+ >>> pipe, params = FlaxStableDiffusionControlNetPipeline.from_pretrained(
+ ... "runwayml/stable-diffusion-v1-5", controlnet=controlnet, revision="flax", dtype=jnp.float32
+ ... )
+ >>> params["controlnet"] = controlnet_params
+
+ >>> num_samples = jax.device_count()
+ >>> rng = jax.random.split(rng, jax.device_count())
+
+ >>> prompt_ids = pipe.prepare_text_inputs([prompts] * num_samples)
+ >>> negative_prompt_ids = pipe.prepare_text_inputs([negative_prompts] * num_samples)
+ >>> processed_image = pipe.prepare_image_inputs([canny_image] * num_samples)
+
+ >>> p_params = replicate(params)
+ >>> prompt_ids = shard(prompt_ids)
+ >>> negative_prompt_ids = shard(negative_prompt_ids)
+ >>> processed_image = shard(processed_image)
+
+ >>> output = pipe(
+ ... prompt_ids=prompt_ids,
+ ... image=processed_image,
+ ... params=p_params,
+ ... prng_seed=rng,
+ ... num_inference_steps=50,
+ ... neg_prompt_ids=negative_prompt_ids,
+ ... jit=True,
+ ... ).images
+
+ >>> output_images = pipe.numpy_to_pil(np.asarray(output.reshape((num_samples,) + output.shape[-3:])))
+ >>> output_images = image_grid(output_images, num_samples // 4, 4)
+ >>> output_images.save("generated_image.png")
+ ```
+"""
+
+
+class FlaxStableDiffusionControlNetPipeline(FlaxDiffusionPipeline):
+ r"""
+ Pipeline for text-to-image generation using Stable Diffusion with ControlNet Guidance.
+
+ This model inherits from [`FlaxDiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Args:
+ vae ([`FlaxAutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`FlaxCLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.FlaxCLIPTextModel),
+ specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`FlaxUNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ controlnet ([`FlaxControlNetModel`]:
+ Provides additional conditioning to the unet during the denoising process.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], [`FlaxPNDMScheduler`], or
+ [`FlaxDPMSolverMultistepScheduler`].
+ safety_checker ([`FlaxStableDiffusionSafetyChecker`]):
+ Classification module that estimates whether generated images could be considered offensive or harmful.
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+ feature_extractor ([`CLIPFeatureExtractor`]):
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+ """
+
+ def __init__(
+ self,
+ vae: FlaxAutoencoderKL,
+ text_encoder: FlaxCLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: FlaxUNet2DConditionModel,
+ controlnet: FlaxControlNetModel,
+ scheduler: Union[
+ FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler
+ ],
+ safety_checker: FlaxStableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ dtype: jnp.dtype = jnp.float32,
+ ):
+ super().__init__()
+ self.dtype = dtype
+
+ if safety_checker is None:
+ logger.warn(
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+ )
+
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ controlnet=controlnet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+ def prepare_text_inputs(self, prompt: Union[str, List[str]]):
+ if not isinstance(prompt, (str, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ text_input = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="np",
+ )
+
+ return text_input.input_ids
+
+ def prepare_image_inputs(self, image: Union[Image.Image, List[Image.Image]]):
+ if not isinstance(image, (Image.Image, list)):
+ raise ValueError(f"image has to be of type `PIL.Image.Image` or list but is {type(image)}")
+
+ if isinstance(image, Image.Image):
+ image = [image]
+
+ processed_images = jnp.concatenate([preprocess(img, jnp.float32) for img in image])
+
+ return processed_images
+
+ def _get_has_nsfw_concepts(self, features, params):
+ has_nsfw_concepts = self.safety_checker(features, params)
+ return has_nsfw_concepts
+
+ def _run_safety_checker(self, images, safety_model_params, jit=False):
+ # safety_model_params should already be replicated when jit is True
+ pil_images = [Image.fromarray(image) for image in images]
+ features = self.feature_extractor(pil_images, return_tensors="np").pixel_values
+
+ if jit:
+ features = shard(features)
+ has_nsfw_concepts = _p_get_has_nsfw_concepts(self, features, safety_model_params)
+ has_nsfw_concepts = unshard(has_nsfw_concepts)
+ safety_model_params = unreplicate(safety_model_params)
+ else:
+ has_nsfw_concepts = self._get_has_nsfw_concepts(features, safety_model_params)
+
+ images_was_copied = False
+ for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
+ if has_nsfw_concept:
+ if not images_was_copied:
+ images_was_copied = True
+ images = images.copy()
+
+ images[idx] = np.zeros(images[idx].shape, dtype=np.uint8) # black image
+
+ if any(has_nsfw_concepts):
+ warnings.warn(
+ "Potential NSFW content was detected in one or more images. A black image will be returned"
+ " instead. Try again with a different prompt and/or seed."
+ )
+
+ return images, has_nsfw_concepts
+
+ def _generate(
+ self,
+ prompt_ids: jnp.array,
+ image: jnp.array,
+ params: Union[Dict, FrozenDict],
+ prng_seed: jax.random.KeyArray,
+ num_inference_steps: int,
+ guidance_scale: float,
+ latents: Optional[jnp.array] = None,
+ neg_prompt_ids: Optional[jnp.array] = None,
+ controlnet_conditioning_scale: float = 1.0,
+ ):
+ height, width = image.shape[-2:]
+ if height % 64 != 0 or width % 64 != 0:
+ raise ValueError(f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
+
+ # get prompt text embeddings
+ prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
+
+ # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
+ # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
+ batch_size = prompt_ids.shape[0]
+
+ max_length = prompt_ids.shape[-1]
+
+ if neg_prompt_ids is None:
+ uncond_input = self.tokenizer(
+ [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np"
+ ).input_ids
+ else:
+ uncond_input = neg_prompt_ids
+ negative_prompt_embeds = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
+ context = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
+
+ image = jnp.concatenate([image] * 2)
+
+ latents_shape = (
+ batch_size,
+ self.unet.config.in_channels,
+ height // self.vae_scale_factor,
+ width // self.vae_scale_factor,
+ )
+ if latents is None:
+ latents = jax.random.normal(prng_seed, shape=latents_shape, dtype=jnp.float32)
+ else:
+ if latents.shape != latents_shape:
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+ def loop_body(step, args):
+ latents, scheduler_state = args
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ latents_input = jnp.concatenate([latents] * 2)
+
+ t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+ timestep = jnp.broadcast_to(t, latents_input.shape[0])
+
+ latents_input = self.scheduler.scale_model_input(scheduler_state, latents_input, t)
+
+ down_block_res_samples, mid_block_res_sample = self.controlnet.apply(
+ {"params": params["controlnet"]},
+ jnp.array(latents_input),
+ jnp.array(timestep, dtype=jnp.int32),
+ encoder_hidden_states=context,
+ controlnet_cond=image,
+ conditioning_scale=controlnet_conditioning_scale,
+ return_dict=False,
+ )
+
+ # predict the noise residual
+ noise_pred = self.unet.apply(
+ {"params": params["unet"]},
+ jnp.array(latents_input),
+ jnp.array(timestep, dtype=jnp.int32),
+ encoder_hidden_states=context,
+ down_block_additional_residuals=down_block_res_samples,
+ mid_block_additional_residual=mid_block_res_sample,
+ ).sample
+
+ # perform guidance
+ noise_pred_uncond, noise_prediction_text = jnp.split(noise_pred, 2, axis=0)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents, scheduler_state = self.scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
+ return latents, scheduler_state
+
+ scheduler_state = self.scheduler.set_timesteps(
+ params["scheduler"], num_inference_steps=num_inference_steps, shape=latents_shape
+ )
+
+ # scale the initial noise by the standard deviation required by the scheduler
+ latents = latents * params["scheduler"].init_noise_sigma
+
+ if DEBUG:
+ # run with python for loop
+ for i in range(num_inference_steps):
+ latents, scheduler_state = loop_body(i, (latents, scheduler_state))
+ else:
+ latents, _ = jax.lax.fori_loop(0, num_inference_steps, loop_body, (latents, scheduler_state))
+
+ # scale and decode the image latents with vae
+ latents = 1 / self.vae.config.scaling_factor * latents
+ image = self.vae.apply({"params": params["vae"]}, latents, method=self.vae.decode).sample
+
+ image = (image / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
+ return image
+
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
+ def __call__(
+ self,
+ prompt_ids: jnp.array,
+ image: jnp.array,
+ params: Union[Dict, FrozenDict],
+ prng_seed: jax.random.KeyArray,
+ num_inference_steps: int = 50,
+ guidance_scale: Union[float, jnp.array] = 7.5,
+ latents: jnp.array = None,
+ neg_prompt_ids: jnp.array = None,
+ controlnet_conditioning_scale: Union[float, jnp.array] = 1.0,
+ return_dict: bool = True,
+ jit: bool = False,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt_ids (`jnp.array`):
+ The prompt or prompts to guide the image generation.
+ image (`jnp.array`):
+ Array representing the ControlNet input condition. ControlNet use this input condition to generate
+ guidance to Unet.
+ params (`Dict` or `FrozenDict`): Dictionary containing the model parameters/weights
+ prng_seed (`jax.random.KeyArray` or `jax.Array`): Array containing random number generator key
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ latents (`jnp.array`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ controlnet_conditioning_scale (`float` or `jnp.array`, *optional*, defaults to 1.0):
+ The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+ to the residual in the original unet.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] instead of
+ a plain tuple.
+ jit (`bool`, defaults to `False`):
+ Whether to run `pmap` versions of the generation and safety scoring functions. NOTE: This argument
+ exists because `__call__` is not yet end-to-end pmap-able. It will be removed in a future release.
+
+ Examples:
+
+ Returns:
+ [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a
+ `tuple. When returning a tuple, the first element is a list with the generated images, and the second
+ element is a list of `bool`s denoting whether the corresponding generated image likely represents
+ "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
+ """
+
+ height, width = image.shape[-2:]
+
+ if isinstance(guidance_scale, float):
+ # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
+ # shape information, as they may be sharded (when `jit` is `True`), or not.
+ guidance_scale = jnp.array([guidance_scale] * prompt_ids.shape[0])
+ if len(prompt_ids.shape) > 2:
+ # Assume sharded
+ guidance_scale = guidance_scale[:, None]
+
+ if isinstance(controlnet_conditioning_scale, float):
+ # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
+ # shape information, as they may be sharded (when `jit` is `True`), or not.
+ controlnet_conditioning_scale = jnp.array([controlnet_conditioning_scale] * prompt_ids.shape[0])
+ if len(prompt_ids.shape) > 2:
+ # Assume sharded
+ controlnet_conditioning_scale = controlnet_conditioning_scale[:, None]
+
+ if jit:
+ images = _p_generate(
+ self,
+ prompt_ids,
+ image,
+ params,
+ prng_seed,
+ num_inference_steps,
+ guidance_scale,
+ latents,
+ neg_prompt_ids,
+ controlnet_conditioning_scale,
+ )
+ else:
+ images = self._generate(
+ prompt_ids,
+ image,
+ params,
+ prng_seed,
+ num_inference_steps,
+ guidance_scale,
+ latents,
+ neg_prompt_ids,
+ controlnet_conditioning_scale,
+ )
+
+ if self.safety_checker is not None:
+ safety_params = params["safety_checker"]
+ images_uint8_casted = (images * 255).round().astype("uint8")
+ num_devices, batch_size = images.shape[:2]
+
+ images_uint8_casted = np.asarray(images_uint8_casted).reshape(num_devices * batch_size, height, width, 3)
+ images_uint8_casted, has_nsfw_concept = self._run_safety_checker(images_uint8_casted, safety_params, jit)
+ images = np.asarray(images)
+
+ # block images
+ if any(has_nsfw_concept):
+ for i, is_nsfw in enumerate(has_nsfw_concept):
+ if is_nsfw:
+ images[i] = np.asarray(images_uint8_casted[i])
+
+ images = images.reshape(num_devices, batch_size, height, width, 3)
+ else:
+ images = np.asarray(images)
+ has_nsfw_concept = False
+
+ if not return_dict:
+ return (images, has_nsfw_concept)
+
+ return FlaxStableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
+
+
+# Static argnums are pipe, num_inference_steps. A change would trigger recompilation.
+# Non-static args are (sharded) input tensors mapped over their first dimension (hence, `0`).
+@partial(
+ jax.pmap,
+ in_axes=(None, 0, 0, 0, 0, None, 0, 0, 0, 0),
+ static_broadcasted_argnums=(0, 5),
+)
+def _p_generate(
+ pipe,
+ prompt_ids,
+ image,
+ params,
+ prng_seed,
+ num_inference_steps,
+ guidance_scale,
+ latents,
+ neg_prompt_ids,
+ controlnet_conditioning_scale,
+):
+ return pipe._generate(
+ prompt_ids,
+ image,
+ params,
+ prng_seed,
+ num_inference_steps,
+ guidance_scale,
+ latents,
+ neg_prompt_ids,
+ controlnet_conditioning_scale,
+ )
+
+
+@partial(jax.pmap, static_broadcasted_argnums=(0,))
+def _p_get_has_nsfw_concepts(pipe, features, params):
+ return pipe._get_has_nsfw_concepts(features, params)
+
+
+def unshard(x: jnp.ndarray):
+ # einops.rearrange(x, 'd b ... -> (d b) ...')
+ num_devices, batch_size = x.shape[:2]
+ rest = x.shape[2:]
+ return x.reshape(num_devices * batch_size, *rest)
+
+
+def preprocess(image, dtype):
+ image = image.convert("RGB")
+ w, h = image.size
+ w, h = (x - x % 64 for x in (w, h)) # resize to integer multiple of 64
+ image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+ image = jnp.array(image).astype(dtype) / 255.0
+ image = image[None].transpose(0, 3, 1, 2)
+ return image
diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
index e3fe20e196d8..911a5018de18 100644
--- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
+++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
@@ -8,10 +8,10 @@
from ...image_processor import VaeImageProcessor
from ...models import AutoencoderKL, UNet2DConditionModel
-from ...pipeline_utils import DiffusionPipeline
from ...pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import logging, randn_tensor
+from ..pipeline_utils import DiffusionPipeline
from . import SemanticStableDiffusionPipelineOutput
diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index b89dde319cb3..f39ae67a9aff 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -45,7 +45,6 @@ class StableDiffusionPipelineOutput(BaseOutput):
from .pipeline_cycle_diffusion import CycleDiffusionPipeline
from .pipeline_stable_diffusion import StableDiffusionPipeline
from .pipeline_stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
- from .pipeline_stable_diffusion_controlnet import StableDiffusionControlNetPipeline
from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
@@ -130,7 +129,6 @@ class FlaxStableDiffusionPipelineOutput(BaseOutput):
from ...schedulers.scheduling_pndm_flax import PNDMSchedulerState
from .pipeline_flax_stable_diffusion import FlaxStableDiffusionPipeline
- from .pipeline_flax_stable_diffusion_controlnet import FlaxStableDiffusionControlNetPipeline
from .pipeline_flax_stable_diffusion_img2img import FlaxStableDiffusionImg2ImgPipeline
from .pipeline_flax_stable_diffusion_inpaint import FlaxStableDiffusionInpaintPipeline
from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_controlnet.py
index 7035242a0cda..bec2424ece4d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_controlnet.py
@@ -12,526 +12,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import warnings
-from functools import partial
-from typing import Dict, List, Optional, Union
+# NOTE: This file is deprecated and will be removed in a future version.
+# It only exists so that temporarely `from diffusers.pipelines import DiffusionPipeline` works
-import jax
-import jax.numpy as jnp
-import numpy as np
-from flax.core.frozen_dict import FrozenDict
-from flax.jax_utils import unreplicate
-from flax.training.common_utils import shard
-from PIL import Image
-from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel
+from ...utils import deprecate
+from ..controlnet.pipeline_flax_controlnet import FlaxStableDiffusionControlNetPipeline # noqa: F401
-from ...models import FlaxAutoencoderKL, FlaxControlNetModel, FlaxUNet2DConditionModel
-from ...schedulers import (
- FlaxDDIMScheduler,
- FlaxDPMSolverMultistepScheduler,
- FlaxLMSDiscreteScheduler,
- FlaxPNDMScheduler,
-)
-from ...utils import PIL_INTERPOLATION, logging, replace_example_docstring
-from ..pipeline_flax_utils import FlaxDiffusionPipeline
-from . import FlaxStableDiffusionPipelineOutput
-from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
-
-
-logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-
-# Set to True to use python for loop instead of jax.fori_loop for easier debugging
-DEBUG = False
-
-EXAMPLE_DOC_STRING = """
- Examples:
- ```py
- >>> import jax
- >>> import numpy as np
- >>> import jax.numpy as jnp
- >>> from flax.jax_utils import replicate
- >>> from flax.training.common_utils import shard
- >>> from diffusers.utils import load_image
- >>> from PIL import Image
- >>> from diffusers import FlaxStableDiffusionControlNetPipeline, FlaxControlNetModel
-
-
- >>> def image_grid(imgs, rows, cols):
- ... w, h = imgs[0].size
- ... grid = Image.new("RGB", size=(cols * w, rows * h))
- ... for i, img in enumerate(imgs):
- ... grid.paste(img, box=(i % cols * w, i // cols * h))
- ... return grid
-
-
- >>> def create_key(seed=0):
- ... return jax.random.PRNGKey(seed)
-
-
- >>> rng = create_key(0)
-
- >>> # get canny image
- >>> canny_image = load_image(
- ... "https://huggingface.co/datasets/YiYiXu/test-doc-assets/resolve/main/blog_post_cell_10_output_0.jpeg"
- ... )
-
- >>> prompts = "best quality, extremely detailed"
- >>> negative_prompts = "monochrome, lowres, bad anatomy, worst quality, low quality"
-
- >>> # load control net and stable diffusion v1-5
- >>> controlnet, controlnet_params = FlaxControlNetModel.from_pretrained(
- ... "lllyasviel/sd-controlnet-canny", from_pt=True, dtype=jnp.float32
- ... )
- >>> pipe, params = FlaxStableDiffusionControlNetPipeline.from_pretrained(
- ... "runwayml/stable-diffusion-v1-5", controlnet=controlnet, revision="flax", dtype=jnp.float32
- ... )
- >>> params["controlnet"] = controlnet_params
-
- >>> num_samples = jax.device_count()
- >>> rng = jax.random.split(rng, jax.device_count())
-
- >>> prompt_ids = pipe.prepare_text_inputs([prompts] * num_samples)
- >>> negative_prompt_ids = pipe.prepare_text_inputs([negative_prompts] * num_samples)
- >>> processed_image = pipe.prepare_image_inputs([canny_image] * num_samples)
-
- >>> p_params = replicate(params)
- >>> prompt_ids = shard(prompt_ids)
- >>> negative_prompt_ids = shard(negative_prompt_ids)
- >>> processed_image = shard(processed_image)
-
- >>> output = pipe(
- ... prompt_ids=prompt_ids,
- ... image=processed_image,
- ... params=p_params,
- ... prng_seed=rng,
- ... num_inference_steps=50,
- ... neg_prompt_ids=negative_prompt_ids,
- ... jit=True,
- ... ).images
-
- >>> output_images = pipe.numpy_to_pil(np.asarray(output.reshape((num_samples,) + output.shape[-3:])))
- >>> output_images = image_grid(output_images, num_samples // 4, 4)
- >>> output_images.save("generated_image.png")
- ```
-"""
-
-
-class FlaxStableDiffusionControlNetPipeline(FlaxDiffusionPipeline):
- r"""
- Pipeline for text-to-image generation using Stable Diffusion with ControlNet Guidance.
-
- This model inherits from [`FlaxDiffusionPipeline`]. Check the superclass documentation for the generic methods the
- library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
- Args:
- vae ([`FlaxAutoencoderKL`]):
- Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
- text_encoder ([`FlaxCLIPTextModel`]):
- Frozen text-encoder. Stable Diffusion uses the text portion of
- [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.FlaxCLIPTextModel),
- specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
- tokenizer (`CLIPTokenizer`):
- Tokenizer of class
- [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
- unet ([`FlaxUNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
- controlnet ([`FlaxControlNetModel`]:
- Provides additional conditioning to the unet during the denoising process.
- scheduler ([`SchedulerMixin`]):
- A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
- [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], [`FlaxPNDMScheduler`], or
- [`FlaxDPMSolverMultistepScheduler`].
- safety_checker ([`FlaxStableDiffusionSafetyChecker`]):
- Classification module that estimates whether generated images could be considered offensive or harmful.
- Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
- feature_extractor ([`CLIPFeatureExtractor`]):
- Model that extracts features from generated images to be used as inputs for the `safety_checker`.
- """
-
- def __init__(
- self,
- vae: FlaxAutoencoderKL,
- text_encoder: FlaxCLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: FlaxUNet2DConditionModel,
- controlnet: FlaxControlNetModel,
- scheduler: Union[
- FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler
- ],
- safety_checker: FlaxStableDiffusionSafetyChecker,
- feature_extractor: CLIPFeatureExtractor,
- dtype: jnp.dtype = jnp.float32,
- ):
- super().__init__()
- self.dtype = dtype
-
- if safety_checker is None:
- logger.warn(
- f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
- " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
- " results in services or applications open to the public. Both the diffusers team and Hugging Face"
- " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
- " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
- " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
- )
-
- self.register_modules(
- vae=vae,
- text_encoder=text_encoder,
- tokenizer=tokenizer,
- unet=unet,
- controlnet=controlnet,
- scheduler=scheduler,
- safety_checker=safety_checker,
- feature_extractor=feature_extractor,
- )
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-
- def prepare_text_inputs(self, prompt: Union[str, List[str]]):
- if not isinstance(prompt, (str, list)):
- raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
- text_input = self.tokenizer(
- prompt,
- padding="max_length",
- max_length=self.tokenizer.model_max_length,
- truncation=True,
- return_tensors="np",
- )
-
- return text_input.input_ids
-
- def prepare_image_inputs(self, image: Union[Image.Image, List[Image.Image]]):
- if not isinstance(image, (Image.Image, list)):
- raise ValueError(f"image has to be of type `PIL.Image.Image` or list but is {type(image)}")
-
- if isinstance(image, Image.Image):
- image = [image]
-
- processed_images = jnp.concatenate([preprocess(img, jnp.float32) for img in image])
-
- return processed_images
-
- def _get_has_nsfw_concepts(self, features, params):
- has_nsfw_concepts = self.safety_checker(features, params)
- return has_nsfw_concepts
-
- def _run_safety_checker(self, images, safety_model_params, jit=False):
- # safety_model_params should already be replicated when jit is True
- pil_images = [Image.fromarray(image) for image in images]
- features = self.feature_extractor(pil_images, return_tensors="np").pixel_values
-
- if jit:
- features = shard(features)
- has_nsfw_concepts = _p_get_has_nsfw_concepts(self, features, safety_model_params)
- has_nsfw_concepts = unshard(has_nsfw_concepts)
- safety_model_params = unreplicate(safety_model_params)
- else:
- has_nsfw_concepts = self._get_has_nsfw_concepts(features, safety_model_params)
-
- images_was_copied = False
- for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
- if has_nsfw_concept:
- if not images_was_copied:
- images_was_copied = True
- images = images.copy()
-
- images[idx] = np.zeros(images[idx].shape, dtype=np.uint8) # black image
-
- if any(has_nsfw_concepts):
- warnings.warn(
- "Potential NSFW content was detected in one or more images. A black image will be returned"
- " instead. Try again with a different prompt and/or seed."
- )
-
- return images, has_nsfw_concepts
- def _generate(
- self,
- prompt_ids: jnp.array,
- image: jnp.array,
- params: Union[Dict, FrozenDict],
- prng_seed: jax.random.KeyArray,
- num_inference_steps: int,
- guidance_scale: float,
- latents: Optional[jnp.array] = None,
- neg_prompt_ids: Optional[jnp.array] = None,
- controlnet_conditioning_scale: float = 1.0,
- ):
- height, width = image.shape[-2:]
- if height % 64 != 0 or width % 64 != 0:
- raise ValueError(f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
-
- # get prompt text embeddings
- prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
-
- # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
- # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
- batch_size = prompt_ids.shape[0]
-
- max_length = prompt_ids.shape[-1]
-
- if neg_prompt_ids is None:
- uncond_input = self.tokenizer(
- [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np"
- ).input_ids
- else:
- uncond_input = neg_prompt_ids
- negative_prompt_embeds = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
- context = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
-
- image = jnp.concatenate([image] * 2)
-
- latents_shape = (
- batch_size,
- self.unet.config.in_channels,
- height // self.vae_scale_factor,
- width // self.vae_scale_factor,
- )
- if latents is None:
- latents = jax.random.normal(prng_seed, shape=latents_shape, dtype=jnp.float32)
- else:
- if latents.shape != latents_shape:
- raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-
- def loop_body(step, args):
- latents, scheduler_state = args
- # For classifier free guidance, we need to do two forward passes.
- # Here we concatenate the unconditional and text embeddings into a single batch
- # to avoid doing two forward passes
- latents_input = jnp.concatenate([latents] * 2)
-
- t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
- timestep = jnp.broadcast_to(t, latents_input.shape[0])
-
- latents_input = self.scheduler.scale_model_input(scheduler_state, latents_input, t)
-
- down_block_res_samples, mid_block_res_sample = self.controlnet.apply(
- {"params": params["controlnet"]},
- jnp.array(latents_input),
- jnp.array(timestep, dtype=jnp.int32),
- encoder_hidden_states=context,
- controlnet_cond=image,
- conditioning_scale=controlnet_conditioning_scale,
- return_dict=False,
- )
-
- # predict the noise residual
- noise_pred = self.unet.apply(
- {"params": params["unet"]},
- jnp.array(latents_input),
- jnp.array(timestep, dtype=jnp.int32),
- encoder_hidden_states=context,
- down_block_additional_residuals=down_block_res_samples,
- mid_block_additional_residual=mid_block_res_sample,
- ).sample
-
- # perform guidance
- noise_pred_uncond, noise_prediction_text = jnp.split(noise_pred, 2, axis=0)
- noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
-
- # compute the previous noisy sample x_t -> x_t-1
- latents, scheduler_state = self.scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
- return latents, scheduler_state
-
- scheduler_state = self.scheduler.set_timesteps(
- params["scheduler"], num_inference_steps=num_inference_steps, shape=latents_shape
- )
-
- # scale the initial noise by the standard deviation required by the scheduler
- latents = latents * params["scheduler"].init_noise_sigma
-
- if DEBUG:
- # run with python for loop
- for i in range(num_inference_steps):
- latents, scheduler_state = loop_body(i, (latents, scheduler_state))
- else:
- latents, _ = jax.lax.fori_loop(0, num_inference_steps, loop_body, (latents, scheduler_state))
-
- # scale and decode the image latents with vae
- latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.apply({"params": params["vae"]}, latents, method=self.vae.decode).sample
-
- image = (image / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
- return image
-
- @replace_example_docstring(EXAMPLE_DOC_STRING)
- def __call__(
- self,
- prompt_ids: jnp.array,
- image: jnp.array,
- params: Union[Dict, FrozenDict],
- prng_seed: jax.random.KeyArray,
- num_inference_steps: int = 50,
- guidance_scale: Union[float, jnp.array] = 7.5,
- latents: jnp.array = None,
- neg_prompt_ids: jnp.array = None,
- controlnet_conditioning_scale: Union[float, jnp.array] = 1.0,
- return_dict: bool = True,
- jit: bool = False,
- ):
- r"""
- Function invoked when calling the pipeline for generation.
-
- Args:
- prompt_ids (`jnp.array`):
- The prompt or prompts to guide the image generation.
- image (`jnp.array`):
- Array representing the ControlNet input condition. ControlNet use this input condition to generate
- guidance to Unet.
- params (`Dict` or `FrozenDict`): Dictionary containing the model parameters/weights
- prng_seed (`jax.random.KeyArray` or `jax.Array`): Array containing random number generator key
- num_inference_steps (`int`, *optional*, defaults to 50):
- The number of denoising steps. More denoising steps usually lead to a higher quality image at the
- expense of slower inference.
- guidance_scale (`float`, *optional*, defaults to 7.5):
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
- usually at the expense of lower image quality.
- latents (`jnp.array`, *optional*):
- Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
- generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
- tensor will ge generated by sampling using the supplied random `generator`.
- controlnet_conditioning_scale (`float` or `jnp.array`, *optional*, defaults to 1.0):
- The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
- to the residual in the original unet.
- return_dict (`bool`, *optional*, defaults to `True`):
- Whether or not to return a [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] instead of
- a plain tuple.
- jit (`bool`, defaults to `False`):
- Whether to run `pmap` versions of the generation and safety scoring functions. NOTE: This argument
- exists because `__call__` is not yet end-to-end pmap-able. It will be removed in a future release.
-
- Examples:
-
- Returns:
- [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] or `tuple`:
- [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a
- `tuple. When returning a tuple, the first element is a list with the generated images, and the second
- element is a list of `bool`s denoting whether the corresponding generated image likely represents
- "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
- """
-
- height, width = image.shape[-2:]
-
- if isinstance(guidance_scale, float):
- # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
- # shape information, as they may be sharded (when `jit` is `True`), or not.
- guidance_scale = jnp.array([guidance_scale] * prompt_ids.shape[0])
- if len(prompt_ids.shape) > 2:
- # Assume sharded
- guidance_scale = guidance_scale[:, None]
-
- if isinstance(controlnet_conditioning_scale, float):
- # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
- # shape information, as they may be sharded (when `jit` is `True`), or not.
- controlnet_conditioning_scale = jnp.array([controlnet_conditioning_scale] * prompt_ids.shape[0])
- if len(prompt_ids.shape) > 2:
- # Assume sharded
- controlnet_conditioning_scale = controlnet_conditioning_scale[:, None]
-
- if jit:
- images = _p_generate(
- self,
- prompt_ids,
- image,
- params,
- prng_seed,
- num_inference_steps,
- guidance_scale,
- latents,
- neg_prompt_ids,
- controlnet_conditioning_scale,
- )
- else:
- images = self._generate(
- prompt_ids,
- image,
- params,
- prng_seed,
- num_inference_steps,
- guidance_scale,
- latents,
- neg_prompt_ids,
- controlnet_conditioning_scale,
- )
-
- if self.safety_checker is not None:
- safety_params = params["safety_checker"]
- images_uint8_casted = (images * 255).round().astype("uint8")
- num_devices, batch_size = images.shape[:2]
-
- images_uint8_casted = np.asarray(images_uint8_casted).reshape(num_devices * batch_size, height, width, 3)
- images_uint8_casted, has_nsfw_concept = self._run_safety_checker(images_uint8_casted, safety_params, jit)
- images = np.asarray(images)
-
- # block images
- if any(has_nsfw_concept):
- for i, is_nsfw in enumerate(has_nsfw_concept):
- if is_nsfw:
- images[i] = np.asarray(images_uint8_casted[i])
-
- images = images.reshape(num_devices, batch_size, height, width, 3)
- else:
- images = np.asarray(images)
- has_nsfw_concept = False
-
- if not return_dict:
- return (images, has_nsfw_concept)
-
- return FlaxStableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
-
-
-# Static argnums are pipe, num_inference_steps. A change would trigger recompilation.
-# Non-static args are (sharded) input tensors mapped over their first dimension (hence, `0`).
-@partial(
- jax.pmap,
- in_axes=(None, 0, 0, 0, 0, None, 0, 0, 0, 0),
- static_broadcasted_argnums=(0, 5),
+deprecate(
+ "stable diffusion controlnet",
+ "0.22.0",
+ "Importing `FlaxStableDiffusionControlNetPipeline` from diffusers.pipelines.stable_diffusion.flax_pipeline_stable_diffusion_controlnet is deprecated. Please import `from diffusers import FlaxStableDiffusionControlNetPipeline` instead.",
+ standard_warn=False,
+ stacklevel=3,
)
-def _p_generate(
- pipe,
- prompt_ids,
- image,
- params,
- prng_seed,
- num_inference_steps,
- guidance_scale,
- latents,
- neg_prompt_ids,
- controlnet_conditioning_scale,
-):
- return pipe._generate(
- prompt_ids,
- image,
- params,
- prng_seed,
- num_inference_steps,
- guidance_scale,
- latents,
- neg_prompt_ids,
- controlnet_conditioning_scale,
- )
-
-
-@partial(jax.pmap, static_broadcasted_argnums=(0,))
-def _p_get_has_nsfw_concepts(pipe, features, params):
- return pipe._get_has_nsfw_concepts(features, params)
-
-
-def unshard(x: jnp.ndarray):
- # einops.rearrange(x, 'd b ... -> (d b) ...')
- num_devices, batch_size = x.shape[:2]
- rest = x.shape[2:]
- return x.reshape(num_devices * batch_size, *rest)
-
-
-def preprocess(image, dtype):
- image = image.convert("RGB")
- w, h = image.size
- w, h = (x - x % 64 for x in (w, h)) # resize to integer multiple of 64
- image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
- image = jnp.array(image).astype(dtype) / 255.0
- image = image[None].transpose(0, 3, 1, 2)
- return image
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 1cef221ea6e1..c7555e2ebad4 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -12,1093 +12,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-
-import inspect
-import os
-import warnings
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import PIL.Image
-import torch
-import torch.nn.functional as F
-from torch import nn
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-
-from ...image_processor import VaeImageProcessor
-from ...loaders import TextualInversionLoaderMixin
-from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
-from ...models.controlnet import ControlNetOutput
-from ...models.modeling_utils import ModelMixin
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (
- PIL_INTERPOLATION,
- is_accelerate_available,
- is_accelerate_version,
- logging,
- randn_tensor,
- replace_example_docstring,
+# NOTE: This file is deprecated and will be removed in a future version.
+# It only exists so that temporarely `from diffusers.pipelines import DiffusionPipeline` works
+from ...utils import deprecate
+from ..controlnet.multicontrolnet import MultiControlNetModel # noqa: F401
+from ..controlnet.pipeline_controlnet import StableDiffusionControlNetPipeline # noqa: F401
+
+
+deprecate(
+ "stable diffusion controlnet",
+ "0.22.0",
+ "Importing `StableDiffusionControlNetPipeline` or `MultiControlNetModel` from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet is deprecated. Please import `from diffusers import StableDiffusionControlNetPipeline` instead.",
+ standard_warn=False,
+ stacklevel=3,
)
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
-
-
-logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-
-
-EXAMPLE_DOC_STRING = """
- Examples:
- ```py
- >>> # !pip install opencv-python transformers accelerate
- >>> from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
- >>> from diffusers.utils import load_image
- >>> import numpy as np
- >>> import torch
-
- >>> import cv2
- >>> from PIL import Image
-
- >>> # download an image
- >>> image = load_image(
- ... "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
- ... )
- >>> image = np.array(image)
-
- >>> # get canny image
- >>> image = cv2.Canny(image, 100, 200)
- >>> image = image[:, :, None]
- >>> image = np.concatenate([image, image, image], axis=2)
- >>> canny_image = Image.fromarray(image)
-
- >>> # load control net and stable diffusion v1-5
- >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
- >>> pipe = StableDiffusionControlNetPipeline.from_pretrained(
- ... "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
- ... )
-
- >>> # speed up diffusion process with faster scheduler and memory optimization
- >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
- >>> # remove following line if xformers is not installed
- >>> pipe.enable_xformers_memory_efficient_attention()
-
- >>> pipe.enable_model_cpu_offload()
-
- >>> # generate image
- >>> generator = torch.manual_seed(0)
- >>> image = pipe(
- ... "futuristic-looking woman", num_inference_steps=20, generator=generator, image=canny_image
- ... ).images[0]
- ```
-"""
-
-
-class MultiControlNetModel(ModelMixin):
- r"""
- Multiple `ControlNetModel` wrapper class for Multi-ControlNet
-
- This module is a wrapper for multiple instances of the `ControlNetModel`. The `forward()` API is designed to be
- compatible with `ControlNetModel`.
-
- Args:
- controlnets (`List[ControlNetModel]`):
- Provides additional conditioning to the unet during the denoising process. You must set multiple
- `ControlNetModel` as a list.
- """
-
- def __init__(self, controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]):
- super().__init__()
- self.nets = nn.ModuleList(controlnets)
-
- def forward(
- self,
- sample: torch.FloatTensor,
- timestep: Union[torch.Tensor, float, int],
- encoder_hidden_states: torch.Tensor,
- controlnet_cond: List[torch.tensor],
- conditioning_scale: List[float],
- class_labels: Optional[torch.Tensor] = None,
- timestep_cond: Optional[torch.Tensor] = None,
- attention_mask: Optional[torch.Tensor] = None,
- cross_attention_kwargs: Optional[Dict[str, Any]] = None,
- guess_mode: bool = False,
- return_dict: bool = True,
- ) -> Union[ControlNetOutput, Tuple]:
- for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)):
- down_samples, mid_sample = controlnet(
- sample,
- timestep,
- encoder_hidden_states,
- image,
- scale,
- class_labels,
- timestep_cond,
- attention_mask,
- cross_attention_kwargs,
- guess_mode,
- return_dict,
- )
-
- # merge samples
- if i == 0:
- down_block_res_samples, mid_block_res_sample = down_samples, mid_sample
- else:
- down_block_res_samples = [
- samples_prev + samples_curr
- for samples_prev, samples_curr in zip(down_block_res_samples, down_samples)
- ]
- mid_block_res_sample += mid_sample
-
- return down_block_res_samples, mid_block_res_sample
-
-
-class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
- r"""
- Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
-
- This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
- library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
- In addition the pipeline inherits the following loading methods:
- - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
-
- Args:
- vae ([`AutoencoderKL`]):
- Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
- text_encoder ([`CLIPTextModel`]):
- Frozen text-encoder. Stable Diffusion uses the text portion of
- [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
- the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
- tokenizer (`CLIPTokenizer`):
- Tokenizer of class
- [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
- unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
- controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
- Provides additional conditioning to the unet during the denoising process. If you set multiple ControlNets
- as a list, the outputs from each ControlNet are added together to create one combined additional
- conditioning.
- scheduler ([`SchedulerMixin`]):
- A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
- [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
- safety_checker ([`StableDiffusionSafetyChecker`]):
- Classification module that estimates whether generated images could be considered offensive or harmful.
- Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
- feature_extractor ([`CLIPImageProcessor`]):
- Model that extracts features from generated images to be used as inputs for the `safety_checker`.
- """
- _optional_components = ["safety_checker", "feature_extractor"]
-
- def __init__(
- self,
- vae: AutoencoderKL,
- text_encoder: CLIPTextModel,
- tokenizer: CLIPTokenizer,
- unet: UNet2DConditionModel,
- controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
- scheduler: KarrasDiffusionSchedulers,
- safety_checker: StableDiffusionSafetyChecker,
- feature_extractor: CLIPImageProcessor,
- requires_safety_checker: bool = True,
- ):
- super().__init__()
-
- if safety_checker is None and requires_safety_checker:
- logger.warning(
- f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
- " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
- " results in services or applications open to the public. Both the diffusers team and Hugging Face"
- " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
- " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
- " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
- )
-
- if safety_checker is not None and feature_extractor is None:
- raise ValueError(
- "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
- " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
- )
-
- if isinstance(controlnet, (list, tuple)):
- controlnet = MultiControlNetModel(controlnet)
-
- self.register_modules(
- vae=vae,
- text_encoder=text_encoder,
- tokenizer=tokenizer,
- unet=unet,
- controlnet=controlnet,
- scheduler=scheduler,
- safety_checker=safety_checker,
- feature_extractor=feature_extractor,
- )
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
- self.register_to_config(requires_safety_checker=requires_safety_checker)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding.
-
- When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
- steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding.
-
- When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
- several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
- def enable_sequential_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
- text_encoder, vae, controlnet, and safety checker have their state dicts saved to CPU and then are moved to a
- `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
- Note that offloading happens on a submodule basis. Memory savings are higher than with
- `enable_model_cpu_offload`, but performance is lower.
- """
- if is_accelerate_available():
- from accelerate import cpu_offload
- else:
- raise ImportError("Please install accelerate via `pip install accelerate`")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.controlnet]:
- cpu_offload(cpu_offloaded_model, device)
-
- if self.safety_checker is not None:
- cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
-
- def enable_model_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
- to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
- method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
- `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
- """
- if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
- from accelerate import cpu_offload_with_hook
- else:
- raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- hook = None
- for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
- _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
- if self.safety_checker is not None:
- # the safety checker can offload the vae again
- _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
- # control net hook has be manually offloaded as it alternates with unet
- cpu_offload_with_hook(self.controlnet, device)
-
- # We'll offload the last model manually.
- self.final_offload_hook = hook
-
- @property
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
- def _execution_device(self):
- r"""
- Returns the device on which the pipeline's models will be executed. After calling
- `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
- hooks.
- """
- if not hasattr(self.unet, "_hf_hook"):
- return self.device
- for module in self.unet.modules():
- if (
- hasattr(module, "_hf_hook")
- and hasattr(module._hf_hook, "execution_device")
- and module._hf_hook.execution_device is not None
- ):
- return torch.device(module._hf_hook.execution_device)
- return self.device
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
- def _encode_prompt(
- self,
- prompt,
- device,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[torch.FloatTensor] = None,
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
- ):
- r"""
- Encodes the prompt into text encoder hidden states.
-
- Args:
- prompt (`str` or `List[str]`, *optional*):
- prompt to be encoded
- device: (`torch.device`):
- torch device
- num_images_per_prompt (`int`):
- number of images that should be generated per prompt
- do_classifier_free_guidance (`bool`):
- whether to use classifier free guidance or not
- negative_prompt (`str` or `List[str]`, *optional*):
- The prompt or prompts not to guide the image generation. If not defined, one has to pass
- `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
- less than `1`).
- prompt_embeds (`torch.FloatTensor`, *optional*):
- Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
- provided, text embeddings will be generated from `prompt` input argument.
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
- Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
- weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
- argument.
- """
- if prompt is not None and isinstance(prompt, str):
- batch_size = 1
- elif prompt is not None and isinstance(prompt, list):
- batch_size = len(prompt)
- else:
- batch_size = prompt_embeds.shape[0]
-
- if prompt_embeds is None:
- # textual inversion: procecss multi-vector tokens if necessary
- if isinstance(self, TextualInversionLoaderMixin):
- prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
- text_inputs = self.tokenizer(
- prompt,
- padding="max_length",
- max_length=self.tokenizer.model_max_length,
- truncation=True,
- return_tensors="pt",
- )
- text_input_ids = text_inputs.input_ids
- untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
- if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
- text_input_ids, untruncated_ids
- ):
- removed_text = self.tokenizer.batch_decode(
- untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
- )
- logger.warning(
- "The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}"
- )
-
- if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
- attention_mask = text_inputs.attention_mask.to(device)
- else:
- attention_mask = None
-
- prompt_embeds = self.text_encoder(
- text_input_ids.to(device),
- attention_mask=attention_mask,
- )
- prompt_embeds = prompt_embeds[0]
-
- prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
- bs_embed, seq_len, _ = prompt_embeds.shape
- # duplicate text embeddings for each generation per prompt, using mps friendly method
- prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
- prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
- # get unconditional embeddings for classifier free guidance
- if do_classifier_free_guidance and negative_prompt_embeds is None:
- uncond_tokens: List[str]
- if negative_prompt is None:
- uncond_tokens = [""] * batch_size
- elif prompt is not None and type(prompt) is not type(negative_prompt):
- raise TypeError(
- f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}."
- )
- elif isinstance(negative_prompt, str):
- uncond_tokens = [negative_prompt]
- elif batch_size != len(negative_prompt):
- raise ValueError(
- f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
- f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`."
- )
- else:
- uncond_tokens = negative_prompt
-
- # textual inversion: procecss multi-vector tokens if necessary
- if isinstance(self, TextualInversionLoaderMixin):
- uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
- max_length = prompt_embeds.shape[1]
- uncond_input = self.tokenizer(
- uncond_tokens,
- padding="max_length",
- max_length=max_length,
- truncation=True,
- return_tensors="pt",
- )
-
- if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
- attention_mask = uncond_input.attention_mask.to(device)
- else:
- attention_mask = None
-
- negative_prompt_embeds = self.text_encoder(
- uncond_input.input_ids.to(device),
- attention_mask=attention_mask,
- )
- negative_prompt_embeds = negative_prompt_embeds[0]
-
- if do_classifier_free_guidance:
- # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
- seq_len = negative_prompt_embeds.shape[1]
-
- negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
- negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
- negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
- # For classifier free guidance, we need to do two forward passes.
- # Here we concatenate the unconditional and text embeddings into a single batch
- # to avoid doing two forward passes
- prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-
- return prompt_embeds
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
- def run_safety_checker(self, image, device, dtype):
- if self.safety_checker is None:
- has_nsfw_concept = None
- else:
- if torch.is_tensor(image):
- feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
- else:
- feature_extractor_input = self.image_processor.numpy_to_pil(image)
- safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
- image, has_nsfw_concept = self.safety_checker(
- images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
- )
- return image, has_nsfw_concept
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
- def decode_latents(self, latents):
- warnings.warn(
- "The decode_latents method is deprecated and will be removed in a future version. Please"
- " use VaeImageProcessor instead",
- FutureWarning,
- )
- latents = 1 / self.vae.config.scaling_factor * latents
- image = self.vae.decode(latents, return_dict=False)[0]
- image = (image / 2 + 0.5).clamp(0, 1)
- # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
- image = image.cpu().permute(0, 2, 3, 1).float().numpy()
- return image
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
- def prepare_extra_step_kwargs(self, generator, eta):
- # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
- # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
- # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
- # and should be between [0, 1]
-
- accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
- extra_step_kwargs = {}
- if accepts_eta:
- extra_step_kwargs["eta"] = eta
-
- # check if the scheduler accepts generator
- accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
- if accepts_generator:
- extra_step_kwargs["generator"] = generator
- return extra_step_kwargs
-
- def check_inputs(
- self,
- prompt,
- image,
- height,
- width,
- callback_steps,
- negative_prompt=None,
- prompt_embeds=None,
- negative_prompt_embeds=None,
- controlnet_conditioning_scale=1.0,
- ):
- if height % 8 != 0 or width % 8 != 0:
- raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
- if (callback_steps is None) or (
- callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
- ):
- raise ValueError(
- f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}."
- )
-
- if prompt is not None and prompt_embeds is not None:
- raise ValueError(
- f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two."
- )
- elif prompt is None and prompt_embeds is None:
- raise ValueError(
- "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
- )
- elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
- raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
- if negative_prompt is not None and negative_prompt_embeds is not None:
- raise ValueError(
- f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
- f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
- )
-
- if prompt_embeds is not None and negative_prompt_embeds is not None:
- if prompt_embeds.shape != negative_prompt_embeds.shape:
- raise ValueError(
- "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
- f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
- f" {negative_prompt_embeds.shape}."
- )
-
- # `prompt` needs more sophisticated handling when there are multiple
- # conditionings.
- if isinstance(self.controlnet, MultiControlNetModel):
- if isinstance(prompt, list):
- logger.warning(
- f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
- " prompts. The conditionings will be fixed across the prompts."
- )
-
- # Check `image`
- is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
- self.controlnet, torch._dynamo.eval_frame.OptimizedModule
- )
- if (
- isinstance(self.controlnet, ControlNetModel)
- or is_compiled
- and isinstance(self.controlnet._orig_mod, ControlNetModel)
- ):
- self.check_image(image, prompt, prompt_embeds)
- elif (
- isinstance(self.controlnet, MultiControlNetModel)
- or is_compiled
- and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
- ):
- if not isinstance(image, list):
- raise TypeError("For multiple controlnets: `image` must be type `list`")
-
- # When `image` is a nested list:
- # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
- elif any(isinstance(i, list) for i in image):
- raise ValueError("A single batch of multiple conditionings are supported at the moment.")
- elif len(image) != len(self.controlnet.nets):
- raise ValueError(
- "For multiple controlnets: `image` must have the same length as the number of controlnets."
- )
-
- for image_ in image:
- self.check_image(image_, prompt, prompt_embeds)
- else:
- assert False
-
- # Check `controlnet_conditioning_scale`
- if (
- isinstance(self.controlnet, ControlNetModel)
- or is_compiled
- and isinstance(self.controlnet._orig_mod, ControlNetModel)
- ):
- if not isinstance(controlnet_conditioning_scale, float):
- raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
- elif (
- isinstance(self.controlnet, MultiControlNetModel)
- or is_compiled
- and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
- ):
- if isinstance(controlnet_conditioning_scale, list):
- if any(isinstance(i, list) for i in controlnet_conditioning_scale):
- raise ValueError("A single batch of multiple conditionings are supported at the moment.")
- elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
- self.controlnet.nets
- ):
- raise ValueError(
- "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
- " the same length as the number of controlnets"
- )
- else:
- assert False
-
- def check_image(self, image, prompt, prompt_embeds):
- image_is_pil = isinstance(image, PIL.Image.Image)
- image_is_tensor = isinstance(image, torch.Tensor)
- image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
- image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
-
- if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list:
- raise TypeError(
- "image must be passed and be one of PIL image, torch tensor, list of PIL images, or list of torch tensors"
- )
-
- if image_is_pil:
- image_batch_size = 1
- elif image_is_tensor:
- image_batch_size = image.shape[0]
- elif image_is_pil_list:
- image_batch_size = len(image)
- elif image_is_tensor_list:
- image_batch_size = len(image)
-
- if prompt is not None and isinstance(prompt, str):
- prompt_batch_size = 1
- elif prompt is not None and isinstance(prompt, list):
- prompt_batch_size = len(prompt)
- elif prompt_embeds is not None:
- prompt_batch_size = prompt_embeds.shape[0]
-
- if image_batch_size != 1 and image_batch_size != prompt_batch_size:
- raise ValueError(
- f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
- )
-
- def prepare_image(
- self,
- image,
- width,
- height,
- batch_size,
- num_images_per_prompt,
- device,
- dtype,
- do_classifier_free_guidance=False,
- guess_mode=False,
- ):
- if not isinstance(image, torch.Tensor):
- if isinstance(image, PIL.Image.Image):
- image = [image]
-
- if isinstance(image[0], PIL.Image.Image):
- images = []
-
- for image_ in image:
- image_ = image_.convert("RGB")
- image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
- image_ = np.array(image_)
- image_ = image_[None, :]
- images.append(image_)
-
- image = images
-
- image = np.concatenate(image, axis=0)
- image = np.array(image).astype(np.float32) / 255.0
- image = image.transpose(0, 3, 1, 2)
- image = torch.from_numpy(image)
- elif isinstance(image[0], torch.Tensor):
- image = torch.cat(image, dim=0)
-
- image_batch_size = image.shape[0]
-
- if image_batch_size == 1:
- repeat_by = batch_size
- else:
- # image batch size is the same as prompt batch size
- repeat_by = num_images_per_prompt
-
- image = image.repeat_interleave(repeat_by, dim=0)
-
- image = image.to(device=device, dtype=dtype)
-
- if do_classifier_free_guidance and not guess_mode:
- image = torch.cat([image] * 2)
-
- return image
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
- def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
- if isinstance(generator, list) and len(generator) != batch_size:
- raise ValueError(
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
- )
-
- if latents is None:
- latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
- else:
- latents = latents.to(device)
-
- # scale the initial noise by the standard deviation required by the scheduler
- latents = latents * self.scheduler.init_noise_sigma
- return latents
-
- def _default_height_width(self, height, width, image):
- # NOTE: It is possible that a list of images have different
- # dimensions for each image, so just checking the first image
- # is not _exactly_ correct, but it is simple.
- while isinstance(image, list):
- image = image[0]
-
- if height is None:
- if isinstance(image, PIL.Image.Image):
- height = image.height
- elif isinstance(image, torch.Tensor):
- height = image.shape[2]
-
- height = (height // 8) * 8 # round down to nearest multiple of 8
-
- if width is None:
- if isinstance(image, PIL.Image.Image):
- width = image.width
- elif isinstance(image, torch.Tensor):
- width = image.shape[3]
-
- width = (width // 8) * 8 # round down to nearest multiple of 8
-
- return height, width
-
- # override DiffusionPipeline
- def save_pretrained(
- self,
- save_directory: Union[str, os.PathLike],
- safe_serialization: bool = False,
- variant: Optional[str] = None,
- ):
- if isinstance(self.controlnet, ControlNetModel):
- super().save_pretrained(save_directory, safe_serialization, variant)
- else:
- raise NotImplementedError("Currently, the `save_pretrained()` is not implemented for Multi-ControlNet.")
-
- @torch.no_grad()
- @replace_example_docstring(EXAMPLE_DOC_STRING)
- def __call__(
- self,
- prompt: Union[str, List[str]] = None,
- image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None,
- height: Optional[int] = None,
- width: Optional[int] = None,
- num_inference_steps: int = 50,
- guidance_scale: float = 7.5,
- negative_prompt: Optional[Union[str, List[str]]] = None,
- num_images_per_prompt: Optional[int] = 1,
- eta: float = 0.0,
- generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
- latents: Optional[torch.FloatTensor] = None,
- prompt_embeds: Optional[torch.FloatTensor] = None,
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
- output_type: Optional[str] = "pil",
- return_dict: bool = True,
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
- callback_steps: int = 1,
- cross_attention_kwargs: Optional[Dict[str, Any]] = None,
- controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
- guess_mode: bool = False,
- ):
- r"""
- Function invoked when calling the pipeline for generation.
-
- Args:
- prompt (`str` or `List[str]`, *optional*):
- The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
- instead.
- image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`,
- `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`):
- The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
- the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
- also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
- height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
- specified in init, images must be passed as a list such that each element of the list can be correctly
- batched for input to a single controlnet.
- height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
- The height in pixels of the generated image.
- width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
- The width in pixels of the generated image.
- num_inference_steps (`int`, *optional*, defaults to 50):
- The number of denoising steps. More denoising steps usually lead to a higher quality image at the
- expense of slower inference.
- guidance_scale (`float`, *optional*, defaults to 7.5):
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
- usually at the expense of lower image quality.
- negative_prompt (`str` or `List[str]`, *optional*):
- The prompt or prompts not to guide the image generation. If not defined, one has to pass
- `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
- less than `1`).
- num_images_per_prompt (`int`, *optional*, defaults to 1):
- The number of images to generate per prompt.
- eta (`float`, *optional*, defaults to 0.0):
- Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
- [`schedulers.DDIMScheduler`], will be ignored for others.
- generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
- One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
- to make generation deterministic.
- latents (`torch.FloatTensor`, *optional*):
- Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
- generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
- tensor will ge generated by sampling using the supplied random `generator`.
- prompt_embeds (`torch.FloatTensor`, *optional*):
- Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
- provided, text embeddings will be generated from `prompt` input argument.
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
- Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
- weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
- argument.
- output_type (`str`, *optional*, defaults to `"pil"`):
- The output format of the generate image. Choose between
- [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
- return_dict (`bool`, *optional*, defaults to `True`):
- Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
- plain tuple.
- callback (`Callable`, *optional*):
- A function that will be called every `callback_steps` steps during inference. The function will be
- called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
- callback_steps (`int`, *optional*, defaults to 1):
- The frequency at which the `callback` function will be called. If not specified, the callback will be
- called at every step.
- cross_attention_kwargs (`dict`, *optional*):
- A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
- `self.processor` in
- [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
- controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
- The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
- to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
- corresponding scale as a list.
- guess_mode (`bool`, *optional*, defaults to `False`):
- In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
- you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
-
- Examples:
-
- Returns:
- [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
- [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
- When returning a tuple, the first element is a list with the generated images, and the second element is a
- list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
- (nsfw) content, according to the `safety_checker`.
- """
- # 0. Default height and width to unet
- height, width = self._default_height_width(height, width, image)
-
- # 1. Check inputs. Raise error if not correct
- self.check_inputs(
- prompt,
- image,
- height,
- width,
- callback_steps,
- negative_prompt,
- prompt_embeds,
- negative_prompt_embeds,
- controlnet_conditioning_scale,
- )
-
- # 2. Define call parameters
- if prompt is not None and isinstance(prompt, str):
- batch_size = 1
- elif prompt is not None and isinstance(prompt, list):
- batch_size = len(prompt)
- else:
- batch_size = prompt_embeds.shape[0]
-
- device = self._execution_device
- # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
- # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
- # corresponds to doing no classifier free guidance.
- do_classifier_free_guidance = guidance_scale > 1.0
-
- if isinstance(self.controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
- controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(self.controlnet.nets)
-
- global_pool_conditions = (
- self.controlnet.config.global_pool_conditions
- if isinstance(self.controlnet, ControlNetModel)
- else self.controlnet.nets[0].config.global_pool_conditions
- )
- guess_mode = guess_mode or global_pool_conditions
-
- # 3. Encode input prompt
- prompt_embeds = self._encode_prompt(
- prompt,
- device,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt,
- prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds,
- )
-
- # 4. Prepare image
- is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
- self.controlnet, torch._dynamo.eval_frame.OptimizedModule
- )
- if (
- isinstance(self.controlnet, ControlNetModel)
- or is_compiled
- and isinstance(self.controlnet._orig_mod, ControlNetModel)
- ):
- image = self.prepare_image(
- image=image,
- width=width,
- height=height,
- batch_size=batch_size * num_images_per_prompt,
- num_images_per_prompt=num_images_per_prompt,
- device=device,
- dtype=self.controlnet.dtype,
- do_classifier_free_guidance=do_classifier_free_guidance,
- guess_mode=guess_mode,
- )
- elif (
- isinstance(self.controlnet, MultiControlNetModel)
- or is_compiled
- and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
- ):
- images = []
-
- for image_ in image:
- image_ = self.prepare_image(
- image=image_,
- width=width,
- height=height,
- batch_size=batch_size * num_images_per_prompt,
- num_images_per_prompt=num_images_per_prompt,
- device=device,
- dtype=self.controlnet.dtype,
- do_classifier_free_guidance=do_classifier_free_guidance,
- guess_mode=guess_mode,
- )
-
- images.append(image_)
-
- image = images
- else:
- assert False
-
- # 5. Prepare timesteps
- self.scheduler.set_timesteps(num_inference_steps, device=device)
- timesteps = self.scheduler.timesteps
-
- # 6. Prepare latent variables
- num_channels_latents = self.unet.config.in_channels
- latents = self.prepare_latents(
- batch_size * num_images_per_prompt,
- num_channels_latents,
- height,
- width,
- prompt_embeds.dtype,
- device,
- generator,
- latents,
- )
-
- # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
- extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
- # 8. Denoising loop
- num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
- with self.progress_bar(total=num_inference_steps) as progress_bar:
- for i, t in enumerate(timesteps):
- # expand the latents if we are doing classifier free guidance
- latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
- latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
- # controlnet(s) inference
- if guess_mode and do_classifier_free_guidance:
- # Infer ControlNet only for the conditional batch.
- controlnet_latent_model_input = latents
- controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
- else:
- controlnet_latent_model_input = latent_model_input
- controlnet_prompt_embeds = prompt_embeds
-
- down_block_res_samples, mid_block_res_sample = self.controlnet(
- controlnet_latent_model_input,
- t,
- encoder_hidden_states=controlnet_prompt_embeds,
- controlnet_cond=image,
- conditioning_scale=controlnet_conditioning_scale,
- guess_mode=guess_mode,
- return_dict=False,
- )
-
- if guess_mode and do_classifier_free_guidance:
- # Infered ControlNet only for the conditional batch.
- # To apply the output of ControlNet to both the unconditional and conditional batches,
- # add 0 to the unconditional batch to keep it unchanged.
- down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
- mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
-
- # predict the noise residual
- noise_pred = self.unet(
- latent_model_input,
- t,
- encoder_hidden_states=prompt_embeds,
- cross_attention_kwargs=cross_attention_kwargs,
- down_block_additional_residuals=down_block_res_samples,
- mid_block_additional_residual=mid_block_res_sample,
- return_dict=False,
- )[0]
-
- # perform guidance
- if do_classifier_free_guidance:
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
- noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
- # compute the previous noisy sample x_t -> x_t-1
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-
- # call the callback, if provided
- if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
- progress_bar.update()
- if callback is not None and i % callback_steps == 0:
- callback(i, t, latents)
-
- # If we do sequential model offloading, let's offload unet and controlnet
- # manually for max memory savings
- if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
- self.unet.to("cpu")
- self.controlnet.to("cpu")
- torch.cuda.empty_cache()
-
- if not output_type == "latent":
- image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
- image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
- else:
- image = latents
- has_nsfw_concept = None
-
- if has_nsfw_concept is None:
- do_denormalize = [True] * image.shape[0]
- else:
- do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
- image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
- # Offload last model to CPU
- if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
- self.final_offload_hook.offload()
-
- if not return_dict:
- return (image, has_nsfw_concept)
-
- return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index f3708107e82a..4c6c595c41d8 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -212,6 +212,36 @@ def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
+class StableDiffusionControlNetImg2ImgPipeline(metaclass=DummyObject):
+ _backends = ["torch", "transformers"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "transformers"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionControlNetInpaintPipeline(metaclass=DummyObject):
+ _backends = ["torch", "transformers"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "transformers"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+
class StableDiffusionControlNetPipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
diff --git a/tests/pipelines/controlnet/__init__.py b/tests/pipelines/controlnet/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py
similarity index 98%
rename from tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
rename to tests/pipelines/controlnet/test_controlnet.py
index bd1470f5ebd1..0453bb38e1ee 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/controlnet/test_controlnet.py
@@ -34,7 +34,10 @@
from diffusers.utils.import_utils import is_xformers_available
from diffusers.utils.testing_utils import require_torch_gpu
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..pipeline_params import (
+ TEXT_TO_IMAGE_BATCH_PARAMS,
+ TEXT_TO_IMAGE_PARAMS,
+)
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
@@ -42,7 +45,7 @@
torch.use_deterministic_algorithms(True)
-class StableDiffusionControlNetPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class ControlNetPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionControlNetPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
@@ -155,6 +158,7 @@ class StableDiffusionMultiControlNetPipelineFastTests(PipelineTesterMixin, unitt
pipeline_class = StableDiffusionControlNetPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+ image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess
def get_dummy_components(self):
torch.manual_seed(0)
@@ -307,7 +311,7 @@ def test_save_load_optional_components(self):
@slow
@require_torch_gpu
-class StableDiffusionControlNetPipelineSlowTests(unittest.TestCase):
+class ControlNetPipelineSlowTests(unittest.TestCase):
def tearDown(self):
super().tearDown()
gc.collect()
diff --git a/tests/pipelines/controlnet/test_controlnet_img2img.py b/tests/pipelines/controlnet/test_controlnet_img2img.py
new file mode 100644
index 000000000000..b83a8af2778b
--- /dev/null
+++ b/tests/pipelines/controlnet/test_controlnet_img2img.py
@@ -0,0 +1,366 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This model implementation is heavily inspired by https://github.com/haofanwang/ControlNet-for-Diffusers/
+
+import gc
+import random
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+ AutoencoderKL,
+ ControlNetModel,
+ DDIMScheduler,
+ StableDiffusionControlNetImg2ImgPipeline,
+ UNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel
+from diffusers.utils import floats_tensor, load_image, load_numpy, randn_tensor, slow, torch_device
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import require_torch_gpu
+
+from ..pipeline_params import (
+ TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
+
+
+class ControlNetImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
+ pipeline_class = StableDiffusionControlNetImg2ImgPipeline
+ params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
+ batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+ image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess
+
+ def get_dummy_components(self):
+ torch.manual_seed(0)
+ unet = UNet2DConditionModel(
+ block_out_channels=(32, 64),
+ layers_per_block=2,
+ sample_size=32,
+ in_channels=4,
+ out_channels=4,
+ down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+ up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+ cross_attention_dim=32,
+ )
+ torch.manual_seed(0)
+ controlnet = ControlNetModel(
+ block_out_channels=(32, 64),
+ layers_per_block=2,
+ in_channels=4,
+ down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+ cross_attention_dim=32,
+ conditioning_embedding_out_channels=(16, 32),
+ )
+ torch.manual_seed(0)
+ scheduler = DDIMScheduler(
+ beta_start=0.00085,
+ beta_end=0.012,
+ beta_schedule="scaled_linear",
+ clip_sample=False,
+ set_alpha_to_one=False,
+ )
+ torch.manual_seed(0)
+ vae = AutoencoderKL(
+ block_out_channels=[32, 64],
+ in_channels=3,
+ out_channels=3,
+ down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+ up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+ latent_channels=4,
+ )
+ torch.manual_seed(0)
+ text_encoder_config = CLIPTextConfig(
+ bos_token_id=0,
+ eos_token_id=2,
+ hidden_size=32,
+ intermediate_size=37,
+ layer_norm_eps=1e-05,
+ num_attention_heads=4,
+ num_hidden_layers=5,
+ pad_token_id=1,
+ vocab_size=1000,
+ )
+ text_encoder = CLIPTextModel(text_encoder_config)
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+ components = {
+ "unet": unet,
+ "controlnet": controlnet,
+ "scheduler": scheduler,
+ "vae": vae,
+ "text_encoder": text_encoder,
+ "tokenizer": tokenizer,
+ "safety_checker": None,
+ "feature_extractor": None,
+ }
+ return components
+
+ def get_dummy_inputs(self, device, seed=0):
+ if str(device).startswith("mps"):
+ generator = torch.manual_seed(seed)
+ else:
+ generator = torch.Generator(device=device).manual_seed(seed)
+
+ controlnet_embedder_scale_factor = 2
+ control_image = randn_tensor(
+ (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+ generator=generator,
+ device=torch.device(device),
+ )
+ image = floats_tensor(control_image.shape, rng=random.Random(seed)).to(device)
+ image = image.cpu().permute(0, 2, 3, 1)[0]
+ image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+ inputs = {
+ "prompt": "A painting of a squirrel eating a burger",
+ "generator": generator,
+ "num_inference_steps": 2,
+ "guidance_scale": 6.0,
+ "output_type": "numpy",
+ "image": image,
+ "control_image": control_image,
+ }
+
+ return inputs
+
+ def test_attention_slicing_forward_pass(self):
+ return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+ @unittest.skipIf(
+ torch_device != "cuda" or not is_xformers_available(),
+ reason="XFormers attention is only available with CUDA and `xformers` installed",
+ )
+ def test_xformers_attention_forwardGenerator_pass(self):
+ self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+ def test_inference_batch_single_identical(self):
+ self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+
+class StableDiffusionMultiControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+ pipeline_class = StableDiffusionControlNetImg2ImgPipeline
+ params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
+ batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+ image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess
+
+ def get_dummy_components(self):
+ torch.manual_seed(0)
+ unet = UNet2DConditionModel(
+ block_out_channels=(32, 64),
+ layers_per_block=2,
+ sample_size=32,
+ in_channels=4,
+ out_channels=4,
+ down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+ up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+ cross_attention_dim=32,
+ )
+ torch.manual_seed(0)
+ controlnet1 = ControlNetModel(
+ block_out_channels=(32, 64),
+ layers_per_block=2,
+ in_channels=4,
+ down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+ cross_attention_dim=32,
+ conditioning_embedding_out_channels=(16, 32),
+ )
+ torch.manual_seed(0)
+ controlnet2 = ControlNetModel(
+ block_out_channels=(32, 64),
+ layers_per_block=2,
+ in_channels=4,
+ down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+ cross_attention_dim=32,
+ conditioning_embedding_out_channels=(16, 32),
+ )
+ torch.manual_seed(0)
+ scheduler = DDIMScheduler(
+ beta_start=0.00085,
+ beta_end=0.012,
+ beta_schedule="scaled_linear",
+ clip_sample=False,
+ set_alpha_to_one=False,
+ )
+ torch.manual_seed(0)
+ vae = AutoencoderKL(
+ block_out_channels=[32, 64],
+ in_channels=3,
+ out_channels=3,
+ down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+ up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+ latent_channels=4,
+ )
+ torch.manual_seed(0)
+ text_encoder_config = CLIPTextConfig(
+ bos_token_id=0,
+ eos_token_id=2,
+ hidden_size=32,
+ intermediate_size=37,
+ layer_norm_eps=1e-05,
+ num_attention_heads=4,
+ num_hidden_layers=5,
+ pad_token_id=1,
+ vocab_size=1000,
+ )
+ text_encoder = CLIPTextModel(text_encoder_config)
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+ controlnet = MultiControlNetModel([controlnet1, controlnet2])
+
+ components = {
+ "unet": unet,
+ "controlnet": controlnet,
+ "scheduler": scheduler,
+ "vae": vae,
+ "text_encoder": text_encoder,
+ "tokenizer": tokenizer,
+ "safety_checker": None,
+ "feature_extractor": None,
+ }
+ return components
+
+ def get_dummy_inputs(self, device, seed=0):
+ if str(device).startswith("mps"):
+ generator = torch.manual_seed(seed)
+ else:
+ generator = torch.Generator(device=device).manual_seed(seed)
+
+ controlnet_embedder_scale_factor = 2
+
+ control_image = [
+ randn_tensor(
+ (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+ generator=generator,
+ device=torch.device(device),
+ ),
+ randn_tensor(
+ (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+ generator=generator,
+ device=torch.device(device),
+ ),
+ ]
+
+ image = floats_tensor(control_image[0].shape, rng=random.Random(seed)).to(device)
+ image = image.cpu().permute(0, 2, 3, 1)[0]
+ image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+ inputs = {
+ "prompt": "A painting of a squirrel eating a burger",
+ "generator": generator,
+ "num_inference_steps": 2,
+ "guidance_scale": 6.0,
+ "output_type": "numpy",
+ "image": image,
+ "control_image": control_image,
+ }
+
+ return inputs
+
+ def test_attention_slicing_forward_pass(self):
+ return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+ @unittest.skipIf(
+ torch_device != "cuda" or not is_xformers_available(),
+ reason="XFormers attention is only available with CUDA and `xformers` installed",
+ )
+ def test_xformers_attention_forwardGenerator_pass(self):
+ self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+ def test_inference_batch_single_identical(self):
+ self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+ def test_save_pretrained_raise_not_implemented_exception(self):
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ pipe.to(torch_device)
+ pipe.set_progress_bar_config(disable=None)
+ with tempfile.TemporaryDirectory() as tmpdir:
+ try:
+ # save_pretrained is not implemented for Multi-ControlNet
+ pipe.save_pretrained(tmpdir)
+ except NotImplementedError:
+ pass
+
+ # override PipelineTesterMixin
+ @unittest.skip("save pretrained not implemented")
+ def test_save_load_float16(self):
+ ...
+
+ # override PipelineTesterMixin
+ @unittest.skip("save pretrained not implemented")
+ def test_save_load_local(self):
+ ...
+
+ # override PipelineTesterMixin
+ @unittest.skip("save pretrained not implemented")
+ def test_save_load_optional_components(self):
+ ...
+
+
+@slow
+@require_torch_gpu
+class ControlNetImg2ImgPipelineSlowTests(unittest.TestCase):
+ def tearDown(self):
+ super().tearDown()
+ gc.collect()
+ torch.cuda.empty_cache()
+
+ def test_canny(self):
+ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+
+ pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+ )
+ pipe.enable_model_cpu_offload()
+ pipe.set_progress_bar_config(disable=None)
+
+ generator = torch.Generator(device="cpu").manual_seed(0)
+ prompt = "evil space-punk bird"
+ control_image = load_image(
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+ ).resize((512, 512))
+ image = load_image(
+ "https://huggingface.co/lllyasviel/sd-controlnet-canny/resolve/main/images/bird.png"
+ ).resize((512, 512))
+
+ output = pipe(
+ prompt,
+ image,
+ control_image=control_image,
+ generator=generator,
+ output_type="np",
+ num_inference_steps=50,
+ strength=0.6,
+ )
+
+ image = output.images[0]
+
+ assert image.shape == (512, 512, 3)
+
+ expected_image = load_numpy(
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/img2img.npy"
+ )
+
+ assert np.abs(expected_image - image).max() < 9e-2
diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint.py b/tests/pipelines/controlnet/test_controlnet_inpaint.py
new file mode 100644
index 000000000000..786b0e608ef0
--- /dev/null
+++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py
@@ -0,0 +1,379 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This model implementation is heavily based on:
+
+import gc
+import random
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+ AutoencoderKL,
+ ControlNetModel,
+ DDIMScheduler,
+ StableDiffusionControlNetInpaintPipeline,
+ UNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel
+from diffusers.utils import floats_tensor, load_image, load_numpy, randn_tensor, slow, torch_device
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import require_torch_gpu
+
+from ..pipeline_params import (
+ TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+ TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+)
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
+
+
+class ControlNetInpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
+ pipeline_class = StableDiffusionControlNetInpaintPipeline
+ params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
+ batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+ image_params = frozenset([])
+
+ def get_dummy_components(self):
+ torch.manual_seed(0)
+ unet = UNet2DConditionModel(
+ block_out_channels=(32, 64),
+ layers_per_block=2,
+ sample_size=32,
+ in_channels=9,
+ out_channels=4,
+ down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+ up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+ cross_attention_dim=32,
+ )
+ torch.manual_seed(0)
+ controlnet = ControlNetModel(
+ block_out_channels=(32, 64),
+ layers_per_block=2,
+ in_channels=4,
+ down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+ cross_attention_dim=32,
+ conditioning_embedding_out_channels=(16, 32),
+ )
+ torch.manual_seed(0)
+ scheduler = DDIMScheduler(
+ beta_start=0.00085,
+ beta_end=0.012,
+ beta_schedule="scaled_linear",
+ clip_sample=False,
+ set_alpha_to_one=False,
+ )
+ torch.manual_seed(0)
+ vae = AutoencoderKL(
+ block_out_channels=[32, 64],
+ in_channels=3,
+ out_channels=3,
+ down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+ up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+ latent_channels=4,
+ )
+ torch.manual_seed(0)
+ text_encoder_config = CLIPTextConfig(
+ bos_token_id=0,
+ eos_token_id=2,
+ hidden_size=32,
+ intermediate_size=37,
+ layer_norm_eps=1e-05,
+ num_attention_heads=4,
+ num_hidden_layers=5,
+ pad_token_id=1,
+ vocab_size=1000,
+ )
+ text_encoder = CLIPTextModel(text_encoder_config)
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+ components = {
+ "unet": unet,
+ "controlnet": controlnet,
+ "scheduler": scheduler,
+ "vae": vae,
+ "text_encoder": text_encoder,
+ "tokenizer": tokenizer,
+ "safety_checker": None,
+ "feature_extractor": None,
+ }
+ return components
+
+ def get_dummy_inputs(self, device, seed=0):
+ if str(device).startswith("mps"):
+ generator = torch.manual_seed(seed)
+ else:
+ generator = torch.Generator(device=device).manual_seed(seed)
+
+ controlnet_embedder_scale_factor = 2
+ control_image = randn_tensor(
+ (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+ generator=generator,
+ device=torch.device(device),
+ )
+ init_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+ init_image = init_image.cpu().permute(0, 2, 3, 1)[0]
+
+ image = Image.fromarray(np.uint8(init_image)).convert("RGB").resize((64, 64))
+ mask_image = Image.fromarray(np.uint8(init_image + 4)).convert("RGB").resize((64, 64))
+
+ inputs = {
+ "prompt": "A painting of a squirrel eating a burger",
+ "generator": generator,
+ "num_inference_steps": 2,
+ "guidance_scale": 6.0,
+ "output_type": "numpy",
+ "image": image,
+ "mask_image": mask_image,
+ "control_image": control_image,
+ }
+
+ return inputs
+
+ def test_attention_slicing_forward_pass(self):
+ return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+ @unittest.skipIf(
+ torch_device != "cuda" or not is_xformers_available(),
+ reason="XFormers attention is only available with CUDA and `xformers` installed",
+ )
+ def test_xformers_attention_forwardGenerator_pass(self):
+ self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+ def test_inference_batch_single_identical(self):
+ self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+
+class MultiControlNetInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+ pipeline_class = StableDiffusionControlNetInpaintPipeline
+ params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
+ batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+
+ def get_dummy_components(self):
+ torch.manual_seed(0)
+ unet = UNet2DConditionModel(
+ block_out_channels=(32, 64),
+ layers_per_block=2,
+ sample_size=32,
+ in_channels=9,
+ out_channels=4,
+ down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+ up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+ cross_attention_dim=32,
+ )
+ torch.manual_seed(0)
+ controlnet1 = ControlNetModel(
+ block_out_channels=(32, 64),
+ layers_per_block=2,
+ in_channels=4,
+ down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+ cross_attention_dim=32,
+ conditioning_embedding_out_channels=(16, 32),
+ )
+ torch.manual_seed(0)
+ controlnet2 = ControlNetModel(
+ block_out_channels=(32, 64),
+ layers_per_block=2,
+ in_channels=4,
+ down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+ cross_attention_dim=32,
+ conditioning_embedding_out_channels=(16, 32),
+ )
+ torch.manual_seed(0)
+ scheduler = DDIMScheduler(
+ beta_start=0.00085,
+ beta_end=0.012,
+ beta_schedule="scaled_linear",
+ clip_sample=False,
+ set_alpha_to_one=False,
+ )
+ torch.manual_seed(0)
+ vae = AutoencoderKL(
+ block_out_channels=[32, 64],
+ in_channels=3,
+ out_channels=3,
+ down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+ up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+ latent_channels=4,
+ )
+ torch.manual_seed(0)
+ text_encoder_config = CLIPTextConfig(
+ bos_token_id=0,
+ eos_token_id=2,
+ hidden_size=32,
+ intermediate_size=37,
+ layer_norm_eps=1e-05,
+ num_attention_heads=4,
+ num_hidden_layers=5,
+ pad_token_id=1,
+ vocab_size=1000,
+ )
+ text_encoder = CLIPTextModel(text_encoder_config)
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+ controlnet = MultiControlNetModel([controlnet1, controlnet2])
+
+ components = {
+ "unet": unet,
+ "controlnet": controlnet,
+ "scheduler": scheduler,
+ "vae": vae,
+ "text_encoder": text_encoder,
+ "tokenizer": tokenizer,
+ "safety_checker": None,
+ "feature_extractor": None,
+ }
+ return components
+
+ def get_dummy_inputs(self, device, seed=0):
+ if str(device).startswith("mps"):
+ generator = torch.manual_seed(seed)
+ else:
+ generator = torch.Generator(device=device).manual_seed(seed)
+
+ controlnet_embedder_scale_factor = 2
+
+ control_image = [
+ randn_tensor(
+ (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+ generator=generator,
+ device=torch.device(device),
+ ),
+ randn_tensor(
+ (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+ generator=generator,
+ device=torch.device(device),
+ ),
+ ]
+ init_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+ init_image = init_image.cpu().permute(0, 2, 3, 1)[0]
+
+ image = Image.fromarray(np.uint8(init_image)).convert("RGB").resize((64, 64))
+ mask_image = Image.fromarray(np.uint8(init_image + 4)).convert("RGB").resize((64, 64))
+
+ inputs = {
+ "prompt": "A painting of a squirrel eating a burger",
+ "generator": generator,
+ "num_inference_steps": 2,
+ "guidance_scale": 6.0,
+ "output_type": "numpy",
+ "image": image,
+ "mask_image": mask_image,
+ "control_image": control_image,
+ }
+
+ return inputs
+
+ def test_attention_slicing_forward_pass(self):
+ return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+ @unittest.skipIf(
+ torch_device != "cuda" or not is_xformers_available(),
+ reason="XFormers attention is only available with CUDA and `xformers` installed",
+ )
+ def test_xformers_attention_forwardGenerator_pass(self):
+ self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+ def test_inference_batch_single_identical(self):
+ self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+ def test_save_pretrained_raise_not_implemented_exception(self):
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ pipe.to(torch_device)
+ pipe.set_progress_bar_config(disable=None)
+ with tempfile.TemporaryDirectory() as tmpdir:
+ try:
+ # save_pretrained is not implemented for Multi-ControlNet
+ pipe.save_pretrained(tmpdir)
+ except NotImplementedError:
+ pass
+
+ # override PipelineTesterMixin
+ @unittest.skip("save pretrained not implemented")
+ def test_save_load_float16(self):
+ ...
+
+ # override PipelineTesterMixin
+ @unittest.skip("save pretrained not implemented")
+ def test_save_load_local(self):
+ ...
+
+ # override PipelineTesterMixin
+ @unittest.skip("save pretrained not implemented")
+ def test_save_load_optional_components(self):
+ ...
+
+
+@slow
+@require_torch_gpu
+class ControlNetInpaintPipelineSlowTests(unittest.TestCase):
+ def tearDown(self):
+ super().tearDown()
+ gc.collect()
+ torch.cuda.empty_cache()
+
+ def test_canny(self):
+ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+
+ pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+ "runwayml/stable-diffusion-inpainting", safety_checker=None, controlnet=controlnet
+ )
+ pipe.enable_model_cpu_offload()
+ pipe.set_progress_bar_config(disable=None)
+
+ generator = torch.Generator(device="cpu").manual_seed(0)
+ image = load_image(
+ "https://huggingface.co/lllyasviel/sd-controlnet-canny/resolve/main/images/bird.png"
+ ).resize((512, 512))
+
+ mask_image = load_image(
+ "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+ "/stable_diffusion_inpaint/input_bench_mask.png"
+ ).resize((512, 512))
+
+ prompt = "pitch black hole"
+
+ control_image = load_image(
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+ ).resize((512, 512))
+
+ output = pipe(
+ prompt,
+ image=image,
+ mask_image=mask_image,
+ control_image=control_image,
+ generator=generator,
+ output_type="np",
+ num_inference_steps=3,
+ )
+
+ image = output.images[0]
+
+ assert image.shape == (512, 512, 3)
+
+ expected_image = load_numpy(
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/inpaint.npy"
+ )
+
+ assert np.abs(expected_image - image).max() < 9e-2
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_flax_controlnet.py b/tests/pipelines/controlnet/test_flax_controlnet.py
similarity index 98%
rename from tests/pipelines/stable_diffusion/test_stable_diffusion_flax_controlnet.py
rename to tests/pipelines/controlnet/test_flax_controlnet.py
index 268c01320177..4ad75b407acc 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_flax_controlnet.py
+++ b/tests/pipelines/controlnet/test_flax_controlnet.py
@@ -30,7 +30,7 @@
@slow
@require_flax
-class FlaxStableDiffusionControlNetPipelineIntegrationTests(unittest.TestCase):
+class FlaxControlNetPipelineIntegrationTests(unittest.TestCase):
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
index 8c27a568d24d..0ce55ae78ae0 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
@@ -46,9 +46,8 @@ class StableDiffusionImageVariationPipelineFastTests(
pipeline_class = StableDiffusionImageVariationPipeline
params = IMAGE_VARIATION_PARAMS
batch_params = IMAGE_VARIATION_BATCH_PARAMS
- image_params = frozenset(
- []
- ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
+ image_params = frozenset([])
+ # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
def get_dummy_components(self):
torch.manual_seed(0)
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index cdf138c4e178..a215e4da6697 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -47,9 +47,8 @@ class StableDiffusionInpaintPipelineFastTests(PipelineLatentTesterMixin, Pipelin
pipeline_class = StableDiffusionInpaintPipeline
params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
- image_params = frozenset(
- []
- ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
+ image_params = frozenset([])
+ # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
def get_dummy_components(self):
torch.manual_seed(0)
From 17f9aed79cd073f4475bd3af1c6f34b681839685 Mon Sep 17 00:00:00 2001
From: clarencechen
Date: Tue, 16 May 2023 11:26:53 -0700
Subject: [PATCH 086/206] [Scheduler] DPM-Solver (++) Inverse Scheduler (#3335)
* Add DPM-Solver Multistep Inverse Scheduler
* Add draft tests for DiffEdit
* Add inverse sde-dpmsolver steps to tune image diversity from inverted latents
* Fix tests
---------
Co-authored-by: Patrick von Platen
---
docs/source/en/_toctree.yml | 2 +
.../multistep_dpm_solver_inverse.mdx | 22 +
src/diffusers/__init__.py | 1 +
src/diffusers/schedulers/__init__.py | 1 +
.../scheduling_dpmsolver_multistep_inverse.py | 701 ++++++++++++++++++
src/diffusers/utils/dummy_pt_objects.py | 15 +
.../test_stable_diffusion_diffedit.py | 77 ++
7 files changed, 819 insertions(+)
create mode 100644 docs/source/en/api/schedulers/multistep_dpm_solver_inverse.mdx
create mode 100644 src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 52d8988206f1..645cbb04c1d0 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -252,6 +252,8 @@
title: Euler scheduler
- local: api/schedulers/heun
title: Heun Scheduler
+ - local: api/schedulers/multistep_dpm_solver_inverse
+ title: Inverse Multistep DPM-Solver
- local: api/schedulers/ipndm
title: IPNDM
- local: api/schedulers/lms_discrete
diff --git a/docs/source/en/api/schedulers/multistep_dpm_solver_inverse.mdx b/docs/source/en/api/schedulers/multistep_dpm_solver_inverse.mdx
new file mode 100644
index 000000000000..1b3348a5a3ea
--- /dev/null
+++ b/docs/source/en/api/schedulers/multistep_dpm_solver_inverse.mdx
@@ -0,0 +1,22 @@
+
+
+# Inverse Multistep DPM-Solver (DPMSolverMultistepInverse)
+
+## Overview
+
+This scheduler is the inverted scheduler of [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://arxiv.org/abs/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models
+](https://arxiv.org/abs/2211.01095) by Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu.
+The implementation is mostly based on the DDIM inversion definition of [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://arxiv.org/pdf/2211.09794.pdf) and the ad-hoc notebook implementation for DiffEdit latent inversion [here](https://github.com/Xiang-cd/DiffEdit-stable-diffusion/blob/main/diffedit.ipynb).
+
+## DPMSolverMultistepInverseScheduler
+[[autodoc]] DPMSolverMultistepInverseScheduler
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 0d48a16b6216..9b3f8adad376 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -76,6 +76,7 @@
DDIMScheduler,
DDPMScheduler,
DEISMultistepScheduler,
+ DPMSolverMultistepInverseScheduler,
DPMSolverMultistepScheduler,
DPMSolverSinglestepScheduler,
EulerAncestralDiscreteScheduler,
diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py
index c4b62c722257..05414e32fc9e 100644
--- a/src/diffusers/schedulers/__init__.py
+++ b/src/diffusers/schedulers/__init__.py
@@ -33,6 +33,7 @@
from .scheduling_ddpm import DDPMScheduler
from .scheduling_deis_multistep import DEISMultistepScheduler
from .scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler
+ from .scheduling_dpmsolver_multistep_inverse import DPMSolverMultistepInverseScheduler
from .scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
from .scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
from .scheduling_euler_discrete import EulerDiscreteScheduler
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
new file mode 100644
index 000000000000..b424ebbff262
--- /dev/null
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
@@ -0,0 +1,701 @@
+# Copyright 2023 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+ """
+ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+ (1-beta) over time from t = [0,1].
+
+ Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+ to that part of the diffusion process.
+
+
+ Args:
+ num_diffusion_timesteps (`int`): the number of betas to produce.
+ max_beta (`float`): the maximum beta to use; use values lower than 1 to
+ prevent singularities.
+
+ Returns:
+ betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+ """
+
+ def alpha_bar(time_step):
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+ betas = []
+ for i in range(num_diffusion_timesteps):
+ t1 = i / num_diffusion_timesteps
+ t2 = (i + 1) / num_diffusion_timesteps
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+ return torch.tensor(betas, dtype=torch.float32)
+
+
+class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
+ """
+ DPMSolverMultistepInverseScheduler is the reverse scheduler of [`DPMSolverMultistepScheduler`].
+
+ We also support the "dynamic thresholding" method in Imagen (https://arxiv.org/abs/2205.11487). For pixel-space
+ diffusion models, you can set both `algorithm_type="dpmsolver++"` and `thresholding=True` to use the dynamic
+ thresholding. Note that the thresholding method is unsuitable for latent-space diffusion models (such as
+ stable-diffusion).
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+ [`~SchedulerMixin.from_pretrained`] functions.
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ beta_start (`float`): the starting `beta` value of inference.
+ beta_end (`float`): the final `beta` value.
+ beta_schedule (`str`):
+ the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+ `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+ trained_betas (`np.ndarray`, optional):
+ option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+ solver_order (`int`, default `2`):
+ the order of DPM-Solver; can be `1` or `2` or `3`. We recommend to use `solver_order=2` for guided
+ sampling, and `solver_order=3` for unconditional sampling.
+ prediction_type (`str`, default `epsilon`, optional):
+ prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+ process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
+ https://imagen.research.google/video/paper.pdf)
+ thresholding (`bool`, default `False`):
+ whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
+ For pixel-space diffusion models, you can set both `algorithm_type=dpmsolver++` and `thresholding=True` to
+ use the dynamic thresholding. Note that the thresholding method is unsuitable for latent-space diffusion
+ models (such as stable-diffusion).
+ dynamic_thresholding_ratio (`float`, default `0.995`):
+ the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
+ (https://arxiv.org/abs/2205.11487).
+ sample_max_value (`float`, default `1.0`):
+ the threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+ `algorithm_type="dpmsolver++`.
+ algorithm_type (`str`, default `dpmsolver++`):
+ the algorithm type for the solver. Either `dpmsolver` or `dpmsolver++` or `sde-dpmsolver` or
+ `sde-dpmsolver++`. The `dpmsolver` type implements the algorithms in https://arxiv.org/abs/2206.00927, and
+ the `dpmsolver++` type implements the algorithms in https://arxiv.org/abs/2211.01095. We recommend to use
+ `dpmsolver++` or `sde-dpmsolver++` with `solver_order=2` for guided sampling (e.g. stable-diffusion).
+ solver_type (`str`, default `midpoint`):
+ the solver type for the second-order solver. Either `midpoint` or `heun`. The solver type slightly affects
+ the sample quality, especially for small number of steps. We empirically find that `midpoint` solvers are
+ slightly better, so we recommend to use the `midpoint` type.
+ lower_order_final (`bool`, default `True`):
+ whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. We empirically
+ find this trick can stabilize the sampling of DPM-Solver for steps < 15, especially for steps <= 10.
+ use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+ This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the
+ noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence
+ of noise levels {σi} as defined in Equation (5) of the paper https://arxiv.org/pdf/2206.00364.pdf.
+ lambda_min_clipped (`float`, default `-inf`):
+ the clipping threshold for the minimum value of lambda(t) for numerical stability. This is critical for
+ cosine (squaredcos_cap_v2) noise schedule.
+ variance_type (`str`, *optional*):
+ Set to "learned" or "learned_range" for diffusion models that predict variance. For example, OpenAI's
+ guided-diffusion (https://github.com/openai/guided-diffusion) predicts both mean and variance of the
+ Gaussian distribution in the model's output. DPM-Solver only needs the "mean" output because it is based on
+ diffusion ODEs. whether the model's output contains the predicted Gaussian variance. For example, OpenAI's
+ guided-diffusion (https://github.com/openai/guided-diffusion) predicts both mean and variance of the
+ Gaussian distribution in the model's output. DPM-Solver only needs the "mean" output because it is based on
+ diffusion ODEs.
+ """
+
+ _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+ order = 1
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ solver_order: int = 2,
+ prediction_type: str = "epsilon",
+ thresholding: bool = False,
+ dynamic_thresholding_ratio: float = 0.995,
+ sample_max_value: float = 1.0,
+ algorithm_type: str = "dpmsolver++",
+ solver_type: str = "midpoint",
+ lower_order_final: bool = True,
+ use_karras_sigmas: Optional[bool] = False,
+ lambda_min_clipped: float = -float("inf"),
+ variance_type: Optional[str] = None,
+ ):
+ if trained_betas is not None:
+ self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+ elif beta_schedule == "linear":
+ self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+ elif beta_schedule == "scaled_linear":
+ # this schedule is very specific to the latent diffusion model.
+ self.betas = (
+ torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+ )
+ elif beta_schedule == "squaredcos_cap_v2":
+ # Glide cosine schedule
+ self.betas = betas_for_alpha_bar(num_train_timesteps)
+ else:
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+ self.alphas = 1.0 - self.betas
+ self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+ # Currently we only support VP-type noise schedule
+ self.alpha_t = torch.sqrt(self.alphas_cumprod)
+ self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+ self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = 1.0
+
+ # settings for DPM-Solver
+ if algorithm_type not in ["dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"]:
+ if algorithm_type == "deis":
+ self.register_to_config(algorithm_type="dpmsolver++")
+ else:
+ raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
+
+ if solver_type not in ["midpoint", "heun"]:
+ if solver_type in ["logrho", "bh1", "bh2"]:
+ self.register_to_config(solver_type="midpoint")
+ else:
+ raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
+
+ # setable values
+ self.num_inference_steps = None
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32).copy()
+ self.timesteps = torch.from_numpy(timesteps)
+ self.model_outputs = [None] * solver_order
+ self.lower_order_nums = 0
+ self.use_karras_sigmas = use_karras_sigmas
+
+ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
+ """
+ Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ device (`str` or `torch.device`, optional):
+ the device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+ """
+ # Clipping the minimum of all lambda(t) for numerical stability.
+ # This is critical for cosine (squaredcos_cap_v2) noise schedule.
+ clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.lambda_min_clipped)
+ self.noisiest_timestep = self.config.num_train_timesteps - 1 - clipped_idx
+ timesteps = (
+ np.linspace(0, self.noisiest_timestep, num_inference_steps + 1).round()[:-1].copy().astype(np.int64)
+ )
+
+ if self.use_karras_sigmas:
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+ log_sigmas = np.log(sigmas)
+ sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+ timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+ timesteps = timesteps.copy().astype(np.int64)
+
+ # when num_inference_steps == num_train_timesteps, we can end up with
+ # duplicates in timesteps.
+ _, unique_indices = np.unique(timesteps, return_index=True)
+ timesteps = timesteps[np.sort(unique_indices)]
+
+ self.timesteps = torch.from_numpy(timesteps).to(device)
+
+ self.num_inference_steps = len(timesteps)
+
+ self.model_outputs = [
+ None,
+ ] * self.config.solver_order
+ self.lower_order_nums = 0
+
+ # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+ """
+ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+ prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+ s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+ pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+ photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+ https://arxiv.org/abs/2205.11487
+ """
+ dtype = sample.dtype
+ batch_size, channels, height, width = sample.shape
+
+ if dtype not in (torch.float32, torch.float64):
+ sample = sample.float() # upcast for quantile calculation, and clamp not implemented for cpu half
+
+ # Flatten sample for doing quantile calculation along each image
+ sample = sample.reshape(batch_size, channels * height * width)
+
+ abs_sample = sample.abs() # "a certain percentile absolute pixel value"
+
+ s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+ s = torch.clamp(
+ s, min=1, max=self.config.sample_max_value
+ ) # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+
+ s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0
+ sample = torch.clamp(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+ sample = sample.reshape(batch_size, channels, height, width)
+ sample = sample.to(dtype)
+
+ return sample
+
+ # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+ def _sigma_to_t(self, sigma, log_sigmas):
+ # get log sigma
+ log_sigma = np.log(sigma)
+
+ # get distribution
+ dists = log_sigma - log_sigmas[:, np.newaxis]
+
+ # get sigmas range
+ low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+ high_idx = low_idx + 1
+
+ low = log_sigmas[low_idx]
+ high = log_sigmas[high_idx]
+
+ # interpolate sigmas
+ w = (low - log_sigma) / (low - high)
+ w = np.clip(w, 0, 1)
+
+ # transform interpolation to time range
+ t = (1 - w) * low_idx + w * high_idx
+ t = t.reshape(sigma.shape)
+ return t
+
+ # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+ """Constructs the noise schedule of Karras et al. (2022)."""
+
+ sigma_min: float = in_sigmas[-1].item()
+ sigma_max: float = in_sigmas[0].item()
+
+ rho = 7.0 # 7.0 is the value used in the paper
+ ramp = np.linspace(0, 1, num_inference_steps)
+ min_inv_rho = sigma_min ** (1 / rho)
+ max_inv_rho = sigma_max ** (1 / rho)
+ sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+ return sigmas
+
+ # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.convert_model_output
+ def convert_model_output(
+ self, model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor
+ ) -> torch.FloatTensor:
+ """
+ Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
+
+ DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to
+ discretize an integral of the data prediction model. So we need to first convert the model output to the
+ corresponding type to match the algorithm.
+
+ Note that the algorithm type and the model type is decoupled. That is to say, we can use either DPM-Solver or
+ DPM-Solver++ for both noise prediction model and data prediction model.
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+
+ Returns:
+ `torch.FloatTensor`: the converted model output.
+ """
+
+ # DPM-Solver++ needs to solve an integral of the data prediction model.
+ if self.config.algorithm_type in ["dpmsolver++", "sde-dpmsolver++"]:
+ if self.config.prediction_type == "epsilon":
+ # DPM-Solver and DPM-Solver++ only need the "mean" output.
+ if self.config.variance_type in ["learned", "learned_range"]:
+ model_output = model_output[:, :3]
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+ x0_pred = (sample - sigma_t * model_output) / alpha_t
+ elif self.config.prediction_type == "sample":
+ x0_pred = model_output
+ elif self.config.prediction_type == "v_prediction":
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+ x0_pred = alpha_t * sample - sigma_t * model_output
+ else:
+ raise ValueError(
+ f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+ " `v_prediction` for the DPMSolverMultistepScheduler."
+ )
+
+ if self.config.thresholding:
+ x0_pred = self._threshold_sample(x0_pred)
+
+ return x0_pred
+
+ # DPM-Solver needs to solve an integral of the noise prediction model.
+ elif self.config.algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
+ if self.config.prediction_type == "epsilon":
+ # DPM-Solver and DPM-Solver++ only need the "mean" output.
+ if self.config.variance_type in ["learned", "learned_range"]:
+ epsilon = model_output[:, :3]
+ else:
+ epsilon = model_output
+ elif self.config.prediction_type == "sample":
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+ epsilon = (sample - alpha_t * model_output) / sigma_t
+ elif self.config.prediction_type == "v_prediction":
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+ epsilon = alpha_t * model_output + sigma_t * sample
+ else:
+ raise ValueError(
+ f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+ " `v_prediction` for the DPMSolverMultistepScheduler."
+ )
+
+ if self.config.thresholding:
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+ x0_pred = (sample - sigma_t * epsilon) / alpha_t
+ x0_pred = self._threshold_sample(x0_pred)
+ epsilon = (sample - alpha_t * x0_pred) / sigma_t
+
+ return epsilon
+
+ # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.dpm_solver_first_order_update
+ def dpm_solver_first_order_update(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: int,
+ prev_timestep: int,
+ sample: torch.FloatTensor,
+ noise: Optional[torch.FloatTensor] = None,
+ ) -> torch.FloatTensor:
+ """
+ One step for the first-order DPM-Solver (equivalent to DDIM).
+
+ See https://arxiv.org/abs/2206.00927 for the detailed derivation.
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+
+ Returns:
+ `torch.FloatTensor`: the sample tensor at the previous timestep.
+ """
+ lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
+ alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
+ sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep]
+ h = lambda_t - lambda_s
+ if self.config.algorithm_type == "dpmsolver++":
+ x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output
+ elif self.config.algorithm_type == "dpmsolver":
+ x_t = (alpha_t / alpha_s) * sample - (sigma_t * (torch.exp(h) - 1.0)) * model_output
+ elif self.config.algorithm_type == "sde-dpmsolver++":
+ assert noise is not None
+ x_t = (
+ (sigma_t / sigma_s * torch.exp(-h)) * sample
+ + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
+ + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+ )
+ elif self.config.algorithm_type == "sde-dpmsolver":
+ assert noise is not None
+ x_t = (
+ (alpha_t / alpha_s) * sample
+ - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * model_output
+ + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+ )
+ return x_t
+
+ # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_second_order_update
+ def multistep_dpm_solver_second_order_update(
+ self,
+ model_output_list: List[torch.FloatTensor],
+ timestep_list: List[int],
+ prev_timestep: int,
+ sample: torch.FloatTensor,
+ noise: Optional[torch.FloatTensor] = None,
+ ) -> torch.FloatTensor:
+ """
+ One step for the second-order multistep DPM-Solver.
+
+ Args:
+ model_output_list (`List[torch.FloatTensor]`):
+ direct outputs from learned diffusion model at current and latter timesteps.
+ timestep (`int`): current and latter discrete timestep in the diffusion chain.
+ prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+
+ Returns:
+ `torch.FloatTensor`: the sample tensor at the previous timestep.
+ """
+ t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
+ m0, m1 = model_output_list[-1], model_output_list[-2]
+ lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1]
+ alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+ sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+ h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+ r0 = h_0 / h
+ D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+ if self.config.algorithm_type == "dpmsolver++":
+ # See https://arxiv.org/abs/2211.01095 for detailed derivations
+ if self.config.solver_type == "midpoint":
+ x_t = (
+ (sigma_t / sigma_s0) * sample
+ - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+ - 0.5 * (alpha_t * (torch.exp(-h) - 1.0)) * D1
+ )
+ elif self.config.solver_type == "heun":
+ x_t = (
+ (sigma_t / sigma_s0) * sample
+ - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+ + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+ )
+ elif self.config.algorithm_type == "dpmsolver":
+ # See https://arxiv.org/abs/2206.00927 for detailed derivations
+ if self.config.solver_type == "midpoint":
+ x_t = (
+ (alpha_t / alpha_s0) * sample
+ - (sigma_t * (torch.exp(h) - 1.0)) * D0
+ - 0.5 * (sigma_t * (torch.exp(h) - 1.0)) * D1
+ )
+ elif self.config.solver_type == "heun":
+ x_t = (
+ (alpha_t / alpha_s0) * sample
+ - (sigma_t * (torch.exp(h) - 1.0)) * D0
+ - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+ )
+ elif self.config.algorithm_type == "sde-dpmsolver++":
+ assert noise is not None
+ if self.config.solver_type == "midpoint":
+ x_t = (
+ (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+ + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+ + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
+ + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+ )
+ elif self.config.solver_type == "heun":
+ x_t = (
+ (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+ + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+ + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1
+ + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+ )
+ elif self.config.algorithm_type == "sde-dpmsolver":
+ assert noise is not None
+ if self.config.solver_type == "midpoint":
+ x_t = (
+ (alpha_t / alpha_s0) * sample
+ - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
+ - (sigma_t * (torch.exp(h) - 1.0)) * D1
+ + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+ )
+ elif self.config.solver_type == "heun":
+ x_t = (
+ (alpha_t / alpha_s0) * sample
+ - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
+ - 2.0 * (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+ + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+ )
+ return x_t
+
+ # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_third_order_update
+ def multistep_dpm_solver_third_order_update(
+ self,
+ model_output_list: List[torch.FloatTensor],
+ timestep_list: List[int],
+ prev_timestep: int,
+ sample: torch.FloatTensor,
+ ) -> torch.FloatTensor:
+ """
+ One step for the third-order multistep DPM-Solver.
+
+ Args:
+ model_output_list (`List[torch.FloatTensor]`):
+ direct outputs from learned diffusion model at current and latter timesteps.
+ timestep (`int`): current and latter discrete timestep in the diffusion chain.
+ prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+
+ Returns:
+ `torch.FloatTensor`: the sample tensor at the previous timestep.
+ """
+ t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
+ m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+ lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
+ self.lambda_t[t],
+ self.lambda_t[s0],
+ self.lambda_t[s1],
+ self.lambda_t[s2],
+ )
+ alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+ sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+ h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+ r0, r1 = h_0 / h, h_1 / h
+ D0 = m0
+ D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+ D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+ D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+ if self.config.algorithm_type == "dpmsolver++":
+ # See https://arxiv.org/abs/2206.00927 for detailed derivations
+ x_t = (
+ (sigma_t / sigma_s0) * sample
+ - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+ + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+ - (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+ )
+ elif self.config.algorithm_type == "dpmsolver":
+ # See https://arxiv.org/abs/2206.00927 for detailed derivations
+ x_t = (
+ (alpha_t / alpha_s0) * sample
+ - (sigma_t * (torch.exp(h) - 1.0)) * D0
+ - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+ - (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+ )
+ return x_t
+
+ def step(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: int,
+ sample: torch.FloatTensor,
+ generator=None,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
+ """
+ Step function propagating the sample with the multistep DPM-Solver.
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+ return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
+
+ Returns:
+ [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
+ True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+ """
+ if self.num_inference_steps is None:
+ raise ValueError(
+ "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+ )
+
+ if isinstance(timestep, torch.Tensor):
+ timestep = timestep.to(self.timesteps.device)
+ step_index = (self.timesteps == timestep).nonzero()
+ if len(step_index) == 0:
+ step_index = len(self.timesteps) - 1
+ else:
+ step_index = step_index.item()
+ prev_timestep = (
+ self.noisiest_timestep if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
+ )
+ lower_order_final = (
+ (step_index == len(self.timesteps) - 1) and self.config.lower_order_final and len(self.timesteps) < 15
+ )
+ lower_order_second = (
+ (step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
+ )
+
+ model_output = self.convert_model_output(model_output, timestep, sample)
+ for i in range(self.config.solver_order - 1):
+ self.model_outputs[i] = self.model_outputs[i + 1]
+ self.model_outputs[-1] = model_output
+
+ if self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
+ noise = randn_tensor(
+ model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
+ )
+ else:
+ noise = None
+
+ if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+ prev_sample = self.dpm_solver_first_order_update(
+ model_output, timestep, prev_timestep, sample, noise=noise
+ )
+ elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+ timestep_list = [self.timesteps[step_index - 1], timestep]
+ prev_sample = self.multistep_dpm_solver_second_order_update(
+ self.model_outputs, timestep_list, prev_timestep, sample, noise=noise
+ )
+ else:
+ timestep_list = [self.timesteps[step_index - 2], self.timesteps[step_index - 1], timestep]
+ prev_sample = self.multistep_dpm_solver_third_order_update(
+ self.model_outputs, timestep_list, prev_timestep, sample
+ )
+
+ if self.lower_order_nums < self.config.solver_order:
+ self.lower_order_nums += 1
+
+ if not return_dict:
+ return (prev_sample,)
+
+ return SchedulerOutput(prev_sample=prev_sample)
+
+ # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.scale_model_input
+ def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+ """
+ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+ current timestep.
+
+ Args:
+ sample (`torch.FloatTensor`): input sample
+
+ Returns:
+ `torch.FloatTensor`: scaled input sample
+ """
+ return sample
+
+ # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+ def add_noise(
+ self,
+ original_samples: torch.FloatTensor,
+ noise: torch.FloatTensor,
+ timesteps: torch.IntTensor,
+ ) -> torch.FloatTensor:
+ # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+ alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+ timesteps = timesteps.to(original_samples.device)
+
+ sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+ sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+ while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+ sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+ sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+ while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+ return noisy_samples
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 014e193aa32a..e07b7cb27da7 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -450,6 +450,21 @@ def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
+class DPMSolverMultistepInverseScheduler(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
class DPMSolverMultistepScheduler(metaclass=DummyObject):
_backends = ["torch"]
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
index d32f4d665f55..c9da7b06893f 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
@@ -27,6 +27,8 @@
AutoencoderKL,
DDIMInverseScheduler,
DDIMScheduler,
+ DPMSolverMultistepInverseScheduler,
+ DPMSolverMultistepScheduler,
StableDiffusionDiffEditPipeline,
UNet2DConditionModel,
)
@@ -256,6 +258,30 @@ def test_inversion(self):
def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=5e-3)
+ def test_inversion_dpm(self):
+ device = "cpu"
+
+ components = self.get_dummy_components()
+
+ scheduler_args = {"beta_start": 0.00085, "beta_end": 0.012, "beta_schedule": "scaled_linear"}
+ components["scheduler"] = DPMSolverMultistepScheduler(**scheduler_args)
+ components["inverse_scheduler"] = DPMSolverMultistepInverseScheduler(**scheduler_args)
+
+ pipe = self.pipeline_class(**components)
+ pipe.to(device)
+ pipe.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_inversion_inputs(device)
+ image = pipe.invert(**inputs).images
+ image_slice = image[0, -1, -3:, -3:]
+
+ self.assertEqual(image.shape, (2, 32, 32, 3))
+ expected_slice = np.array(
+ [0.5150, 0.5134, 0.5043, 0.5376, 0.4694, 0.51050, 0.5015, 0.4407, 0.4799],
+ )
+ max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+ self.assertLessEqual(max_diff, 1e-3)
+
@require_torch_gpu
@slow
@@ -320,3 +346,54 @@ def test_stable_diffusion_diffedit_full(self):
/ 255
)
assert np.abs((expected_image - image).max()) < 5e-1
+
+ def test_stable_diffusion_diffedit_dpm(self):
+ generator = torch.manual_seed(0)
+
+ pipe = StableDiffusionDiffEditPipeline.from_pretrained(
+ "stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16
+ )
+ pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+ pipe.inverse_scheduler = DPMSolverMultistepInverseScheduler.from_config(pipe.scheduler.config)
+ pipe.enable_model_cpu_offload()
+ pipe.set_progress_bar_config(disable=None)
+
+ source_prompt = "a bowl of fruit"
+ target_prompt = "a bowl of pears"
+
+ mask_image = pipe.generate_mask(
+ image=self.raw_image,
+ source_prompt=source_prompt,
+ target_prompt=target_prompt,
+ generator=generator,
+ )
+
+ inv_latents = pipe.invert(
+ prompt=source_prompt,
+ image=self.raw_image,
+ inpaint_strength=0.7,
+ generator=generator,
+ num_inference_steps=25,
+ ).latents
+
+ image = pipe(
+ prompt=target_prompt,
+ mask_image=mask_image,
+ image_latents=inv_latents,
+ generator=generator,
+ negative_prompt=source_prompt,
+ inpaint_strength=0.7,
+ num_inference_steps=25,
+ output_type="numpy",
+ ).images[0]
+
+ expected_image = (
+ np.array(
+ load_image(
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+ "/diffedit/pears.png"
+ ).resize((768, 768))
+ )
+ / 255
+ )
+ assert np.abs((expected_image - image).max()) < 5e-1
From 754fac82d2e0237edff20c4eee3f0f2ea4ab91a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Laure=CE=B7t?=
Date: Tue, 16 May 2023 20:33:34 +0200
Subject: [PATCH 087/206] [Docs] Fix incomplete docstring for resnet.py (#3438)
Fix incomplete docstrings for resnet.py
---
src/diffusers/models/resnet.py | 86 ++++++++++++++++++++++++----------
1 file changed, 62 insertions(+), 24 deletions(-)
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index d9d539959c09..debe120e8ead 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -24,14 +24,17 @@
class Upsample1D(nn.Module):
- """
- An upsampling layer with an optional convolution.
+ """A 1D upsampling layer with an optional convolution.
Parameters:
- channels: channels in the inputs and outputs.
- use_conv: a bool determining if a convolution is applied.
- use_conv_transpose:
- out_channels:
+ channels (`int`):
+ number of channels in the inputs and outputs.
+ use_conv (`bool`, default `False`):
+ option to use a convolution.
+ use_conv_transpose (`bool`, default `False`):
+ option to use a convolution transpose.
+ out_channels (`int`, optional):
+ number of output channels. Defaults to `channels`.
"""
def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
@@ -62,14 +65,17 @@ def forward(self, x):
class Downsample1D(nn.Module):
- """
- A downsampling layer with an optional convolution.
+ """A 1D downsampling layer with an optional convolution.
Parameters:
- channels: channels in the inputs and outputs.
- use_conv: a bool determining if a convolution is applied.
- out_channels:
- padding:
+ channels (`int`):
+ number of channels in the inputs and outputs.
+ use_conv (`bool`, default `False`):
+ option to use a convolution.
+ out_channels (`int`, optional):
+ number of output channels. Defaults to `channels`.
+ padding (`int`, default `1`):
+ padding for the convolution.
"""
def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
@@ -93,14 +99,17 @@ def forward(self, x):
class Upsample2D(nn.Module):
- """
- An upsampling layer with an optional convolution.
+ """A 2D upsampling layer with an optional convolution.
Parameters:
- channels: channels in the inputs and outputs.
- use_conv: a bool determining if a convolution is applied.
- use_conv_transpose:
- out_channels:
+ channels (`int`):
+ number of channels in the inputs and outputs.
+ use_conv (`bool`, default `False`):
+ option to use a convolution.
+ use_conv_transpose (`bool`, default `False`):
+ option to use a convolution transpose.
+ out_channels (`int`, optional):
+ number of output channels. Defaults to `channels`.
"""
def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
@@ -162,14 +171,17 @@ def forward(self, hidden_states, output_size=None):
class Downsample2D(nn.Module):
- """
- A downsampling layer with an optional convolution.
+ """A 2D downsampling layer with an optional convolution.
Parameters:
- channels: channels in the inputs and outputs.
- use_conv: a bool determining if a convolution is applied.
- out_channels:
- padding:
+ channels (`int`):
+ number of channels in the inputs and outputs.
+ use_conv (`bool`, default `False`):
+ option to use a convolution.
+ out_channels (`int`, optional):
+ number of output channels. Defaults to `channels`.
+ padding (`int`, default `1`):
+ padding for the convolution.
"""
def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
@@ -209,6 +221,19 @@ def forward(self, hidden_states):
class FirUpsample2D(nn.Module):
+ """A 2D FIR upsampling layer with an optional convolution.
+
+ Parameters:
+ channels (`int`):
+ number of channels in the inputs and outputs.
+ use_conv (`bool`, default `False`):
+ option to use a convolution.
+ out_channels (`int`, optional):
+ number of output channels. Defaults to `channels`.
+ fir_kernel (`tuple`, default `(1, 3, 3, 1)`):
+ kernel for the FIR filter.
+ """
+
def __init__(self, channels=None, out_channels=None, use_conv=False, fir_kernel=(1, 3, 3, 1)):
super().__init__()
out_channels = out_channels if out_channels else channels
@@ -309,6 +334,19 @@ def forward(self, hidden_states):
class FirDownsample2D(nn.Module):
+ """A 2D FIR downsampling layer with an optional convolution.
+
+ Parameters:
+ channels (`int`):
+ number of channels in the inputs and outputs.
+ use_conv (`bool`, default `False`):
+ option to use a convolution.
+ out_channels (`int`, optional):
+ number of output channels. Defaults to `channels`.
+ fir_kernel (`tuple`, default `(1, 3, 3, 1)`):
+ kernel for the FIR filter.
+ """
+
def __init__(self, channels=None, out_channels=None, use_conv=False, fir_kernel=(1, 3, 3, 1)):
super().__init__()
out_channels = out_channels if out_channels else channels
From 92ea5baca2815ecd51f96bedb0fb766b313196f8 Mon Sep 17 00:00:00 2001
From: superlabs-dev <133080491+superlabs-dev@users.noreply.github.com>
Date: Wed, 17 May 2023 03:33:47 +0900
Subject: [PATCH 088/206] fix tiled vae blend extent range (#3384)
fix tiled vae bleand extent range
---
src/diffusers/models/autoencoder_kl.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/src/diffusers/models/autoencoder_kl.py b/src/diffusers/models/autoencoder_kl.py
index 1a8a204d80ce..a4894e78c43f 100644
--- a/src/diffusers/models/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoder_kl.py
@@ -196,12 +196,14 @@ def decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[Decode
return DecoderOutput(sample=decoded)
def blend_v(self, a, b, blend_extent):
- for y in range(min(a.shape[2], b.shape[2], blend_extent)):
+ blend_extent = min(a.shape[2], b.shape[2], blend_extent)
+ for y in range(blend_extent):
b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
return b
def blend_h(self, a, b, blend_extent):
- for x in range(min(a.shape[3], b.shape[3], blend_extent)):
+ blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+ for x in range(blend_extent):
b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
return b
From 0392eceba8d42b24fcecc56b2cc1f4582dbefcc4 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca
Date: Tue, 16 May 2023 20:35:47 +0200
Subject: [PATCH 089/206] Small update to "Next steps" section (#3443)
Small update to "Next steps" section:
- PyTorch 2 is recommended.
- Updated improvement figures.
---
docs/source/en/stable_diffusion.mdx | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/docs/source/en/stable_diffusion.mdx b/docs/source/en/stable_diffusion.mdx
index d02e93033614..64c90c7f6477 100644
--- a/docs/source/en/stable_diffusion.mdx
+++ b/docs/source/en/stable_diffusion.mdx
@@ -266,6 +266,6 @@ image_grid(images)
In this tutorial, you learned how to optimize a [`DiffusionPipeline`] for computational and memory efficiency as well as improving the quality of generated outputs. If you're interested in making your pipeline even faster, take a look at the following resources:
-- Enable [xFormers](./optimization/xformers) memory efficient attention mechanism for faster speed and reduced memory consumption.
-- Learn how in [PyTorch 2.0](./optimization/torch2.0), [`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html) can yield 2-9% faster inference speed.
-- Many optimization techniques for inference are also included in this memory and speed [guide](./optimization/fp16), such as memory offloading.
+- Learn how [PyTorch 2.0](./optimization/torch2.0) and [`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html) can yield 5 - 300% faster inference speed.
+- If you can't use PyTorch 2, we recommend you install [xFormers](./optimization/xformers). Its memory-efficient attention mechanism works great with PyTorch 1.13.1 for faster speed and reduced memory consumption.
+- Other optimization techniques, such as model offloading, are covered in [this guide](./optimization/fp16).
From 6070b32fcfd13fdf81547c91f9333fb117bc3982 Mon Sep 17 00:00:00 2001
From: Dev Aggarwal
Date: Wed, 17 May 2023 07:51:07 +0530
Subject: [PATCH 090/206] Allow arbitrary aspect ratio in
IFSuperResolutionPipeline (#3298)
* Update pipeline_if_superresolution.py
Allow arbitrary aspect ratio in IFSuperResolutionPipeline by using the input image shape
* IFSuperResolutionPipeline: allow the user to override the height and width through the arguments
* update IFSuperResolutionPipeline width/height doc string to match StableDiffusionInpaintPipeline conventions
---------
Co-authored-by: Patrick von Platen
---
.../deepfloyd_if/pipeline_if_superresolution.py | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
index 1ba8f888a8e3..2fe8e6a9d5d5 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
@@ -695,6 +695,8 @@ def preprocess_image(self, image, num_images_per_prompt, device):
def __call__(
self,
prompt: Union[str, List[str]] = None,
+ height: int = None,
+ width: int = None,
image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor] = None,
num_inference_steps: int = 50,
timesteps: List[int] = None,
@@ -720,6 +722,10 @@ def __call__(
prompt (`str` or `List[str]`, *optional*):
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
instead.
+ height (`int`, *optional*, defaults to self.unet.config.sample_size):
+ The height in pixels of the generated image.
+ width (`int`, *optional*, defaults to self.unet.config.sample_size):
+ The width in pixels of the generated image.
image (`PIL.Image.Image`, `np.ndarray`, `torch.FloatTensor`):
The image to be upscaled.
num_inference_steps (`int`, *optional*, defaults to 50):
@@ -806,8 +812,8 @@ def __call__(
# 2. Define call parameters
- height = self.unet.config.sample_size
- width = self.unet.config.sample_size
+ height = height or self.unet.config.sample_size
+ width = width or self.unet.config.sample_size
device = self._execution_device
From c09c4f3ab7ab7d46727949e003facb391e1e8b8d Mon Sep 17 00:00:00 2001
From: Rupert Menneer <71332436+rupertmenneer@users.noreply.github.com>
Date: Wed, 17 May 2023 03:05:16 -0700
Subject: [PATCH 091/206] Adding 'strength' parameter to
StableDiffusionInpaintingPipeline (#3424)
* Added explanation of 'strength' parameter
* Added get_timesteps function which relies on new strength parameter
* Added `strength` parameter which defaults to 1.
* Swapped ordering so `noise_timestep` can be calculated before masking the image
this is required when you aren't applying 100% noise to the masked region, e.g. strength < 1.
* Added strength to check_inputs, throws error if out of range
* Changed `prepare_latents` to initialise latents w.r.t strength
inspired from the stable diffusion img2img pipeline, init latents are initialised by converting the init image into a VAE latent and adding noise (based upon the strength parameter passed in), e.g. random when strength = 1, or the init image at strength = 0.
* WIP: Added a unit test for the new strength parameter in the StableDiffusionInpaintingPipeline
still need to add correct regression values
* Created a is_strength_max to initialise from pure random noise
* Updated unit tests w.r.t new strength parameter + fixed new strength unit test
* renamed parameter to avoid confusion with variable of same name
* Updated regression values for new strength test - now passes
* removed 'copied from' comment as this method is now different and divergent from the cpy
* Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
Co-authored-by: Patrick von Platen
* Ensure backwards compatibility for prepare_mask_and_masked_image
created a return_image boolean and initialised to false
* Ensure backwards compatibility for prepare_latents
* Fixed copy check typo
* Fixes w.r.t backward compibility changes
* make style
* keep function argument ordering same for backwards compatibility in callees with copied from statements
* make fix-copies
---------
Co-authored-by: Patrick von Platen
Co-authored-by: William Berman
---
.../controlnet/pipeline_controlnet_inpaint.py | 47 +++++++-
.../pipeline_stable_diffusion_inpaint.py | 93 +++++++++++++--
.../test_stable_diffusion_inpaint.py | 108 ++++++++++++++----
3 files changed, 211 insertions(+), 37 deletions(-)
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
index a146a1cc2908..27475dc5ef8b 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -99,7 +99,7 @@
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.prepare_mask_and_masked_image
-def prepare_mask_and_masked_image(image, mask, height, width):
+def prepare_mask_and_masked_image(image, mask, height, width, return_image=False):
"""
Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
@@ -209,6 +209,10 @@ def prepare_mask_and_masked_image(image, mask, height, width):
masked_image = image * (mask < 0.5)
+ # n.b. ensure backwards compatibility as old function does not return image
+ if return_image:
+ return mask, masked_image, image
+
return mask, masked_image
@@ -795,7 +799,20 @@ def prepare_control_image(
return image
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_latents
- def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+ def prepare_latents(
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ device,
+ generator,
+ latents=None,
+ image=None,
+ timestep=None,
+ is_strength_max=True,
+ ):
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
@@ -803,13 +820,37 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
+ if (image is None or timestep is None) and not is_strength_max:
+ raise ValueError(
+ "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+ "However, either the image or the noise timestep has not been provided."
+ )
+
if latents is None:
- latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+ if is_strength_max:
+ # if strength is 100% then simply initialise the latents to noise
+ latents = noise
+ else:
+ # otherwise initialise latents as init image + noise
+ image = image.to(device=device, dtype=dtype)
+ if isinstance(generator, list):
+ image_latents = [
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(generator=generator[i])
+ for i in range(batch_size)
+ ]
+ else:
+ image_latents = self.vae.encode(image).latent_dist.sample(generator=generator)
+
+ image_latents = self.vae.config.scaling_factor * image_latents
+
+ latents = self.scheduler.add_noise(image_latents, noise, timestep)
else:
latents = latents.to(device)
# scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma
+
return latents
def _default_height_width(self, height, width, image):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 518a9a3e9781..78ef11587b4d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -36,7 +36,7 @@
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-def prepare_mask_and_masked_image(image, mask, height, width):
+def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool = False):
"""
Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
@@ -146,6 +146,10 @@ def prepare_mask_and_masked_image(image, mask, height, width):
masked_image = image * (mask < 0.5)
+ # n.b. ensure backwards compatibility as old function does not return image
+ if return_image:
+ return mask, masked_image, image
+
return mask, masked_image
@@ -552,17 +556,20 @@ def decode_latents(self, latents):
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
return image
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
def check_inputs(
self,
prompt,
height,
width,
+ strength,
callback_steps,
negative_prompt=None,
prompt_embeds=None,
negative_prompt_embeds=None,
):
+ if strength < 0 or strength > 1:
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
if height % 8 != 0 or width % 8 != 0:
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
@@ -600,8 +607,20 @@ def check_inputs(
f" {negative_prompt_embeds.shape}."
)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
- def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+ def prepare_latents(
+ self,
+ batch_size,
+ num_channels_latents,
+ height,
+ width,
+ dtype,
+ device,
+ generator,
+ latents=None,
+ image=None,
+ timestep=None,
+ is_strength_max=True,
+ ):
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
@@ -609,13 +628,37 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
+ if (image is None or timestep is None) and not is_strength_max:
+ raise ValueError(
+ "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+ "However, either the image or the noise timestep has not been provided."
+ )
+
if latents is None:
- latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+ if is_strength_max:
+ # if strength is 100% then simply initialise the latents to noise
+ latents = noise
+ else:
+ # otherwise initialise latents as init image + noise
+ image = image.to(device=device, dtype=dtype)
+ if isinstance(generator, list):
+ image_latents = [
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(generator=generator[i])
+ for i in range(batch_size)
+ ]
+ else:
+ image_latents = self.vae.encode(image).latent_dist.sample(generator=generator)
+
+ image_latents = self.vae.config.scaling_factor * image_latents
+
+ latents = self.scheduler.add_noise(image_latents, noise, timestep)
else:
latents = latents.to(device)
# scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma
+
return latents
def prepare_mask_latents(
@@ -669,6 +712,16 @@ def prepare_mask_latents(
masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
return mask, masked_image_latents
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+ def get_timesteps(self, num_inference_steps, strength, device):
+ # get the original timestep using init_timestep
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+ t_start = max(num_inference_steps - init_timestep, 0)
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+ return timesteps, num_inference_steps - t_start
+
@torch.no_grad()
def __call__(
self,
@@ -677,6 +730,7 @@ def __call__(
mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
height: Optional[int] = None,
width: Optional[int] = None,
+ strength: float = 1.0,
num_inference_steps: int = 50,
guidance_scale: float = 7.5,
negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -710,6 +764,13 @@ def __call__(
The height in pixels of the generated image.
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
The width in pixels of the generated image.
+ strength (`float`, *optional*, defaults to 1.):
+ Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
+ between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
+ `strength`. The number of denoising steps depends on the amount of noise initially added. When
+ `strength` is 1, added noise will be maximum and the denoising process will run for the full number of
+ iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores the masked
+ portion of the reference `image`.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
@@ -802,6 +863,7 @@ def __call__(
prompt,
height,
width,
+ strength,
callback_steps,
negative_prompt,
prompt_embeds,
@@ -833,12 +895,20 @@ def __call__(
negative_prompt_embeds=negative_prompt_embeds,
)
- # 4. Preprocess mask and image - resizes image and mask w.r.t height and width
- mask, masked_image = prepare_mask_and_masked_image(image, mask_image, height, width)
-
- # 5. set timesteps
+ # 4. set timesteps
self.scheduler.set_timesteps(num_inference_steps, device=device)
- timesteps = self.scheduler.timesteps
+ timesteps, num_inference_steps = self.get_timesteps(
+ num_inference_steps=num_inference_steps, strength=strength, device=device
+ )
+ # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+ # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+ is_strength_max = strength == 1.0
+
+ # 5. Preprocess mask and image
+ mask, masked_image, init_image = prepare_mask_and_masked_image(
+ image, mask_image, height, width, return_image=True
+ )
# 6. Prepare latent variables
num_channels_latents = self.vae.config.latent_channels
@@ -851,6 +921,9 @@ def __call__(
device,
generator,
latents,
+ image=init_image,
+ timestep=latent_timestep,
+ is_strength_max=is_strength_max,
)
# 7. Prepare mask latent variables
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index a215e4da6697..5c5e4c4590dc 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -324,6 +324,26 @@ def test_stable_diffusion_inpaint_pil_input_resolution_test(self):
# verify that the returned image has the same height and width as the input height and width
assert image.shape == (1, inputs["height"], inputs["width"], 3)
+ def test_stable_diffusion_inpaint_strength_test(self):
+ pipe = StableDiffusionInpaintPipeline.from_pretrained(
+ "runwayml/stable-diffusion-inpainting", safety_checker=None
+ )
+ pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+ pipe.to(torch_device)
+ pipe.set_progress_bar_config(disable=None)
+ pipe.enable_attention_slicing()
+
+ inputs = self.get_inputs(torch_device)
+ # change input strength
+ inputs["strength"] = 0.75
+ image = pipe(**inputs).images
+ # verify that the returned image has the same height and width as the input height and width
+ assert image.shape == (1, 512, 512, 3)
+
+ image_slice = image[0, 253:256, 253:256, -1].flatten()
+ expected_slice = np.array([0.0021, 0.2350, 0.3712, 0.0575, 0.2485, 0.3451, 0.1857, 0.3156, 0.3943])
+ assert np.abs(expected_slice - image_slice).max() < 3e-3
+
@nightly
@require_torch_gpu
@@ -427,24 +447,30 @@ def test_pil_inputs(self):
mask = np.random.randint(0, 255, (height, width), dtype=np.uint8) > 127.5
mask = Image.fromarray((mask * 255).astype(np.uint8))
- t_mask, t_masked = prepare_mask_and_masked_image(im, mask, height, width)
+ t_mask, t_masked, t_image = prepare_mask_and_masked_image(im, mask, height, width, return_image=True)
self.assertTrue(isinstance(t_mask, torch.Tensor))
self.assertTrue(isinstance(t_masked, torch.Tensor))
+ self.assertTrue(isinstance(t_image, torch.Tensor))
self.assertEqual(t_mask.ndim, 4)
self.assertEqual(t_masked.ndim, 4)
+ self.assertEqual(t_image.ndim, 4)
self.assertEqual(t_mask.shape, (1, 1, height, width))
self.assertEqual(t_masked.shape, (1, 3, height, width))
+ self.assertEqual(t_image.shape, (1, 3, height, width))
self.assertTrue(t_mask.dtype == torch.float32)
self.assertTrue(t_masked.dtype == torch.float32)
+ self.assertTrue(t_image.dtype == torch.float32)
self.assertTrue(t_mask.min() >= 0.0)
self.assertTrue(t_mask.max() <= 1.0)
self.assertTrue(t_masked.min() >= -1.0)
self.assertTrue(t_masked.min() <= 1.0)
+ self.assertTrue(t_image.min() >= -1.0)
+ self.assertTrue(t_image.min() >= -1.0)
self.assertTrue(t_mask.sum() > 0.0)
@@ -467,11 +493,16 @@ def test_np_inputs(self):
)
mask_pil = Image.fromarray((mask_np * 255).astype(np.uint8))
- t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
- t_mask_pil, t_masked_pil = prepare_mask_and_masked_image(im_pil, mask_pil, height, width)
+ t_mask_np, t_masked_np, t_image_np = prepare_mask_and_masked_image(
+ im_np, mask_np, height, width, return_image=True
+ )
+ t_mask_pil, t_masked_pil, t_image_pil = prepare_mask_and_masked_image(
+ im_pil, mask_pil, height, width, return_image=True
+ )
self.assertTrue((t_mask_np == t_mask_pil).all())
self.assertTrue((t_masked_np == t_masked_pil).all())
+ self.assertTrue((t_image_np == t_image_pil).all())
def test_torch_3D_2D_inputs(self):
height, width = 32, 32
@@ -501,13 +532,16 @@ def test_torch_3D_2D_inputs(self):
im_np = im_tensor.numpy().transpose(1, 2, 0)
mask_np = mask_tensor.numpy()
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
- im_tensor / 127.5 - 1, mask_tensor, height, width
+ t_mask_tensor, t_masked_tensor, t_image_tensor = prepare_mask_and_masked_image(
+ im_tensor / 127.5 - 1, mask_tensor, height, width, return_image=True
+ )
+ t_mask_np, t_masked_np, t_image_np = prepare_mask_and_masked_image(
+ im_np, mask_np, height, width, return_image=True
)
- t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
+ self.assertTrue((t_image_tensor == t_image_np).all())
def test_torch_3D_3D_inputs(self):
height, width = 32, 32
@@ -538,13 +572,16 @@ def test_torch_3D_3D_inputs(self):
im_np = im_tensor.numpy().transpose(1, 2, 0)
mask_np = mask_tensor.numpy()[0]
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
- im_tensor / 127.5 - 1, mask_tensor, height, width
+ t_mask_tensor, t_masked_tensor, t_image_tensor = prepare_mask_and_masked_image(
+ im_tensor / 127.5 - 1, mask_tensor, height, width, return_image=True
+ )
+ t_mask_np, t_masked_np, t_image_np = prepare_mask_and_masked_image(
+ im_np, mask_np, height, width, return_image=True
)
- t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
+ self.assertTrue((t_image_tensor == t_image_np).all())
def test_torch_4D_2D_inputs(self):
height, width = 32, 32
@@ -575,13 +612,16 @@ def test_torch_4D_2D_inputs(self):
im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
mask_np = mask_tensor.numpy()
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
- im_tensor / 127.5 - 1, mask_tensor, height, width
+ t_mask_tensor, t_masked_tensor, t_image_tensor = prepare_mask_and_masked_image(
+ im_tensor / 127.5 - 1, mask_tensor, height, width, return_image=True
+ )
+ t_mask_np, t_masked_np, t_image_np = prepare_mask_and_masked_image(
+ im_np, mask_np, height, width, return_image=True
)
- t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
+ self.assertTrue((t_image_tensor == t_image_np).all())
def test_torch_4D_3D_inputs(self):
height, width = 32, 32
@@ -613,13 +653,16 @@ def test_torch_4D_3D_inputs(self):
im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
mask_np = mask_tensor.numpy()[0]
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
- im_tensor / 127.5 - 1, mask_tensor, height, width
+ t_mask_tensor, t_masked_tensor, t_image_tensor = prepare_mask_and_masked_image(
+ im_tensor / 127.5 - 1, mask_tensor, height, width, return_image=True
+ )
+ t_mask_np, t_masked_np, t_image_np = prepare_mask_and_masked_image(
+ im_np, mask_np, height, width, return_image=True
)
- t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
+ self.assertTrue((t_image_tensor == t_image_np).all())
def test_torch_4D_4D_inputs(self):
height, width = 32, 32
@@ -652,13 +695,16 @@ def test_torch_4D_4D_inputs(self):
im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
mask_np = mask_tensor.numpy()[0][0]
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
- im_tensor / 127.5 - 1, mask_tensor, height, width
+ t_mask_tensor, t_masked_tensor, t_image_tensor = prepare_mask_and_masked_image(
+ im_tensor / 127.5 - 1, mask_tensor, height, width, return_image=True
+ )
+ t_mask_np, t_masked_np, t_image_np = prepare_mask_and_masked_image(
+ im_np, mask_np, height, width, return_image=True
)
- t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
+ self.assertTrue((t_image_tensor == t_image_np).all())
def test_torch_batch_4D_3D(self):
height, width = 32, 32
@@ -691,15 +737,17 @@ def test_torch_batch_4D_3D(self):
im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
mask_nps = [mask.numpy() for mask in mask_tensor]
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
- im_tensor / 127.5 - 1, mask_tensor, height, width
+ t_mask_tensor, t_masked_tensor, t_image_tensor = prepare_mask_and_masked_image(
+ im_tensor / 127.5 - 1, mask_tensor, height, width, return_image=True
)
- nps = [prepare_mask_and_masked_image(i, m, height, width) for i, m in zip(im_nps, mask_nps)]
+ nps = [prepare_mask_and_masked_image(i, m, height, width, return_image=True) for i, m in zip(im_nps, mask_nps)]
t_mask_np = torch.cat([n[0] for n in nps])
t_masked_np = torch.cat([n[1] for n in nps])
+ t_image_np = torch.cat([n[2] for n in nps])
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
+ self.assertTrue((t_image_tensor == t_image_np).all())
def test_torch_batch_4D_4D(self):
height, width = 32, 32
@@ -733,15 +781,17 @@ def test_torch_batch_4D_4D(self):
im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
mask_nps = [mask.numpy()[0] for mask in mask_tensor]
- t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
- im_tensor / 127.5 - 1, mask_tensor, height, width
+ t_mask_tensor, t_masked_tensor, t_image_tensor = prepare_mask_and_masked_image(
+ im_tensor / 127.5 - 1, mask_tensor, height, width, return_image=True
)
- nps = [prepare_mask_and_masked_image(i, m, height, width) for i, m in zip(im_nps, mask_nps)]
+ nps = [prepare_mask_and_masked_image(i, m, height, width, return_image=True) for i, m in zip(im_nps, mask_nps)]
t_mask_np = torch.cat([n[0] for n in nps])
t_masked_np = torch.cat([n[1] for n in nps])
+ t_image_np = torch.cat([n[2] for n in nps])
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
+ self.assertTrue((t_image_tensor == t_image_np).all())
def test_shape_mismatch(self):
height, width = 32, 32
@@ -757,6 +807,7 @@ def test_shape_mismatch(self):
torch.randn(64, 64),
height,
width,
+ return_image=True,
)
# test batch dim
with self.assertRaises(AssertionError):
@@ -770,6 +821,7 @@ def test_shape_mismatch(self):
torch.randn(4, 64, 64),
height,
width,
+ return_image=True,
)
# test batch dim
with self.assertRaises(AssertionError):
@@ -783,6 +835,7 @@ def test_shape_mismatch(self):
torch.randn(4, 1, 64, 64),
height,
width,
+ return_image=True,
)
def test_type_mismatch(self):
@@ -803,6 +856,7 @@ def test_type_mismatch(self):
).numpy(),
height,
width,
+ return_image=True,
)
# test tensors-only
with self.assertRaises(TypeError):
@@ -819,6 +873,7 @@ def test_type_mismatch(self):
),
height,
width,
+ return_image=True,
)
def test_channels_first(self):
@@ -835,6 +890,7 @@ def test_channels_first(self):
),
height,
width,
+ return_image=True,
)
def test_tensor_range(self):
@@ -855,6 +911,7 @@ def test_tensor_range(self):
),
height,
width,
+ return_image=True,
)
# test im >= -1
with self.assertRaises(ValueError):
@@ -871,6 +928,7 @@ def test_tensor_range(self):
),
height,
width,
+ return_image=True,
)
# test mask <= 1
with self.assertRaises(ValueError):
@@ -887,6 +945,7 @@ def test_tensor_range(self):
* 2,
height,
width,
+ return_image=True,
)
# test mask >= 0
with self.assertRaises(ValueError):
@@ -903,4 +962,5 @@ def test_tensor_range(self):
* -1,
height,
width,
+ return_image=True,
)
From 415c616712d82fff64df739aae79ec5fce01f045 Mon Sep 17 00:00:00 2001
From: Vimarsh Chaturvedi
Date: Wed, 17 May 2023 12:05:33 +0200
Subject: [PATCH 092/206] [WIP] Bugfix - Pipeline.from_pretrained is broken
when the pipeline is partially downloaded (#3448)
Added bugfix using f strings.
---
src/diffusers/pipelines/pipeline_utils.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index a4d3dd1f1673..fa71a181f521 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -1249,7 +1249,7 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
# allow all patterns from non-model folders
# this enables downloading schedulers, tokenizers, ...
- allow_patterns += [os.path.join(k, "*") for k in folder_names if k not in model_folder_names]
+ allow_patterns += [f"{k}/*" for k in folder_names if k not in model_folder_names]
# also allow downloading config.json files with the model
allow_patterns += [os.path.join(k, "config.json") for k in model_folder_names]
From 15f1bab13bf3d9ca956d2398e1f550c840fa2bb1 Mon Sep 17 00:00:00 2001
From: 7eu7d7 <31194890+7eu7d7@users.noreply.github.com>
Date: Wed, 17 May 2023 18:06:04 +0800
Subject: [PATCH 093/206] Fix gradient checkpointing bugs in freezing part of
models (requires_grad=False) (#3404)
* gradient checkpointing bug fix
* bug fix; changes for reviews
* reformat
* reformat
---------
Co-authored-by: Patrick von Platen
---
src/diffusers/models/unet_2d_blocks.py | 173 ++++++++++++++----
src/diffusers/models/vae.py | 46 +++--
.../versatile_diffusion/modeling_text_unet.py | 76 ++++++--
3 files changed, 230 insertions(+), 65 deletions(-)
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index 0004f074c563..7b76dd7e37bd 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -18,6 +18,7 @@
import torch.nn.functional as F
from torch import nn
+from ..utils import is_torch_version
from .attention import AdaGroupNorm
from .attention_processor import Attention, AttnAddedKVProcessor, AttnAddedKVProcessor2_0
from .dual_transformer_2d import DualTransformer2DModel
@@ -866,13 +867,27 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- cross_attention_kwargs,
- )[0]
+ if is_torch_version(">=", "1.11.0"):
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ cross_attention_kwargs,
+ use_reentrant=False,
+ )[0]
+ else:
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ cross_attention_kwargs,
+ )[0]
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
@@ -957,7 +972,14 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+ if is_torch_version(">=", "1.11.0"):
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+ )
+ else:
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb
+ )
else:
hidden_states = resnet(hidden_states, temb)
@@ -1361,7 +1383,14 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+ if is_torch_version(">=", "1.11.0"):
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+ )
+ else:
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb
+ )
else:
hidden_states = resnet(hidden_states, temb)
@@ -1558,7 +1587,14 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+ if is_torch_version(">=", "1.11.0"):
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+ )
+ else:
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb
+ )
else:
hidden_states = resnet(hidden_states, temb)
@@ -1653,14 +1689,29 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- attention_mask,
- cross_attention_kwargs,
- )
+ if is_torch_version(">=", "1.11.0"):
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ attention_mask,
+ cross_attention_kwargs,
+ use_reentrant=False,
+ )
+ else:
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ attention_mask,
+ cross_attention_kwargs,
+ )
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
@@ -1874,13 +1925,27 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- cross_attention_kwargs,
- )[0]
+ if is_torch_version(">=", "1.11.0"):
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ cross_attention_kwargs,
+ use_reentrant=False,
+ )[0]
+ else:
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ cross_attention_kwargs,
+ )[0]
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
@@ -1960,7 +2025,14 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+ if is_torch_version(">=", "1.11.0"):
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+ )
+ else:
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb
+ )
else:
hidden_states = resnet(hidden_states, temb)
@@ -2388,7 +2460,14 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+ if is_torch_version(">=", "1.11.0"):
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+ )
+ else:
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb
+ )
else:
hidden_states = resnet(hidden_states, temb)
@@ -2593,7 +2672,14 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+ if is_torch_version(">=", "1.11.0"):
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+ )
+ else:
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb
+ )
else:
hidden_states = resnet(hidden_states, temb)
@@ -2714,14 +2800,29 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- attention_mask,
- cross_attention_kwargs,
- )[0]
+ if is_torch_version(">=", "1.11.0"):
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ attention_mask,
+ cross_attention_kwargs,
+ use_reentrant=False,
+ )[0]
+ else:
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ attention_mask,
+ cross_attention_kwargs,
+ )[0]
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
index 400c3030af90..6f8514f28d33 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/vae.py
@@ -18,7 +18,7 @@
import torch
import torch.nn as nn
-from ..utils import BaseOutput, randn_tensor
+from ..utils import BaseOutput, is_torch_version, randn_tensor
from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
@@ -117,11 +117,20 @@ def custom_forward(*inputs):
return custom_forward
# down
- for down_block in self.down_blocks:
- sample = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), sample)
-
- # middle
- sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
+ if is_torch_version(">=", "1.11.0"):
+ for down_block in self.down_blocks:
+ sample = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(down_block), sample, use_reentrant=False
+ )
+ # middle
+ sample = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(self.mid_block), sample, use_reentrant=False
+ )
+ else:
+ for down_block in self.down_blocks:
+ sample = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), sample)
+ # middle
+ sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
else:
# down
@@ -221,13 +230,26 @@ def custom_forward(*inputs):
return custom_forward
- # middle
- sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
- sample = sample.to(upscale_dtype)
+ if is_torch_version(">=", "1.11.0"):
+ # middle
+ sample = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(self.mid_block), sample, use_reentrant=False
+ )
+ sample = sample.to(upscale_dtype)
- # up
- for up_block in self.up_blocks:
- sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample)
+ # up
+ for up_block in self.up_blocks:
+ sample = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(up_block), sample, use_reentrant=False
+ )
+ else:
+ # middle
+ sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
+ sample = sample.to(upscale_dtype)
+
+ # up
+ for up_block in self.up_blocks:
+ sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample)
else:
# middle
sample = self.mid_block(sample)
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index f0a210339c46..7aaa0e49e1da 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -18,7 +18,7 @@
from ...models.embeddings import GaussianFourierProjection, TextTimeEmbedding, TimestepEmbedding, Timesteps
from ...models.transformer_2d import Transformer2DModel
from ...models.unet_2d_condition import UNet2DConditionOutput
-from ...utils import logging
+from ...utils import is_torch_version, logging
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -1077,7 +1077,14 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+ if is_torch_version(">=", "1.11.0"):
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+ )
+ else:
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb
+ )
else:
hidden_states = resnet(hidden_states, temb)
@@ -1198,13 +1205,27 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- cross_attention_kwargs,
- )[0]
+ if is_torch_version(">=", "1.11.0"):
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ cross_attention_kwargs,
+ use_reentrant=False,
+ )[0]
+ else:
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ cross_attention_kwargs,
+ )[0]
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
@@ -1289,7 +1310,14 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+ if is_torch_version(">=", "1.11.0"):
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+ )
+ else:
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb
+ )
else:
hidden_states = resnet(hidden_states, temb)
@@ -1412,13 +1440,27 @@ def custom_forward(*inputs):
return custom_forward
- hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- cross_attention_kwargs,
- )[0]
+ if is_torch_version(">=", "1.11.0"):
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ cross_attention_kwargs,
+ use_reentrant=False,
+ )[0]
+ else:
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet), hidden_states, temb
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ cross_attention_kwargs,
+ )[0]
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
From 3ebd2d1f9ec97f8bc0fc5cc8c7313bdf5f0dc1d2 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Wed, 17 May 2023 12:20:13 +0200
Subject: [PATCH 094/206] Make dreambooth lora more robust to orig unet (#3462)
* Make dreambooth lora more robust to orig unet
* up
---
examples/dreambooth/train_dreambooth_lora.py | 18 +++++-------------
1 file changed, 5 insertions(+), 13 deletions(-)
diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
index 16adfe4b83fc..bfbf3603e8d0 100644
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
@@ -31,7 +31,7 @@
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import ProjectConfiguration, set_seed
-from huggingface_hub import create_repo, model_info, upload_folder
+from huggingface_hub import create_repo, upload_folder
from packaging import version
from PIL import Image
from torch.utils.data import Dataset
@@ -589,16 +589,6 @@ def __getitem__(self, index):
return example
-def model_has_vae(args):
- config_file_name = os.path.join("vae", AutoencoderKL.config_name)
- if os.path.isdir(args.pretrained_model_name_or_path):
- config_file_name = os.path.join(args.pretrained_model_name_or_path, config_file_name)
- return os.path.isfile(config_file_name)
- else:
- files_in_repo = model_info(args.pretrained_model_name_or_path, revision=args.revision).siblings
- return any(file.rfilename == config_file_name for file in files_in_repo)
-
-
def tokenize_prompt(tokenizer, prompt, tokenizer_max_length=None):
if tokenizer_max_length is not None:
max_length = tokenizer_max_length
@@ -753,11 +743,13 @@ def main(args):
text_encoder = text_encoder_cls.from_pretrained(
args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
)
- if model_has_vae(args):
+ try:
vae = AutoencoderKL.from_pretrained(
args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision
)
- else:
+ except OSError:
+ # IF does not have a VAE so let's just set it to None
+ # We don't have to error out here
vae = None
unet = UNet2DConditionModel.from_pretrained(
From bd78f63a54e439a46f162f191618e3ba554aeef6 Mon Sep 17 00:00:00 2001
From: cmdr2
Date: Wed, 17 May 2023 15:54:59 +0530
Subject: [PATCH 095/206] Reduce peak VRAM by releasing large attention tensors
(as soon as they're unnecessary) (#3463)
Release large tensors in attention (as soon as they're no longer required). Reduces peak VRAM by nearly 2 GB for 1024x1024 (even after slicing), and the savings scale up with image size.
---
src/diffusers/models/attention_processor.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index f88400da0333..a489814c4787 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -344,11 +344,14 @@ def get_attention_scores(self, query, key, attention_mask=None):
beta=beta,
alpha=self.scale,
)
+ del baddbmm_input
if self.upcast_softmax:
attention_scores = attention_scores.float()
attention_probs = attention_scores.softmax(dim=-1)
+ del attention_scores
+
attention_probs = attention_probs.to(dtype)
return attention_probs
From 2faf91dbdeb51ad41e8a398d16818932374cde0c Mon Sep 17 00:00:00 2001
From: wfng92 <43742196+wfng92@users.noreply.github.com>
Date: Wed, 17 May 2023 19:07:45 +0800
Subject: [PATCH 096/206] Add min snr to text2img lora training script (#3459)
add min snr to text2img lora training script
---
.../text_to_image/train_text_to_image_lora.py | 49 ++++++++++++++++++-
1 file changed, 48 insertions(+), 1 deletion(-)
diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py
index c2a4e1aacdb7..806637f04c53 100644
--- a/examples/text_to_image/train_text_to_image_lora.py
+++ b/examples/text_to_image/train_text_to_image_lora.py
@@ -239,6 +239,13 @@ def parse_args():
parser.add_argument(
"--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
)
+ parser.add_argument(
+ "--snr_gamma",
+ type=float,
+ default=None,
+ help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+ "More details here: https://arxiv.org/abs/2303.09556.",
+ )
parser.add_argument(
"--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
)
@@ -472,6 +479,30 @@ def main():
else:
raise ValueError("xformers is not available. Make sure it is installed correctly")
+ def compute_snr(timesteps):
+ """
+ Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
+ """
+ alphas_cumprod = noise_scheduler.alphas_cumprod
+ sqrt_alphas_cumprod = alphas_cumprod**0.5
+ sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+
+ # Expand the tensors.
+ # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
+ sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+ while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+ sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+ alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+
+ sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+ while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+ sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
+ sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+
+ # Compute SNR.
+ snr = (alpha / sigma) ** 2
+ return snr
+
lora_layers = AttnProcsLayers(unet.attn_processors)
# Enable TF32 for faster training on Ampere GPUs,
@@ -727,7 +758,23 @@ def collate_fn(examples):
# Predict the noise residual and compute loss
model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
- loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+ if args.snr_gamma is None:
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+ else:
+ # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+ # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+ # This is discussed in Section 4.2 of the same paper.
+ snr = compute_snr(timesteps)
+ mse_loss_weights = (
+ torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
+ )
+ # We first calculate the original loss. Then we mean over the non-batch dimensions and
+ # rebalance the sample-wise losses with their respective loss weights.
+ # Finally, we take the mean of the rebalanced loss.
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+ loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+ loss = loss.mean()
# Gather the losses across all processes for logging (if we use distributed training).
avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
From 88295f92d963f414cc7adf93f30c694a4d100dd2 Mon Sep 17 00:00:00 2001
From: Glaceon-Hyy
Date: Wed, 17 May 2023 19:28:19 +0800
Subject: [PATCH 097/206] Add inpaint lora scale support (#3460)
* add inpaint lora scale support
* add inpaint lora scale test
---------
Co-authored-by: yueyang.hyy
---
.../pipeline_stable_diffusion_inpaint.py | 18 +++++++---
.../test_stable_diffusion_inpaint.py | 35 +++++++++++++++++++
2 files changed, 48 insertions(+), 5 deletions(-)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 78ef11587b4d..f09db016d956 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -14,7 +14,7 @@
import inspect
import warnings
-from typing import Callable, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
import numpy as np
import PIL
@@ -744,6 +744,7 @@ def __call__(
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
):
r"""
Function invoked when calling the pipeline for generation.
@@ -815,7 +816,10 @@ def __call__(
callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function will be called. If not specified, the callback will be
called at every step.
-
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
Examples:
```py
@@ -966,9 +970,13 @@ def __call__(
latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
# predict the noise residual
- noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds, return_dict=False)[
- 0
- ]
+ noise_pred = self.unet(
+ latent_model_input,
+ t,
+ encoder_hidden_states=prompt_embeds,
+ cross_attention_kwargs=cross_attention_kwargs,
+ return_dict=False,
+ )[0]
# perform guidance
if do_classifier_free_guidance:
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 5c5e4c4590dc..5c2d9d7c44f7 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -35,6 +35,7 @@
from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
from diffusers.utils.testing_utils import require_torch_gpu
+from ...models.test_models_unet_2d_condition import create_lora_layers
from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
@@ -155,6 +156,40 @@ def test_stable_diffusion_inpaint_image_tensor(self):
assert out_pil.shape == (1, 64, 64, 3)
assert np.abs(out_pil.flatten() - out_tensor.flatten()).max() < 5e-2
+ def test_stable_diffusion_inpaint_lora(self):
+ device = "cpu" # ensure determinism for the device-dependent torch.Generator
+
+ components = self.get_dummy_components()
+ sd_pipe = StableDiffusionInpaintPipeline(**components)
+ sd_pipe = sd_pipe.to(torch_device)
+ sd_pipe.set_progress_bar_config(disable=None)
+
+ # forward 1
+ inputs = self.get_dummy_inputs(device)
+ output = sd_pipe(**inputs)
+ image = output.images
+ image_slice = image[0, -3:, -3:, -1]
+
+ # set lora layers
+ lora_attn_procs = create_lora_layers(sd_pipe.unet)
+ sd_pipe.unet.set_attn_processor(lora_attn_procs)
+ sd_pipe = sd_pipe.to(torch_device)
+
+ # forward 2
+ inputs = self.get_dummy_inputs(device)
+ output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.0})
+ image = output.images
+ image_slice_1 = image[0, -3:, -3:, -1]
+
+ # forward 3
+ inputs = self.get_dummy_inputs(device)
+ output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.5})
+ image = output.images
+ image_slice_2 = image[0, -3:, -3:, -1]
+
+ assert np.abs(image_slice - image_slice_1).max() < 1e-2
+ assert np.abs(image_slice - image_slice_2).max() > 1e-2
+
def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=3e-3)
From 2858d7e15eaf445ec37fc77b204a85f84affbeef Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Wed, 17 May 2023 14:26:53 +0200
Subject: [PATCH 098/206] [From ckpt] Fix from_ckpt (#3466)
* Correct from_ckpt
* make style
---
src/diffusers/loaders.py | 2 +-
.../stable_diffusion/convert_from_ckpt.py | 22 +++++++++++--------
2 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index a1f0d8ec2a52..e50bc31a5c63 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -1326,7 +1326,7 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs):
file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
from_safetensors = file_extension == "safetensors"
- if from_safetensors and use_safetensors is True:
+ if from_safetensors and use_safetensors is False:
raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
# TODO: For now we only support stable diffusion
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 5961636dd197..42e8ae7cafd2 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -140,17 +140,17 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
new_item = new_item.replace("norm.weight", "group_norm.weight")
new_item = new_item.replace("norm.bias", "group_norm.bias")
- new_item = new_item.replace("q.weight", "query.weight")
- new_item = new_item.replace("q.bias", "query.bias")
+ new_item = new_item.replace("q.weight", "to_q.weight")
+ new_item = new_item.replace("q.bias", "to_q.bias")
- new_item = new_item.replace("k.weight", "key.weight")
- new_item = new_item.replace("k.bias", "key.bias")
+ new_item = new_item.replace("k.weight", "to_k.weight")
+ new_item = new_item.replace("k.bias", "to_k.bias")
- new_item = new_item.replace("v.weight", "value.weight")
- new_item = new_item.replace("v.bias", "value.bias")
+ new_item = new_item.replace("v.weight", "to_v.weight")
+ new_item = new_item.replace("v.bias", "to_v.bias")
- new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
- new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+ new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
+ new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
@@ -204,8 +204,12 @@ def assign_to_checkpoint(
new_path = new_path.replace(replacement["old"], replacement["new"])
# proj_attn.weight has to be converted from conv 1D to linear
- if "proj_attn.weight" in new_path:
+ is_attn_weight = "proj_attn.weight" in new_path or ("attentions" in new_path and "to_" in new_path)
+ shape = old_checkpoint[path["old"]].shape
+ if is_attn_weight and len(shape) == 3:
checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+ elif is_attn_weight and len(shape) == 4:
+ checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0, 0]
else:
checkpoint[new_path] = old_checkpoint[path["old"]]
From c9f939bf9885de32ada828809410b4a6c1d9ff2a Mon Sep 17 00:00:00 2001
From: Will Berman
Date: Wed, 17 May 2023 10:42:20 -0700
Subject: [PATCH 099/206] Update full dreambooth script to work with IF (#3425)
---
examples/dreambooth/train_dreambooth.py | 306 ++++++++++++++++++++----
examples/test_examples.py | 26 ++
src/diffusers/models/unet_2d_blocks.py | 69 ++++--
3 files changed, 344 insertions(+), 57 deletions(-)
diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index 5d2107f024d1..efcfb39ab4c4 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -14,6 +14,7 @@
# See the License for the specific language governing permissions and
import argparse
+import gc
import hashlib
import itertools
import logging
@@ -30,7 +31,7 @@
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import ProjectConfiguration, set_seed
-from huggingface_hub import create_repo, upload_folder
+from huggingface_hub import create_repo, model_info, upload_folder
from packaging import version
from PIL import Image
from torch.utils.data import Dataset
@@ -93,31 +94,61 @@ def save_model_card(repo_id: str, images=None, base_model=str, train_text_encode
f.write(yaml + model_card)
-def log_validation(text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch):
+def log_validation(
+ text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch, prompt_embeds, negative_prompt_embeds
+):
logger.info(
f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
f" {args.validation_prompt}."
)
+
+ pipeline_args = {}
+
+ if text_encoder is not None:
+ pipeline_args["text_encoder"] = accelerator.unwrap_model(text_encoder)
+
+ if vae is not None:
+ pipeline_args["vae"] = vae
+
# create pipeline (note: unet and vae are loaded again in float32)
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
- text_encoder=accelerator.unwrap_model(text_encoder),
tokenizer=tokenizer,
unet=accelerator.unwrap_model(unet),
- vae=vae,
revision=args.revision,
torch_dtype=weight_dtype,
+ **pipeline_args,
)
- pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+
+ # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+ scheduler_args = {}
+
+ if "variance_type" in pipeline.scheduler.config:
+ variance_type = pipeline.scheduler.config.variance_type
+
+ if variance_type in ["learned", "learned_range"]:
+ variance_type = "fixed_small"
+
+ scheduler_args["variance_type"] = variance_type
+
+ pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
pipeline = pipeline.to(accelerator.device)
pipeline.set_progress_bar_config(disable=True)
+ if args.pre_compute_text_embeddings:
+ pipeline_args = {
+ "prompt_embeds": prompt_embeds,
+ "negative_prompt_embeds": negative_prompt_embeds,
+ }
+ else:
+ pipeline_args = {"prompt": args.validation_prompt}
+
# run inference
generator = None if args.seed is None else torch.Generator(device=accelerator.device).manual_seed(args.seed)
images = []
for _ in range(args.num_validation_images):
with torch.autocast("cuda"):
- image = pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+ image = pipeline(**pipeline_args, num_inference_steps=25, generator=generator).images[0]
images.append(image)
for tracker in accelerator.trackers:
@@ -155,6 +186,10 @@ def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: st
from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
return RobertaSeriesModelWithTransformation
+ elif model_class == "T5EncoderModel":
+ from transformers import T5EncoderModel
+
+ return T5EncoderModel
else:
raise ValueError(f"{model_class} is not supported.")
@@ -459,6 +494,27 @@ def parse_args(input_args=None):
" See: https://www.crosslabs.org//blog/diffusion-with-offset-noise for more information."
),
)
+ parser.add_argument(
+ "--pre_compute_text_embeddings",
+ action="store_true",
+ help="Whether or not to pre-compute text embeddings. If text embeddings are pre-computed, the text encoder will not be kept in memory during training and will leave more GPU memory available for training the rest of the model. This is not compatible with `--train_text_encoder`.",
+ )
+ parser.add_argument(
+ "--tokenizer_max_length",
+ type=int,
+ default=None,
+ required=False,
+ help="The maximum length of the tokenizer. If not set, will default to the tokenizer's max length.",
+ )
+ parser.add_argument(
+ "--text_encoder_use_attention_mask",
+ action="store_true",
+ required=False,
+ help="Whether to use attention mask for the text encoder",
+ )
+ parser.add_argument(
+ "--skip_save_text_encoder", action="store_true", required=False, help="Set to not save text encoder"
+ )
if input_args is not None:
args = parser.parse_args(input_args)
@@ -481,6 +537,9 @@ def parse_args(input_args=None):
if args.class_prompt is not None:
warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+ if args.train_text_encoder and args.pre_compute_text_embeddings:
+ raise ValueError("`--train_text_encoder` cannot be used with `--pre_compute_text_embeddings`")
+
return args
@@ -500,10 +559,16 @@ def __init__(
class_num=None,
size=512,
center_crop=False,
+ encoder_hidden_states=None,
+ instance_prompt_encoder_hidden_states=None,
+ tokenizer_max_length=None,
):
self.size = size
self.center_crop = center_crop
self.tokenizer = tokenizer
+ self.encoder_hidden_states = encoder_hidden_states
+ self.instance_prompt_encoder_hidden_states = instance_prompt_encoder_hidden_states
+ self.tokenizer_max_length = tokenizer_max_length
self.instance_data_root = Path(instance_data_root)
if not self.instance_data_root.exists():
@@ -545,40 +610,52 @@ def __getitem__(self, index):
if not instance_image.mode == "RGB":
instance_image = instance_image.convert("RGB")
example["instance_images"] = self.image_transforms(instance_image)
- example["instance_prompt_ids"] = self.tokenizer(
- self.instance_prompt,
- truncation=True,
- padding="max_length",
- max_length=self.tokenizer.model_max_length,
- return_tensors="pt",
- ).input_ids
+
+ if self.encoder_hidden_states is not None:
+ example["instance_prompt_ids"] = self.encoder_hidden_states
+ else:
+ text_inputs = tokenize_prompt(
+ self.tokenizer, self.instance_prompt, tokenizer_max_length=self.tokenizer_max_length
+ )
+ example["instance_prompt_ids"] = text_inputs.input_ids
+ example["instance_attention_mask"] = text_inputs.attention_mask
if self.class_data_root:
class_image = Image.open(self.class_images_path[index % self.num_class_images])
if not class_image.mode == "RGB":
class_image = class_image.convert("RGB")
example["class_images"] = self.image_transforms(class_image)
- example["class_prompt_ids"] = self.tokenizer(
- self.class_prompt,
- truncation=True,
- padding="max_length",
- max_length=self.tokenizer.model_max_length,
- return_tensors="pt",
- ).input_ids
+
+ if self.instance_prompt_encoder_hidden_states is not None:
+ example["class_prompt_ids"] = self.instance_prompt_encoder_hidden_states
+ else:
+ class_text_inputs = tokenize_prompt(
+ self.tokenizer, self.class_prompt, tokenizer_max_length=self.tokenizer_max_length
+ )
+ example["class_prompt_ids"] = class_text_inputs.input_ids
+ example["class_attention_mask"] = class_text_inputs.attention_mask
return example
def collate_fn(examples, with_prior_preservation=False):
+ has_attention_mask = "instance_attention_mask" in examples[0]
+
input_ids = [example["instance_prompt_ids"] for example in examples]
pixel_values = [example["instance_images"] for example in examples]
+ if has_attention_mask:
+ attention_mask = [example["instance_attention_mask"] for example in examples]
+
# Concat class and instance examples for prior preservation.
# We do this to avoid doing two forward passes.
if with_prior_preservation:
input_ids += [example["class_prompt_ids"] for example in examples]
pixel_values += [example["class_images"] for example in examples]
+ if has_attention_mask:
+ attention_mask += [example["class_attention_mask"] for example in examples]
+
pixel_values = torch.stack(pixel_values)
pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
@@ -588,6 +665,10 @@ def collate_fn(examples, with_prior_preservation=False):
"input_ids": input_ids,
"pixel_values": pixel_values,
}
+
+ if has_attention_mask:
+ batch["attention_mask"] = attention_mask
+
return batch
@@ -608,6 +689,50 @@ def __getitem__(self, index):
return example
+def model_has_vae(args):
+ config_file_name = os.path.join("vae", AutoencoderKL.config_name)
+ if os.path.isdir(args.pretrained_model_name_or_path):
+ config_file_name = os.path.join(args.pretrained_model_name_or_path, config_file_name)
+ return os.path.isfile(config_file_name)
+ else:
+ files_in_repo = model_info(args.pretrained_model_name_or_path, revision=args.revision).siblings
+ return any(file.rfilename == config_file_name for file in files_in_repo)
+
+
+def tokenize_prompt(tokenizer, prompt, tokenizer_max_length=None):
+ if tokenizer_max_length is not None:
+ max_length = tokenizer_max_length
+ else:
+ max_length = tokenizer.model_max_length
+
+ text_inputs = tokenizer(
+ prompt,
+ truncation=True,
+ padding="max_length",
+ max_length=max_length,
+ return_tensors="pt",
+ )
+
+ return text_inputs
+
+
+def encode_prompt(text_encoder, input_ids, attention_mask, text_encoder_use_attention_mask=None):
+ text_input_ids = input_ids.to(text_encoder.device)
+
+ if text_encoder_use_attention_mask:
+ attention_mask = attention_mask.to(text_encoder.device)
+ else:
+ attention_mask = None
+
+ prompt_embeds = text_encoder(
+ text_input_ids,
+ attention_mask=attention_mask,
+ )
+ prompt_embeds = prompt_embeds[0]
+
+ return prompt_embeds
+
+
def main(args):
logging_dir = Path(args.output_dir, args.logging_dir)
@@ -727,7 +852,14 @@ def main(args):
text_encoder = text_encoder_cls.from_pretrained(
args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
)
- vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
+
+ if model_has_vae(args):
+ vae = AutoencoderKL.from_pretrained(
+ args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision
+ )
+ else:
+ vae = None
+
unet = UNet2DConditionModel.from_pretrained(
args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
)
@@ -761,7 +893,9 @@ def load_model_hook(models, input_dir):
accelerator.register_save_state_pre_hook(save_model_hook)
accelerator.register_load_state_pre_hook(load_model_hook)
- vae.requires_grad_(False)
+ if vae is not None:
+ vae.requires_grad_(False)
+
if not args.train_text_encoder:
text_encoder.requires_grad_(False)
@@ -835,6 +969,44 @@ def load_model_hook(models, input_dir):
eps=args.adam_epsilon,
)
+ if args.pre_compute_text_embeddings:
+
+ def compute_text_embeddings(prompt):
+ with torch.no_grad():
+ text_inputs = tokenize_prompt(tokenizer, prompt, tokenizer_max_length=args.tokenizer_max_length)
+ prompt_embeds = encode_prompt(
+ text_encoder,
+ text_inputs.input_ids,
+ text_inputs.attention_mask,
+ text_encoder_use_attention_mask=args.text_encoder_use_attention_mask,
+ )
+
+ return prompt_embeds
+
+ pre_computed_encoder_hidden_states = compute_text_embeddings(args.instance_prompt)
+ validation_prompt_negative_prompt_embeds = compute_text_embeddings("")
+
+ if args.validation_prompt is not None:
+ validation_prompt_encoder_hidden_states = compute_text_embeddings(args.validation_prompt)
+ else:
+ validation_prompt_encoder_hidden_states = None
+
+ if args.instance_prompt is not None:
+ pre_computed_instance_prompt_encoder_hidden_states = compute_text_embeddings(args.instance_prompt)
+ else:
+ pre_computed_instance_prompt_encoder_hidden_states = None
+
+ text_encoder = None
+ tokenizer = None
+
+ gc.collect()
+ torch.cuda.empty_cache()
+ else:
+ pre_computed_encoder_hidden_states = None
+ validation_prompt_encoder_hidden_states = None
+ validation_prompt_negative_prompt_embeds = None
+ pre_computed_instance_prompt_encoder_hidden_states = None
+
# Dataset and DataLoaders creation:
train_dataset = DreamBoothDataset(
instance_data_root=args.instance_data_dir,
@@ -845,6 +1017,9 @@ def load_model_hook(models, input_dir):
tokenizer=tokenizer,
size=args.resolution,
center_crop=args.center_crop,
+ encoder_hidden_states=pre_computed_encoder_hidden_states,
+ instance_prompt_encoder_hidden_states=pre_computed_instance_prompt_encoder_hidden_states,
+ tokenizer_max_length=args.tokenizer_max_length,
)
train_dataloader = torch.utils.data.DataLoader(
@@ -890,8 +1065,10 @@ def load_model_hook(models, input_dir):
weight_dtype = torch.bfloat16
# Move vae and text_encoder to device and cast to weight_dtype
- vae.to(accelerator.device, dtype=weight_dtype)
- if not args.train_text_encoder:
+ if vae is not None:
+ vae.to(accelerator.device, dtype=weight_dtype)
+
+ if not args.train_text_encoder and text_encoder is not None:
text_encoder.to(accelerator.device, dtype=weight_dtype)
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
@@ -961,37 +1138,55 @@ def load_model_hook(models, input_dir):
continue
with accelerator.accumulate(unet):
- # Convert images to latent space
- latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
- latents = latents * vae.config.scaling_factor
+ pixel_values = batch["pixel_values"].to(dtype=weight_dtype)
- # Sample noise that we'll add to the latents
+ if vae is not None:
+ # Convert images to latent space
+ model_input = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+ model_input = model_input * vae.config.scaling_factor
+ else:
+ model_input = pixel_values
+
+ # Sample noise that we'll add to the model input
if args.offset_noise:
- noise = torch.randn_like(latents) + 0.1 * torch.randn(
- latents.shape[0], latents.shape[1], 1, 1, device=latents.device
+ noise = torch.randn_like(model_input) + 0.1 * torch.randn(
+ model_input.shape[0], model_input.shape[1], 1, 1, device=model_input.device
)
else:
- noise = torch.randn_like(latents)
- bsz = latents.shape[0]
+ noise = torch.randn_like(model_input)
+ bsz = model_input.shape[0]
# Sample a random timestep for each image
- timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+ timesteps = torch.randint(
+ 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+ )
timesteps = timesteps.long()
- # Add noise to the latents according to the noise magnitude at each timestep
+ # Add noise to the model input according to the noise magnitude at each timestep
# (this is the forward diffusion process)
- noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+ noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
# Get the text embedding for conditioning
- encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+ if args.pre_compute_text_embeddings:
+ encoder_hidden_states = batch["input_ids"]
+ else:
+ encoder_hidden_states = encode_prompt(
+ text_encoder,
+ batch["input_ids"],
+ batch["attention_mask"],
+ text_encoder_use_attention_mask=args.text_encoder_use_attention_mask,
+ )
# Predict the noise residual
- model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+ model_pred = unet(noisy_model_input, timesteps, encoder_hidden_states).sample
+
+ if model_pred.shape[1] == 6:
+ model_pred, _ = torch.chunk(model_pred, 2, dim=1)
# Get the target for loss depending on the prediction type
if noise_scheduler.config.prediction_type == "epsilon":
target = noise
elif noise_scheduler.config.prediction_type == "v_prediction":
- target = noise_scheduler.get_velocity(latents, noise, timesteps)
+ target = noise_scheduler.get_velocity(model_input, noise, timesteps)
else:
raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
@@ -1037,7 +1232,16 @@ def load_model_hook(models, input_dir):
if args.validation_prompt is not None and global_step % args.validation_steps == 0:
images = log_validation(
- text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch
+ text_encoder,
+ tokenizer,
+ unet,
+ vae,
+ args,
+ accelerator,
+ weight_dtype,
+ epoch,
+ validation_prompt_encoder_hidden_states,
+ validation_prompt_negative_prompt_embeds,
)
logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
@@ -1050,12 +1254,34 @@ def load_model_hook(models, input_dir):
# Create the pipeline using using the trained modules and save it.
accelerator.wait_for_everyone()
if accelerator.is_main_process:
+ pipeline_args = {}
+
+ if text_encoder is not None:
+ pipeline_args["text_encoder"] = accelerator.unwrap_model(text_encoder)
+
+ if args.skip_save_text_encoder:
+ pipeline_args["text_encoder"] = None
+
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
unet=accelerator.unwrap_model(unet),
- text_encoder=accelerator.unwrap_model(text_encoder),
revision=args.revision,
+ **pipeline_args,
)
+
+ # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+ scheduler_args = {}
+
+ if "variance_type" in pipeline.scheduler.config:
+ variance_type = pipeline.scheduler.config.variance_type
+
+ if variance_type in ["learned", "learned_range"]:
+ variance_type = "fixed_small"
+
+ scheduler_args["variance_type"] = variance_type
+
+ pipeline.scheduler = pipeline.scheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+
pipeline.save_pretrained(args.output_dir)
if args.push_to_hub:
diff --git a/examples/test_examples.py b/examples/test_examples.py
index d9e7de717f47..59c96f44fe93 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -147,6 +147,32 @@ def test_dreambooth(self):
self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.bin")))
self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
+ def test_dreambooth_if(self):
+ with tempfile.TemporaryDirectory() as tmpdir:
+ test_args = f"""
+ examples/dreambooth/train_dreambooth.py
+ --pretrained_model_name_or_path hf-internal-testing/tiny-if-pipe
+ --instance_data_dir docs/source/en/imgs
+ --instance_prompt photo
+ --resolution 64
+ --train_batch_size 1
+ --gradient_accumulation_steps 1
+ --max_train_steps 2
+ --learning_rate 5.0e-04
+ --scale_lr
+ --lr_scheduler constant
+ --lr_warmup_steps 0
+ --output_dir {tmpdir}
+ --pre_compute_text_embeddings
+ --tokenizer_max_length=77
+ --text_encoder_use_attention_mask
+ """.split()
+
+ run_command(self._launch_args + test_args)
+ # save_pretrained smoke test
+ self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.bin")))
+ self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
+
def test_dreambooth_checkpointing(self):
instance_prompt = "photo"
pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index 7b76dd7e37bd..75d9eb3e03df 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -1507,16 +1507,33 @@ def forward(
cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
for resnet, attn in zip(self.resnets, self.attentions):
- # resnet
- hidden_states = resnet(hidden_states, temb)
+ if self.training and self.gradient_checkpointing:
- # attn
- hidden_states = attn(
- hidden_states,
- encoder_hidden_states=encoder_hidden_states,
- attention_mask=attention_mask,
- **cross_attention_kwargs,
- )
+ def create_custom_forward(module, return_dict=None):
+ def custom_forward(*inputs):
+ if return_dict is not None:
+ return module(*inputs, return_dict=return_dict)
+ else:
+ return module(*inputs)
+
+ return custom_forward
+
+ hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ cross_attention_kwargs,
+ )[0]
+ else:
+ hidden_states = resnet(hidden_states, temb)
+
+ hidden_states = attn(
+ hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ attention_mask=attention_mask,
+ **cross_attention_kwargs,
+ )
output_states = output_states + (hidden_states,)
@@ -2593,15 +2610,33 @@ def forward(
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
- hidden_states = resnet(hidden_states, temb)
+ if self.training and self.gradient_checkpointing:
- # attn
- hidden_states = attn(
- hidden_states,
- encoder_hidden_states=encoder_hidden_states,
- attention_mask=attention_mask,
- **cross_attention_kwargs,
- )
+ def create_custom_forward(module, return_dict=None):
+ def custom_forward(*inputs):
+ if return_dict is not None:
+ return module(*inputs, return_dict=return_dict)
+ else:
+ return module(*inputs)
+
+ return custom_forward
+
+ hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ cross_attention_kwargs,
+ )[0]
+ else:
+ hidden_states = resnet(hidden_states, temb)
+
+ hidden_states = attn(
+ hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ attention_mask=attention_mask,
+ **cross_attention_kwargs,
+ )
if self.upsamplers is not None:
for upsampler in self.upsamplers:
From 7200985eab7126801fffcf8251fd149c1cf1f291 Mon Sep 17 00:00:00 2001
From: Will Berman
Date: Wed, 17 May 2023 11:56:10 -0700
Subject: [PATCH 100/206] Add IF dreambooth docs (#3470)
---
examples/dreambooth/README.md | 64 +++++++++++++++++++++++++++++++++++
1 file changed, 64 insertions(+)
diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md
index 75d705f89e02..086100bd4a36 100644
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -531,3 +531,67 @@ More info: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_
### Experimental results
You can refer to [this blog post](https://huggingface.co/blog/dreambooth) that discusses some of DreamBooth experiments in detail. Specifically, it recommends a set of DreamBooth-specific tips and tricks that we have found to work well for a variety of subjects.
+
+## IF
+
+You can use the lora and full dreambooth scripts to also train the text to image [IF model](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0). A few alternative cli flags are needed due to the model size, the expected input resolution, and the text encoder conventions.
+
+### LoRA Dreambooth
+This training configuration requires ~28 GB VRAM.
+
+```sh
+export MODEL_NAME="DeepFloyd/IF-I-XL-v1.0"
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="dreambooth_dog_lora"
+
+accelerate launch train_dreambooth_lora.py \
+ --report_to wandb \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --instance_data_dir=$INSTANCE_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --instance_prompt="a sks dog" \
+ --resolution=64 \ # The input resolution of the IF unet is 64x64
+ --train_batch_size=4 \
+ --gradient_accumulation_steps=1 \
+ --learning_rate=5e-6 \
+ --scale_lr \
+ --max_train_steps=1200 \
+ --validation_prompt="a sks dog" \
+ --validation_epochs=25 \
+ --checkpointing_steps=100 \
+ --pre_compute_text_embeddings \ # Pre compute text embeddings to that T5 doesn't have to be kept in memory
+ --tokenizer_max_length=77 \ # IF expects an override of the max token length
+ --text_encoder_use_attention_mask # IF expects attention mask for text embeddings
+```
+
+### Full Dreambooth
+Due to the size of the optimizer states, we recommend training the full XL IF model with 8bit adam.
+Using 8bit adam and the rest of the following config, the model can be trained in ~48 GB VRAM.
+
+For full dreambooth, IF requires very low learning rates. With higher learning rates model quality will degrade.
+
+```sh
+export MODEL_NAME="DeepFloyd/IF-I-XL-v1.0"
+
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="dreambooth_if"
+
+accelerate launch train_dreambooth.py \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --instance_data_dir=$INSTANCE_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --instance_prompt="a photo of sks dog" \
+ --resolution=64 \ # The input resolution of the IF unet is 64x64
+ --train_batch_size=4 \
+ --gradient_accumulation_steps=1 \
+ --learning_rate=1e-7 \
+ --max_train_steps=150 \
+ --validation_prompt "a photo of sks dog" \
+ --validation_steps 25 \
+ --text_encoder_use_attention_mask \ # IF expects attention mask for text embeddings
+ --tokenizer_max_length 77 \ # IF expects an override of the max token length
+ --pre_compute_text_embeddings \ # Pre compute text embeddings to that T5 doesn't have to be kept in memory
+ --use_8bit_adam \ #
+ --set_grads_to_none \
+ --skip_save_text_encoder # do not save the full T5 text encoder with the model
+```
From 49b7ccfb965ce77046477f292b8e9f9777bea0e9 Mon Sep 17 00:00:00 2001
From: Will Berman
Date: Thu, 18 May 2023 10:14:29 -0700
Subject: [PATCH 101/206] parameterize pass single args through tuple (#3477)
---
tests/models/test_models_vae.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py
index fd4cf0114f51..9a3e49cdfbc0 100644
--- a/tests/models/test_models_vae.py
+++ b/tests/models/test_models_vae.py
@@ -321,7 +321,7 @@ def test_stable_diffusion_decode_fp16(self, seed, expected_slice):
assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
- @parameterized.expand([13, 16, 27])
+ @parameterized.expand([(13,), (16,), (27,)])
@require_torch_gpu
@unittest.skipIf(not is_xformers_available(), reason="xformers is not required when using PyTorch 2.0.")
def test_stable_diffusion_decode_xformers_vs_2_0_fp16(self, seed):
@@ -339,7 +339,7 @@ def test_stable_diffusion_decode_xformers_vs_2_0_fp16(self, seed):
assert torch_all_close(sample, sample_2, atol=1e-1)
- @parameterized.expand([13, 16, 37])
+ @parameterized.expand([(13,), (16,), (37,)])
@require_torch_gpu
@unittest.skipIf(not is_xformers_available(), reason="xformers is not required when using PyTorch 2.0.")
def test_stable_diffusion_decode_xformers_vs_2_0(self, seed):
From 8917769499632c5539f81e9bae9e923825e5be69 Mon Sep 17 00:00:00 2001
From: Will Berman
Date: Thu, 18 May 2023 10:24:49 -0700
Subject: [PATCH 102/206] attend and excite tests disable determinism on the
class level (#3478)
---
...test_stable_diffusion_attend_and_excite.py | 27 ++++++++++++++++++-
1 file changed, 26 insertions(+), 1 deletion(-)
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
index 898d5741043f..6cec2cce752d 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
@@ -34,7 +34,6 @@
torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(False)
@skip_mps
@@ -47,6 +46,19 @@ class StableDiffusionAttendAndExcitePipelineFastTests(
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"token_indices"})
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+ # Attend and excite requires being able to run a backward pass at
+ # inference time. There's no deterministic backward operator for pad
+
+ @classmethod
+ def setUpClass(cls):
+ super().setUpClass()
+ torch.use_deterministic_algorithms(False)
+
+ @classmethod
+ def tearDownClass(cls):
+ super().tearDownClass()
+ torch.use_deterministic_algorithms(True)
+
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet2DConditionModel(
@@ -171,6 +183,19 @@ def test_save_load_optional_components(self):
@require_torch_gpu
@slow
class StableDiffusionAttendAndExcitePipelineIntegrationTests(unittest.TestCase):
+ # Attend and excite requires being able to run a backward pass at
+ # inference time. There's no deterministic backward operator for pad
+
+ @classmethod
+ def setUpClass(cls):
+ super().setUpClass()
+ torch.use_deterministic_algorithms(False)
+
+ @classmethod
+ def tearDownClass(cls):
+ super().tearDownClass()
+ torch.use_deterministic_algorithms(True)
+
def tearDown(self):
super().tearDown()
gc.collect()
From 8d646f229440999f8c20bf8cbaf016dc4b35441d Mon Sep 17 00:00:00 2001
From: Will Berman
Date: Thu, 18 May 2023 19:10:14 -0700
Subject: [PATCH 103/206] dreambooth docs torch.compile note (#3471)
* dreambooth docs torch.compile note
* Update examples/dreambooth/README.md
Co-authored-by: Sayak Paul
* Update examples/dreambooth/README.md
Co-authored-by: Pedro Cuenca
---------
Co-authored-by: Sayak Paul
Co-authored-by: Pedro Cuenca
---
examples/dreambooth/README.md | 2 ++
1 file changed, 2 insertions(+)
diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md
index 086100bd4a36..83073210ac04 100644
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -43,6 +43,8 @@ from accelerate.utils import write_basic_config
write_basic_config()
```
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
+
### Dog toy example
Now let's get our dataset. For this example we will use some dog images: https://huggingface.co/datasets/diffusers/dog-example.
From e343443565d9dbbba026f563c35f0d4a0515a8d9 Mon Sep 17 00:00:00 2001
From: Sayak Paul
Date: Fri, 19 May 2023 07:47:28 +0530
Subject: [PATCH 104/206] add: if entry in the dreambooth training docs.
(#3472)
---
docs/source/en/training/dreambooth.mdx | 64 ++++++++++++++++++++++++++
1 file changed, 64 insertions(+)
diff --git a/docs/source/en/training/dreambooth.mdx b/docs/source/en/training/dreambooth.mdx
index 38a3adf9c4f1..de93772abedd 100644
--- a/docs/source/en/training/dreambooth.mdx
+++ b/docs/source/en/training/dreambooth.mdx
@@ -496,3 +496,67 @@ image.save("dog-bucket.png")
```
You may also run inference from any of the [saved training checkpoints](#inference-from-a-saved-checkpoint).
+
+## IF
+
+You can use the lora and full dreambooth scripts to also train the text to image [IF model](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0). A few alternative cli flags are needed due to the model size, the expected input resolution, and the text encoder conventions.
+
+### LoRA Dreambooth
+This training configuration requires ~28 GB VRAM.
+
+```sh
+export MODEL_NAME="DeepFloyd/IF-I-XL-v1.0"
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="dreambooth_dog_lora"
+
+accelerate launch train_dreambooth_lora.py \
+ --report_to wandb \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --instance_data_dir=$INSTANCE_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --instance_prompt="a sks dog" \
+ --resolution=64 \ # The input resolution of the IF unet is 64x64
+ --train_batch_size=4 \
+ --gradient_accumulation_steps=1 \
+ --learning_rate=5e-6 \
+ --scale_lr \
+ --max_train_steps=1200 \
+ --validation_prompt="a sks dog" \
+ --validation_epochs=25 \
+ --checkpointing_steps=100 \
+ --pre_compute_text_embeddings \ # Pre compute text embeddings to that T5 doesn't have to be kept in memory
+ --tokenizer_max_length=77 \ # IF expects an override of the max token length
+ --text_encoder_use_attention_mask # IF expects attention mask for text embeddings
+```
+
+### Full Dreambooth
+Due to the size of the optimizer states, we recommend training the full XL IF model with 8bit adam.
+Using 8bit adam and the rest of the following config, the model can be trained in ~48 GB VRAM.
+
+For full dreambooth, IF requires very low learning rates. With higher learning rates model quality will degrade.
+
+```sh
+export MODEL_NAME="DeepFloyd/IF-I-XL-v1.0"
+
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="dreambooth_if"
+
+accelerate launch train_dreambooth.py \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --instance_data_dir=$INSTANCE_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --instance_prompt="a photo of sks dog" \
+ --resolution=64 \ # The input resolution of the IF unet is 64x64
+ --train_batch_size=4 \
+ --gradient_accumulation_steps=1 \
+ --learning_rate=1e-7 \
+ --max_train_steps=150 \
+ --validation_prompt "a photo of sks dog" \
+ --validation_steps 25 \
+ --text_encoder_use_attention_mask \ # IF expects attention mask for text embeddings
+ --tokenizer_max_length 77 \ # IF expects an override of the max token length
+ --pre_compute_text_embeddings \ # Pre compute text embeddings to that T5 doesn't have to be kept in memory
+ --use_8bit_adam \ #
+ --set_grads_to_none \
+ --skip_save_text_encoder # do not save the full T5 text encoder with the model
+```
\ No newline at end of file
From 00c76f6ff19a9667594597c37b4e3da15e9a56db Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Fri, 19 May 2023 09:47:27 -0700
Subject: [PATCH 105/206] [docs] Textual inversion inference (#3473)
* add textual inversion inference to docs
* add to toctree
---------
Co-authored-by: Sayak Paul
---
docs/source/en/_toctree.yml | 2 +
.../textual_inversion_inference.mdx | 80 +++++++++++++++++++
2 files changed, 82 insertions(+)
create mode 100644 docs/source/en/using-diffusers/textual_inversion_inference.mdx
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 645cbb04c1d0..926a3ea716e8 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -44,6 +44,8 @@
title: Text-guided image-inpainting
- local: using-diffusers/depth2img
title: Text-guided depth-to-image
+ - local: using-diffusers/textual_inversion_inference
+ title: Textual inversion
- local: using-diffusers/reusing_seeds
title: Improve image quality with deterministic generation
- local: using-diffusers/reproducibility
diff --git a/docs/source/en/using-diffusers/textual_inversion_inference.mdx b/docs/source/en/using-diffusers/textual_inversion_inference.mdx
new file mode 100644
index 000000000000..9eca3e7e465c
--- /dev/null
+++ b/docs/source/en/using-diffusers/textual_inversion_inference.mdx
@@ -0,0 +1,80 @@
+# Textual inversion
+
+[[open-in-colab]]
+
+The [`StableDiffusionPipeline`] supports textual inversion, a technique that enables a model like Stable Diffusion to learn a new concept from just a few sample images. This gives you more control over the generated images and allows you to tailor the model towards specific concepts. You can get started quickly with a collection of community created concepts in the [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer).
+
+This guide will show you how to run inference with textual inversion using a pre-learned concept from the Stable Diffusion Conceptualizer. If you're interested in teaching a model new concepts with textual inversion, take a look at the [Textual Inversion](./training/text_inversion) training guide.
+
+Login to your Hugging Face account:
+
+```py
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+Import the necessary libraries, and create a helper function to visualize the generated images:
+
+```py
+import os
+import torch
+
+import PIL
+from PIL import Image
+
+from diffusers import StableDiffusionPipeline
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+
+def image_grid(imgs, rows, cols):
+ assert len(imgs) == rows * cols
+
+ w, h = imgs[0].size
+ grid = Image.new("RGB", size=(cols * w, rows * h))
+ grid_w, grid_h = grid.size
+
+ for i, img in enumerate(imgs):
+ grid.paste(img, box=(i % cols * w, i // cols * h))
+ return grid
+```
+
+Pick a Stable Diffusion checkpoint and a pre-learned concept from the [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer):
+
+```py
+pretrained_model_name_or_path = "runwayml/stable-diffusion-v1-5"
+repo_id_embeds = "sd-concepts-library/cat-toy"
+```
+
+Now you can load a pipeline, and pass the pre-learned concept to it:
+
+```py
+pipeline = StableDiffusionPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=torch.float16).to("cuda")
+
+pipeline.load_textual_inversion(repo_id_embeds)
+```
+
+Create a prompt with the pre-learned concept by using the special placeholder token ``, and choose the number of samples and rows of images you'd like to generate:
+
+```py
+prompt = "a grafitti in a favela wall with a on it"
+
+num_samples = 2
+num_rows = 2
+```
+
+Then run the pipeline (feel free to adjust the parameters like `num_inference_steps` and `guidance_scale` to see how they affect image quality), save the generated images and visualize them with the helper function you created at the beginning:
+
+```py
+all_images = []
+for _ in range(num_rows):
+ images = pipe(prompt, num_images_per_prompt=num_samples, num_inference_steps=50, guidance_scale=7.5).images
+ all_images.extend(images)
+
+grid = image_grid(all_images, num_samples, num_rows)
+grid
+```
+
+
+

+
From e589bdb956c9be33fc73e1d4614d8d1c1ad95544 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Fri, 19 May 2023 10:07:33 -0700
Subject: [PATCH 106/206] [docs] Distributed inference (#3376)
* distributed inference
* move to inference section
* apply feedback
* update with split_between_processes
* apply feedback
---
docs/source/en/_toctree.yml | 2 +
.../en/training/distributed_inference.mdx | 91 +++++++++++++++++++
2 files changed, 93 insertions(+)
create mode 100644 docs/source/en/training/distributed_inference.mdx
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 926a3ea716e8..aa2d907da4bd 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -46,6 +46,8 @@
title: Text-guided depth-to-image
- local: using-diffusers/textual_inversion_inference
title: Textual inversion
+ - local: training/distributed_inference
+ title: Distributed inference with multiple GPUs
- local: using-diffusers/reusing_seeds
title: Improve image quality with deterministic generation
- local: using-diffusers/reproducibility
diff --git a/docs/source/en/training/distributed_inference.mdx b/docs/source/en/training/distributed_inference.mdx
new file mode 100644
index 000000000000..e85b3f11e238
--- /dev/null
+++ b/docs/source/en/training/distributed_inference.mdx
@@ -0,0 +1,91 @@
+# Distributed inference with multiple GPUs
+
+On distributed setups, you can run inference across multiple GPUs with 🤗 [Accelerate](https://huggingface.co/docs/accelerate/index) or [PyTorch Distributed](https://pytorch.org/tutorials/beginner/dist_overview.html), which is useful for generating with multiple prompts in parallel.
+
+This guide will show you how to use 🤗 Accelerate and PyTorch Distributed for distributed inference.
+
+## 🤗 Accelerate
+
+🤗 [Accelerate](https://huggingface.co/docs/accelerate/index) is a library designed to make it easy to train or run inference across distributed setups. It simplifies the process of setting up the distributed environment, allowing you to focus on your PyTorch code.
+
+To begin, create a Python file and initialize an [`accelerate.PartialState`] to create a distributed environment; your setup is automatically detected so you don't need to explicitly define the `rank` or `world_size`. Move the [`DiffusionPipeline`] to `distributed_state.device` to assign a GPU to each process.
+
+Now use the [`~accelerate.PartialState.split_between_processes`] utility as a context manager to automatically distribute the prompts between the number of processes.
+
+```py
+from accelerate import PartialState
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+distributed_state = PartialState()
+pipeline.to(distributed_state.device)
+
+with distributed_state.split_between_processes(["a dog", "a cat"]) as prompt:
+ result = pipeline(prompt).images[0]
+ result.save(f"result_{distributed_state.process_index}.png")
+```
+
+Use the `--num_processes` argument to specify the number of GPUs to use, and call `accelerate launch` to run the script:
+
+```bash
+accelerate launch run_distributed.py --num_processes=2
+```
+
+
+
+To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate) guide.
+
+
+
+## PyTorch Distributed
+
+PyTorch supports [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) which enables data parallelism.
+
+To start, create a Python file and import `torch.distributed` and `torch.multiprocessing` to set up the distributed process group and to spawn the processes for inference on each GPU. You should also initialize a [`DiffusionPipeline`]:
+
+```py
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+from diffusers import DiffusionPipeline
+
+sd = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+```
+
+You'll want to create a function to run inference; [`init_process_group`](https://pytorch.org/docs/stable/distributed.html?highlight=init_process_group#torch.distributed.init_process_group) handles creating a distributed environment with the type of backend to use, the `rank` of the current process, and the `world_size` or the number of processes participating. If you're running inference in parallel over 2 GPUs, then the `world_size` is 2.
+
+Move the [`DiffusionPipeline`] to `rank` and use `get_rank` to assign a GPU to each process, where each process handles a different prompt:
+
+```py
+def run_inference(rank, world_size):
+ dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+ sd.to(rank)
+
+ if torch.distributed.get_rank() == 0:
+ prompt = "a dog"
+ elif torch.distributed.get_rank() == 1:
+ prompt = "a cat"
+
+ image = sd(prompt).images[0]
+ image.save(f"./{'_'.join(prompt)}.png")
+```
+
+To run the distributed inference, call [`mp.spawn`](https://pytorch.org/docs/stable/multiprocessing.html#torch.multiprocessing.spawn) to run the `run_inference` function on the number of GPUs defined in `world_size`:
+
+```py
+def main():
+ world_size = 2
+ mp.spawn(run_inference, args=(world_size,), nprocs=world_size, join=True)
+
+
+if __name__ == "__main__":
+ main()
+```
+
+Once you've completed the inference script, use the `--nproc_per_node` argument to specify the number of GPUs to use and call `torchrun` to run the script:
+
+```bash
+torchrun run_distributed.py --nproc_per_node=2
+```
\ No newline at end of file
From 85eff637aad1106f593d7535ec41cdb736b0b2ea Mon Sep 17 00:00:00 2001
From: Will Berman
Date: Fri, 19 May 2023 10:45:56 -0700
Subject: [PATCH 107/206] [{Up,Down}sample1d] explicit view kernel size as
number elements in flattened indices (#3479)
explicit view kernel size as number elements in flattened indices
---
src/diffusers/models/unet_1d_blocks.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/src/diffusers/models/unet_1d_blocks.py b/src/diffusers/models/unet_1d_blocks.py
index a0f0e58f9103..934a4a4a7dcb 100644
--- a/src/diffusers/models/unet_1d_blocks.py
+++ b/src/diffusers/models/unet_1d_blocks.py
@@ -300,7 +300,8 @@ def forward(self, hidden_states):
hidden_states = F.pad(hidden_states, (self.pad,) * 2, self.pad_mode)
weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]])
indices = torch.arange(hidden_states.shape[1], device=hidden_states.device)
- weight[indices, indices] = self.kernel.to(weight)
+ kernel = self.kernel.to(weight)[None, :].expand(hidden_states.shape[1], -1)
+ weight[indices, indices] = kernel
return F.conv1d(hidden_states, weight, stride=2)
@@ -316,7 +317,8 @@ def forward(self, hidden_states, temb=None):
hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2,) * 2, self.pad_mode)
weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]])
indices = torch.arange(hidden_states.shape[1], device=hidden_states.device)
- weight[indices, indices] = self.kernel.to(weight)
+ kernel = self.kernel.to(weight)[None, :].expand(hidden_states.shape[1], -1)
+ weight[indices, indices] = kernel
return F.conv_transpose1d(hidden_states, weight, stride=2, padding=self.pad * 2 + 1)
From f7b4f51cc2a423c96cb2a4c2282e55feba0be506 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca
Date: Sat, 20 May 2023 13:43:07 +0200
Subject: [PATCH 108/206] mps & onnx tests rework (#3449)
* Remove ONNX tests from PR.
They are already a part of push_tests.yml.
* Remove mps tests from PRs.
They are already performed on push.
* Fix workflow name for fast push tests.
* Extract mps tests to a workflow.
For better control/filtering.
* Remove --extra-index-url from mps tests
* Increase tolerance of mps test
This test passes in my Mac (Ventura 13.3) but fails in the CI hardware
(Ventura 13.2). I ran the local tests following the same steps that
exist in the CI workflow.
* Temporarily run mps tests on pr
So we can test.
* Revert "Temporarily run mps tests on pr"
Tests passed, go back to running on push.
---
.github/workflows/pr_tests.yml | 66 -------------------------
.github/workflows/push_tests_fast.yml | 55 +--------------------
.github/workflows/push_tests_mps.yml | 68 ++++++++++++++++++++++++++
tests/schedulers/test_scheduler_lms.py | 2 +-
4 files changed, 70 insertions(+), 121 deletions(-)
create mode 100644 .github/workflows/push_tests_mps.yml
diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
index 23a7659166c0..162b1ba83d66 100644
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -36,11 +36,6 @@ jobs:
runner: docker-cpu
image: diffusers/diffusers-flax-cpu
report: flax_cpu
- - name: Fast ONNXRuntime CPU tests
- framework: onnxruntime
- runner: docker-cpu
- image: diffusers/diffusers-onnxruntime-cpu
- report: onnx_cpu
- name: PyTorch Example CPU tests
framework: pytorch_examples
runner: docker-cpu
@@ -98,14 +93,6 @@ jobs:
--make-reports=tests_${{ matrix.config.report }} \
tests
- - name: Run fast ONNXRuntime CPU tests
- if: ${{ matrix.config.framework == 'onnxruntime' }}
- run: |
- python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
- -s -v -k "Onnx" \
- --make-reports=tests_${{ matrix.config.report }} \
- tests/
-
- name: Run example PyTorch CPU tests
if: ${{ matrix.config.framework == 'pytorch_examples' }}
run: |
@@ -123,56 +110,3 @@ jobs:
with:
name: pr_${{ matrix.config.report }}_test_reports
path: reports
-
- run_fast_tests_apple_m1:
- name: Fast PyTorch MPS tests on MacOS
- runs-on: [ self-hosted, apple-m1 ]
-
- steps:
- - name: Checkout diffusers
- uses: actions/checkout@v3
- with:
- fetch-depth: 2
-
- - name: Clean checkout
- shell: arch -arch arm64 bash {0}
- run: |
- git clean -fxd
-
- - name: Setup miniconda
- uses: ./.github/actions/setup-miniconda
- with:
- python-version: 3.9
-
- - name: Install dependencies
- shell: arch -arch arm64 bash {0}
- run: |
- ${CONDA_RUN} python -m pip install --upgrade pip
- ${CONDA_RUN} python -m pip install -e .[quality,test]
- ${CONDA_RUN} python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
- ${CONDA_RUN} python -m pip install accelerate --upgrade
- ${CONDA_RUN} python -m pip install transformers --upgrade
-
- - name: Environment
- shell: arch -arch arm64 bash {0}
- run: |
- ${CONDA_RUN} python utils/print_env.py
-
- - name: Run fast PyTorch tests on M1 (MPS)
- shell: arch -arch arm64 bash {0}
- env:
- HF_HOME: /System/Volumes/Data/mnt/cache
- HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- run: |
- ${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/
-
- - name: Failure short reports
- if: ${{ failure() }}
- run: cat reports/tests_torch_mps_failures_short.txt
-
- - name: Test suite reports artifacts
- if: ${{ always() }}
- uses: actions/upload-artifact@v2
- with:
- name: pr_torch_mps_test_reports
- path: reports
diff --git a/.github/workflows/push_tests_fast.yml b/.github/workflows/push_tests_fast.yml
index 50ef729161d3..adf4fc8a87bc 100644
--- a/.github/workflows/push_tests_fast.yml
+++ b/.github/workflows/push_tests_fast.yml
@@ -1,4 +1,4 @@
-name: Slow tests on main
+name: Fast tests on main
on:
push:
@@ -108,56 +108,3 @@ jobs:
with:
name: pr_${{ matrix.config.report }}_test_reports
path: reports
-
- run_fast_tests_apple_m1:
- name: Fast PyTorch MPS tests on MacOS
- runs-on: [ self-hosted, apple-m1 ]
-
- steps:
- - name: Checkout diffusers
- uses: actions/checkout@v3
- with:
- fetch-depth: 2
-
- - name: Clean checkout
- shell: arch -arch arm64 bash {0}
- run: |
- git clean -fxd
-
- - name: Setup miniconda
- uses: ./.github/actions/setup-miniconda
- with:
- python-version: 3.9
-
- - name: Install dependencies
- shell: arch -arch arm64 bash {0}
- run: |
- ${CONDA_RUN} python -m pip install --upgrade pip
- ${CONDA_RUN} python -m pip install -e .[quality,test]
- ${CONDA_RUN} python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
- ${CONDA_RUN} python -m pip install accelerate --upgrade
- ${CONDA_RUN} python -m pip install transformers --upgrade
-
- - name: Environment
- shell: arch -arch arm64 bash {0}
- run: |
- ${CONDA_RUN} python utils/print_env.py
-
- - name: Run fast PyTorch tests on M1 (MPS)
- shell: arch -arch arm64 bash {0}
- env:
- HF_HOME: /System/Volumes/Data/mnt/cache
- HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- run: |
- ${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/
-
- - name: Failure short reports
- if: ${{ failure() }}
- run: cat reports/tests_torch_mps_failures_short.txt
-
- - name: Test suite reports artifacts
- if: ${{ always() }}
- uses: actions/upload-artifact@v2
- with:
- name: pr_torch_mps_test_reports
- path: reports
diff --git a/.github/workflows/push_tests_mps.yml b/.github/workflows/push_tests_mps.yml
new file mode 100644
index 000000000000..6b95815f1ea5
--- /dev/null
+++ b/.github/workflows/push_tests_mps.yml
@@ -0,0 +1,68 @@
+name: Fast mps tests on main
+
+on:
+ push:
+ branches:
+ - main
+
+env:
+ DIFFUSERS_IS_CI: yes
+ HF_HOME: /mnt/cache
+ OMP_NUM_THREADS: 8
+ MKL_NUM_THREADS: 8
+ PYTEST_TIMEOUT: 600
+ RUN_SLOW: no
+
+jobs:
+ run_fast_tests_apple_m1:
+ name: Fast PyTorch MPS tests on MacOS
+ runs-on: [ self-hosted, apple-m1 ]
+
+ steps:
+ - name: Checkout diffusers
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 2
+
+ - name: Clean checkout
+ shell: arch -arch arm64 bash {0}
+ run: |
+ git clean -fxd
+
+ - name: Setup miniconda
+ uses: ./.github/actions/setup-miniconda
+ with:
+ python-version: 3.9
+
+ - name: Install dependencies
+ shell: arch -arch arm64 bash {0}
+ run: |
+ ${CONDA_RUN} python -m pip install --upgrade pip
+ ${CONDA_RUN} python -m pip install -e .[quality,test]
+ ${CONDA_RUN} python -m pip install torch torchvision torchaudio
+ ${CONDA_RUN} python -m pip install accelerate --upgrade
+ ${CONDA_RUN} python -m pip install transformers --upgrade
+
+ - name: Environment
+ shell: arch -arch arm64 bash {0}
+ run: |
+ ${CONDA_RUN} python utils/print_env.py
+
+ - name: Run fast PyTorch tests on M1 (MPS)
+ shell: arch -arch arm64 bash {0}
+ env:
+ HF_HOME: /System/Volumes/Data/mnt/cache
+ HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+ run: |
+ ${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/
+
+ - name: Failure short reports
+ if: ${{ failure() }}
+ run: cat reports/tests_torch_mps_failures_short.txt
+
+ - name: Test suite reports artifacts
+ if: ${{ always() }}
+ uses: actions/upload-artifact@v2
+ with:
+ name: pr_torch_mps_test_reports
+ path: reports
diff --git a/tests/schedulers/test_scheduler_lms.py b/tests/schedulers/test_scheduler_lms.py
index 3f31f9696de2..2682886a788d 100644
--- a/tests/schedulers/test_scheduler_lms.py
+++ b/tests/schedulers/test_scheduler_lms.py
@@ -136,5 +136,5 @@ def test_full_loop_device_karras_sigmas(self):
result_sum = torch.sum(torch.abs(sample))
result_mean = torch.mean(torch.abs(sample))
- assert abs(result_sum.item() - 3812.9927) < 1e-2
+ assert abs(result_sum.item() - 3812.9927) < 2e-2
assert abs(result_mean.item() - 4.9648) < 1e-3
From 4bbc51d94d08a0c74cb28a036e120a32b5237b9a Mon Sep 17 00:00:00 2001
From: Sayak Paul
Date: Sun, 21 May 2023 15:26:47 +0530
Subject: [PATCH 109/206] [Attention processor] Better warning message when
shifting to `AttnProcessor2_0` (#3457)
* add: debugging to enabling memory efficient processing
* add: better warning message.
---
src/diffusers/models/attention_processor.py | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index a489814c4787..86997632cac1 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -191,7 +191,10 @@ def set_use_memory_efficient_attention_xformers(
elif hasattr(F, "scaled_dot_product_attention") and self.scale_qk:
warnings.warn(
"You have specified using flash attention using xFormers but you have PyTorch 2.0 already installed. "
- "We will default to PyTorch's native efficient flash attention implementation provided by PyTorch 2.0."
+ "We will default to PyTorch's native efficient flash attention implementation (`F.scaled_dot_product_attention`) "
+ "introduced in PyTorch 2.0. In case you are using LoRA or Custom Diffusion, we will fall "
+ "back to their respective attention processors i.e., we will NOT use the PyTorch 2.0 "
+ "native efficient flash attention."
)
else:
try:
@@ -213,6 +216,9 @@ def set_use_memory_efficient_attention_xformers(
)
processor.load_state_dict(self.processor.state_dict())
processor.to(self.processor.to_q_lora.up.weight.device)
+ print(
+ f"is_lora is set to {is_lora}, type: LoRAXFormersAttnProcessor: {isinstance(processor, LoRAXFormersAttnProcessor)}"
+ )
elif is_custom_diffusion:
processor = CustomDiffusionXFormersAttnProcessor(
train_kv=self.processor.train_kv,
@@ -250,6 +256,7 @@ def set_use_memory_efficient_attention_xformers(
# We use the AttnProcessor2_0 by default when torch 2.x is used which uses
# torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
# but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+ print("Still defaulting to: AttnProcessor2_0 :O")
processor = (
AttnProcessor2_0()
if hasattr(F, "scaled_dot_product_attention") and self.scale_qk
From 49ad61c2045a3278ea0b6648546c0824e9d89c0f Mon Sep 17 00:00:00 2001
From: Sayak Paul
Date: Sun, 21 May 2023 15:26:56 +0530
Subject: [PATCH 110/206] [Docs] add note on local directory path. (#3397)
add note on local directory path.
Co-authored-by: Patrick von Platen
---
docs/source/en/training/lora.mdx | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/docs/source/en/training/lora.mdx b/docs/source/en/training/lora.mdx
index 04eff7af11f8..748d99d5020d 100644
--- a/docs/source/en/training/lora.mdx
+++ b/docs/source/en/training/lora.mdx
@@ -146,6 +146,7 @@ pipe = StableDiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.
+
## DreamBooth
[DreamBooth](https://arxiv.org/abs/2208.12242) is a finetuning technique for personalizing a text-to-image model like Stable Diffusion to generate photorealistic images of a subject in different contexts, given a few images of the subject. However, DreamBooth is very sensitive to hyperparameters and it is easy to overfit. Some important hyperparameters to consider include those that affect the training time (learning rate, number of training steps), and inference time (number of steps, scheduler type).
@@ -268,4 +269,7 @@ Note that the use of [`~diffusers.loaders.LoraLoaderMixin.load_lora_weights`] is
pipe.load_lora_weights(lora_model_path)
```
-* LoRA parameters that have separate identifiers for the UNet and the text encoder such as: [`"sayakpaul/dreambooth"`](https://huggingface.co/sayakpaul/dreambooth).
\ No newline at end of file
+* LoRA parameters that have separate identifiers for the UNet and the text encoder such as: [`"sayakpaul/dreambooth"`](https://huggingface.co/sayakpaul/dreambooth).
+
+**Note** that it is possible to provide a local directory path to [`~diffusers.loaders.LoraLoaderMixin.load_lora_weights`] as well as [`~diffusers.loaders.UNet2DConditionLoadersMixin.load_attn_procs`]. To know about the supported inputs,
+refer to the respective docstrings.
\ No newline at end of file
From 51843fd7d043428b5ef3bb77cc683e5339b2d95e Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Mon, 22 May 2023 12:15:11 +0200
Subject: [PATCH 111/206] Refactor full determinism (#3485)
* up
* fix more
* Apply suggestions from code review
* fix more
* fix more
* Check it
* Remove 16:8
* fix more
* fix more
* fix more
* up
* up
* Test only stable diffusion
* Test only two files
* up
* Try out spinning up processes that can be killed
* up
* Apply suggestions from code review
* up
* up
---
src/diffusers/training_utils.py | 23 +-----------------
src/diffusers/utils/testing_utils.py | 18 ++++++++++++++
tests/models/test_layers_utils.py | 3 ---
tests/models/test_models_unet_1d.py | 3 ---
tests/models/test_models_unet_2d.py | 5 ++--
tests/models/test_models_unet_2d_condition.py | 5 ++--
tests/models/test_models_unet_3d_condition.py | 5 ++--
tests/models/test_models_vae.py | 4 ++--
tests/models/test_models_vq.py | 4 ++--
tests/others/test_ema.py | 5 ++--
.../altdiffusion/test_alt_diffusion.py | 5 ++--
.../test_alt_diffusion_img2img.py | 5 ++--
.../audio_diffusion/test_audio_diffusion.py | 5 ++--
tests/pipelines/audioldm/test_audioldm.py | 4 ++--
tests/pipelines/controlnet/test_controlnet.py | 5 ++--
.../controlnet/test_controlnet_img2img.py | 5 ++--
.../controlnet/test_controlnet_inpaint.py | 5 ++--
.../dance_diffusion/test_dance_diffusion.py | 4 ++--
tests/pipelines/ddim/test_ddim.py | 4 ++--
tests/pipelines/ddpm/test_ddpm.py | 4 ++--
tests/pipelines/dit/test_dit.py | 4 ++--
tests/pipelines/karras_ve/test_karras_ve.py | 4 ++--
.../latent_diffusion/test_latent_diffusion.py | 11 +++++++--
.../test_latent_diffusion_superresolution.py | 4 ++--
.../test_latent_diffusion_uncond.py | 4 ++--
.../paint_by_example/test_paint_by_example.py | 5 ++--
tests/pipelines/pndm/test_pndm.py | 4 ++--
tests/pipelines/repaint/test_repaint.py | 13 +++++++---
.../score_sde_ve/test_score_sde_ve.py | 4 ++--
.../test_semantic_diffusion.py | 4 ++--
.../test_spectrogram_diffusion.py | 4 ++--
.../stable_diffusion/test_cycle_diffusion.py | 5 ++--
.../stable_diffusion/test_stable_diffusion.py | 24 +++++++++++++++----
.../test_stable_diffusion_image_variation.py | 5 ++--
.../test_stable_diffusion_img2img.py | 5 ++--
.../test_stable_diffusion_inpaint.py | 5 ++--
.../test_stable_diffusion_inpaint_legacy.py | 5 ++--
...st_stable_diffusion_instruction_pix2pix.py | 5 ++--
.../test_stable_diffusion_k_diffusion.py | 4 ++--
.../test_stable_diffusion_model_editing.py | 5 ++--
.../test_stable_diffusion_panorama.py | 5 ++--
.../test_stable_diffusion_pix2pix_zero.py | 5 ++--
.../test_stable_diffusion_sag.py | 5 ++--
.../test_stable_diffusion.py | 5 ++--
.../test_stable_diffusion_depth.py | 5 ++--
.../test_stable_diffusion_diffedit.py | 5 ++--
.../test_stable_diffusion_inpaint.py | 5 ++--
.../test_stable_diffusion_latent_upscale.py | 4 ++--
.../test_stable_diffusion_upscale.py | 4 ++--
.../test_stable_diffusion_v_pred.py | 5 ++--
.../test_safe_diffusion.py | 3 ---
.../stable_unclip/test_stable_unclip.py | 5 ++--
.../test_stable_unclip_img2img.py | 4 ++--
tests/pipelines/test_pipelines.py | 6 ++---
tests/pipelines/test_pipelines_common.py | 3 ---
.../text_to_video/test_text_to_video.py | 4 ++--
tests/pipelines/unclip/test_unclip.py | 5 ++--
.../unclip/test_unclip_image_variation.py | 5 ++--
58 files changed, 158 insertions(+), 170 deletions(-)
diff --git a/src/diffusers/training_utils.py b/src/diffusers/training_utils.py
index 1a3abb49a065..df9c7e882682 100644
--- a/src/diffusers/training_utils.py
+++ b/src/diffusers/training_utils.py
@@ -1,7 +1,6 @@
import contextlib
import copy
-import os
-import random
+from random import random
from typing import Any, Dict, Iterable, Optional, Union
import numpy as np
@@ -14,26 +13,6 @@
import transformers
-def enable_full_determinism(seed: int):
- """
- Helper function for reproducible behavior during distributed training. See
- - https://pytorch.org/docs/stable/notes/randomness.html for pytorch
- """
- # set seed first
- set_seed(seed)
-
- # Enable PyTorch deterministic mode. This potentially requires either the environment
- # variable 'CUDA_LAUNCH_BLOCKING' or 'CUBLAS_WORKSPACE_CONFIG' to be set,
- # depending on the CUDA version, so we set them both here
- os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
- os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
- torch.use_deterministic_algorithms(True)
-
- # Enable CUDNN deterministic mode
- torch.backends.cudnn.deterministic = True
- torch.backends.cudnn.benchmark = False
-
-
def set_seed(seed: int):
"""
Args:
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index 4ad7d97b4462..93d0ef5b7b5f 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -514,3 +514,21 @@ def __exit__(self, *exc):
def __repr__(self):
return f"captured: {self.out}\n"
+
+
+def enable_full_determinism():
+ """
+ Helper function for reproducible behavior during distributed training. See
+ - https://pytorch.org/docs/stable/notes/randomness.html for pytorch
+ """
+ # Enable PyTorch deterministic mode. This potentially requires either the environment
+ # variable 'CUDA_LAUNCH_BLOCKING' or 'CUBLAS_WORKSPACE_CONFIG' to be set,
+ # depending on the CUDA version, so we set them both here
+ os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+ os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+ torch.use_deterministic_algorithms(True)
+
+ # Enable CUDNN deterministic mode
+ torch.backends.cudnn.deterministic = True
+ torch.backends.cudnn.benchmark = False
+ torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/models/test_layers_utils.py b/tests/models/test_layers_utils.py
index 98fa1afcbb9d..b438b2ddb4af 100644
--- a/tests/models/test_layers_utils.py
+++ b/tests/models/test_layers_utils.py
@@ -27,9 +27,6 @@
from diffusers.utils import torch_device
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
class EmbeddingsTests(unittest.TestCase):
def test_timestep_embeddings(self):
embedding_dim = 256
diff --git a/tests/models/test_models_unet_1d.py b/tests/models/test_models_unet_1d.py
index 78f759cb1a24..9fb1a61011e3 100644
--- a/tests/models/test_models_unet_1d.py
+++ b/tests/models/test_models_unet_1d.py
@@ -23,9 +23,6 @@
from .test_modeling_common import ModelTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
class UNet1DModelTests(ModelTesterMixin, unittest.TestCase):
model_class = UNet1DModel
diff --git a/tests/models/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py
index 8f9a6b813f19..92a5664daa2b 100644
--- a/tests/models/test_models_unet_2d.py
+++ b/tests/models/test_models_unet_2d.py
@@ -21,13 +21,14 @@
from diffusers import UNet2DModel
from diffusers.utils import floats_tensor, logging, slow, torch_all_close, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism
from .test_modeling_common import ModelTesterMixin
logger = logging.get_logger(__name__)
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+
+enable_full_determinism()
class Unet2DModelTests(ModelTesterMixin, unittest.TestCase):
diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index d3ca5ea3048e..43a487a32b43 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -33,13 +33,14 @@
torch_device,
)
from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import enable_full_determinism
from .test_modeling_common import ModelTesterMixin
logger = logging.get_logger(__name__)
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+
+enable_full_determinism()
def create_lora_layers(model, mock_weights: bool = True):
diff --git a/tests/models/test_models_unet_3d_condition.py b/tests/models/test_models_unet_3d_condition.py
index 08863adfeaac..928f6bcbe960 100644
--- a/tests/models/test_models_unet_3d_condition.py
+++ b/tests/models/test_models_unet_3d_condition.py
@@ -29,13 +29,14 @@
torch_device,
)
from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import enable_full_determinism
from .test_modeling_common import ModelTesterMixin
+enable_full_determinism()
+
logger = logging.get_logger(__name__)
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
def create_lora_layers(model, mock_weights: bool = True):
diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py
index 9a3e49cdfbc0..fe27e138f5fa 100644
--- a/tests/models/test_models_vae.py
+++ b/tests/models/test_models_vae.py
@@ -22,12 +22,12 @@
from diffusers import AutoencoderKL
from diffusers.utils import floats_tensor, load_hf_numpy, require_torch_gpu, slow, torch_all_close, torch_device
from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import enable_full_determinism
from .test_modeling_common import ModelTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class AutoencoderKLTests(ModelTesterMixin, unittest.TestCase):
diff --git a/tests/models/test_models_vq.py b/tests/models/test_models_vq.py
index f0be6f6a6d64..8ea6ef77ce63 100644
--- a/tests/models/test_models_vq.py
+++ b/tests/models/test_models_vq.py
@@ -19,12 +19,12 @@
from diffusers import VQModel
from diffusers.utils import floats_tensor, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism
from .test_modeling_common import ModelTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class VQModelTests(ModelTesterMixin, unittest.TestCase):
diff --git a/tests/others/test_ema.py b/tests/others/test_ema.py
index 5526aadc4757..32f7ae8a9a8e 100644
--- a/tests/others/test_ema.py
+++ b/tests/others/test_ema.py
@@ -20,11 +20,10 @@
from diffusers import UNet2DConditionModel
from diffusers.training_utils import EMAModel
-from diffusers.utils.testing_utils import skip_mps, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class EMAModelTests(unittest.TestCase):
diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion.py b/tests/pipelines/altdiffusion/test_alt_diffusion.py
index 9237f7435b95..6842d29dc6c0 100644
--- a/tests/pipelines/altdiffusion/test_alt_diffusion.py
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion.py
@@ -26,14 +26,13 @@
RobertaSeriesModelWithTransformation,
)
from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class AltDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
index 35a4e91284cd..61457e6ca01f 100644
--- a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
@@ -33,11 +33,10 @@
RobertaSeriesModelWithTransformation,
)
from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class AltDiffusionImg2ImgPipelineFastTests(unittest.TestCase):
diff --git a/tests/pipelines/audio_diffusion/test_audio_diffusion.py b/tests/pipelines/audio_diffusion/test_audio_diffusion.py
index a848bd031797..8c20f011cb86 100644
--- a/tests/pipelines/audio_diffusion/test_audio_diffusion.py
+++ b/tests/pipelines/audio_diffusion/test_audio_diffusion.py
@@ -30,11 +30,10 @@
UNet2DModel,
)
from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class PipelineFastTests(unittest.TestCase):
diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 566b2c2d2cd0..0825fc36a266 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -37,13 +37,13 @@
UNet2DConditionModel,
)
from diffusers.utils import slow, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism
from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py
index 0453bb38e1ee..97b5e20f3c14 100644
--- a/tests/pipelines/controlnet/test_controlnet.py
+++ b/tests/pipelines/controlnet/test_controlnet.py
@@ -32,7 +32,7 @@
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel
from diffusers.utils import load_image, load_numpy, randn_tensor, slow, torch_device
from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
from ..pipeline_params import (
TEXT_TO_IMAGE_BATCH_PARAMS,
@@ -41,8 +41,7 @@
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class ControlNetPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/controlnet/test_controlnet_img2img.py b/tests/pipelines/controlnet/test_controlnet_img2img.py
index b83a8af2778b..9d3b10aa8283 100644
--- a/tests/pipelines/controlnet/test_controlnet_img2img.py
+++ b/tests/pipelines/controlnet/test_controlnet_img2img.py
@@ -35,7 +35,7 @@
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel
from diffusers.utils import floats_tensor, load_image, load_numpy, randn_tensor, slow, torch_device
from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
from ..pipeline_params import (
TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
@@ -44,8 +44,7 @@
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class ControlNetImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint.py b/tests/pipelines/controlnet/test_controlnet_inpaint.py
index 786b0e608ef0..155286630c04 100644
--- a/tests/pipelines/controlnet/test_controlnet_inpaint.py
+++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py
@@ -35,7 +35,7 @@
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel
from diffusers.utils import floats_tensor, load_image, load_numpy, randn_tensor, slow, torch_device
from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
from ..pipeline_params import (
TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
@@ -44,8 +44,7 @@
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class ControlNetInpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
index 361839043c9f..0ba86daa61fc 100644
--- a/tests/pipelines/dance_diffusion/test_dance_diffusion.py
+++ b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
@@ -21,13 +21,13 @@
from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel
from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps
from ..pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
+enable_full_determinism()
class DanceDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/ddim/test_ddim.py b/tests/pipelines/ddim/test_ddim.py
index e997ae45d975..0861d7daab29 100644
--- a/tests/pipelines/ddim/test_ddim.py
+++ b/tests/pipelines/ddim/test_ddim.py
@@ -19,13 +19,13 @@
import torch
from diffusers import DDIMPipeline, DDIMScheduler, UNet2DModel
-from diffusers.utils.testing_utils import require_torch_gpu, slow, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow, torch_device
from ..pipeline_params import UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS, UNCONDITIONAL_IMAGE_GENERATION_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
+enable_full_determinism()
class DDIMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/ddpm/test_ddpm.py b/tests/pipelines/ddpm/test_ddpm.py
index 5e3e47cb74fb..a3c290215114 100644
--- a/tests/pipelines/ddpm/test_ddpm.py
+++ b/tests/pipelines/ddpm/test_ddpm.py
@@ -19,10 +19,10 @@
import torch
from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
-from diffusers.utils.testing_utils import require_torch_gpu, slow, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow, torch_device
-torch.backends.cuda.matmul.allow_tf32 = False
+enable_full_determinism()
class DDPMPipelineFastTests(unittest.TestCase):
diff --git a/tests/pipelines/dit/test_dit.py b/tests/pipelines/dit/test_dit.py
index d8098178f339..4937915696b4 100644
--- a/tests/pipelines/dit/test_dit.py
+++ b/tests/pipelines/dit/test_dit.py
@@ -21,7 +21,7 @@
from diffusers import AutoencoderKL, DDIMScheduler, DiTPipeline, DPMSolverMultistepScheduler, Transformer2DModel
from diffusers.utils import is_xformers_available, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
from ..pipeline_params import (
CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS,
@@ -30,7 +30,7 @@
from ..test_pipelines_common import PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
+enable_full_determinism()
class DiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/karras_ve/test_karras_ve.py b/tests/pipelines/karras_ve/test_karras_ve.py
index 391e61a2b9c9..142058bcd710 100644
--- a/tests/pipelines/karras_ve/test_karras_ve.py
+++ b/tests/pipelines/karras_ve/test_karras_ve.py
@@ -19,10 +19,10 @@
import torch
from diffusers import KarrasVePipeline, KarrasVeScheduler, UNet2DModel
-from diffusers.utils.testing_utils import require_torch, slow, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch, slow, torch_device
-torch.backends.cuda.matmul.allow_tf32 = False
+enable_full_determinism()
class KarrasVePipelineFastTests(unittest.TestCase):
diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion.py b/tests/pipelines/latent_diffusion/test_latent_diffusion.py
index 05ff4162e5c6..88dc8ef9b17b 100644
--- a/tests/pipelines/latent_diffusion/test_latent_diffusion.py
+++ b/tests/pipelines/latent_diffusion/test_latent_diffusion.py
@@ -21,13 +21,20 @@
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, DDIMScheduler, LDMTextToImagePipeline, UNet2DConditionModel
-from diffusers.utils.testing_utils import load_numpy, nightly, require_torch_gpu, slow, torch_device
+from diffusers.utils.testing_utils import (
+ enable_full_determinism,
+ load_numpy,
+ nightly,
+ require_torch_gpu,
+ slow,
+ torch_device,
+)
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
+enable_full_determinism()
class LDMTextToImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
index f1aa2f08efba..d21ead543af8 100644
--- a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
+++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
@@ -21,10 +21,10 @@
from diffusers import DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel, VQModel
from diffusers.utils import PIL_INTERPOLATION, floats_tensor, load_image, slow, torch_device
-from diffusers.utils.testing_utils import require_torch
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch
-torch.backends.cuda.matmul.allow_tf32 = False
+enable_full_determinism()
class LDMSuperResolutionPipelineFastTests(unittest.TestCase):
diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
index aa7b33730d18..ff8670ea2950 100644
--- a/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
+++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
@@ -20,10 +20,10 @@
from transformers import CLIPTextConfig, CLIPTextModel
from diffusers import DDIMScheduler, LDMPipeline, UNet2DModel, VQModel
-from diffusers.utils.testing_utils import require_torch, slow, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch, slow, torch_device
-torch.backends.cuda.matmul.allow_tf32 = False
+enable_full_determinism()
class LDMPipelineFastTests(unittest.TestCase):
diff --git a/tests/pipelines/paint_by_example/test_paint_by_example.py b/tests/pipelines/paint_by_example/test_paint_by_example.py
index 80ba3f5ed37f..14c16644889e 100644
--- a/tests/pipelines/paint_by_example/test_paint_by_example.py
+++ b/tests/pipelines/paint_by_example/test_paint_by_example.py
@@ -25,14 +25,13 @@
from diffusers import AutoencoderKL, PaintByExamplePipeline, PNDMScheduler, UNet2DConditionModel
from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder
from diffusers.utils import floats_tensor, load_image, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
from ..pipeline_params import IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/pndm/test_pndm.py b/tests/pipelines/pndm/test_pndm.py
index bed5fea561dc..c2595713933c 100644
--- a/tests/pipelines/pndm/test_pndm.py
+++ b/tests/pipelines/pndm/test_pndm.py
@@ -19,10 +19,10 @@
import torch
from diffusers import PNDMPipeline, PNDMScheduler, UNet2DModel
-from diffusers.utils.testing_utils import require_torch, slow, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch, slow, torch_device
-torch.backends.cuda.matmul.allow_tf32 = False
+enable_full_determinism()
class PNDMPipelineFastTests(unittest.TestCase):
diff --git a/tests/pipelines/repaint/test_repaint.py b/tests/pipelines/repaint/test_repaint.py
index 59968eaf101c..e372cf979ebb 100644
--- a/tests/pipelines/repaint/test_repaint.py
+++ b/tests/pipelines/repaint/test_repaint.py
@@ -20,14 +20,21 @@
import torch
from diffusers import RePaintPipeline, RePaintScheduler, UNet2DModel
-from diffusers.utils.testing_utils import load_image, load_numpy, nightly, require_torch_gpu, skip_mps, torch_device
+from diffusers.utils.testing_utils import (
+ enable_full_determinism,
+ load_image,
+ load_numpy,
+ nightly,
+ require_torch_gpu,
+ skip_mps,
+ torch_device,
+)
from ..pipeline_params import IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_INPAINTING_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class RepaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/score_sde_ve/test_score_sde_ve.py b/tests/pipelines/score_sde_ve/test_score_sde_ve.py
index 036ecc3f6bf3..32505253f6c7 100644
--- a/tests/pipelines/score_sde_ve/test_score_sde_ve.py
+++ b/tests/pipelines/score_sde_ve/test_score_sde_ve.py
@@ -19,10 +19,10 @@
import torch
from diffusers import ScoreSdeVePipeline, ScoreSdeVeScheduler, UNet2DModel
-from diffusers.utils.testing_utils import require_torch, slow, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch, slow, torch_device
-torch.backends.cuda.matmul.allow_tf32 = False
+enable_full_determinism()
class ScoreSdeVeipelineFastTests(unittest.TestCase):
diff --git a/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py b/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
index ba42b1fe9c5f..9e810616dc56 100644
--- a/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
+++ b/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
@@ -25,10 +25,10 @@
from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
from diffusers.pipelines.semantic_stable_diffusion import SemanticStableDiffusionPipeline as StableDiffusionPipeline
from diffusers.utils import floats_tensor, nightly, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
-torch.backends.cuda.matmul.allow_tf32 = False
+enable_full_determinism()
class SafeDiffusionPipelineFastTests(unittest.TestCase):
diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index 3ec6f681be79..cc8690eb87ca 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -22,13 +22,13 @@
from diffusers import DDPMScheduler, MidiProcessor, SpectrogramDiffusionPipeline
from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder
from diffusers.utils import require_torch_gpu, skip_mps, slow, torch_device
-from diffusers.utils.testing_utils import require_note_seq, require_onnxruntime
+from diffusers.utils.testing_utils import enable_full_determinism, require_note_seq, require_onnxruntime
from ..pipeline_params import TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS, TOKENS_TO_AUDIO_GENERATION_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
+enable_full_determinism()
MIDI_FILE = "./tests/fixtures/elise_format0.mid"
diff --git a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
index 3d6bfff1bbd1..a1ae3d2d0e7c 100644
--- a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
@@ -23,14 +23,13 @@
from diffusers import AutoencoderKL, CycleDiffusionPipeline, DDIMScheduler, UNet2DConditionModel
from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps
from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class CycleDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 1f52a09b672b..aec4436710b9 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -15,11 +15,16 @@
import gc
+import os
+import signal
+import subprocess
+import sys
import tempfile
import time
import unittest
import numpy as np
+import pytest
import torch
from huggingface_hub import hf_hub_download
from packaging import version
@@ -39,15 +44,25 @@
)
from diffusers.models.attention_processor import AttnProcessor
from diffusers.utils import load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
+from diffusers.utils.testing_utils import CaptureLogger, enable_full_determinism, require_torch_gpu
from ...models.test_models_unet_2d_condition import create_lora_layers
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+@pytest.fixture(autouse=True)
+def process_fixture():
+ # This will be run before each test
+ command = [sys.executable, os.path.abspath(__file__)]
+ process = subprocess.Popen(command)
+ enable_full_determinism()
+ yield process
+ # This will be run after each test
+ try:
+ os.kill(process.pid, signal.SIGTERM) # or signal.SIGKILL
+ except ProcessLookupError:
+ pass
class StableDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
@@ -551,8 +566,7 @@ def test_inference_batch_single_identical(self):
@slow
@require_torch_gpu
class StableDiffusionPipelineSlowTests(unittest.TestCase):
- def tearDown(self):
- super().tearDown()
+ def setUp(self):
gc.collect()
torch.cuda.empty_cache()
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
index 0ce55ae78ae0..c35d84de9802 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
@@ -30,14 +30,13 @@
UNet2DConditionModel,
)
from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class StableDiffusionImageVariationPipelineFastTests(
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 4afc16d9b65f..8ab252b9be80 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -34,7 +34,7 @@
)
from diffusers.image_processor import VaeImageProcessor
from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps
from ..pipeline_params import (
IMAGE_TO_IMAGE_IMAGE_PARAMS,
@@ -44,8 +44,7 @@
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class StableDiffusionImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 5c2d9d7c44f7..44de277ead07 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -33,15 +33,14 @@
)
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image
from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
from ...models.test_models_unet_2d_condition import create_lora_layers
from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class StableDiffusionInpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
index 8647041fbb6f..fa00a0d201af 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
@@ -34,11 +34,10 @@
VQModel,
)
from diffusers.utils import floats_tensor, load_image, nightly, slow, torch_device
-from diffusers.utils.testing_utils import load_numpy, preprocess_image, require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, load_numpy, preprocess_image, require_torch_gpu
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class StableDiffusionInpaintLegacyPipelineFastTests(unittest.TestCase):
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
index 99a069493885..fbff6c554967 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
@@ -32,14 +32,13 @@
UNet2DConditionModel,
)
from diffusers.utils import floats_tensor, load_image, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class StableDiffusionInstructPix2PixPipelineFastTests(
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py
index 546b1d21252c..4eccb871a0cb 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py
@@ -21,10 +21,10 @@
from diffusers import StableDiffusionKDiffusionPipeline
from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
-torch.backends.cuda.matmul.allow_tf32 = False
+enable_full_determinism()
@slow
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
index b448dbef1ebe..cba20417bca0 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
@@ -29,14 +29,13 @@
UNet2DConditionModel,
)
from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
@skip_mps
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
index 61708b36bfee..02a15b2a29dc 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
@@ -30,14 +30,13 @@
UNet2DConditionModel,
)
from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
@skip_mps
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
index 90cc85646462..98f5910ab313 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
@@ -33,14 +33,13 @@
UNet2DConditionModel,
)
from diffusers.utils import floats_tensor, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import load_image, load_pt, require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, load_image, load_pt, require_torch_gpu, skip_mps
from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
@skip_mps
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
index 7cb8ab409a9b..2b0f0bfc11a6 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
@@ -27,14 +27,13 @@
UNet2DConditionModel,
)
from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class StableDiffusionSAGPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
index bc4ab7d66431..3f9867783b33 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
@@ -33,14 +33,13 @@
logging,
)
from diffusers.utils import load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
+from diffusers.utils.testing_utils import CaptureLogger, enable_full_determinism, require_torch_gpu
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class StableDiffusion2PipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
index ae1eefa68242..08ac29868971 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
@@ -49,14 +49,13 @@
slow,
torch_device,
)
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps
from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
@skip_mps
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
index c9da7b06893f..8df5b6da846c 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
@@ -33,14 +33,13 @@
UNet2DConditionModel,
)
from diffusers.utils import load_image, slow
-from diffusers.utils.testing_utils import floats_tensor, require_torch_gpu, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_gpu, torch_device
from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class StableDiffusionDiffEditPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
index 77242add93e9..10d8561f0126 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
@@ -24,14 +24,13 @@
from diffusers import AutoencoderKL, PNDMScheduler, StableDiffusionInpaintPipeline, UNet2DConditionModel
from diffusers.utils import floats_tensor, load_image, load_numpy, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, slow
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow
from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class StableDiffusion2InpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
index 539b4b1cc350..561536a44ea0 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
@@ -29,13 +29,13 @@
UNet2DConditionModel,
)
from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
+enable_full_determinism()
class StableDiffusionLatentUpscalePipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
index 747809a4fb2e..7100e5023a5d 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
@@ -24,10 +24,10 @@
from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, StableDiffusionUpscalePipeline, UNet2DConditionModel
from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
-torch.backends.cuda.matmul.allow_tf32 = False
+enable_full_determinism()
class StableDiffusionUpscalePipelineFastTests(unittest.TestCase):
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
index a874cbb7e0c5..d1a2c856659f 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
@@ -30,11 +30,10 @@
UNet2DConditionModel,
)
from diffusers.utils import load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class StableDiffusion2VPredictionPipelineFastTests(unittest.TestCase):
diff --git a/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py b/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
index c614fa48055e..09e31aacfbc9 100644
--- a/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
+++ b/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
@@ -28,9 +28,6 @@
from diffusers.utils.testing_utils import require_torch_gpu
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
class SafeDiffusionPipelineFastTests(unittest.TestCase):
def tearDown(self):
# clean up the VRAM after each test
diff --git a/tests/pipelines/stable_unclip/test_stable_unclip.py b/tests/pipelines/stable_unclip/test_stable_unclip.py
index 78775a938b5b..8b4a065cd4bf 100644
--- a/tests/pipelines/stable_unclip/test_stable_unclip.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip.py
@@ -13,14 +13,13 @@
UNet2DConditionModel,
)
from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
-from diffusers.utils.testing_utils import load_numpy, require_torch_gpu, slow, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, load_numpy, require_torch_gpu, slow, torch_device
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin, assert_mean_pixel_difference
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class StableUnCLIPPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
index dcd4300b85c1..35cae61242c4 100644
--- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
@@ -18,6 +18,7 @@
from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
from diffusers.utils.import_utils import is_xformers_available
from diffusers.utils.testing_utils import (
+ enable_full_determinism,
floats_tensor,
load_image,
load_numpy,
@@ -35,8 +36,7 @@
)
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class StableUnCLIPImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index df1a3b6ac7bb..a9abb0b4fb62 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -65,6 +65,7 @@
)
from diffusers.utils.testing_utils import (
CaptureLogger,
+ enable_full_determinism,
get_tests_dir,
load_numpy,
require_compel,
@@ -73,8 +74,7 @@
)
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class DownloadTests(unittest.TestCase):
@@ -700,7 +700,6 @@ def test_local_custom_pipeline_file(self):
def test_download_from_git(self):
# Because adaptive_avg_pool2d_backward_cuda
# does not have a deterministic implementation.
- torch.use_deterministic_algorithms(False)
clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
feature_extractor = CLIPImageProcessor.from_pretrained(clip_model_id)
@@ -722,7 +721,6 @@ def test_download_from_git(self):
image = pipeline("a prompt", num_inference_steps=2, output_type="np").images[0]
assert image.shape == (512, 512, 3)
- torch.use_deterministic_algorithms(True)
def test_save_pipeline_change_config(self):
pipe = DiffusionPipeline.from_pretrained(
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index f23e850f4d54..3984ed76edce 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -18,9 +18,6 @@
from diffusers.utils.testing_utils import require_torch, torch_device
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
def to_np(tensor):
if isinstance(tensor, torch.Tensor):
tensor = tensor.detach().cpu().numpy()
diff --git a/tests/pipelines/text_to_video/test_text_to_video.py b/tests/pipelines/text_to_video/test_text_to_video.py
index 212becbb6729..8b4bae2275e5 100644
--- a/tests/pipelines/text_to_video/test_text_to_video.py
+++ b/tests/pipelines/text_to_video/test_text_to_video.py
@@ -27,13 +27,13 @@
UNet3DConditionModel,
)
from diffusers.utils import load_numpy, skip_mps, slow
+from diffusers.utils.testing_utils import enable_full_determinism
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
@skip_mps
diff --git a/tests/pipelines/unclip/test_unclip.py b/tests/pipelines/unclip/test_unclip.py
index 5357e5b0e7ef..393c3ba1635d 100644
--- a/tests/pipelines/unclip/test_unclip.py
+++ b/tests/pipelines/unclip/test_unclip.py
@@ -23,14 +23,13 @@
from diffusers import PriorTransformer, UnCLIPPipeline, UnCLIPScheduler, UNet2DConditionModel, UNet2DModel
from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
from diffusers.utils import load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/unclip/test_unclip_image_variation.py b/tests/pipelines/unclip/test_unclip_image_variation.py
index ded162102dd6..75a26250807b 100644
--- a/tests/pipelines/unclip/test_unclip_image_variation.py
+++ b/tests/pipelines/unclip/test_unclip_image_variation.py
@@ -37,14 +37,13 @@
)
from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
from diffusers.utils import floats_tensor, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import load_image, require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, load_image, require_torch_gpu, skip_mps
from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
From 6dd3871ae05bd2a7d52c637e14db17887868aee9 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Mon, 22 May 2023 15:32:39 +0200
Subject: [PATCH 112/206] Fix DPM single (#3413)
* Fix DPM single
* add test
* fix one more bug
* Apply suggestions from code review
Co-authored-by: StAlKeR7779
---------
Co-authored-by: StAlKeR7779
---
.../scheduling_dpmsolver_singlestep.py | 19 ++++++++++++++++++-
tests/schedulers/test_scheduler_dpm_single.py | 16 ++++++++++++++++
2 files changed, 34 insertions(+), 1 deletion(-)
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
index 9307db89d8d7..8ddd30b0a192 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -21,9 +21,13 @@
import torch
from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import logging
from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
"""
@@ -251,7 +255,14 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
self.timesteps = torch.from_numpy(timesteps).to(device)
self.model_outputs = [None] * self.config.solver_order
self.sample = None
- self.orders = self.get_order_list(num_inference_steps)
+
+ if not self.config.lower_order_final and num_inference_steps % self.config.solver_order != 0:
+ logger.warn(
+ "Changing scheduler {self.config} to have `lower_order_final` set to True to handle uneven amount of inference steps. Please make sure to always use an even number of `num_inference steps when using `lower_order_final=True`."
+ )
+ self.register_to_config(lower_order_final=True)
+
+ self.order_list = self.get_order_list(num_inference_steps)
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
@@ -597,6 +608,12 @@ def step(
self.model_outputs[-1] = model_output
order = self.order_list[step_index]
+
+ # For img2img denoising might start with order>1 which is not possible
+ # In this case make sure that the first two steps are both order=1
+ while self.model_outputs[-order] is None:
+ order -= 1
+
# For single-step solvers, we use the initial value at each time with order = 1.
if order == 1:
self.sample = sample
diff --git a/tests/schedulers/test_scheduler_dpm_single.py b/tests/schedulers/test_scheduler_dpm_single.py
index fd7395e794c7..18a706a1f59b 100644
--- a/tests/schedulers/test_scheduler_dpm_single.py
+++ b/tests/schedulers/test_scheduler_dpm_single.py
@@ -116,6 +116,22 @@ def full_loop(self, scheduler=None, **config):
return sample
+ def test_full_uneven_loop(self):
+ scheduler = DPMSolverSinglestepScheduler(**self.get_scheduler_config())
+ num_inference_steps = 50
+ model = self.dummy_model()
+ sample = self.dummy_sample_deter
+ scheduler.set_timesteps(num_inference_steps)
+
+ # make sure that the first t is uneven
+ for i, t in enumerate(scheduler.timesteps[3:]):
+ residual = model(sample, t)
+ sample = scheduler.step(residual, t, sample).prev_sample
+
+ result_mean = torch.mean(torch.abs(sample))
+
+ assert abs(result_mean.item() - 0.2574) < 1e-3
+
def test_timesteps(self):
for timesteps in [25, 50, 100, 999, 1000]:
self.check_over_configs(num_train_timesteps=timesteps)
From 194b0a425dfa0bcdb048ab8f37d1668682c1a91b Mon Sep 17 00:00:00 2001
From: Isotr0py <41363108+Isotr0py@users.noreply.github.com>
Date: Mon, 22 May 2023 22:43:56 +0800
Subject: [PATCH 113/206] Add `use_Karras_sigmas` to
DPMSolverSinglestepScheduler (#3476)
* add use_karras_sigmas
* add karras test
* add doc
---
.../scheduling_dpmsolver_singlestep.py | 52 +++++++++++++++++++
tests/schedulers/test_scheduler_dpm_single.py | 12 +++++
2 files changed, 64 insertions(+)
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
index 8ddd30b0a192..7fa8eabb5a15 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -117,6 +117,10 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
lower_order_final (`bool`, default `True`):
whether to use lower-order solvers in the final steps. For singlestep schedulers, we recommend to enable
this to use up all the function evaluations.
+ use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+ This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the
+ noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence
+ of noise levels {σi} as defined in Equation (5) of the paper https://arxiv.org/pdf/2206.00364.pdf.
lambda_min_clipped (`float`, default `-inf`):
the clipping threshold for the minimum value of lambda(t) for numerical stability. This is critical for
cosine (squaredcos_cap_v2) noise schedule.
@@ -150,6 +154,7 @@ def __init__(
algorithm_type: str = "dpmsolver++",
solver_type: str = "midpoint",
lower_order_final: bool = True,
+ use_karras_sigmas: Optional[bool] = False,
lambda_min_clipped: float = -float("inf"),
variance_type: Optional[str] = None,
):
@@ -197,6 +202,7 @@ def __init__(
self.model_outputs = [None] * solver_order
self.sample = None
self.order_list = self.get_order_list(num_train_timesteps)
+ self.use_karras_sigmas = use_karras_sigmas
def get_order_list(self, num_inference_steps: int) -> List[int]:
"""
@@ -252,6 +258,14 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
.copy()
.astype(np.int64)
)
+
+ if self.use_karras_sigmas:
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+ log_sigmas = np.log(sigmas)
+ sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+ timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+ timesteps = np.flip(timesteps).copy().astype(np.int64)
+
self.timesteps = torch.from_numpy(timesteps).to(device)
self.model_outputs = [None] * self.config.solver_order
self.sample = None
@@ -299,6 +313,44 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
return sample
+ # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+ def _sigma_to_t(self, sigma, log_sigmas):
+ # get log sigma
+ log_sigma = np.log(sigma)
+
+ # get distribution
+ dists = log_sigma - log_sigmas[:, np.newaxis]
+
+ # get sigmas range
+ low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+ high_idx = low_idx + 1
+
+ low = log_sigmas[low_idx]
+ high = log_sigmas[high_idx]
+
+ # interpolate sigmas
+ w = (low - log_sigma) / (low - high)
+ w = np.clip(w, 0, 1)
+
+ # transform interpolation to time range
+ t = (1 - w) * low_idx + w * high_idx
+ t = t.reshape(sigma.shape)
+ return t
+
+ # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+ """Constructs the noise schedule of Karras et al. (2022)."""
+
+ sigma_min: float = in_sigmas[-1].item()
+ sigma_max: float = in_sigmas[0].item()
+
+ rho = 7.0 # 7.0 is the value used in the paper
+ ramp = np.linspace(0, 1, num_inference_steps)
+ min_inv_rho = sigma_min ** (1 / rho)
+ max_inv_rho = sigma_max ** (1 / rho)
+ sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+ return sigmas
+
def convert_model_output(
self, model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor
) -> torch.FloatTensor:
diff --git a/tests/schedulers/test_scheduler_dpm_single.py b/tests/schedulers/test_scheduler_dpm_single.py
index 18a706a1f59b..66be3d5d00ad 100644
--- a/tests/schedulers/test_scheduler_dpm_single.py
+++ b/tests/schedulers/test_scheduler_dpm_single.py
@@ -215,12 +215,24 @@ def test_full_loop_no_noise(self):
assert abs(result_mean.item() - 0.2791) < 1e-3
+ def test_full_loop_with_karras(self):
+ sample = self.full_loop(use_karras_sigmas=True)
+ result_mean = torch.mean(torch.abs(sample))
+
+ assert abs(result_mean.item() - 0.2248) < 1e-3
+
def test_full_loop_with_v_prediction(self):
sample = self.full_loop(prediction_type="v_prediction")
result_mean = torch.mean(torch.abs(sample))
assert abs(result_mean.item() - 0.1453) < 1e-3
+ def test_full_loop_with_karras_and_v_prediction(self):
+ sample = self.full_loop(prediction_type="v_prediction", use_karras_sigmas=True)
+ result_mean = torch.mean(torch.abs(sample))
+
+ assert abs(result_mean.item() - 0.0649) < 1e-3
+
def test_fp16_support(self):
scheduler_class = self.scheduler_classes[0]
scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
From 0160e5146f00ad541a857a16ecc1512e4f6e39bb Mon Sep 17 00:00:00 2001
From: w4ffl35
Date: Mon, 22 May 2023 08:44:36 -0600
Subject: [PATCH 114/206] Adds local_files_only bool to prevent forced online
connection (#3486)
---
.../pipelines/stable_diffusion/convert_from_ckpt.py | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 42e8ae7cafd2..ff9e03d29347 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -727,8 +727,8 @@ def _copy_layers(hf_layers, pt_layers):
return hf_model
-def convert_ldm_clip_checkpoint(checkpoint):
- text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False):
+ text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
keys = list(checkpoint.keys())
@@ -992,6 +992,7 @@ def download_from_original_stable_diffusion_ckpt(
controlnet: Optional[bool] = None,
load_safety_checker: bool = True,
pipeline_class: DiffusionPipeline = None,
+ local_files_only=False
) -> DiffusionPipeline:
"""
Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
@@ -1037,6 +1038,8 @@ def download_from_original_stable_diffusion_ckpt(
Whether to load the safety checker or not. Defaults to `True`.
pipeline_class (`str`, *optional*, defaults to `None`):
The pipeline class to use. Pass `None` to determine automatically.
+ local_files_only (`bool`, *optional*, defaults to `False`):
+ Whether or not to only look at local files (i.e., do not try to download the model).
return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
"""
@@ -1292,7 +1295,7 @@ def download_from_original_stable_diffusion_ckpt(
feature_extractor=feature_extractor,
)
elif model_type == "FrozenCLIPEmbedder":
- text_model = convert_ldm_clip_checkpoint(checkpoint)
+ text_model = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
if load_safety_checker:
From a2874af2971d1b262371d9a6fae653662c4a5e95 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Mon, 22 May 2023 16:44:48 +0200
Subject: [PATCH 115/206] make style
---
src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index ff9e03d29347..7ba1bbd996db 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -992,7 +992,7 @@ def download_from_original_stable_diffusion_ckpt(
controlnet: Optional[bool] = None,
load_safety_checker: bool = True,
pipeline_class: DiffusionPipeline = None,
- local_files_only=False
+ local_files_only=False,
) -> DiffusionPipeline:
"""
Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
From 229fd8cbca989b675ed9ad30676b323eebc24fbc Mon Sep 17 00:00:00 2001
From: Seongsu Park
Date: Mon, 22 May 2023 23:46:16 +0900
Subject: [PATCH 116/206] [Docs] Korean translation (optimization, training)
(#3488)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* feat) optimization kr translation
* fix) typo, italic setting
* feat) dreambooth, text2image kr
* feat) lora kr
* fix) LoRA
* fix) fp16 fix
* fix) doc-builder style
* fix) fp16 일부 단어 수정
* fix) fp16 style fix
* fix) opt, training docs update
* feat) toctree update
* feat) toctree update
---------
Co-authored-by: Chanran Kim
---
docs/source/ko/_toctree.yml | 207 ++--------
docs/source/ko/optimization/fp16.mdx | 410 +++++++++++++++++++
docs/source/ko/optimization/habana.mdx | 71 ++++
docs/source/ko/optimization/mps.mdx | 71 ++++
docs/source/ko/optimization/onnx.mdx | 65 +++
docs/source/ko/optimization/open_vino.mdx | 39 ++
docs/source/ko/optimization/xformers.mdx | 36 ++
docs/source/ko/training/dreambooth.mdx | 475 ++++++++++++++++++++++
docs/source/ko/training/lora.mdx | 128 ++++++
docs/source/ko/training/text2image.mdx | 224 ++++++++++
10 files changed, 1550 insertions(+), 176 deletions(-)
create mode 100644 docs/source/ko/optimization/fp16.mdx
create mode 100644 docs/source/ko/optimization/habana.mdx
create mode 100644 docs/source/ko/optimization/mps.mdx
create mode 100644 docs/source/ko/optimization/onnx.mdx
create mode 100644 docs/source/ko/optimization/open_vino.mdx
create mode 100644 docs/source/ko/optimization/xformers.mdx
create mode 100644 docs/source/ko/training/dreambooth.mdx
create mode 100644 docs/source/ko/training/lora.mdx
create mode 100644 docs/source/ko/training/text2image.mdx
diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index a1c0c690eb94..2fec3af66525 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -3,191 +3,46 @@
title: "🧨 Diffusers"
- local: quicktour
title: "훑어보기"
+ - local: in_translation
+ title: Stable Diffusion
- local: installation
title: "설치"
title: "시작하기"
+
- sections:
- sections:
- local: in_translation
- title: "Loading Pipelines, Models, and Schedulers"
- - local: in_translation
- title: "Using different Schedulers"
- - local: in_translation
- title: "Configuring Pipelines, Models, and Schedulers"
- - local: in_translation
- title: "Loading and Adding Custom Pipelines"
- title: "불러오기 & 허브 (번역 예정)"
- - sections:
- - local: in_translation
- title: "Unconditional Image Generation"
- - local: in_translation
- title: "Text-to-Image Generation"
- - local: in_translation
- title: "Text-Guided Image-to-Image"
+ title: 개요
- local: in_translation
- title: "Text-Guided Image-Inpainting"
+ title: Unconditional 이미지 생성
- local: in_translation
- title: "Text-Guided Depth-to-Image"
+ title: Textual Inversion
+ - local: training/dreambooth
+ title: DreamBooth
+ - local: training/text2image
+ title: Text-to-image
+ - local: training/lora
+ title: Low-Rank Adaptation of Large Language Models (LoRA)
- local: in_translation
- title: "Reusing seeds for deterministic generation"
+ title: ControlNet
- local: in_translation
- title: "Community Pipelines"
- - local: in_translation
- title: "How to contribute a Pipeline"
- title: "추론을 위한 파이프라인 (번역 예정)"
- - sections:
- - local: in_translation
- title: "Reinforcement Learning"
- - local: in_translation
- title: "Audio"
- - local: in_translation
- title: "Other Modalities"
- title: "Taking Diffusers Beyond Images"
- title: "Diffusers 사용법 (번역 예정)"
-- sections:
- - local: in_translation
- title: "Memory and Speed"
- - local: in_translation
- title: "xFormers"
- - local: in_translation
- title: "ONNX"
- - local: in_translation
- title: "OpenVINO"
- - local: in_translation
- title: "MPS"
- - local: in_translation
- title: "Habana Gaudi"
- title: "최적화/특수 하드웨어 (번역 예정)"
-- sections:
- - local: in_translation
- title: "Overview"
- - local: in_translation
- title: "Unconditional Image Generation"
- - local: in_translation
- title: "Textual Inversion"
- - local: in_translation
- title: "Dreambooth"
- - local: in_translation
- title: "Text-to-image fine-tuning"
- title: "학습 (번역 예정)"
+ title: InstructPix2Pix 학습
+ title: 학습
- sections:
- local: in_translation
- title: "Stable Diffusion"
- - local: in_translation
- title: "Philosophy"
- - local: in_translation
- title: "How to contribute?"
- title: "개념 설명 (번역 예정)"
-- sections:
- - sections:
- - local: in_translation
- title: "Models"
- - local: in_translation
- title: "Diffusion Pipeline"
- - local: in_translation
- title: "Logging"
- - local: in_translation
- title: "Configuration"
- - local: in_translation
- title: "Outputs"
- title: "Main Classes"
-
- - sections:
- - local: in_translation
- title: "Overview"
- - local: in_translation
- title: "AltDiffusion"
- - local: in_translation
- title: "Cycle Diffusion"
- - local: in_translation
- title: "DDIM"
- - local: in_translation
- title: "DDPM"
- - local: in_translation
- title: "Latent Diffusion"
- - local: in_translation
- title: "Unconditional Latent Diffusion"
- - local: in_translation
- title: "PaintByExample"
- - local: in_translation
- title: "PNDM"
- - local: in_translation
- title: "Score SDE VE"
- - sections:
- - local: in_translation
- title: "Overview"
- - local: in_translation
- title: "Text-to-Image"
- - local: in_translation
- title: "Image-to-Image"
- - local: in_translation
- title: "Inpaint"
- - local: in_translation
- title: "Depth-to-Image"
- - local: in_translation
- title: "Image-Variation"
- - local: in_translation
- title: "Super-Resolution"
- title: "Stable Diffusion"
- - local: in_translation
- title: "Stable Diffusion 2"
- - local: in_translation
- title: "Safe Stable Diffusion"
- - local: in_translation
- title: "Stochastic Karras VE"
- - local: in_translation
- title: "Dance Diffusion"
- - local: in_translation
- title: "UnCLIP"
- - local: in_translation
- title: "Versatile Diffusion"
- - local: in_translation
- title: "VQ Diffusion"
- - local: in_translation
- title: "RePaint"
- - local: in_translation
- title: "Audio Diffusion"
- title: "파이프라인 (번역 예정)"
- - sections:
- - local: in_translation
- title: "Overview"
- - local: in_translation
- title: "DDIM"
- - local: in_translation
- title: "DDPM"
- - local: in_translation
- title: "Singlestep DPM-Solver"
- - local: in_translation
- title: "Multistep DPM-Solver"
- - local: in_translation
- title: "Heun Scheduler"
- - local: in_translation
- title: "DPM Discrete Scheduler"
- - local: in_translation
- title: "DPM Discrete Scheduler with ancestral sampling"
- - local: in_translation
- title: "Stochastic Kerras VE"
- - local: in_translation
- title: "Linear Multistep"
- - local: in_translation
- title: "PNDM"
- - local: in_translation
- title: "VE-SDE"
- - local: in_translation
- title: "IPNDM"
- - local: in_translation
- title: "VP-SDE"
- - local: in_translation
- title: "Euler scheduler"
- - local: in_translation
- title: "Euler Ancestral Scheduler"
- - local: in_translation
- title: "VQDiffusionScheduler"
- - local: in_translation
- title: "RePaint Scheduler"
- title: "스케줄러 (번역 예정)"
- - sections:
- - local: in_translation
- title: "RL Planning"
- title: "Experimental Features"
- title: "API (번역 예정)"
+ title: 개요
+ - local: optimization/fp16
+ title: 메모리와 속도
+ - local: in_translation
+ title: Torch2.0 지원
+ - local: optimization/xformers
+ title: xFormers
+ - local: optimization/onnx
+ title: ONNX
+ - local: optimization/open_vino
+ title: OpenVINO
+ - local: optimization/mps
+ title: MPS
+ - local: optimization/habana
+ title: Habana Gaudi
+ title: 최적화/특수 하드웨어
\ No newline at end of file
diff --git a/docs/source/ko/optimization/fp16.mdx b/docs/source/ko/optimization/fp16.mdx
new file mode 100644
index 000000000000..593860581be3
--- /dev/null
+++ b/docs/source/ko/optimization/fp16.mdx
@@ -0,0 +1,410 @@
+
+
+# 메모리와 속도
+
+메모리 또는 속도에 대해 🤗 Diffusers *추론*을 최적화하기 위한 몇 가지 기술과 아이디어를 제시합니다.
+일반적으로, memory-efficient attention을 위해 [xFormers](https://github.com/facebookresearch/xformers) 사용을 추천하기 때문에, 추천하는 [설치 방법](xformers)을 보고 설치해 보세요.
+
+다음 설정이 성능과 메모리에 미치는 영향에 대해 설명합니다.
+
+| | 지연시간 | 속도 향상 |
+| ---------------- | ------- | ------- |
+| 별도 설정 없음 | 9.50s | x1 |
+| cuDNN auto-tuner | 9.37s | x1.01 |
+| fp16 | 3.61s | x2.63 |
+| Channels Last 메모리 형식 | 3.30s | x2.88 |
+| traced UNet | 3.21s | x2.96 |
+| memory-efficient attention | 2.63s | x3.61 |
+
+
+ NVIDIA TITAN RTX에서 50 DDIM 스텝의 "a photo of an astronaut riding a horse on mars" 프롬프트로 512x512 크기의 단일 이미지를 생성하였습니다.
+
+
+## cuDNN auto-tuner 활성화하기
+
+[NVIDIA cuDNN](https://developer.nvidia.com/cudnn)은 컨볼루션을 계산하는 많은 알고리즘을 지원합니다. Autotuner는 짧은 벤치마크를 실행하고 주어진 입력 크기에 대해 주어진 하드웨어에서 최고의 성능을 가진 커널을 선택합니다.
+
+**컨볼루션 네트워크**를 활용하고 있기 때문에 (다른 유형들은 현재 지원되지 않음), 다음 설정을 통해 추론 전에 cuDNN autotuner를 활성화할 수 있습니다:
+
+```python
+import torch
+
+torch.backends.cudnn.benchmark = True
+```
+
+### fp32 대신 tf32 사용하기 (Ampere 및 이후 CUDA 장치들에서)
+
+Ampere 및 이후 CUDA 장치에서 행렬곱 및 컨볼루션은 TensorFloat32(TF32) 모드를 사용하여 더 빠르지만 약간 덜 정확할 수 있습니다.
+기본적으로 PyTorch는 컨볼루션에 대해 TF32 모드를 활성화하지만 행렬 곱셈은 활성화하지 않습니다.
+네트워크에 완전한 float32 정밀도가 필요한 경우가 아니면 행렬 곱셈에 대해서도 이 설정을 활성화하는 것이 좋습니다.
+이는 일반적으로 무시할 수 있는 수치의 정확도 손실이 있지만, 계산 속도를 크게 높일 수 있습니다.
+그것에 대해 [여기](https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32)서 더 읽을 수 있습니다.
+추론하기 전에 다음을 추가하기만 하면 됩니다:
+
+```python
+import torch
+
+torch.backends.cuda.matmul.allow_tf32 = True
+```
+
+## 반정밀도 가중치
+
+더 많은 GPU 메모리를 절약하고 더 빠른 속도를 얻기 위해 모델 가중치를 반정밀도(half precision)로 직접 로드하고 실행할 수 있습니다.
+여기에는 `fp16`이라는 브랜치에 저장된 float16 버전의 가중치를 불러오고, 그 때 `float16` 유형을 사용하도록 PyTorch에 지시하는 작업이 포함됩니다.
+
+```Python
+pipe = StableDiffusionPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5",
+
+ torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]
+```
+
+
+ 어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다.
+
+
+## 추가 메모리 절약을 위한 슬라이스 어텐션
+
+추가 메모리 절약을 위해, 한 번에 모두 계산하는 대신 단계적으로 계산을 수행하는 슬라이스 버전의 어텐션(attention)을 사용할 수 있습니다.
+
+
+ Attention slicing은 모델이 하나 이상의 어텐션 헤드를 사용하는 한, 배치 크기가 1인 경우에도 유용합니다.
+ 하나 이상의 어텐션 헤드가 있는 경우 *QK^T* 어텐션 매트릭스는 상당한 양의 메모리를 절약할 수 있는 각 헤드에 대해 순차적으로 계산될 수 있습니다.
+
+
+각 헤드에 대해 순차적으로 어텐션 계산을 수행하려면, 다음과 같이 추론 전에 파이프라인에서 [`~StableDiffusionPipeline.enable_attention_slicing`]를 호출하면 됩니다:
+
+```Python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5",
+
+ torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_attention_slicing()
+image = pipe(prompt).images[0]
+```
+
+추론 시간이 약 10% 느려지는 약간의 성능 저하가 있지만 이 방법을 사용하면 3.2GB 정도의 작은 VRAM으로도 Stable Diffusion을 사용할 수 있습니다!
+
+
+## 더 큰 배치를 위한 sliced VAE 디코드
+
+제한된 VRAM에서 대규모 이미지 배치를 디코딩하거나 32개 이상의 이미지가 포함된 배치를 활성화하기 위해, 배치의 latent 이미지를 한 번에 하나씩 디코딩하는 슬라이스 VAE 디코드를 사용할 수 있습니다.
+
+이를 [`~StableDiffusionPipeline.enable_attention_slicing`] 또는 [`~StableDiffusionPipeline.enable_xformers_memory_efficient_attention`]과 결합하여 메모리 사용을 추가로 최소화할 수 있습니다.
+
+VAE 디코드를 한 번에 하나씩 수행하려면 추론 전에 파이프라인에서 [`~StableDiffusionPipeline.enable_vae_slicing`]을 호출합니다. 예를 들어:
+
+```Python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5",
+
+ torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_vae_slicing()
+images = pipe([prompt] * 32).images
+```
+
+다중 이미지 배치에서 VAE 디코드가 약간의 성능 향상이 이루어집니다. 단일 이미지 배치에서는 성능 영향은 없습니다.
+
+
+
+## 메모리 절약을 위해 가속 기능을 사용하여 CPU로 오프로딩
+
+추가 메모리 절약을 위해 가중치를 CPU로 오프로드하고 순방향 전달을 수행할 때만 GPU로 로드할 수 있습니다.
+
+CPU 오프로딩을 수행하려면 [`~StableDiffusionPipeline.enable_sequential_cpu_offload`]를 호출하기만 하면 됩니다:
+
+```Python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5",
+
+ torch_dtype=torch.float16,
+)
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_sequential_cpu_offload()
+image = pipe(prompt).images[0]
+```
+
+그러면 메모리 소비를 3GB 미만으로 줄일 수 있습니다.
+
+참고로 이 방법은 전체 모델이 아닌 서브모듈 수준에서 작동합니다. 이는 메모리 소비를 최소화하는 가장 좋은 방법이지만 프로세스의 반복적 특성으로 인해 추론 속도가 훨씬 느립니다. 파이프라인의 UNet 구성 요소는 여러 번 실행됩니다('num_inference_steps' 만큼). 매번 UNet의 서로 다른 서브모듈이 순차적으로 온로드된 다음 필요에 따라 오프로드되므로 메모리 이동 횟수가 많습니다.
+
+
+또 다른 최적화 방법인 모델 오프로딩을 사용하는 것을 고려하십시오. 이는 훨씬 빠르지만 메모리 절약이 크지는 않습니다.
+
+
+또한 ttention slicing과 연결해서 최소 메모리(< 2GB)로도 동작할 수 있습니다.
+
+
+```Python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5",
+
+ torch_dtype=torch.float16,
+)
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_sequential_cpu_offload()
+pipe.enable_attention_slicing(1)
+
+image = pipe(prompt).images[0]
+```
+
+**참고**: 'enable_sequential_cpu_offload()'를 사용할 때, 미리 파이프라인을 CUDA로 이동하지 **않는** 것이 중요합니다.그렇지 않으면 메모리 소비의 이득이 최소화됩니다. 더 많은 정보를 위해 [이 이슈](https://github.com/huggingface/diffusers/issues/1934)를 보세요.
+
+
+## 빠른 추론과 메모리 메모리 절약을 위한 모델 오프로딩
+
+[순차적 CPU 오프로딩](#sequential_offloading)은 이전 섹션에서 설명한 것처럼 많은 메모리를 보존하지만 필요에 따라 서브모듈을 GPU로 이동하고 새 모듈이 실행될 때 즉시 CPU로 반환되기 때문에 추론 속도가 느려집니다.
+
+전체 모델 오프로딩은 각 모델의 구성 요소인 _modules_을 처리하는 대신, 전체 모델을 GPU로 이동하는 대안입니다. 이로 인해 추론 시간에 미치는 영향은 미미하지만(파이프라인을 'cuda'로 이동하는 것과 비교하여) 여전히 약간의 메모리를 절약할 수 있습니다.
+
+이 시나리오에서는 파이프라인의 주요 구성 요소 중 하나만(일반적으로 텍스트 인코더, unet 및 vae) GPU에 있고, 나머지는 CPU에서 대기할 것입니다.
+여러 반복을 위해 실행되는 UNet과 같은 구성 요소는 더 이상 필요하지 않을 때까지 GPU에 남아 있습니다.
+
+이 기능은 아래와 같이 파이프라인에서 `enable_model_cpu_offload()`를 호출하여 활성화할 수 있습니다.
+
+```Python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5",
+ torch_dtype=torch.float16,
+)
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_model_cpu_offload()
+image = pipe(prompt).images[0]
+```
+
+이는 추가적인 메모리 절약을 위한 attention slicing과도 호환됩니다.
+
+```Python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5",
+ torch_dtype=torch.float16,
+)
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_model_cpu_offload()
+pipe.enable_attention_slicing(1)
+
+image = pipe(prompt).images[0]
+```
+
+
+이 기능을 사용하려면 'accelerate' 버전 0.17.0 이상이 필요합니다.
+
+
+## Channels Last 메모리 형식 사용하기
+
+Channels Last 메모리 형식은 차원 순서를 보존하는 메모리에서 NCHW 텐서 배열을 대체하는 방법입니다.
+Channels Last 텐서는 채널이 가장 조밀한 차원이 되는 방식으로 정렬됩니다(일명 픽셀당 이미지를 저장).
+현재 모든 연산자 Channels Last 형식을 지원하는 것은 아니라 성능이 저하될 수 있으므로, 사용해보고 모델에 잘 작동하는지 확인하는 것이 좋습니다.
+
+
+예를 들어 파이프라인의 UNet 모델이 channels Last 형식을 사용하도록 설정하려면 다음을 사용할 수 있습니다:
+
+```python
+print(pipe.unet.conv_out.state_dict()["weight"].stride()) # (2880, 9, 3, 1)
+pipe.unet.to(memory_format=torch.channels_last) # in-place 연산
+# 2번째 차원에서 스트라이드 1을 가지는 (2880, 1, 960, 320)로, 연산이 작동함을 증명합니다.
+print(pipe.unet.conv_out.state_dict()["weight"].stride())
+```
+
+## 추적(tracing)
+
+추적은 모델을 통해 예제 입력 텐서를 통해 실행되는데, 해당 입력이 모델의 레이어를 통과할 때 호출되는 작업을 캡처하여 실행 파일 또는 'ScriptFunction'이 반환되도록 하고, 이는 just-in-time 컴파일로 최적화됩니다.
+
+UNet 모델을 추적하기 위해 다음을 사용할 수 있습니다:
+
+```python
+import time
+import torch
+from diffusers import StableDiffusionPipeline
+import functools
+
+# torch 기울기 비활성화
+torch.set_grad_enabled(False)
+
+# 변수 설정
+n_experiments = 2
+unet_runs_per_experiment = 50
+
+
+# 입력 불러오기
+def generate_inputs():
+ sample = torch.randn(2, 4, 64, 64).half().cuda()
+ timestep = torch.rand(1).half().cuda() * 999
+ encoder_hidden_states = torch.randn(2, 77, 768).half().cuda()
+ return sample, timestep, encoder_hidden_states
+
+
+pipe = StableDiffusionPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5",
+ torch_dtype=torch.float16,
+).to("cuda")
+unet = pipe.unet
+unet.eval()
+unet.to(memory_format=torch.channels_last) # Channels Last 메모리 형식 사용
+unet.forward = functools.partial(unet.forward, return_dict=False) # return_dict=False을 기본값으로 설정
+
+# 워밍업
+for _ in range(3):
+ with torch.inference_mode():
+ inputs = generate_inputs()
+ orig_output = unet(*inputs)
+
+# 추적
+print("tracing..")
+unet_traced = torch.jit.trace(unet, inputs)
+unet_traced.eval()
+print("done tracing")
+
+
+# 워밍업 및 그래프 최적화
+for _ in range(5):
+ with torch.inference_mode():
+ inputs = generate_inputs()
+ orig_output = unet_traced(*inputs)
+
+
+# 벤치마킹
+with torch.inference_mode():
+ for _ in range(n_experiments):
+ torch.cuda.synchronize()
+ start_time = time.time()
+ for _ in range(unet_runs_per_experiment):
+ orig_output = unet_traced(*inputs)
+ torch.cuda.synchronize()
+ print(f"unet traced inference took {time.time() - start_time:.2f} seconds")
+ for _ in range(n_experiments):
+ torch.cuda.synchronize()
+ start_time = time.time()
+ for _ in range(unet_runs_per_experiment):
+ orig_output = unet(*inputs)
+ torch.cuda.synchronize()
+ print(f"unet inference took {time.time() - start_time:.2f} seconds")
+
+# 모델 저장
+unet_traced.save("unet_traced.pt")
+```
+
+그 다음, 파이프라인의 `unet` 특성을 다음과 같이 추적된 모델로 바꿀 수 있습니다.
+
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+from dataclasses import dataclass
+
+
+@dataclass
+class UNet2DConditionOutput:
+ sample: torch.FloatTensor
+
+
+pipe = StableDiffusionPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5",
+ torch_dtype=torch.float16,
+).to("cuda")
+
+# jitted unet 사용
+unet_traced = torch.jit.load("unet_traced.pt")
+
+
+# pipe.unet 삭제
+class TracedUNet(torch.nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.in_channels = pipe.unet.in_channels
+ self.device = pipe.unet.device
+
+ def forward(self, latent_model_input, t, encoder_hidden_states):
+ sample = unet_traced(latent_model_input, t, encoder_hidden_states)[0]
+ return UNet2DConditionOutput(sample=sample)
+
+
+pipe.unet = TracedUNet()
+
+with torch.inference_mode():
+ image = pipe([prompt] * 1, num_inference_steps=50).images[0]
+```
+
+
+## Memory-efficient attention
+
+어텐션 블록의 대역폭을 최적화하는 최근 작업으로 GPU 메모리 사용량이 크게 향상되고 향상되었습니다.
+@tridao의 가장 최근의 플래시 어텐션: [code](https://github.com/HazyResearch/flash-attention), [paper](https://arxiv.org/pdf/2205.14135.pdf).
+
+배치 크기 1(프롬프트 1개)의 512x512 크기로 추론을 실행할 때 몇 가지 Nvidia GPU에서 얻은 속도 향상은 다음과 같습니다:
+
+| GPU | 기준 어텐션 FP16 | 메모리 효율적인 어텐션 FP16 |
+|------------------ |--------------------- |--------------------------------- |
+| NVIDIA Tesla T4 | 3.5it/s | 5.5it/s |
+| NVIDIA 3060 RTX | 4.6it/s | 7.8it/s |
+| NVIDIA A10G | 8.88it/s | 15.6it/s |
+| NVIDIA RTX A6000 | 11.7it/s | 21.09it/s |
+| NVIDIA TITAN RTX | 12.51it/s | 18.22it/s |
+| A100-SXM4-40GB | 18.6it/s | 29.it/s |
+| A100-SXM-80GB | 18.7it/s | 29.5it/s |
+
+이를 활용하려면 다음을 만족해야 합니다:
+ - PyTorch > 1.12
+ - Cuda 사용 가능
+ - [xformers 라이브러리를 설치함](xformers)
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+
+pipe = StableDiffusionPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5",
+ torch_dtype=torch.float16,
+).to("cuda")
+
+pipe.enable_xformers_memory_efficient_attention()
+
+with torch.inference_mode():
+ sample = pipe("a small cat")
+
+# 선택: 이를 비활성화 하기 위해 다음을 사용할 수 있습니다.
+# pipe.disable_xformers_memory_efficient_attention()
+```
diff --git a/docs/source/ko/optimization/habana.mdx b/docs/source/ko/optimization/habana.mdx
new file mode 100644
index 000000000000..0f076245fb1c
--- /dev/null
+++ b/docs/source/ko/optimization/habana.mdx
@@ -0,0 +1,71 @@
+
+
+# Habana Gaudi에서 Stable Diffusion을 사용하는 방법
+
+🤗 Diffusers는 🤗 [Optimum Habana](https://huggingface.co/docs/optimum/habana/usage_guides/stable_diffusion)를 통해서 Habana Gaudi와 호환됩니다.
+
+## 요구 사항
+
+- Optimum Habana 1.4 또는 이후, [여기](https://huggingface.co/docs/optimum/habana/installation)에 설치하는 방법이 있습니다.
+- SynapseAI 1.8.
+
+
+## 추론 파이프라인
+
+Gaudi에서 Stable Diffusion 1 및 2로 이미지를 생성하려면 두 인스턴스를 인스턴스화해야 합니다:
+- [`GaudiStableDiffusionPipeline`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline)이 포함된 파이프라인. 이 파이프라인은 *텍스트-이미지 생성*을 지원합니다.
+- [`GaudiDDIMScheduler`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline#optimum.habana.diffusers.GaudiDDIMScheduler)이 포함된 스케줄러. 이 스케줄러는 Habana Gaudi에 최적화되어 있습니다.
+
+파이프라인을 초기화할 때, HPU에 배포하기 위해 `use_habana=True`를 지정해야 합니다.
+또한 가능한 가장 빠른 생성을 위해 `use_hpu_graphs=True`로 **HPU 그래프**를 활성화해야 합니다.
+마지막으로, [Hugging Face Hub](https://huggingface.co/Habana)에서 다운로드할 수 있는 [Gaudi configuration](https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config)을 지정해야 합니다.
+
+```python
+from optimum.habana import GaudiConfig
+from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionPipeline
+
+model_name = "stabilityai/stable-diffusion-2-base"
+scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
+pipeline = GaudiStableDiffusionPipeline.from_pretrained(
+ model_name,
+ scheduler=scheduler,
+ use_habana=True,
+ use_hpu_graphs=True,
+ gaudi_config="Habana/stable-diffusion",
+)
+```
+
+파이프라인을 호출하여 하나 이상의 프롬프트에서 배치별로 이미지를 생성할 수 있습니다.
+
+```python
+outputs = pipeline(
+ prompt=[
+ "High quality photo of an astronaut riding a horse in space",
+ "Face of a yellow cat, high resolution, sitting on a park bench",
+ ],
+ num_images_per_prompt=10,
+ batch_size=4,
+)
+```
+
+더 많은 정보를 얻기 위해, Optimum Habana의 [문서](https://huggingface.co/docs/optimum/habana/usage_guides/stable_diffusion)와 공식 Github 저장소에 제공된 [예시](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)를 확인하세요.
+
+
+## 벤치마크
+
+다음은 [Habana/stable-diffusion](https://huggingface.co/Habana/stable-diffusion) Gaudi 구성(혼합 정밀도 bf16/fp32)을 사용하는 Habana first-generation Gaudi 및 Gaudi2의 지연 시간입니다:
+
+| | Latency (배치 크기 = 1) | Throughput (배치 크기 = 8) |
+| ---------------------- |:------------------------:|:---------------------------:|
+| first-generation Gaudi | 4.29s | 0.283 images/s |
+| Gaudi2 | 1.54s | 0.904 images/s |
diff --git a/docs/source/ko/optimization/mps.mdx b/docs/source/ko/optimization/mps.mdx
new file mode 100644
index 000000000000..cd04d6d1103d
--- /dev/null
+++ b/docs/source/ko/optimization/mps.mdx
@@ -0,0 +1,71 @@
+
+
+# Apple Silicon (M1/M2)에서 Stable Diffusion을 사용하는 방법
+
+Diffusers는 Stable Diffusion 추론을 위해 PyTorch `mps`를 사용해 Apple 실리콘과 호환됩니다. 다음은 Stable Diffusion이 있는 M1 또는 M2 컴퓨터를 사용하기 위해 따라야 하는 단계입니다.
+
+## 요구 사항
+
+- Apple silicon (M1/M2) 하드웨어의 Mac 컴퓨터.
+- macOS 12.6 또는 이후 (13.0 또는 이후 추천).
+- Python arm64 버전
+- PyTorch 2.0(추천) 또는 1.13(`mps`를 지원하는 최소 버전). Yhttps://pytorch.org/get-started/locally/의 지침에 따라 `pip` 또는 `conda`로 설치할 수 있습니다.
+
+
+## 추론 파이프라인
+
+아래 코도는 익숙한 `to()` 인터페이스를 사용하여 `mps` 백엔드로 Stable Diffusion 파이프라인을 M1 또는 M2 장치로 이동하는 방법을 보여줍니다.
+
+
+
+
+**PyTorch 1.13을 사용 중일 때 ** 추가 일회성 전달을 사용하여 파이프라인을 "프라이밍"하는 것을 추천합니다. 이것은 발견한 이상한 문제에 대한 임시 해결 방법입니다. 첫 번째 추론 전달은 후속 전달와 약간 다른 결과를 생성합니다. 이 전달은 한 번만 수행하면 되며 추론 단계를 한 번만 사용하고 결과를 폐기해도 됩니다.
+
+
+
+이전 팁에서 설명한 것들을 포함한 여러 문제를 해결하므로 PyTorch 2 이상을 사용하는 것이 좋습니다.
+
+
+```python
+# `huggingface-cli login`에 로그인되어 있음을 확인
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+pipe = pipe.to("mps")
+
+# 컴퓨터가 64GB 이하의 RAM 램일 때 추천
+pipe.enable_attention_slicing()
+
+prompt = "a photo of an astronaut riding a horse on mars"
+
+# 처음 "워밍업" 전달 (위 설명을 보세요)
+_ = pipe(prompt, num_inference_steps=1)
+
+# 결과는 워밍업 전달 후의 CPU 장치의 결과와 일치합니다.
+image = pipe(prompt).images[0]
+```
+
+## 성능 추천
+
+M1/M2 성능은 메모리 압력에 매우 민감합니다. 시스템은 필요한 경우 자동으로 스왑되지만 스왑할 때 성능이 크게 저하됩니다.
+
+
+특히 컴퓨터의 시스템 RAM이 64GB 미만이거나 512 × 512픽셀보다 큰 비표준 해상도에서 이미지를 생성하는 경우, 추론 중에 메모리 압력을 줄이고 스와핑을 방지하기 위해 *어텐션 슬라이싱*을 사용하는 것이 좋습니다. 어텐션 슬라이싱은 비용이 많이 드는 어텐션 작업을 한 번에 모두 수행하는 대신 여러 단계로 수행합니다. 일반적으로 범용 메모리가 없는 컴퓨터에서 ~20%의 성능 영향을 미치지만 64GB 이상이 아닌 경우 대부분의 Apple Silicon 컴퓨터에서 *더 나은 성능*이 관찰되었습니다.
+
+```python
+pipeline.enable_attention_slicing()
+```
+
+## Known Issues
+
+- 여러 프롬프트를 배치로 생성하는 것은 [충돌이 발생하거나 안정적으로 작동하지 않습니다](https://github.com/huggingface/diffusers/issues/363). 우리는 이것이 [PyTorch의 `mps` 백엔드](https://github.com/pytorch/pytorch/issues/84039)와 관련이 있다고 생각합니다. 이 문제는 해결되고 있지만 지금은 배치 대신 반복 방법을 사용하는 것이 좋습니다.
\ No newline at end of file
diff --git a/docs/source/ko/optimization/onnx.mdx b/docs/source/ko/optimization/onnx.mdx
new file mode 100644
index 000000000000..d52110b8c1fb
--- /dev/null
+++ b/docs/source/ko/optimization/onnx.mdx
@@ -0,0 +1,65 @@
+
+
+
+# 추론을 위해 ONNX 런타임을 사용하는 방법
+
+🤗 Diffusers는 ONNX Runtime과 호환되는 Stable Diffusion 파이프라인을 제공합니다. 이를 통해 ONNX(CPU 포함)를 지원하고 PyTorch의 가속 버전을 사용할 수 없는 모든 하드웨어에서 Stable Diffusion을 실행할 수 있습니다.
+
+## 설치
+
+다음 명령어로 ONNX Runtime를 지원하는 🤗 Optimum를 설치합니다:
+
+```
+pip install optimum["onnxruntime"]
+```
+
+## Stable Diffusion 추론
+
+아래 코드는 ONNX 런타임을 사용하는 방법을 보여줍니다. `StableDiffusionPipeline` 대신 `OnnxStableDiffusionPipeline`을 사용해야 합니다.
+PyTorch 모델을 불러오고 즉시 ONNX 형식으로 변환하려는 경우 `export=True`로 설정합니다.
+
+```python
+from optimum.onnxruntime import ORTStableDiffusionPipeline
+
+model_id = "runwayml/stable-diffusion-v1-5"
+pipe = ORTStableDiffusionPipeline.from_pretrained(model_id, export=True)
+prompt = "a photo of an astronaut riding a horse on mars"
+images = pipe(prompt).images[0]
+pipe.save_pretrained("./onnx-stable-diffusion-v1-5")
+```
+
+파이프라인을 ONNX 형식으로 오프라인으로 내보내고 나중에 추론에 사용하려는 경우,
+[`optimum-cli export`](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) 명령어를 사용할 수 있습니다:
+
+```bash
+optimum-cli export onnx --model runwayml/stable-diffusion-v1-5 sd_v15_onnx/
+```
+
+그 다음 추론을 수행합니다:
+
+```python
+from optimum.onnxruntime import ORTStableDiffusionPipeline
+
+model_id = "sd_v15_onnx"
+pipe = ORTStableDiffusionPipeline.from_pretrained(model_id)
+prompt = "a photo of an astronaut riding a horse on mars"
+images = pipe(prompt).images[0]
+```
+
+Notice that we didn't have to specify `export=True` above.
+
+[Optimum 문서](https://huggingface.co/docs/optimum/)에서 더 많은 예시를 찾을 수 있습니다.
+
+## 알려진 이슈들
+
+- 여러 프롬프트를 배치로 생성하면 너무 많은 메모리가 사용되는 것 같습니다. 이를 조사하는 동안, 배치 대신 반복 방법이 필요할 수도 있습니다.
diff --git a/docs/source/ko/optimization/open_vino.mdx b/docs/source/ko/optimization/open_vino.mdx
new file mode 100644
index 000000000000..cb279909f618
--- /dev/null
+++ b/docs/source/ko/optimization/open_vino.mdx
@@ -0,0 +1,39 @@
+
+
+# 추론을 위한 OpenVINO 사용 방법
+
+🤗 [Optimum](https://github.com/huggingface/optimum-intel)은 OpenVINO와 호환되는 Stable Diffusion 파이프라인을 제공합니다.
+이제 다양한 Intel 프로세서에서 OpenVINO Runtime으로 쉽게 추론을 수행할 수 있습니다. ([여기](https://docs.openvino.ai/latest/openvino_docs_OV_UG_supported_plugins_Supported_Devices.html)서 지원되는 전 기기 목록을 확인하세요).
+
+## 설치
+
+다음 명령어로 🤗 Optimum을 설치합니다:
+
+```
+pip install optimum["openvino"]
+```
+
+## Stable Diffusion 추론
+
+OpenVINO 모델을 불러오고 OpenVINO 런타임으로 추론을 실행하려면 `StableDiffusionPipeline`을 `OVStableDiffusionPipeline`으로 교체해야 합니다. PyTorch 모델을 불러오고 즉시 OpenVINO 형식으로 변환하려는 경우 `export=True`로 설정합니다.
+
+```python
+from optimum.intel.openvino import OVStableDiffusionPipeline
+
+model_id = "runwayml/stable-diffusion-v1-5"
+pipe = OVStableDiffusionPipeline.from_pretrained(model_id, export=True)
+prompt = "a photo of an astronaut riding a horse on mars"
+images = pipe(prompt).images[0]
+```
+
+[Optimum 문서](https://huggingface.co/docs/optimum/intel/inference#export-and-inference-of-stable-diffusion-models)에서 (정적 reshaping과 모델 컴파일 등의) 더 많은 예시들을 찾을 수 있습니다.
diff --git a/docs/source/ko/optimization/xformers.mdx b/docs/source/ko/optimization/xformers.mdx
new file mode 100644
index 000000000000..a8b9408fbe50
--- /dev/null
+++ b/docs/source/ko/optimization/xformers.mdx
@@ -0,0 +1,36 @@
+
+
+# xFormers 설치하기
+
+추론과 학습 모두에 [xFormers](https://github.com/facebookresearch/xformers)를 사용하는 것이 좋습니다.
+자체 테스트로 어텐션 블록에서 수행된 최적화가 더 빠른 속도와 적은 메모리 소비를 확인했습니다.
+
+2023년 1월에 출시된 xFormers 버전 '0.0.16'부터 사전 빌드된 pip wheel을 사용하여 쉽게 설치할 수 있습니다:
+
+```bash
+pip install xformers
+```
+
+
+
+xFormers PIP 패키지에는 최신 버전의 PyTorch(xFormers 0.0.16에 1.13.1)가 필요합니다. 이전 버전의 PyTorch를 사용해야 하는 경우 [프로젝트 지침](https://github.com/facebookresearch/xformers#installing-xformers)의 소스를 사용해 xFormers를 설치하는 것이 좋습니다.
+
+
+
+xFormers를 설치하면, [여기](fp16#memory-efficient-attention)서 설명한 것처럼 'enable_xformers_memory_efficient_attention()'을 사용하여 추론 속도를 높이고 메모리 소비를 줄일 수 있습니다.
+
+
+
+[이 이슈](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212)에 따르면 xFormers `v0.0.16`에서 GPU를 사용한 학습(파인 튜닝 또는 Dreambooth)을 할 수 없습니다. 해당 문제가 발견되면. 해당 코멘트를 참고해 development 버전을 설치하세요.
+
+
diff --git a/docs/source/ko/training/dreambooth.mdx b/docs/source/ko/training/dreambooth.mdx
new file mode 100644
index 000000000000..cc282d9d24f8
--- /dev/null
+++ b/docs/source/ko/training/dreambooth.mdx
@@ -0,0 +1,475 @@
+
+
+# DreamBooth
+
+[DreamBooth](https://arxiv.org/abs/2208.12242)는 한 주제에 대한 적은 이미지(3~5개)만으로도 stable diffusion과 같이 text-to-image 모델을 개인화할 수 있는 방법입니다. 이를 통해 모델은 다양한 장면, 포즈 및 장면(뷰)에서 피사체에 대해 맥락화(contextualized)된 이미지를 생성할 수 있습니다.
+
+
+project's blog.
+프로젝트 블로그에서의 Dreambooth 예시
+
+
+이 가이드는 다양한 GPU, Flax 사양에 대해 [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4) 모델로 DreamBooth를 파인튜닝하는 방법을 보여줍니다. 더 깊이 파고들어 작동 방식을 확인하는 데 관심이 있는 경우, 이 가이드에 사용된 DreamBooth의 모든 학습 스크립트를 [여기](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth)에서 찾을 수 있습니다.
+
+스크립트를 실행하기 전에 라이브러리의 학습에 필요한 dependencies를 설치해야 합니다. 또한 `main` GitHub 브랜치에서 🧨 Diffusers를 설치하는 것이 좋습니다.
+
+```bash
+pip install git+https://github.com/huggingface/diffusers
+pip install -U -r diffusers/examples/dreambooth/requirements.txt
+```
+
+xFormers는 학습에 필요한 요구 사항은 아니지만, 가능하면 [설치](../optimization/xformers)하는 것이 좋습니다. 학습 속도를 높이고 메모리 사용량을 줄일 수 있기 때문입니다.
+
+모든 dependencies을 설정한 후 다음을 사용하여 [🤗 Accelerate](https://github.com/huggingface/accelerate/) 환경을 다음과 같이 초기화합니다:
+
+```bash
+accelerate config
+```
+
+별도 설정 없이 기본 🤗 Accelerate 환경을 설치하려면 다음을 실행합니다:
+
+```bash
+accelerate config default
+```
+
+또는 현재 환경이 노트북과 같은 대화형 셸을 지원하지 않는 경우 다음을 사용할 수 있습니다:
+
+```py
+from accelerate.utils import write_basic_config
+
+write_basic_config()
+```
+
+## 파인튜닝
+
+
+
+DreamBooth 파인튜닝은 하이퍼파라미터에 매우 민감하고 과적합되기 쉽습니다. 적절한 하이퍼파라미터를 선택하는 데 도움이 되도록 다양한 권장 설정이 포함된 [심층 분석](https://huggingface.co/blog/dreambooth)을 살펴보는 것이 좋습니다.
+
+
+
+
+
+[몇 장의 강아지 이미지들](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ)로 DreamBooth를 시도해봅시다.
+이를 다운로드해 디렉터리에 저장한 다음 `INSTANCE_DIR` 환경 변수를 해당 경로로 설정합니다:
+
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path_to_training_images"
+export OUTPUT_DIR="path_to_saved_model"
+```
+
+그런 다음, 다음 명령을 사용하여 학습 스크립트를 실행할 수 있습니다 (전체 학습 스크립트는 [여기](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py)에서 찾을 수 있습니다):
+
+```bash
+accelerate launch train_dreambooth.py \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --instance_data_dir=$INSTANCE_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --instance_prompt="a photo of sks dog" \
+ --resolution=512 \
+ --train_batch_size=1 \
+ --gradient_accumulation_steps=1 \
+ --learning_rate=5e-6 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --max_train_steps=400
+```
+
+
+
+TPU에 액세스할 수 있거나 더 빠르게 훈련하고 싶다면 [Flax 학습 스크립트](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_flax.py)를 사용해 볼 수 있습니다. Flax 학습 스크립트는 gradient checkpointing 또는 gradient accumulation을 지원하지 않으므로, 메모리가 30GB 이상인 GPU가 필요합니다.
+
+스크립트를 실행하기 전에 요구 사항이 설치되어 있는지 확인하십시오.
+
+```bash
+pip install -U -r requirements.txt
+```
+
+그러면 다음 명령어로 학습 스크립트를 실행시킬 수 있습니다:
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export INSTANCE_DIR="path-to-instance-images"
+export OUTPUT_DIR="path-to-save-model"
+
+python train_dreambooth_flax.py \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --instance_data_dir=$INSTANCE_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --instance_prompt="a photo of sks dog" \
+ --resolution=512 \
+ --train_batch_size=1 \
+ --learning_rate=5e-6 \
+ --max_train_steps=400
+```
+
+
+
+### Prior-preserving(사전 보존) loss를 사용한 파인튜닝
+
+과적합과 language drift를 방지하기 위해 사전 보존이 사용됩니다(관심이 있는 경우 [논문](https://arxiv.org/abs/2208.12242)을 참조하세요). 사전 보존을 위해 동일한 클래스의 다른 이미지를 학습 프로세스의 일부로 사용합니다. 좋은 점은 Stable Diffusion 모델 자체를 사용하여 이러한 이미지를 생성할 수 있다는 것입니다! 학습 스크립트는 생성된 이미지를 우리가 지정한 로컬 경로에 저장합니다.
+
+저자들에 따르면 사전 보존을 위해 `num_epochs * num_samples`개의 이미지를 생성하는 것이 좋습니다. 200-300개에서 대부분 잘 작동합니다.
+
+
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path_to_training_images"
+export CLASS_DIR="path_to_class_images"
+export OUTPUT_DIR="path_to_saved_model"
+
+accelerate launch train_dreambooth.py \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --instance_data_dir=$INSTANCE_DIR \
+ --class_data_dir=$CLASS_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --with_prior_preservation --prior_loss_weight=1.0 \
+ --instance_prompt="a photo of sks dog" \
+ --class_prompt="a photo of dog" \
+ --resolution=512 \
+ --train_batch_size=1 \
+ --gradient_accumulation_steps=1 \
+ --learning_rate=5e-6 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --num_class_images=200 \
+ --max_train_steps=800
+```
+
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+python train_dreambooth_flax.py \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --instance_data_dir=$INSTANCE_DIR \
+ --class_data_dir=$CLASS_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --with_prior_preservation --prior_loss_weight=1.0 \
+ --instance_prompt="a photo of sks dog" \
+ --class_prompt="a photo of dog" \
+ --resolution=512 \
+ --train_batch_size=1 \
+ --learning_rate=5e-6 \
+ --num_class_images=200 \
+ --max_train_steps=800
+```
+
+
+
+## 텍스트 인코더와 and UNet로 파인튜닝하기
+
+해당 스크립트를 사용하면 `unet`과 함께 `text_encoder`를 파인튜닝할 수 있습니다. 실험에서(자세한 내용은 [🧨 Diffusers를 사용해 DreamBooth로 Stable Diffusion 학습하기](https://huggingface.co/blog/dreambooth) 게시물을 확인하세요), 특히 얼굴 이미지를 생성할 때 훨씬 더 나은 결과를 얻을 수 있습니다.
+
+
+
+텍스트 인코더를 학습시키려면 추가 메모리가 필요해 16GB GPU로는 동작하지 않습니다. 이 옵션을 사용하려면 최소 24GB VRAM이 필요합니다.
+
+
+
+`--train_text_encoder` 인수를 학습 스크립트에 전달하여 `text_encoder` 및 `unet`을 파인튜닝할 수 있습니다:
+
+
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path_to_training_images"
+export CLASS_DIR="path_to_class_images"
+export OUTPUT_DIR="path_to_saved_model"
+
+accelerate launch train_dreambooth.py \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --train_text_encoder \
+ --instance_data_dir=$INSTANCE_DIR \
+ --class_data_dir=$CLASS_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --with_prior_preservation --prior_loss_weight=1.0 \
+ --instance_prompt="a photo of sks dog" \
+ --class_prompt="a photo of dog" \
+ --resolution=512 \
+ --train_batch_size=1 \
+ --use_8bit_adam
+ --gradient_checkpointing \
+ --learning_rate=2e-6 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --num_class_images=200 \
+ --max_train_steps=800
+```
+
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+python train_dreambooth_flax.py \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --train_text_encoder \
+ --instance_data_dir=$INSTANCE_DIR \
+ --class_data_dir=$CLASS_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --with_prior_preservation --prior_loss_weight=1.0 \
+ --instance_prompt="a photo of sks dog" \
+ --class_prompt="a photo of dog" \
+ --resolution=512 \
+ --train_batch_size=1 \
+ --learning_rate=2e-6 \
+ --num_class_images=200 \
+ --max_train_steps=800
+```
+
+
+
+## LoRA로 파인튜닝하기
+
+DreamBooth에서 대규모 모델의 학습을 가속화하기 위한 파인튜닝 기술인 LoRA(Low-Rank Adaptation of Large Language Models)를 사용할 수 있습니다. 자세한 내용은 [LoRA 학습](training/lora#dreambooth) 가이드를 참조하세요.
+
+### 학습 중 체크포인트 저장하기
+
+Dreambooth로 훈련하는 동안 과적합하기 쉬우므로, 때때로 학습 중에 정기적인 체크포인트를 저장하는 것이 유용합니다. 중간 체크포인트 중 하나가 최종 모델보다 더 잘 작동할 수 있습니다! 체크포인트 저장 기능을 활성화하려면 학습 스크립트에 다음 인수를 전달해야 합니다:
+
+```bash
+ --checkpointing_steps=500
+```
+
+이렇게 하면 `output_dir`의 하위 폴더에 전체 학습 상태가 저장됩니다. 하위 폴더 이름은 접두사 `checkpoint-`로 시작하고 지금까지 수행된 step 수입니다. 예시로 `checkpoint-1500`은 1500 학습 step 후에 저장된 체크포인트입니다.
+
+#### 저장된 체크포인트에서 훈련 재개하기
+
+저장된 체크포인트에서 훈련을 재개하려면, `--resume_from_checkpoint` 인수를 전달한 다음 사용할 체크포인트의 이름을 지정하면 됩니다. 특수 문자열 `"latest"`를 사용하여 저장된 마지막 체크포인트(즉, step 수가 가장 많은 체크포인트)에서 재개할 수도 있습니다. 예를 들어 다음은 1500 step 후에 저장된 체크포인트에서부터 학습을 재개합니다:
+
+```bash
+ --resume_from_checkpoint="checkpoint-1500"
+```
+
+원하는 경우 일부 하이퍼파라미터를 조정할 수 있습니다.
+
+#### 저장된 체크포인트를 사용하여 추론 수행하기
+
+저장된 체크포인트는 훈련 재개에 적합한 형식으로 저장됩니다. 여기에는 모델 가중치뿐만 아니라 옵티마이저, 데이터 로더 및 학습률의 상태도 포함됩니다.
+
+**`"accelerate>=0.16.0"`**이 설치된 경우 다음 코드를 사용하여 중간 체크포인트에서 추론을 실행합니다.
+
+```python
+from diffusers import DiffusionPipeline, UNet2DConditionModel
+from transformers import CLIPTextModel
+import torch
+
+# 학습에 사용된 것과 동일한 인수(model, revision)로 파이프라인을 로드합니다.
+model_id = "CompVis/stable-diffusion-v1-4"
+
+unet = UNet2DConditionModel.from_pretrained("/sddata/dreambooth/daruma-v2-1/checkpoint-100/unet")
+
+# `args.train_text_encoder`로 학습한 경우면 텍스트 인코더를 꼭 불러오세요
+text_encoder = CLIPTextModel.from_pretrained("/sddata/dreambooth/daruma-v2-1/checkpoint-100/text_encoder")
+
+pipeline = DiffusionPipeline.from_pretrained(model_id, unet=unet, text_encoder=text_encoder, dtype=torch.float16)
+pipeline.to("cuda")
+
+# 추론을 수행하거나 저장하거나, 허브에 푸시합니다.
+pipeline.save_pretrained("dreambooth-pipeline")
+```
+
+If you have **`"accelerate<0.16.0"`** installed, you need to convert it to an inference pipeline first:
+
+```python
+from accelerate import Accelerator
+from diffusers import DiffusionPipeline
+
+# 학습에 사용된 것과 동일한 인수(model, revision)로 파이프라인을 로드합니다.
+model_id = "CompVis/stable-diffusion-v1-4"
+pipeline = DiffusionPipeline.from_pretrained(model_id)
+
+accelerator = Accelerator()
+
+# 초기 학습에 `--train_text_encoder`가 사용된 경우 text_encoder를 사용합니다.
+unet, text_encoder = accelerator.prepare(pipeline.unet, pipeline.text_encoder)
+
+# 체크포인트 경로로부터 상태를 복원합니다. 여기서는 절대 경로를 사용해야 합니다.
+accelerator.load_state("/sddata/dreambooth/daruma-v2-1/checkpoint-100")
+
+# unwrapped 모델로 파이프라인을 다시 빌드합니다.(.unet and .text_encoder로의 할당도 작동해야 합니다)
+pipeline = DiffusionPipeline.from_pretrained(
+ model_id,
+ unet=accelerator.unwrap_model(unet),
+ text_encoder=accelerator.unwrap_model(text_encoder),
+)
+
+# 추론을 수행하거나 저장하거나, 허브에 푸시합니다.
+pipeline.save_pretrained("dreambooth-pipeline")
+```
+
+## 각 GPU 용량에서의 최적화
+
+하드웨어에 따라 16GB에서 8GB까지 GPU에서 DreamBooth를 최적화하는 몇 가지 방법이 있습니다!
+
+### xFormers
+
+[xFormers](https://github.com/facebookresearch/xformers)는 Transformers를 최적화하기 위한 toolbox이며, 🧨 Diffusers에서 사용되는[memory-efficient attention](https://facebookresearch.github.io/xformers/components/ops.html#module-xformers.ops) 메커니즘을 포함하고 있습니다. [xFormers를 설치](./optimization/xformers)한 다음 학습 스크립트에 다음 인수를 추가합니다:
+
+```bash
+ --enable_xformers_memory_efficient_attention
+```
+
+xFormers는 Flax에서 사용할 수 없습니다.
+
+### 그래디언트 없음으로 설정
+
+메모리 사용량을 줄일 수 있는 또 다른 방법은 [기울기 설정](https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html)을 0 대신 `None`으로 하는 것입니다. 그러나 이로 인해 특정 동작이 변경될 수 있으므로 문제가 발생하면 이 인수를 제거해 보십시오. 학습 스크립트에 다음 인수를 추가하여 그래디언트를 `None`으로 설정합니다.
+
+```bash
+ --set_grads_to_none
+```
+
+### 16GB GPU
+
+Gradient checkpointing과 [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)의 8비트 옵티마이저의 도움으로, 16GB GPU에서 dreambooth를 훈련할 수 있습니다. bitsandbytes가 설치되어 있는지 확인하세요:
+
+```bash
+pip install bitsandbytes
+```
+
+그 다음, 학습 스크립트에 `--use_8bit_adam` 옵션을 명시합니다:
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path_to_training_images"
+export CLASS_DIR="path_to_class_images"
+export OUTPUT_DIR="path_to_saved_model"
+
+accelerate launch train_dreambooth.py \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --instance_data_dir=$INSTANCE_DIR \
+ --class_data_dir=$CLASS_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --with_prior_preservation --prior_loss_weight=1.0 \
+ --instance_prompt="a photo of sks dog" \
+ --class_prompt="a photo of dog" \
+ --resolution=512 \
+ --train_batch_size=1 \
+ --gradient_accumulation_steps=2 --gradient_checkpointing \
+ --use_8bit_adam \
+ --learning_rate=5e-6 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --num_class_images=200 \
+ --max_train_steps=800
+```
+
+### 12GB GPU
+
+12GB GPU에서 DreamBooth를 실행하려면 gradient checkpointing, 8비트 옵티마이저, xFormers를 활성화하고 그래디언트를 `None`으로 설정해야 합니다.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth.py \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --instance_data_dir=$INSTANCE_DIR \
+ --class_data_dir=$CLASS_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --with_prior_preservation --prior_loss_weight=1.0 \
+ --instance_prompt="a photo of sks dog" \
+ --class_prompt="a photo of dog" \
+ --resolution=512 \
+ --train_batch_size=1 \
+ --gradient_accumulation_steps=1 --gradient_checkpointing \
+ --use_8bit_adam \
+ --enable_xformers_memory_efficient_attention \
+ --set_grads_to_none \
+ --learning_rate=2e-6 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --num_class_images=200 \
+ --max_train_steps=800
+```
+
+### 8GB GPU에서 학습하기
+
+8GB GPU에 대해서는 [DeepSpeed](https://www.deepspeed.ai/)를 사용해 일부 텐서를 VRAM에서 CPU 또는 NVME로 오프로드하여 더 적은 GPU 메모리로 학습할 수도 있습니다.
+
+🤗 Accelerate 환경을 구성하려면 다음 명령을 실행하세요:
+
+```bash
+accelerate config
+```
+
+환경 구성 중에 DeepSpeed를 사용할 것을 확인하세요.
+그러면 DeepSpeed stage 2, fp16 혼합 정밀도를 결합하고 모델 매개변수와 옵티마이저 상태를 모두 CPU로 오프로드하면 8GB VRAM 미만에서 학습할 수 있습니다.
+단점은 더 많은 시스템 RAM(약 25GB)이 필요하다는 것입니다. 추가 구성 옵션은 [DeepSpeed 문서](https://huggingface.co/docs/accelerate/usage_guides/deepspeed)를 참조하세요.
+
+또한 기본 Adam 옵티마이저를 DeepSpeed의 최적화된 Adam 버전으로 변경해야 합니다.
+이는 상당한 속도 향상을 위한 Adam인 [`deepspeed.ops.adam.DeepSpeedCPUAdam`](https://deepspeed.readthedocs.io/en/latest/optimizers.html#adam-cpu)입니다.
+`DeepSpeedCPUAdam`을 활성화하려면 시스템의 CUDA toolchain 버전이 PyTorch와 함께 설치된 것과 동일해야 합니다.
+
+8비트 옵티마이저는 현재 DeepSpeed와 호환되지 않는 것 같습니다.
+
+다음 명령으로 학습을 시작합니다:
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path_to_training_images"
+export CLASS_DIR="path_to_class_images"
+export OUTPUT_DIR="path_to_saved_model"
+
+accelerate launch train_dreambooth.py \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --instance_data_dir=$INSTANCE_DIR \
+ --class_data_dir=$CLASS_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --with_prior_preservation --prior_loss_weight=1.0 \
+ --instance_prompt="a photo of sks dog" \
+ --class_prompt="a photo of dog" \
+ --resolution=512 \
+ --train_batch_size=1 \
+ --sample_batch_size=1 \
+ --gradient_accumulation_steps=1 --gradient_checkpointing \
+ --learning_rate=5e-6 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --num_class_images=200 \
+ --max_train_steps=800 \
+ --mixed_precision=fp16
+```
+
+## 추론
+
+모델을 학습한 후에는, 모델이 저장된 경로를 지정해 [`StableDiffusionPipeline`]로 추론을 수행할 수 있습니다. 프롬프트에 학습에 사용된 특수 `식별자`(이전 예시의 `sks`)가 포함되어 있는지 확인하세요.
+
+**`"accelerate>=0.16.0"`**이 설치되어 있는 경우 다음 코드를 사용하여 중간 체크포인트에서 추론을 실행할 수 있습니다:
+
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+
+model_id = "path_to_saved_model"
+pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+
+prompt = "A photo of sks dog in a bucket"
+image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
+
+image.save("dog-bucket.png")
+```
+
+[저장된 학습 체크포인트](#inference-from-a-saved-checkpoint)에서도 추론을 실행할 수도 있습니다.
\ No newline at end of file
diff --git a/docs/source/ko/training/lora.mdx b/docs/source/ko/training/lora.mdx
new file mode 100644
index 000000000000..9aebb0fa3109
--- /dev/null
+++ b/docs/source/ko/training/lora.mdx
@@ -0,0 +1,128 @@
+
+
+# Low-Rank Adaptation of Large Language Models (LoRA)
+
+[[open-in-colab]]
+
+
+
+현재 LoRA는 [`UNet2DConditionalModel`]의 어텐션 레이어에서만 지원됩니다.
+
+
+
+[LoRA(Low-Rank Adaptation of Large Language Models)](https://arxiv.org/abs/2106.09685)는 메모리를 적게 사용하면서 대규모 모델의 학습을 가속화하는 학습 방법입니다. 이는 rank-decomposition weight 행렬 쌍(**업데이트 행렬**이라고 함)을 추가하고 새로 추가된 가중치**만** 학습합니다. 여기에는 몇 가지 장점이 있습니다.
+
+- 이전에 미리 학습된 가중치는 고정된 상태로 유지되므로 모델이 [치명적인 망각](https://www.pnas.org/doi/10.1073/pnas.1611835114) 경향이 없습니다.
+- Rank-decomposition 행렬은 원래 모델보다 파라메터 수가 훨씬 적으므로 학습된 LoRA 가중치를 쉽게 끼워넣을 수 있습니다.
+- LoRA 매트릭스는 일반적으로 원본 모델의 어텐션 레이어에 추가됩니다. 🧨 Diffusers는 [`~diffusers.loaders.UNet2DConditionLoadersMixin.load_attn_procs`] 메서드를 제공하여 LoRA 가중치를 모델의 어텐션 레이어로 불러옵니다. `scale` 매개변수를 통해 모델이 새로운 학습 이미지에 맞게 조정되는 범위를 제어할 수 있습니다.
+- 메모리 효율성이 향상되어 Tesla T4, RTX 3080 또는 RTX 2080 Ti와 같은 소비자용 GPU에서 파인튜닝을 실행할 수 있습니다! T4와 같은 GPU는 무료이며 Kaggle 또는 Google Colab 노트북에서 쉽게 액세스할 수 있습니다.
+
+
+
+
+💡 LoRA는 어텐션 레이어에만 한정되지는 않습니다. 저자는 언어 모델의 어텐션 레이어를 수정하는 것이 매우 효율적으로 죻은 성능을 얻기에 충분하다는 것을 발견했습니다. 이것이 LoRA 가중치를 모델의 어텐션 레이어에 추가하는 것이 일반적인 이유입니다. LoRA 작동 방식에 대한 자세한 내용은 [Using LoRA for effective Stable Diffusion fine-tuning](https://huggingface.co/blog/lora) 블로그를 확인하세요!
+
+
+
+[cloneofsimo](https://github.com/cloneofsimo)는 인기 있는 [lora](https://github.com/cloneofsimo/lora) GitHub 리포지토리에서 Stable Diffusion을 위한 LoRA 학습을 최초로 시도했습니다. 🧨 Diffusers는 [text-to-image 생성](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image#training-with-lora) 및 [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth#training-with-low-rank-adaptation-of-large-language-models-lora)을 지원합니다. 이 가이드는 두 가지를 모두 수행하는 방법을 보여줍니다.
+
+모델을 저장하거나 커뮤니티와 공유하려면 Hugging Face 계정에 로그인하세요(아직 계정이 없는 경우 [생성](hf.co/join)하세요):
+
+```bash
+huggingface-cli login
+```
+
+## Text-to-image
+
+수십억 개의 파라메터들이 있는 Stable Diffusion과 같은 모델을 파인튜닝하는 것은 느리고 어려울 수 있습니다. LoRA를 사용하면 diffusion 모델을 파인튜닝하는 것이 훨씬 쉽고 빠릅니다. 8비트 옵티마이저와 같은 트릭에 의존하지 않고도 11GB의 GPU RAM으로 하드웨어에서 실행할 수 있습니다.
+
+
+### 학습 [[text-to-image 학습]]
+
+[Pokémon BLIP 캡션](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) 데이터셋으로 [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)를 파인튜닝해 나만의 포켓몬을 생성해 보겠습니다.
+
+시작하려면 `MODEL_NAME` 및 `DATASET_NAME` 환경 변수가 설정되어 있는지 확인하십시오. `OUTPUT_DIR` 및 `HUB_MODEL_ID` 변수는 선택 사항이며 허브에서 모델을 저장할 위치를 지정합니다.
+
+```bash
+export MODEL_NAME="runwayml/stable-diffusion-v1-5"
+export OUTPUT_DIR="/sddata/finetune/lora/pokemon"
+export HUB_MODEL_ID="pokemon-lora"
+export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+```
+
+학습을 시작하기 전에 알아야 할 몇 가지 플래그가 있습니다.
+
+* `--push_to_hub`를 명시하면 학습된 LoRA 임베딩을 허브에 저장합니다.
+* `--report_to=wandb`는 학습 결과를 가중치 및 편향 대시보드에 보고하고 기록합니다(예를 들어, 이 [보고서](https://wandb.ai/pcuenq/text2image-fine-tune/run/b4k1w0tn?workspace=user-pcuenq)를 참조하세요).
+* `--learning_rate=1e-04`, 일반적으로 LoRA에서 사용하는 것보다 더 높은 학습률을 사용할 수 있습니다.
+
+이제 학습을 시작할 준비가 되었습니다 (전체 학습 스크립트는 [여기](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py)에서 찾을 수 있습니다).
+
+```bash
+accelerate launch train_dreambooth_lora.py \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --instance_data_dir=$INSTANCE_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --instance_prompt="a photo of sks dog" \
+ --resolution=512 \
+ --train_batch_size=1 \
+ --gradient_accumulation_steps=1 \
+ --checkpointing_steps=100 \
+ --learning_rate=1e-4 \
+ --report_to="wandb" \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --max_train_steps=500 \
+ --validation_prompt="A photo of sks dog in a bucket" \
+ --validation_epochs=50 \
+ --seed="0" \
+ --push_to_hub
+```
+
+### 추론 [[dreambooth 추론]]
+
+이제 [`StableDiffusionPipeline`]에서 기본 모델을 불러와 추론을 위해 모델을 사용할 수 있습니다:
+
+```py
+>>> import torch
+>>> from diffusers import StableDiffusionPipeline
+
+>>> model_base = "runwayml/stable-diffusion-v1-5"
+
+>>> pipe = StableDiffusionPipeline.from_pretrained(model_base, torch_dtype=torch.float16)
+```
+
+*기본 모델의 가중치 위에* 파인튜닝된 DreamBooth 모델에서 LoRA 가중치를 로드한 다음, 더 빠른 추론을 위해 파이프라인을 GPU로 이동합니다. LoRA 가중치를 프리징된 사전 훈련된 모델 가중치와 병합할 때, 선택적으로 'scale' 매개변수로 어느 정도의 가중치를 병합할 지 조절할 수 있습니다:
+
+
+
+💡 `0`의 `scale` 값은 LoRA 가중치를 사용하지 않아 원래 모델의 가중치만 사용한 것과 같고, `1`의 `scale` 값은 파인튜닝된 LoRA 가중치만 사용함을 의미합니다. 0과 1 사이의 값들은 두 결과들 사이로 보간됩니다.
+
+
+
+```py
+>>> pipe.unet.load_attn_procs(model_path)
+>>> pipe.to("cuda")
+# LoRA 파인튜닝된 모델의 가중치 절반과 기본 모델의 가중치 절반 사용
+
+>>> image = pipe(
+... "A picture of a sks dog in a bucket.",
+... num_inference_steps=25,
+... guidance_scale=7.5,
+... cross_attention_kwargs={"scale": 0.5},
+... ).images[0]
+# 완전히 파인튜닝된 LoRA 모델의 가중치 사용
+
+>>> image = pipe("A picture of a sks dog in a bucket.", num_inference_steps=25, guidance_scale=7.5).images[0]
+>>> image.save("bucket-dog.png")
+```
\ No newline at end of file
diff --git a/docs/source/ko/training/text2image.mdx b/docs/source/ko/training/text2image.mdx
new file mode 100644
index 000000000000..069388603124
--- /dev/null
+++ b/docs/source/ko/training/text2image.mdx
@@ -0,0 +1,224 @@
+
+
+
+# Text-to-image
+
+
+
+text-to-image 파인튜닝 스크립트는 experimental 상태입니다. 과적합하기 쉽고 치명적인 망각과 같은 문제에 부딪히기 쉽습니다. 자체 데이터셋에서 최상의 결과를 얻으려면 다양한 하이퍼파라미터를 탐색하는 것이 좋습니다.
+
+
+
+Stable Diffusion과 같은 text-to-image 모델은 텍스트 프롬프트에서 이미지를 생성합니다. 이 가이드는 PyTorch 및 Flax를 사용하여 자체 데이터셋에서 [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4) 모델로 파인튜닝하는 방법을 보여줍니다. 이 가이드에 사용된 text-to-image 파인튜닝을 위한 모든 학습 스크립트에 관심이 있는 경우 이 [리포지토리](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image)에서 자세히 찾을 수 있습니다.
+
+스크립트를 실행하기 전에, 라이브러리의 학습 dependency들을 설치해야 합니다:
+
+```bash
+pip install git+https://github.com/huggingface/diffusers.git
+pip install -U -r requirements.txt
+```
+
+그리고 [🤗Accelerate](https://github.com/huggingface/accelerate/) 환경을 초기화합니다:
+
+```bash
+accelerate config
+```
+
+리포지토리를 이미 복제한 경우, 이 단계를 수행할 필요가 없습니다. 대신, 로컬 체크아웃 경로를 학습 스크립트에 명시할 수 있으며 거기에서 로드됩니다.
+
+### 하드웨어 요구 사항
+
+`gradient_checkpointing` 및 `mixed_precision`을 사용하면 단일 24GB GPU에서 모델을 파인튜닝할 수 있습니다. 더 높은 `batch_size`와 더 빠른 훈련을 위해서는 GPU 메모리가 30GB 이상인 GPU를 사용하는 것이 좋습니다. TPU 또는 GPU에서 파인튜닝을 위해 JAX나 Flax를 사용할 수도 있습니다. 자세한 내용은 [아래](#flax-jax-finetuning)를 참조하세요.
+
+xFormers로 memory efficient attention을 활성화하여 메모리 사용량 훨씬 더 줄일 수 있습니다. [xFormers가 설치](./optimization/xformers)되어 있는지 확인하고 `--enable_xformers_memory_efficient_attention`를 학습 스크립트에 명시합니다.
+
+xFormers는 Flax에 사용할 수 없습니다.
+
+## Hub에 모델 업로드하기
+
+학습 스크립트에 다음 인수를 추가하여 모델을 허브에 저장합니다:
+
+```bash
+ --push_to_hub
+```
+
+
+## 체크포인트 저장 및 불러오기
+
+학습 중 발생할 수 있는 일에 대비하여 정기적으로 체크포인트를 저장해 두는 것이 좋습니다. 체크포인트를 저장하려면 학습 스크립트에 다음 인수를 명시합니다.
+
+```bash
+ --checkpointing_steps=500
+```
+
+500스텝마다 전체 학습 state가 'output_dir'의 하위 폴더에 저장됩니다. 체크포인트는 'checkpoint-'에 지금까지 학습된 step 수입니다. 예를 들어 'checkpoint-1500'은 1500 학습 step 후에 저장된 체크포인트입니다.
+
+학습을 재개하기 위해 체크포인트를 불러오려면 '--resume_from_checkpoint' 인수를 학습 스크립트에 명시하고 재개할 체크포인트를 지정하십시오. 예를 들어 다음 인수는 1500개의 학습 step 후에 저장된 체크포인트에서부터 훈련을 재개합니다.
+
+```bash
+ --resume_from_checkpoint="checkpoint-1500"
+```
+
+## 파인튜닝
+
+
+
+다음과 같이 [Pokémon BLIP 캡션](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) 데이터셋에서 파인튜닝 실행을 위해 [PyTorch 학습 스크립트](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py)를 실행합니다:
+
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export dataset_name="lambdalabs/pokemon-blip-captions"
+
+accelerate launch train_text_to_image.py \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --dataset_name=$dataset_name \
+ --use_ema \
+ --resolution=512 --center_crop --random_flip \
+ --train_batch_size=1 \
+ --gradient_accumulation_steps=4 \
+ --gradient_checkpointing \
+ --mixed_precision="fp16" \
+ --max_train_steps=15000 \
+ --learning_rate=1e-05 \
+ --max_grad_norm=1 \
+ --lr_scheduler="constant" --lr_warmup_steps=0 \
+ --output_dir="sd-pokemon-model"
+```
+
+자체 데이터셋으로 파인튜닝하려면 🤗 [Datasets](https://huggingface.co/docs/datasets/index)에서 요구하는 형식에 따라 데이터셋을 준비하세요. [데이터셋을 허브에 업로드](https://huggingface.co/docs/datasets/image_dataset#upload-dataset-to-the-hub)하거나 [파일들이 있는 로컬 폴더를 준비](https ://huggingface.co/docs/datasets/image_dataset#imagefolder)할 수 있습니다.
+
+사용자 커스텀 loading logic을 사용하려면 스크립트를 수정하십시오. 도움이 되도록 코드의 적절한 위치에 포인터를 남겼습니다. 🤗 아래 예제 스크립트는 `TRAIN_DIR`의 로컬 데이터셋으로를 파인튜닝하는 방법과 `OUTPUT_DIR`에서 모델을 저장할 위치를 보여줍니다:
+
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export TRAIN_DIR="path_to_your_dataset"
+export OUTPUT_DIR="path_to_save_model"
+
+accelerate launch train_text_to_image.py \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --train_data_dir=$TRAIN_DIR \
+ --use_ema \
+ --resolution=512 --center_crop --random_flip \
+ --train_batch_size=1 \
+ --gradient_accumulation_steps=4 \
+ --gradient_checkpointing \
+ --mixed_precision="fp16" \
+ --max_train_steps=15000 \
+ --learning_rate=1e-05 \
+ --max_grad_norm=1 \
+ --lr_scheduler="constant" --lr_warmup_steps=0 \
+ --output_dir=${OUTPUT_DIR}
+```
+
+
+
+[@duongna211](https://github.com/duongna21)의 기여로, Flax를 사용해 TPU 및 GPU에서 Stable Diffusion 모델을 더 빠르게 학습할 수 있습니다. 이는 TPU 하드웨어에서 매우 효율적이지만 GPU에서도 훌륭하게 작동합니다. Flax 학습 스크립트는 gradient checkpointing나 gradient accumulation과 같은 기능을 아직 지원하지 않으므로 메모리가 30GB 이상인 GPU 또는 TPU v3가 필요합니다.
+
+스크립트를 실행하기 전에 요구 사항이 설치되어 있는지 확인하십시오:
+
+```bash
+pip install -U -r requirements_flax.txt
+```
+
+그러면 다음과 같이 [Flax 학습 스크립트](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_flax.py)를 실행할 수 있습니다.
+
+```bash
+export MODEL_NAME="runwayml/stable-diffusion-v1-5"
+export dataset_name="lambdalabs/pokemon-blip-captions"
+
+python train_text_to_image_flax.py \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --dataset_name=$dataset_name \
+ --resolution=512 --center_crop --random_flip \
+ --train_batch_size=1 \
+ --max_train_steps=15000 \
+ --learning_rate=1e-05 \
+ --max_grad_norm=1 \
+ --output_dir="sd-pokemon-model"
+```
+
+자체 데이터셋으로 파인튜닝하려면 🤗 [Datasets](https://huggingface.co/docs/datasets/index)에서 요구하는 형식에 따라 데이터셋을 준비하세요. [데이터셋을 허브에 업로드](https://huggingface.co/docs/datasets/image_dataset#upload-dataset-to-the-hub)하거나 [파일들이 있는 로컬 폴더를 준비](https ://huggingface.co/docs/datasets/image_dataset#imagefolder)할 수 있습니다.
+
+사용자 커스텀 loading logic을 사용하려면 스크립트를 수정하십시오. 도움이 되도록 코드의 적절한 위치에 포인터를 남겼습니다. 🤗 아래 예제 스크립트는 `TRAIN_DIR`의 로컬 데이터셋으로를 파인튜닝하는 방법을 보여줍니다:
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export TRAIN_DIR="path_to_your_dataset"
+
+python train_text_to_image_flax.py \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --train_data_dir=$TRAIN_DIR \
+ --resolution=512 --center_crop --random_flip \
+ --train_batch_size=1 \
+ --mixed_precision="fp16" \
+ --max_train_steps=15000 \
+ --learning_rate=1e-05 \
+ --max_grad_norm=1 \
+ --output_dir="sd-pokemon-model"
+```
+
+
+
+## LoRA
+
+Text-to-image 모델 파인튜닝을 위해, 대규모 모델 학습을 가속화하기 위한 파인튜닝 기술인 LoRA(Low-Rank Adaptation of Large Language Models)를 사용할 수 있습니다. 자세한 내용은 [LoRA 학습](lora#text-to-image) 가이드를 참조하세요.
+
+## 추론
+
+허브의 모델 경로 또는 모델 이름을 [`StableDiffusionPipeline`]에 전달하여 추론을 위해 파인 튜닝된 모델을 불러올 수 있습니다:
+
+
+
+```python
+from diffusers import StableDiffusionPipeline
+
+model_path = "path_to_saved_model"
+pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16)
+pipe.to("cuda")
+
+image = pipe(prompt="yoda").images[0]
+image.save("yoda-pokemon.png")
+```
+
+
+```python
+import jax
+import numpy as np
+from flax.jax_utils import replicate
+from flax.training.common_utils import shard
+from diffusers import FlaxStableDiffusionPipeline
+
+model_path = "path_to_saved_model"
+pipe, params = FlaxStableDiffusionPipeline.from_pretrained(model_path, dtype=jax.numpy.bfloat16)
+
+prompt = "yoda pokemon"
+prng_seed = jax.random.PRNGKey(0)
+num_inference_steps = 50
+
+num_samples = jax.device_count()
+prompt = num_samples * [prompt]
+prompt_ids = pipeline.prepare_inputs(prompt)
+
+# shard inputs and rng
+params = replicate(params)
+prng_seed = jax.random.split(prng_seed, jax.device_count())
+prompt_ids = shard(prompt_ids)
+
+images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
+images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
+image.save("yoda-pokemon.png")
+```
+
+
\ No newline at end of file
From b8b5daaee30ecbecd7b901020008ffead443665d Mon Sep 17 00:00:00 2001
From: Ambrosiussen
Date: Mon, 22 May 2023 16:49:35 +0200
Subject: [PATCH 117/206] DataLoader respecting EXIF data in Training Images
(#3465)
* DataLoader will now bake in any transforms or image manipulations contained in the EXIF
Images may have rotations stored in EXIF. Training using such images will cause those transforms to be ignored while training and thus produce unexpected results
* Fixed the Dataloading EXIF issue in main DreamBooth training as well
* Run make style (black & isort)
---
examples/dreambooth/train_dreambooth.py | 23 ++++++++++++--------
examples/dreambooth/train_dreambooth_lora.py | 23 ++++++++++++--------
2 files changed, 28 insertions(+), 18 deletions(-)
diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index efcfb39ab4c4..53d9c269f3e7 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -27,19 +27,13 @@
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
+from torch.utils.data import Dataset
+
+import diffusers
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import ProjectConfiguration, set_seed
-from huggingface_hub import create_repo, model_info, upload_folder
-from packaging import version
-from PIL import Image
-from torch.utils.data import Dataset
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer, PretrainedConfig
-
-import diffusers
from diffusers import (
AutoencoderKL,
DDPMScheduler,
@@ -50,6 +44,13 @@
from diffusers.optimization import get_scheduler
from diffusers.utils import check_min_version, is_wandb_available
from diffusers.utils.import_utils import is_xformers_available
+from huggingface_hub import create_repo, model_info, upload_folder
+from packaging import version
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
if is_wandb_available():
@@ -607,6 +608,8 @@ def __len__(self):
def __getitem__(self, index):
example = {}
instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+ instance_image = exif_transpose(instance_image)
+
if not instance_image.mode == "RGB":
instance_image = instance_image.convert("RGB")
example["instance_images"] = self.image_transforms(instance_image)
@@ -622,6 +625,8 @@ def __getitem__(self, index):
if self.class_data_root:
class_image = Image.open(self.class_images_path[index % self.num_class_images])
+ class_image = exif_transpose(class_image)
+
if not class_image.mode == "RGB":
class_image = class_image.convert("RGB")
example["class_images"] = self.image_transforms(class_image)
diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
index bfbf3603e8d0..659b0d3e1d88 100644
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
@@ -27,19 +27,13 @@
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
+from torch.utils.data import Dataset
+
+import diffusers
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import ProjectConfiguration, set_seed
-from huggingface_hub import create_repo, upload_folder
-from packaging import version
-from PIL import Image
-from torch.utils.data import Dataset
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer, PretrainedConfig
-
-import diffusers
from diffusers import (
AutoencoderKL,
DDPMScheduler,
@@ -59,6 +53,13 @@
from diffusers.optimization import get_scheduler
from diffusers.utils import TEXT_ENCODER_TARGET_MODULES, check_min_version, is_wandb_available
from diffusers.utils.import_utils import is_xformers_available
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
@@ -508,6 +509,8 @@ def __len__(self):
def __getitem__(self, index):
example = {}
instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+ instance_image = exif_transpose(instance_image)
+
if not instance_image.mode == "RGB":
instance_image = instance_image.convert("RGB")
example["instance_images"] = self.image_transforms(instance_image)
@@ -523,6 +526,8 @@ def __getitem__(self, index):
if self.class_data_root:
class_image = Image.open(self.class_images_path[index % self.num_class_images])
+ class_image = exif_transpose(class_image)
+
if not class_image.mode == "RGB":
class_image = class_image.convert("RGB")
example["class_images"] = self.image_transforms(class_image)
From 2b56e8ca6810d40b820592a9673b538705141482 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Mon, 22 May 2023 16:49:46 +0200
Subject: [PATCH 118/206] make style
---
examples/dreambooth/train_dreambooth.py | 20 ++++++++++----------
examples/dreambooth/train_dreambooth_lora.py | 20 ++++++++++----------
2 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index 53d9c269f3e7..ad43ee7aeee2 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -27,13 +27,20 @@
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
-from torch.utils.data import Dataset
-
-import diffusers
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, model_info, upload_folder
+from packaging import version
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
from diffusers import (
AutoencoderKL,
DDPMScheduler,
@@ -44,13 +51,6 @@
from diffusers.optimization import get_scheduler
from diffusers.utils import check_min_version, is_wandb_available
from diffusers.utils.import_utils import is_xformers_available
-from huggingface_hub import create_repo, model_info, upload_folder
-from packaging import version
-from PIL import Image
-from PIL.ImageOps import exif_transpose
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer, PretrainedConfig
if is_wandb_available():
diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
index 659b0d3e1d88..e640542e36da 100644
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
@@ -27,13 +27,20 @@
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
-from torch.utils.data import Dataset
-
-import diffusers
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
from diffusers import (
AutoencoderKL,
DDPMScheduler,
@@ -53,13 +60,6 @@
from diffusers.optimization import get_scheduler
from diffusers.utils import TEXT_ENCODER_TARGET_MODULES, check_min_version, is_wandb_available
from diffusers.utils.import_utils import is_xformers_available
-from huggingface_hub import create_repo, upload_folder
-from packaging import version
-from PIL import Image
-from PIL.ImageOps import exif_transpose
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer, PretrainedConfig
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
From f3d570c273561b7f92a1ab55e6c846bb73c19a29 Mon Sep 17 00:00:00 2001
From: Hari Krishna <37787894+hari10599@users.noreply.github.com>
Date: Mon, 22 May 2023 20:41:08 +0530
Subject: [PATCH 119/206] feat: allow disk offload for diffuser models (#3285)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* allow disk offload for diffuser models
* sort import
* add max_memory argument
* Changed sample[0] to images[0] (#3304)
A pipeline object stores the results in `images` not in `sample`.
Current code blocks don't work.
* Typo in tutorial (#3295)
* Torch compile graph fix (#3286)
* fix more
* Fix more
* fix more
* Apply suggestions from code review
* fix
* make style
* make fix-copies
* fix
* make sure torch compile
* Clean
* fix test
* Postprocessing refactor img2img (#3268)
* refactor img2img VaeImageProcessor.postprocess
* remove copy from for init, run_safety_checker, decode_latents
Co-authored-by: Sayak Paul
---------
Co-authored-by: yiyixuxu
Co-authored-by: Sayak Paul
* [Torch 2.0 compile] Fix more torch compile breaks (#3313)
* Fix more torch compile breaks
* add tests
* Fix all
* fix controlnet
* fix more
* Add Horace He as co-author.
>
>
Co-authored-by: Horace He
* Add Horace He as co-author.
Co-authored-by: Horace He
---------
Co-authored-by: Horace He
* fix: scale_lr and sync example readme and docs. (#3299)
* fix: scale_lr and sync example readme and docs.
* fix doc link.
* Update stable_diffusion.mdx (#3310)
fixed import statement
* Fix missing variable assign in DeepFloyd-IF-II (#3315)
Fix missing variable assign
lol
* Correct doc build for patch releases (#3316)
Update build_documentation.yml
* Add Stable Diffusion RePaint to community pipelines (#3320)
* Add Stable Diffsuion RePaint to community pipelines
- Adds Stable Diffsuion RePaint to community pipelines
- Add Readme enty for pipeline
* Fix: Remove wrong import
- Remove wrong import
- Minor change in comments
* Fix: Code formatting of stable_diffusion_repaint
* Fix: ruff errors in stable_diffusion_repaint
* Fix multistep dpmsolver for cosine schedule (suitable for deepfloyd-if) (#3314)
* fix multistep dpmsolver for cosine schedule (deepfloy-if)
* fix a typo
* Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
Co-authored-by: Patrick von Platen
* Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
Co-authored-by: Patrick von Platen
* Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
Co-authored-by: Patrick von Platen
* Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
Co-authored-by: Patrick von Platen
* Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
Co-authored-by: Patrick von Platen
* update all dpmsolver (singlestep, multistep, dpm, dpm++) for cosine noise schedule
* add test, fix style
---------
Co-authored-by: Patrick von Platen
* [docs] Improve LoRA docs (#3311)
* update docs
* add to toctree
* apply feedback
* Added input pretubation (#3292)
* Added input pretubation
* Fixed spelling
* Update write_own_pipeline.mdx (#3323)
* update controlling generation doc with latest goodies. (#3321)
* [Quality] Make style (#3341)
* Fix config dpm (#3343)
* Add the SDE variant of DPM-Solver and DPM-Solver++ (#3344)
* add SDE variant of DPM-Solver and DPM-Solver++
* add test
* fix typo
* fix typo
* Add upsample_size to AttnUpBlock2D, AttnDownBlock2D (#3275)
The argument `upsample_size` needs to be added to these modules to allow compatibility with other blocks that require this argument.
* Rename --only_save_embeds to --save_as_full_pipeline (#3206)
* Set --only_save_embeds to False by default
Due to how the option is named, it makes more sense to behave like this.
* Refactor only_save_embeds to save_as_full_pipeline
* [AudioLDM] Generalise conversion script (#3328)
Co-authored-by: Patrick von Platen
* Fix TypeError when using prompt_embeds and negative_prompt (#2982)
* test: Added test case
* fix: fixed type checking issue on _encode_prompt
* fix: fixed copies consistency
* fix: one copy was not sufficient
* Fix pipeline class on README (#3345)
Update README.md
* Inpainting: typo in docs (#3331)
Typo in docs
Co-authored-by: Patrick von Platen
* Add `use_Karras_sigmas` to LMSDiscreteScheduler (#3351)
* add karras sigma to lms discrete scheduler
* add test for lms_scheduler karras
* reformat test lms
* Batched load of textual inversions (#3277)
* Batched load of textual inversions
- Only call resize_token_embeddings once per batch as it is the most expensive operation
- Allow pretrained_model_name_or_path and token to be an optional list
- Remove Dict from type annotation pretrained_model_name_or_path as it was not supported in this function
- Add comment that single files (e.g. .pt/.safetensors) are supported
- Add comment for token parameter
- Convert token override log message from warning to info
* Update src/diffusers/loaders.py
Check for duplicate tokens
Co-authored-by: Patrick von Platen
* Update condition for None tokens
---------
Co-authored-by: Patrick von Platen
* make fix-copies
* [docs] Fix docstring (#3334)
fix docstring
Co-authored-by: Patrick von Platen
* if dreambooth lora (#3360)
* update IF stage I pipelines
add fixed variance schedulers and lora loading
* added kv lora attn processor
* allow loading into alternative lora attn processor
* make vae optional
* throw away predicted variance
* allow loading into added kv lora layer
* allow load T5
* allow pre compute text embeddings
* set new variance type in schedulers
* fix copies
* refactor all prompt embedding code
class prompts are now included in pre-encoding code
max tokenizer length is now configurable
embedding attention mask is now configurable
* fix for when variance type is not defined on scheduler
* do not pre compute validation prompt if not present
* add example test for if lora dreambooth
* add check for train text encoder and pre compute text embeddings
* Postprocessing refactor all others (#3337)
* add text2img
* fix-copies
* add
* add all other pipelines
* add
* add
* add
* add
* add
* make style
* style + fix copies
---------
Co-authored-by: yiyixuxu
* [docs] Improve safetensors docstring (#3368)
* clarify safetensor docstring
* fix typo
* apply feedback
* add: a warning message when using xformers in a PT 2.0 env. (#3365)
* add: a warning message when using xformers in a PT 2.0 env.
* Apply suggestions from code review
Co-authored-by: Patrick von Platen
---------
Co-authored-by: Patrick von Platen
* StableDiffusionInpaintingPipeline - resize image w.r.t height and width (#3322)
* StableDiffusionInpaintingPipeline now resizes input images and masks w.r.t to passed input height and width. Default is already set to 512. This addresses the common tensor mismatch error. Also moved type check into relevant funciton to keep main pipeline body tidy.
* Fixed StableDiffusionInpaintingPrepareMaskAndMaskedImageTests
Due to previous commit these tests were failing as height and width need to be passed into the prepare_mask_and_masked_image function, I have updated the code and added a height/width variable per unit test as it seemed more appropriate than the current hard coded solution
* Added a resolution test to StableDiffusionInpaintPipelineSlowTests
this unit test simply gets the input and resizes it into some that would fail (e.g. would throw a tensor mismatch error/not a mult of 8). Then passes it through the pipeline and verifies it produces output with correct dims w.r.t the passed height and width
---------
Co-authored-by: Patrick von Platen
* make style
* [docs] Adapt a model (#3326)
* first draft
* apply feedback
* conv_in.weight thrown away
* [docs] Load safetensors (#3333)
* safetensors
* apply feedback
* apply feedback
* Apply suggestions from code review
---------
Co-authored-by: Patrick von Platen
* make style
* [Docs] Fix stable_diffusion.mdx typo (#3398)
Fix typo in last code block. Correct "prommpts" to "prompt"
* Support ControlNet v1.1 shuffle properly (#3340)
* add inferring_controlnet_cond_batch
* Revert "add inferring_controlnet_cond_batch"
This reverts commit abe8d6311d4b7f5b9409ca709c7fabf80d06c1a9.
* set guess_mode to True
whenever global_pool_conditions is True
Co-authored-by: Patrick von Platen
* nit
* add integration test
---------
Co-authored-by: Patrick von Platen
* [Tests] better determinism (#3374)
* enable deterministic pytorch and cuda operations.
* disable manual seeding.
* make style && make quality for unet_2d tests.
* enable determinism for the unet2dconditional model.
* add CUBLAS_WORKSPACE_CONFIG for better reproducibility.
* relax tolerance (very weird issue, though).
* revert to torch manual_seed() where needed.
* relax more tolerance.
* better placement of the cuda variable and relax more tolerance.
* enable determinism for 3d condition model.
* relax tolerance.
* add: determinism to alt_diffusion.
* relax tolerance for alt diffusion.
* dance diffusion.
* dance diffusion is flaky.
* test_dict_tuple_outputs_equivalent edit.
* fix two more tests.
* fix more ddim tests.
* fix: argument.
* change to diff in place of difference.
* fix: test_save_load call.
* test_save_load_float16 call.
* fix: expected_max_diff
* fix: paint by example.
* relax tolerance.
* add determinism to 1d unet model.
* torch 2.0 regressions seem to be brutal
* determinism to vae.
* add reason to skipping.
* up tolerance.
* determinism to vq.
* determinism to cuda.
* determinism to the generic test pipeline file.
* refactor general pipelines testing a bit.
* determinism to alt diffusion i2i
* up tolerance for alt diff i2i and audio diff
* up tolerance.
* determinism to audioldm
* increase tolerance for audioldm lms.
* increase tolerance for paint by paint.
* increase tolerance for repaint.
* determinism to cycle diffusion and sd 1.
* relax tol for cycle diffusion 🚲
* relax tol for sd 1.0
* relax tol for controlnet.
* determinism to img var.
* relax tol for img variation.
* tolerance to i2i sd
* make style
* determinism to inpaint.
* relax tolerance for inpaiting.
* determinism for inpainting legacy
* relax tolerance.
* determinism to instruct pix2pix
* determinism to model editing.
* model editing tolerance.
* panorama determinism
* determinism to pix2pix zero.
* determinism to sag.
* sd 2. determinism
* sd. tolerance
* disallow tf32 matmul.
* relax tolerance is all you need.
* make style and determinism to sd 2 depth
* relax tolerance for depth.
* tolerance to diffedit.
* tolerance to sd 2 inpaint.
* up tolerance.
* determinism in upscaling.
* tolerance in upscaler.
* more tolerance relaxation.
* determinism to v pred.
* up tol for v_pred
* unclip determinism
* determinism to unclip img2img
* determinism to text to video.
* determinism to last set of tests
* up tol.
* vq cumsum doesn't have a deterministic kernel
* relax tol
* relax tol
* [docs] Add transformers to install (#3388)
add transformers to install
* [deepspeed] partial ZeRO-3 support (#3076)
* [deepspeed] partial ZeRO-3 support
* cleanup
* improve deepspeed fixes
* Improve
* make style
---------
Co-authored-by: Patrick von Platen
* Add omegaconf for tests (#3400)
Add omegaconfg
* Fix various bugs with LoRA Dreambooth and Dreambooth script (#3353)
* Improve checkpointing lora
* fix more
* Improve doc string
* Update src/diffusers/loaders.py
* make stytle
* Apply suggestions from code review
* Update src/diffusers/loaders.py
* Apply suggestions from code review
* Apply suggestions from code review
* better
* Fix all
* Fix multi-GPU dreambooth
* Apply suggestions from code review
Co-authored-by: Pedro Cuenca
* Fix all
* make style
* make style
---------
Co-authored-by: Pedro Cuenca
* Fix docker file (#3402)
* up
* up
* fix: deepseepd_plugin retrieval from accelerate state (#3410)
* [Docs] Add `sigmoid` beta_scheduler to docstrings of relevant Schedulers (#3399)
* Add `sigmoid` beta scheduler to `DDPMScheduler` docstring
* Add `sigmoid` beta scheduler to `RePaintScheduler` docstring
---------
Co-authored-by: Patrick von Platen
* Don't install accelerate and transformers from source (#3415)
* Don't install transformers and accelerate from source (#3414)
* Improve fast tests (#3416)
Update pr_tests.yml
* attention refactor: the trilogy (#3387)
* Replace `AttentionBlock` with `Attention`
* use _from_deprecated_attn_block check re: @patrickvonplaten
* [Docs] update the PT 2.0 optimization doc with latest findings (#3370)
* add: benchmarking stats for A100 and V100.
* Apply suggestions from code review
Co-authored-by: Patrick von Platen
* address patrick's comments.
* add: rtx 4090 stats
* ⚔ benchmark reports done
* Apply suggestions from code review
Co-authored-by: Pedro Cuenca
* 3313 pr link.
* add: plots.
Co-authored-by: Pedro
* fix formattimg
* update number percent.
---------
Co-authored-by: Patrick von Platen
Co-authored-by: Pedro Cuenca
* Fix style rendering (#3433)
* Fix style rendering.
* Fix typo
* unCLIP scheduler do not use note (#3417)
* Replace deprecated command with environment file (#3409)
Co-authored-by: Patrick von Platen
* fix warning message pipeline loading (#3446)
* add stable diffusion tensorrt img2img pipeline (#3419)
* add stable diffusion tensorrt img2img pipeline
Signed-off-by: Asfiya Baig
* update docstrings
Signed-off-by: Asfiya Baig
---------
Signed-off-by: Asfiya Baig
* Refactor controlnet and add img2img and inpaint (#3386)
* refactor controlnet and add img2img and inpaint
* First draft to get pipelines to work
* make style
* Fix more
* Fix more
* More tests
* Fix more
* Make inpainting work
* make style and more tests
* Apply suggestions from code review
* up
* make style
* Fix imports
* Fix more
* Fix more
* Improve examples
* add test
* Make sure import is correctly deprecated
* Make sure everything works in compile mode
* make sure authorship is correctly attributed
* [Scheduler] DPM-Solver (++) Inverse Scheduler (#3335)
* Add DPM-Solver Multistep Inverse Scheduler
* Add draft tests for DiffEdit
* Add inverse sde-dpmsolver steps to tune image diversity from inverted latents
* Fix tests
---------
Co-authored-by: Patrick von Platen
* [Docs] Fix incomplete docstring for resnet.py (#3438)
Fix incomplete docstrings for resnet.py
* fix tiled vae blend extent range (#3384)
fix tiled vae bleand extent range
* Small update to "Next steps" section (#3443)
Small update to "Next steps" section:
- PyTorch 2 is recommended.
- Updated improvement figures.
* Allow arbitrary aspect ratio in IFSuperResolutionPipeline (#3298)
* Update pipeline_if_superresolution.py
Allow arbitrary aspect ratio in IFSuperResolutionPipeline by using the input image shape
* IFSuperResolutionPipeline: allow the user to override the height and width through the arguments
* update IFSuperResolutionPipeline width/height doc string to match StableDiffusionInpaintPipeline conventions
---------
Co-authored-by: Patrick von Platen
* Adding 'strength' parameter to StableDiffusionInpaintingPipeline (#3424)
* Added explanation of 'strength' parameter
* Added get_timesteps function which relies on new strength parameter
* Added `strength` parameter which defaults to 1.
* Swapped ordering so `noise_timestep` can be calculated before masking the image
this is required when you aren't applying 100% noise to the masked region, e.g. strength < 1.
* Added strength to check_inputs, throws error if out of range
* Changed `prepare_latents` to initialise latents w.r.t strength
inspired from the stable diffusion img2img pipeline, init latents are initialised by converting the init image into a VAE latent and adding noise (based upon the strength parameter passed in), e.g. random when strength = 1, or the init image at strength = 0.
* WIP: Added a unit test for the new strength parameter in the StableDiffusionInpaintingPipeline
still need to add correct regression values
* Created a is_strength_max to initialise from pure random noise
* Updated unit tests w.r.t new strength parameter + fixed new strength unit test
* renamed parameter to avoid confusion with variable of same name
* Updated regression values for new strength test - now passes
* removed 'copied from' comment as this method is now different and divergent from the cpy
* Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
Co-authored-by: Patrick von Platen
* Ensure backwards compatibility for prepare_mask_and_masked_image
created a return_image boolean and initialised to false
* Ensure backwards compatibility for prepare_latents
* Fixed copy check typo
* Fixes w.r.t backward compibility changes
* make style
* keep function argument ordering same for backwards compatibility in callees with copied from statements
* make fix-copies
---------
Co-authored-by: Patrick von Platen
Co-authored-by: William Berman
* [WIP] Bugfix - Pipeline.from_pretrained is broken when the pipeline is partially downloaded (#3448)
Added bugfix using f strings.
* Fix gradient checkpointing bugs in freezing part of models (requires_grad=False) (#3404)
* gradient checkpointing bug fix
* bug fix; changes for reviews
* reformat
* reformat
---------
Co-authored-by: Patrick von Platen
* Make dreambooth lora more robust to orig unet (#3462)
* Make dreambooth lora more robust to orig unet
* up
* Reduce peak VRAM by releasing large attention tensors (as soon as they're unnecessary) (#3463)
Release large tensors in attention (as soon as they're no longer required). Reduces peak VRAM by nearly 2 GB for 1024x1024 (even after slicing), and the savings scale up with image size.
* Add min snr to text2img lora training script (#3459)
add min snr to text2img lora training script
* Add inpaint lora scale support (#3460)
* add inpaint lora scale support
* add inpaint lora scale test
---------
Co-authored-by: yueyang.hyy
* [From ckpt] Fix from_ckpt (#3466)
* Correct from_ckpt
* make style
* Update full dreambooth script to work with IF (#3425)
* Add IF dreambooth docs (#3470)
* parameterize pass single args through tuple (#3477)
* attend and excite tests disable determinism on the class level (#3478)
* dreambooth docs torch.compile note (#3471)
* dreambooth docs torch.compile note
* Update examples/dreambooth/README.md
Co-authored-by: Sayak Paul
* Update examples/dreambooth/README.md
Co-authored-by: Pedro Cuenca
---------
Co-authored-by: Sayak Paul
Co-authored-by: Pedro Cuenca
* add: if entry in the dreambooth training docs. (#3472)
* [docs] Textual inversion inference (#3473)
* add textual inversion inference to docs
* add to toctree
---------
Co-authored-by: Sayak Paul
* [docs] Distributed inference (#3376)
* distributed inference
* move to inference section
* apply feedback
* update with split_between_processes
* apply feedback
* [{Up,Down}sample1d] explicit view kernel size as number elements in flattened indices (#3479)
explicit view kernel size as number elements in flattened indices
* mps & onnx tests rework (#3449)
* Remove ONNX tests from PR.
They are already a part of push_tests.yml.
* Remove mps tests from PRs.
They are already performed on push.
* Fix workflow name for fast push tests.
* Extract mps tests to a workflow.
For better control/filtering.
* Remove --extra-index-url from mps tests
* Increase tolerance of mps test
This test passes in my Mac (Ventura 13.3) but fails in the CI hardware
(Ventura 13.2). I ran the local tests following the same steps that
exist in the CI workflow.
* Temporarily run mps tests on pr
So we can test.
* Revert "Temporarily run mps tests on pr"
Tests passed, go back to running on push.
---------
Signed-off-by: Asfiya Baig
Co-authored-by: Ilia Larchenko <41329713+IliaLarchenko@users.noreply.github.com>
Co-authored-by: Patrick von Platen
Co-authored-by: YiYi Xu
Co-authored-by: yiyixuxu
Co-authored-by: Sayak Paul
Co-authored-by: Horace He
Co-authored-by: Umar <55330742+mu94-csl@users.noreply.github.com>
Co-authored-by: Mylo <36931363+gitmylo@users.noreply.github.com>
Co-authored-by: Markus Pobitzer
Co-authored-by: Cheng Lu
Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Co-authored-by: Isamu Isozaki
Co-authored-by: Cesar Aybar
Co-authored-by: Will Rice
Co-authored-by: Adrià Arrufat <1671644+arrufat@users.noreply.github.com>
Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Co-authored-by: At-sushi
Co-authored-by: Lucca Zenóbio
Co-authored-by: Lysandre Debut
Co-authored-by: Isotr0py <41363108+Isotr0py@users.noreply.github.com>
Co-authored-by: pdoane
Co-authored-by: Will Berman
Co-authored-by: yiyixuxu
Co-authored-by: Rupert Menneer <71332436+rupertmenneer@users.noreply.github.com>
Co-authored-by: sudowind
Co-authored-by: Takuma Mori
Co-authored-by: Stas Bekman
Co-authored-by: Pedro Cuenca
Co-authored-by: Laureηt
Co-authored-by: Jongwoo Han
Co-authored-by: asfiyab-nvidia <117682710+asfiyab-nvidia@users.noreply.github.com>
Co-authored-by: clarencechen
Co-authored-by: Laureηt
Co-authored-by: superlabs-dev <133080491+superlabs-dev@users.noreply.github.com>
Co-authored-by: Dev Aggarwal
Co-authored-by: Vimarsh Chaturvedi
Co-authored-by: 7eu7d7 <31194890+7eu7d7@users.noreply.github.com>
Co-authored-by: cmdr2
Co-authored-by: wfng92 <43742196+wfng92@users.noreply.github.com>
Co-authored-by: Glaceon-Hyy
Co-authored-by: yueyang.hyy
---
src/diffusers/models/modeling_utils.py | 25 ++++++++++++++++++++++-
src/diffusers/pipelines/pipeline_utils.py | 21 +++++++++++++++++++
2 files changed, 45 insertions(+), 1 deletion(-)
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index e7cfcd71062f..c9fabf93253b 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -398,6 +398,15 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
more information about each option see [designing a device
map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+ max_memory (`Dict`, *optional*):
+ A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
+ GPU and the available CPU RAM if unset.
+ offload_folder (`str` or `os.PathLike`, *optional*):
+ If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
+ offload_state_dict (`bool`, *optional*):
+ If `True`, will temporarily offload the CPU state dict to the hard drive to avoid getting out of CPU
+ RAM if the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to
+ `True` when there is some disk offload.
low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
@@ -439,6 +448,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
torch_dtype = kwargs.pop("torch_dtype", None)
subfolder = kwargs.pop("subfolder", None)
device_map = kwargs.pop("device_map", None)
+ max_memory = kwargs.pop("max_memory", None)
+ offload_folder = kwargs.pop("offload_folder", None)
+ offload_state_dict = kwargs.pop("offload_state_dict", False)
low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
variant = kwargs.pop("variant", None)
use_safetensors = kwargs.pop("use_safetensors", None)
@@ -510,6 +522,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
revision=revision,
subfolder=subfolder,
device_map=device_map,
+ max_memory=max_memory,
+ offload_folder=offload_folder,
+ offload_state_dict=offload_state_dict,
user_agent=user_agent,
**kwargs,
)
@@ -614,7 +629,15 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
else: # else let accelerate handle loading and dispatching.
# Load weights and dispatch according to the device_map
# by default the device_map is None and the weights are loaded on the CPU
- accelerate.load_checkpoint_and_dispatch(model, model_file, device_map, dtype=torch_dtype)
+ accelerate.load_checkpoint_and_dispatch(
+ model,
+ model_file,
+ device_map,
+ max_memory=max_memory,
+ offload_folder=offload_folder,
+ offload_state_dict=offload_state_dict,
+ dtype=torch_dtype,
+ )
loading_info = {
"missing_keys": [],
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index fa71a181f521..aed1139a2a16 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -354,6 +354,9 @@ def load_sub_model(
provider: Any,
sess_options: Any,
device_map: Optional[Union[Dict[str, torch.device], str]],
+ max_memory: Optional[Dict[Union[int, str], Union[int, str]]],
+ offload_folder: Optional[Union[str, os.PathLike]],
+ offload_state_dict: bool,
model_variants: Dict[str, str],
name: str,
from_flax: bool,
@@ -416,6 +419,9 @@ def load_sub_model(
# This makes sure that the weights won't be initialized which significantly speeds up loading.
if is_diffusers_model or is_transformers_model:
loading_kwargs["device_map"] = device_map
+ loading_kwargs["max_memory"] = max_memory
+ loading_kwargs["offload_folder"] = offload_folder
+ loading_kwargs["offload_state_dict"] = offload_state_dict
loading_kwargs["variant"] = model_variants.pop(name, None)
if from_flax:
loading_kwargs["from_flax"] = True
@@ -808,6 +814,15 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
more information about each option see [designing a device
map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+ max_memory (`Dict`, *optional*):
+ A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
+ GPU and the available CPU RAM if unset.
+ offload_folder (`str` or `os.PathLike`, *optional*):
+ If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
+ offload_state_dict (`bool`, *optional*):
+ If `True`, will temporarily offload the CPU state dict to the hard drive to avoid getting out of CPU
+ RAM if the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to
+ `True` when there is some disk offload.
low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
@@ -873,6 +888,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
provider = kwargs.pop("provider", None)
sess_options = kwargs.pop("sess_options", None)
device_map = kwargs.pop("device_map", None)
+ max_memory = kwargs.pop("max_memory", None)
+ offload_folder = kwargs.pop("offload_folder", None)
+ offload_state_dict = kwargs.pop("offload_state_dict", False)
low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
variant = kwargs.pop("variant", None)
use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
@@ -1046,6 +1064,9 @@ def load_module(name, value):
provider=provider,
sess_options=sess_options,
device_map=device_map,
+ max_memory=max_memory,
+ offload_folder=offload_folder,
+ offload_state_dict=offload_state_dict,
model_variants=model_variants,
name=name,
from_flax=from_flax,
From c4359d63e32119081be877ec0affa61df5e6630b Mon Sep 17 00:00:00 2001
From: takuoko
Date: Tue, 23 May 2023 00:21:54 +0900
Subject: [PATCH 120/206] [Community] reference only control (#3435)
* add reference only control
* add reference only control
* add reference only control
* fix lint
* fix lint
* reference adain
* bugfix EulerAncestralDiscreteScheduler
* fix style fidelity rule
* fix default output size
* del unused line
* fix deterministic
---
examples/community/README.md | 43 +
.../community/stable_diffusion_reference.py | 774 ++++++++++++++++++
2 files changed, 817 insertions(+)
create mode 100644 examples/community/stable_diffusion_reference.py
diff --git a/examples/community/README.md b/examples/community/README.md
index 47b129ce9e7e..974f77fd1011 100755
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -1320,3 +1320,46 @@ prompt = "photorealistic new zealand hills"
image = pipe(prompt, image=input_image, strength=0.75,).images[0]
image.save('tensorrt_img2img_new_zealand_hills.png')
```
+
+### Stable Diffusion Reference
+
+This pipeline uses the Reference only Control. Refer to the [sd-webui-controlnet discussion](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236).
+
+
+```py
+import torch
+from diffusers import UniPCMultistepScheduler
+from diffusers.utils import load_image
+
+input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
+
+pipe = StableDiffusionReferencePipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5",
+ safety_checker=None,
+ torch_dtype=torch.float16
+ ).to('cuda:0')
+
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+
+result_img = pipe(ref_image=input_image,
+ prompt="1girl",
+ num_inference_steps=20,
+ reference_attn=True,
+ reference_adain=True).images[0]
+```
+
+Reference Image
+
+
+
+Output Image of `reference_attn=True` and `reference_adain=False`
+
+
+
+Output Image of `reference_attn=False` and `reference_adain=True`
+
+
+
+Output Image of `reference_attn=True` and `reference_adain=True`
+
+
diff --git a/examples/community/stable_diffusion_reference.py b/examples/community/stable_diffusion_reference.py
new file mode 100644
index 000000000000..5e8051cdcdb2
--- /dev/null
+++ b/examples/community/stable_diffusion_reference.py
@@ -0,0 +1,774 @@
+# Inspired by: https://github.com/Mikubill/sd-webui-controlnet/discussions/1236
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+
+from diffusers import StableDiffusionPipeline
+from diffusers.models.attention import BasicTransformerBlock
+from diffusers.models.unet_2d_blocks import CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, UpBlock2D
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.utils import PIL_INTERPOLATION, logging, randn_tensor
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+ Examples:
+ ```py
+ >>> import torch
+ >>> from diffusers import UniPCMultistepScheduler
+ >>> from diffusers.utils import load_image
+
+ >>> input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
+
+ >>> pipe = StableDiffusionReferencePipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5",
+ safety_checker=None,
+ torch_dtype=torch.float16
+ ).to('cuda:0')
+
+ >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe_controlnet.scheduler.config)
+
+ >>> result_img = pipe(ref_image=input_image,
+ prompt="1girl",
+ num_inference_steps=20,
+ reference_attn=True,
+ reference_adain=True).images[0]
+
+ >>> result_img.show()
+ ```
+"""
+
+
+def torch_dfs(model: torch.nn.Module):
+ result = [model]
+ for child in model.children():
+ result += torch_dfs(child)
+ return result
+
+
+class StableDiffusionReferencePipeline(StableDiffusionPipeline):
+ def _default_height_width(self, height, width, image):
+ # NOTE: It is possible that a list of images have different
+ # dimensions for each image, so just checking the first image
+ # is not _exactly_ correct, but it is simple.
+ while isinstance(image, list):
+ image = image[0]
+
+ if height is None:
+ if isinstance(image, PIL.Image.Image):
+ height = image.height
+ elif isinstance(image, torch.Tensor):
+ height = image.shape[2]
+
+ height = (height // 8) * 8 # round down to nearest multiple of 8
+
+ if width is None:
+ if isinstance(image, PIL.Image.Image):
+ width = image.width
+ elif isinstance(image, torch.Tensor):
+ width = image.shape[3]
+
+ width = (width // 8) * 8 # round down to nearest multiple of 8
+
+ return height, width
+
+ def prepare_image(
+ self,
+ image,
+ width,
+ height,
+ batch_size,
+ num_images_per_prompt,
+ device,
+ dtype,
+ do_classifier_free_guidance=False,
+ guess_mode=False,
+ ):
+ if not isinstance(image, torch.Tensor):
+ if isinstance(image, PIL.Image.Image):
+ image = [image]
+
+ if isinstance(image[0], PIL.Image.Image):
+ images = []
+
+ for image_ in image:
+ image_ = image_.convert("RGB")
+ image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
+ image_ = np.array(image_)
+ image_ = image_[None, :]
+ images.append(image_)
+
+ image = images
+
+ image = np.concatenate(image, axis=0)
+ image = np.array(image).astype(np.float32) / 255.0
+ image = (image - 0.5) / 0.5
+ image = image.transpose(0, 3, 1, 2)
+ image = torch.from_numpy(image)
+ elif isinstance(image[0], torch.Tensor):
+ image = torch.cat(image, dim=0)
+
+ image_batch_size = image.shape[0]
+
+ if image_batch_size == 1:
+ repeat_by = batch_size
+ else:
+ # image batch size is the same as prompt batch size
+ repeat_by = num_images_per_prompt
+
+ image = image.repeat_interleave(repeat_by, dim=0)
+
+ image = image.to(device=device, dtype=dtype)
+
+ if do_classifier_free_guidance and not guess_mode:
+ image = torch.cat([image] * 2)
+
+ return image
+
+ def prepare_ref_latents(self, refimage, batch_size, dtype, device, generator, do_classifier_free_guidance):
+ refimage = refimage.to(device=device, dtype=dtype)
+
+ # encode the mask image into latents space so we can concatenate it to the latents
+ if isinstance(generator, list):
+ ref_image_latents = [
+ self.vae.encode(refimage[i : i + 1]).latent_dist.sample(generator=generator[i])
+ for i in range(batch_size)
+ ]
+ ref_image_latents = torch.cat(ref_image_latents, dim=0)
+ else:
+ ref_image_latents = self.vae.encode(refimage).latent_dist.sample(generator=generator)
+ ref_image_latents = self.vae.config.scaling_factor * ref_image_latents
+
+ # duplicate mask and ref_image_latents for each generation per prompt, using mps friendly method
+ if ref_image_latents.shape[0] < batch_size:
+ if not batch_size % ref_image_latents.shape[0] == 0:
+ raise ValueError(
+ "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+ f" to a total batch size of {batch_size}, but {ref_image_latents.shape[0]} images were passed."
+ " Make sure the number of images that you pass is divisible by the total requested batch size."
+ )
+ ref_image_latents = ref_image_latents.repeat(batch_size // ref_image_latents.shape[0], 1, 1, 1)
+
+ ref_image_latents = torch.cat([ref_image_latents] * 2) if do_classifier_free_guidance else ref_image_latents
+
+ # aligning device to prevent device errors when concating it with the latent model input
+ ref_image_latents = ref_image_latents.to(device=device, dtype=dtype)
+ return ref_image_latents
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ prompt: Union[str, List[str]] = None,
+ ref_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ latents: Optional[torch.FloatTensor] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+ callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ attention_auto_machine_weight: float = 1.0,
+ gn_auto_machine_weight: float = 1.0,
+ style_fidelity: float = 0.5,
+ reference_attn: bool = True,
+ reference_adain: bool = True,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+ instead.
+ ref_image (`torch.FloatTensor`, `PIL.Image.Image`):
+ The Reference Control input condition. Reference Control uses this input condition to generate guidance to Unet. If
+ the type is specified as `Torch.FloatTensor`, it is passed to Reference Control as is. `PIL.Image.Image` can
+ also be accepted as an image.
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+ The height in pixels of the generated image.
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+ The width in pixels of the generated image.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+ latents (`torch.FloatTensor`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+ plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+ attention_auto_machine_weight (`float`):
+ Weight of using reference query for self attention's context.
+ If attention_auto_machine_weight=1.0, use reference query for all self attention's context.
+ gn_auto_machine_weight (`float`):
+ Weight of using reference adain. If gn_auto_machine_weight=2.0, use all reference adain plugins.
+ style_fidelity (`float`):
+ style fidelity of ref_uncond_xt. If style_fidelity=1.0, control more important,
+ elif style_fidelity=0.0, prompt more important, else balanced.
+ reference_attn (`bool`):
+ Whether to use reference query for self attention's context.
+ reference_adain (`bool`):
+ Whether to use reference adain.
+
+ Examples:
+
+ Returns:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+ (nsfw) content, according to the `safety_checker`.
+ """
+ assert reference_attn or reference_adain, "`reference_attn` or `reference_adain` must be True."
+
+ # 0. Default height and width to unet
+ height, width = self._default_height_width(height, width, ref_image)
+
+ # 1. Check inputs. Raise error if not correct
+ self.check_inputs(
+ prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+ )
+
+ # 2. Define call parameters
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ device = self._execution_device
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ # 3. Encode input prompt
+ prompt_embeds = self._encode_prompt(
+ prompt,
+ device,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
+
+ # 4. Preprocess reference image
+ ref_image = self.prepare_image(
+ image=ref_image,
+ width=width,
+ height=height,
+ batch_size=batch_size * num_images_per_prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ device=device,
+ dtype=prompt_embeds.dtype,
+ )
+
+ # 5. Prepare timesteps
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
+ timesteps = self.scheduler.timesteps
+
+ # 6. Prepare latent variables
+ num_channels_latents = self.unet.config.in_channels
+ latents = self.prepare_latents(
+ batch_size * num_images_per_prompt,
+ num_channels_latents,
+ height,
+ width,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ latents,
+ )
+
+ # 7. Prepare reference latent variables
+ ref_image_latents = self.prepare_ref_latents(
+ ref_image,
+ batch_size * num_images_per_prompt,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ do_classifier_free_guidance,
+ )
+
+ # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+ # 9. Modify self attention and group norm
+ MODE = "write"
+ uc_mask = (
+ torch.Tensor([1] * batch_size * num_images_per_prompt + [0] * batch_size * num_images_per_prompt)
+ .type_as(ref_image_latents)
+ .bool()
+ )
+
+ def hacked_basic_transformer_inner_forward(
+ self,
+ hidden_states,
+ encoder_hidden_states=None,
+ timestep=None,
+ attention_mask=None,
+ cross_attention_kwargs=None,
+ class_labels=None,
+ ):
+ if self.use_ada_layer_norm:
+ norm_hidden_states = self.norm1(hidden_states, timestep)
+ elif self.use_ada_layer_norm_zero:
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+ hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+ )
+ else:
+ norm_hidden_states = self.norm1(hidden_states)
+
+ # 1. Self-Attention
+ cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+ if self.only_cross_attention:
+ attn_output = self.attn1(
+ norm_hidden_states,
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+ attention_mask=attention_mask,
+ **cross_attention_kwargs,
+ )
+ else:
+ if MODE == "write":
+ self.bank.append(norm_hidden_states.detach().clone())
+ attn_output = self.attn1(
+ norm_hidden_states,
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+ attention_mask=attention_mask,
+ **cross_attention_kwargs,
+ )
+ if MODE == "read":
+ if attention_auto_machine_weight > self.attn_weight:
+ attn_output_uc = self.attn1(
+ norm_hidden_states,
+ encoder_hidden_states=torch.cat([norm_hidden_states] + self.bank, dim=1),
+ # attention_mask=attention_mask,
+ **cross_attention_kwargs,
+ )
+ attn_output_c = attn_output_uc.clone()
+ if do_classifier_free_guidance and style_fidelity > 0:
+ attn_output_c[uc_mask] = self.attn1(
+ norm_hidden_states[uc_mask],
+ encoder_hidden_states=norm_hidden_states[uc_mask],
+ **cross_attention_kwargs,
+ )
+ attn_output = style_fidelity * attn_output_c + (1.0 - style_fidelity) * attn_output_uc
+ self.bank.clear()
+ else:
+ attn_output = self.attn1(
+ norm_hidden_states,
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+ attention_mask=attention_mask,
+ **cross_attention_kwargs,
+ )
+ if self.use_ada_layer_norm_zero:
+ attn_output = gate_msa.unsqueeze(1) * attn_output
+ hidden_states = attn_output + hidden_states
+
+ if self.attn2 is not None:
+ norm_hidden_states = (
+ self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+ )
+
+ # 2. Cross-Attention
+ attn_output = self.attn2(
+ norm_hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ attention_mask=attention_mask,
+ **cross_attention_kwargs,
+ )
+ hidden_states = attn_output + hidden_states
+
+ # 3. Feed-forward
+ norm_hidden_states = self.norm3(hidden_states)
+
+ if self.use_ada_layer_norm_zero:
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+ ff_output = self.ff(norm_hidden_states)
+
+ if self.use_ada_layer_norm_zero:
+ ff_output = gate_mlp.unsqueeze(1) * ff_output
+
+ hidden_states = ff_output + hidden_states
+
+ return hidden_states
+
+ def hacked_mid_forward(self, *args, **kwargs):
+ eps = 1e-6
+ x = self.original_forward(*args, **kwargs)
+ if MODE == "write":
+ if gn_auto_machine_weight >= self.gn_weight:
+ var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0)
+ self.mean_bank.append(mean)
+ self.var_bank.append(var)
+ if MODE == "read":
+ if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+ var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0)
+ std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+ mean_acc = sum(self.mean_bank) / float(len(self.mean_bank))
+ var_acc = sum(self.var_bank) / float(len(self.var_bank))
+ std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+ x_uc = (((x - mean) / std) * std_acc) + mean_acc
+ x_c = x_uc.clone()
+ if do_classifier_free_guidance and style_fidelity > 0:
+ x_c[uc_mask] = x[uc_mask]
+ x = style_fidelity * x_c + (1.0 - style_fidelity) * x_uc
+ self.mean_bank = []
+ self.var_bank = []
+ return x
+
+ def hack_CrossAttnDownBlock2D_forward(
+ self,
+ hidden_states,
+ temb=None,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ cross_attention_kwargs=None,
+ ):
+ eps = 1e-6
+
+ # TODO(Patrick, William) - attention mask is not used
+ output_states = ()
+
+ for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)):
+ hidden_states = resnet(hidden_states, temb)
+ hidden_states = attn(
+ hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ cross_attention_kwargs=cross_attention_kwargs,
+ return_dict=False,
+ )[0]
+ if MODE == "write":
+ if gn_auto_machine_weight >= self.gn_weight:
+ var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+ self.mean_bank.append(mean)
+ self.var_bank.append(var)
+ if MODE == "read":
+ if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+ var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+ std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+ mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+ var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+ std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+ hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+ hidden_states_c = hidden_states_uc.clone()
+ if do_classifier_free_guidance and style_fidelity > 0:
+ hidden_states_c[uc_mask] = hidden_states[uc_mask]
+ hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+ output_states = output_states + (hidden_states,)
+
+ if MODE == "read":
+ self.mean_bank = []
+ self.var_bank = []
+
+ if self.downsamplers is not None:
+ for downsampler in self.downsamplers:
+ hidden_states = downsampler(hidden_states)
+
+ output_states = output_states + (hidden_states,)
+
+ return hidden_states, output_states
+
+ def hacked_DownBlock2D_forward(self, hidden_states, temb=None):
+ eps = 1e-6
+
+ output_states = ()
+
+ for i, resnet in enumerate(self.resnets):
+ hidden_states = resnet(hidden_states, temb)
+
+ if MODE == "write":
+ if gn_auto_machine_weight >= self.gn_weight:
+ var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+ self.mean_bank.append(mean)
+ self.var_bank.append(var)
+ if MODE == "read":
+ if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+ var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+ std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+ mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+ var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+ std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+ hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+ hidden_states_c = hidden_states_uc.clone()
+ if do_classifier_free_guidance and style_fidelity > 0:
+ hidden_states_c[uc_mask] = hidden_states[uc_mask]
+ hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+ output_states = output_states + (hidden_states,)
+
+ if MODE == "read":
+ self.mean_bank = []
+ self.var_bank = []
+
+ if self.downsamplers is not None:
+ for downsampler in self.downsamplers:
+ hidden_states = downsampler(hidden_states)
+
+ output_states = output_states + (hidden_states,)
+
+ return hidden_states, output_states
+
+ def hacked_CrossAttnUpBlock2D_forward(
+ self,
+ hidden_states,
+ res_hidden_states_tuple,
+ temb=None,
+ encoder_hidden_states=None,
+ cross_attention_kwargs=None,
+ upsample_size=None,
+ attention_mask=None,
+ ):
+ eps = 1e-6
+ # TODO(Patrick, William) - attention mask is not used
+ for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)):
+ # pop res hidden states
+ res_hidden_states = res_hidden_states_tuple[-1]
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+ hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+ hidden_states = resnet(hidden_states, temb)
+ hidden_states = attn(
+ hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ cross_attention_kwargs=cross_attention_kwargs,
+ return_dict=False,
+ )[0]
+
+ if MODE == "write":
+ if gn_auto_machine_weight >= self.gn_weight:
+ var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+ self.mean_bank.append(mean)
+ self.var_bank.append(var)
+ if MODE == "read":
+ if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+ var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+ std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+ mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+ var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+ std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+ hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+ hidden_states_c = hidden_states_uc.clone()
+ if do_classifier_free_guidance and style_fidelity > 0:
+ hidden_states_c[uc_mask] = hidden_states[uc_mask]
+ hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+ if MODE == "read":
+ self.mean_bank = []
+ self.var_bank = []
+
+ if self.upsamplers is not None:
+ for upsampler in self.upsamplers:
+ hidden_states = upsampler(hidden_states, upsample_size)
+
+ return hidden_states
+
+ def hacked_UpBlock2D_forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+ eps = 1e-6
+ for i, resnet in enumerate(self.resnets):
+ # pop res hidden states
+ res_hidden_states = res_hidden_states_tuple[-1]
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+ hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+ hidden_states = resnet(hidden_states, temb)
+
+ if MODE == "write":
+ if gn_auto_machine_weight >= self.gn_weight:
+ var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+ self.mean_bank.append(mean)
+ self.var_bank.append(var)
+ if MODE == "read":
+ if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+ var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+ std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+ mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+ var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+ std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+ hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+ hidden_states_c = hidden_states_uc.clone()
+ if do_classifier_free_guidance and style_fidelity > 0:
+ hidden_states_c[uc_mask] = hidden_states[uc_mask]
+ hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+ if MODE == "read":
+ self.mean_bank = []
+ self.var_bank = []
+
+ if self.upsamplers is not None:
+ for upsampler in self.upsamplers:
+ hidden_states = upsampler(hidden_states, upsample_size)
+
+ return hidden_states
+
+ if reference_attn:
+ attn_modules = [module for module in torch_dfs(self.unet) if isinstance(module, BasicTransformerBlock)]
+ attn_modules = sorted(attn_modules, key=lambda x: -x.norm1.normalized_shape[0])
+
+ for i, module in enumerate(attn_modules):
+ module._original_inner_forward = module.forward
+ module.forward = hacked_basic_transformer_inner_forward.__get__(module, BasicTransformerBlock)
+ module.bank = []
+ module.attn_weight = float(i) / float(len(attn_modules))
+
+ if reference_adain:
+ gn_modules = [self.unet.mid_block]
+ self.unet.mid_block.gn_weight = 0
+
+ down_blocks = self.unet.down_blocks
+ for w, module in enumerate(down_blocks):
+ module.gn_weight = 1.0 - float(w) / float(len(down_blocks))
+ gn_modules.append(module)
+
+ up_blocks = self.unet.up_blocks
+ for w, module in enumerate(up_blocks):
+ module.gn_weight = float(w) / float(len(up_blocks))
+ gn_modules.append(module)
+
+ for i, module in enumerate(gn_modules):
+ if getattr(module, "original_forward", None) is None:
+ module.original_forward = module.forward
+ if i == 0:
+ # mid_block
+ module.forward = hacked_mid_forward.__get__(module, torch.nn.Module)
+ elif isinstance(module, CrossAttnDownBlock2D):
+ module.forward = hack_CrossAttnDownBlock2D_forward.__get__(module, CrossAttnDownBlock2D)
+ elif isinstance(module, DownBlock2D):
+ module.forward = hacked_DownBlock2D_forward.__get__(module, DownBlock2D)
+ elif isinstance(module, CrossAttnUpBlock2D):
+ module.forward = hacked_CrossAttnUpBlock2D_forward.__get__(module, CrossAttnUpBlock2D)
+ elif isinstance(module, UpBlock2D):
+ module.forward = hacked_UpBlock2D_forward.__get__(module, UpBlock2D)
+ module.mean_bank = []
+ module.var_bank = []
+ module.gn_weight *= 2
+
+ # 10. Denoising loop
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
+ for i, t in enumerate(timesteps):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+ # ref only part
+ noise = randn_tensor(
+ ref_image_latents.shape, generator=generator, device=device, dtype=ref_image_latents.dtype
+ )
+ ref_xt = self.scheduler.add_noise(
+ ref_image_latents,
+ noise,
+ t.reshape(
+ 1,
+ ),
+ )
+ ref_xt = self.scheduler.scale_model_input(ref_xt, t)
+
+ MODE = "write"
+ self.unet(
+ ref_xt,
+ t,
+ encoder_hidden_states=prompt_embeds,
+ cross_attention_kwargs=cross_attention_kwargs,
+ return_dict=False,
+ )
+
+ # predict the noise residual
+ MODE = "read"
+ noise_pred = self.unet(
+ latent_model_input,
+ t,
+ encoder_hidden_states=prompt_embeds,
+ cross_attention_kwargs=cross_attention_kwargs,
+ return_dict=False,
+ )[0]
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+ # call the callback, if provided
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+ progress_bar.update()
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, latents)
+
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
+ image = latents
+ has_nsfw_concept = None
+
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+ # Offload last model to CPU
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+ self.final_offload_hook.offload()
+
+ if not return_dict:
+ return (image, has_nsfw_concept)
+
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
From 64bf5d33b7ef1b1deac256bed7bd99b55020c4e0 Mon Sep 17 00:00:00 2001
From: Birch-san
Date: Mon, 22 May 2023 17:27:15 +0100
Subject: [PATCH 121/206] Support for cross-attention bias / mask (#2634)
* Cross-attention masks
prefer qualified symbol, fix accidental Optional
prefer qualified symbol in AttentionProcessor
prefer qualified symbol in embeddings.py
qualified symbol in transformed_2d
qualify FloatTensor in unet_2d_blocks
move new transformer_2d params attention_mask, encoder_attention_mask to the end of the section which is assumed (e.g. by functions such as checkpoint()) to have a stable positional param interface. regard return_dict as a special-case which is assumed to be injected separately from positional params (e.g. by create_custom_forward()).
move new encoder_attention_mask param to end of CrossAttn block interfaces and Unet2DCondition interface, to maintain positional param interface.
regenerate modeling_text_unet.py
remove unused import
unet_2d_condition encoder_attention_mask docs
Co-authored-by: Pedro Cuenca
versatile_diffusion/modeling_text_unet.py encoder_attention_mask docs
Co-authored-by: Pedro Cuenca
transformer_2d encoder_attention_mask docs
Co-authored-by: Pedro Cuenca
unet_2d_blocks.py: add parameter name comments
Co-authored-by: Pedro Cuenca
revert description. bool-to-bias treatment happens in unet_2d_condition only.
comment parameter names
fix copies, style
* encoder_attention_mask for SimpleCrossAttnDownBlock2D, SimpleCrossAttnUpBlock2D
* encoder_attention_mask for UNetMidBlock2DSimpleCrossAttn
* support attention_mask, encoder_attention_mask in KCrossAttnDownBlock2D, KCrossAttnUpBlock2D, KAttentionBlock. fix binding of attention_mask, cross_attention_kwargs params in KCrossAttnDownBlock2D, KCrossAttnUpBlock2D checkpoint invocations.
* fix mistake made during merge conflict resolution
* regenerate versatile_diffusion
* pass time embedding into checkpointed attention invocation
* always assume encoder_attention_mask is a mask (i.e. not a bias).
* style, fix-copies
* add tests for cross-attention masks
* add test for padding of attention mask
* explain mask's query_tokens dim. fix explanation about broadcasting over channels; we actually broadcast over query tokens
* support both masks and biases in Transformer2DModel#forward. document behaviour
* fix-copies
* delete attention_mask docs on the basis I never tested self-attention masking myself. not comfortable explaining it, since I don't actually understand how a self-attn mask can work in its current form: the key length will be different in every ResBlock (we don't downsample the mask when we downsample the image).
* review feedback: the standard Unet blocks shouldn't pass temb to attn (only to resnet). remove from KCrossAttnDownBlock2D,KCrossAttnUpBlock2D#forward.
* remove encoder_attention_mask param from SimpleCrossAttn{Up,Down}Block2D,UNetMidBlock2DSimpleCrossAttn, and mask-choice in those blocks' #forward, on the basis that they only do one type of attention, so the consumer can pass whichever type of attention_mask is appropriate.
* put attention mask padding back to how it was (since the SD use-case it enabled wasn't important, and it breaks the original unclip use-case). disable the test which was added.
* fix-copies
* style
* fix-copies
* put encoder_attention_mask param back into Simple block forward interfaces, to ensure consistency of forward interface.
* restore passing of emb to KAttentionBlock#forward, on the basis that removal caused test failures. restore also the passing of emb to checkpointed calls to KAttentionBlock#forward.
* make simple unet2d blocks use encoder_attention_mask, but only when attention_mask is None. this should fix UnCLIP compatibility.
* fix copies
---
src/diffusers/models/attention.py | 18 +-
src/diffusers/models/attention_processor.py | 33 +-
src/diffusers/models/embeddings.py | 2 +-
src/diffusers/models/transformer_2d.py | 47 ++-
src/diffusers/models/unet_2d_blocks.py | 315 +++++++++++-------
src/diffusers/models/unet_2d_condition.py | 26 +-
.../versatile_diffusion/modeling_text_unet.py | 167 ++++++----
tests/models/test_models_unet_2d_condition.py | 71 ++++
8 files changed, 473 insertions(+), 206 deletions(-)
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 0b313b83d360..a7a9a472d9e9 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-from typing import Optional
+from typing import Any, Dict, Optional
import torch
import torch.nn.functional as F
@@ -120,13 +120,13 @@ def __init__(
def forward(
self,
- hidden_states,
- attention_mask=None,
- encoder_hidden_states=None,
- encoder_attention_mask=None,
- timestep=None,
- cross_attention_kwargs=None,
- class_labels=None,
+ hidden_states: torch.FloatTensor,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ timestep: Optional[torch.LongTensor] = None,
+ cross_attention_kwargs: Dict[str, Any] = None,
+ class_labels: Optional[torch.LongTensor] = None,
):
# Notice that normalization is always applied before the real computation in the following blocks.
# 1. Self-Attention
@@ -155,8 +155,6 @@ def forward(
norm_hidden_states = (
self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
)
- # TODO (Birch-San): Here we should prepare the encoder_attention mask correctly
- # prepare attention mask here
attn_output = self.attn2(
norm_hidden_states,
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 86997632cac1..d0e2e7bd2dac 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -380,7 +380,13 @@ def prepare_attention_mask(self, attention_mask, target_length, batch_size=None,
if attention_mask is None:
return attention_mask
- if attention_mask.shape[-1] != target_length:
+ current_length: int = attention_mask.shape[-1]
+ if current_length > target_length:
+ # we *could* trim the mask with:
+ # attention_mask = attention_mask[:,:target_length]
+ # but this is weird enough that it's more likely to be a mistake than a shortcut
+ raise ValueError(f"mask's length ({current_length}) exceeds the sequence length ({target_length}).")
+ elif current_length < target_length:
if attention_mask.device.type == "mps":
# HACK: MPS: Does not support padding by greater than dimension of input tensor.
# Instead, we can manually construct the padding tensor.
@@ -388,6 +394,10 @@ def prepare_attention_mask(self, attention_mask, target_length, batch_size=None,
padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
attention_mask = torch.cat([attention_mask, padding], dim=2)
else:
+ # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+ # we want to instead pad by (0, remaining_length), where remaining_length is:
+ # remaining_length: int = target_length - current_length
+ # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
if out_dim == 3:
@@ -820,7 +830,13 @@ class XFormersAttnProcessor:
def __init__(self, attention_op: Optional[Callable] = None):
self.attention_op = attention_op
- def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+ def __call__(
+ self,
+ attn: Attention,
+ hidden_states: torch.FloatTensor,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ ):
residual = hidden_states
input_ndim = hidden_states.ndim
@@ -829,11 +845,20 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
batch_size, channel, height, width = hidden_states.shape
hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
- batch_size, sequence_length, _ = (
+ batch_size, key_tokens, _ = (
hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
)
- attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+ attention_mask = attn.prepare_attention_mask(attention_mask, key_tokens, batch_size)
+ if attention_mask is not None:
+ # expand our mask's singleton query_tokens dimension:
+ # [batch*heads, 1, key_tokens] ->
+ # [batch*heads, query_tokens, key_tokens]
+ # so that it can be added as a bias onto the attention scores that xformers computes:
+ # [batch*heads, query_tokens, key_tokens]
+ # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
+ _, query_tokens, _ = hidden_states.shape
+ attention_mask = attention_mask.expand(-1, query_tokens, -1)
if attn.group_norm is not None:
hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index fa88bce305e6..fb803039b268 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -352,7 +352,7 @@ def token_drop(self, labels, force_drop_ids=None):
labels = torch.where(drop_ids, self.num_classes, labels)
return labels
- def forward(self, labels, force_drop_ids=None):
+ def forward(self, labels: torch.LongTensor, force_drop_ids=None):
use_dropout = self.dropout_prob > 0
if (self.training and use_dropout) or (force_drop_ids is not None):
labels = self.token_drop(labels, force_drop_ids)
diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py
index fde1014bd2e7..ec4cb371845f 100644
--- a/src/diffusers/models/transformer_2d.py
+++ b/src/diffusers/models/transformer_2d.py
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
-from typing import Optional
+from typing import Any, Dict, Optional
import torch
import torch.nn.functional as F
@@ -213,11 +213,13 @@ def __init__(
def forward(
self,
- hidden_states,
- encoder_hidden_states=None,
- timestep=None,
- class_labels=None,
- cross_attention_kwargs=None,
+ hidden_states: torch.Tensor,
+ encoder_hidden_states: Optional[torch.Tensor] = None,
+ timestep: Optional[torch.LongTensor] = None,
+ class_labels: Optional[torch.LongTensor] = None,
+ cross_attention_kwargs: Dict[str, Any] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.Tensor] = None,
return_dict: bool = True,
):
"""
@@ -228,11 +230,17 @@ def forward(
encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
self-attention.
- timestep ( `torch.long`, *optional*):
+ timestep ( `torch.LongTensor`, *optional*):
Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels
conditioning.
+ encoder_attention_mask ( `torch.Tensor`, *optional* ).
+ Cross-attention mask, applied to encoder_hidden_states. Two formats supported:
+ Mask `(batch, sequence_length)` True = keep, False = discard. Bias `(batch, 1, sequence_length)` 0
+ = keep, -10000 = discard.
+ If ndim == 2: will be interpreted as a mask, then converted into a bias consistent with the format
+ above. This bias will be added to the cross-attention scores.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
@@ -241,6 +249,29 @@ def forward(
[`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When
returning a tuple, the first element is the sample tensor.
"""
+ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+ # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+ # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+ # expects mask of shape:
+ # [batch, key_tokens]
+ # adds singleton query_tokens dimension:
+ # [batch, 1, key_tokens]
+ # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+ # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+ # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+ if attention_mask is not None and attention_mask.ndim == 2:
+ # assume that mask is expressed as:
+ # (1 = keep, 0 = discard)
+ # convert mask into a bias that can be added to attention scores:
+ # (keep = +0, discard = -10000.0)
+ attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+ attention_mask = attention_mask.unsqueeze(1)
+
+ # convert encoder_attention_mask to a bias the same way we do for attention_mask
+ if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+ encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+ encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
# 1. Input
if self.is_input_continuous:
batch, _, height, width = hidden_states.shape
@@ -264,7 +295,9 @@ def forward(
for block in self.transformer_blocks:
hidden_states = block(
hidden_states,
+ attention_mask=attention_mask,
encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
timestep=timestep,
cross_attention_kwargs=cross_attention_kwargs,
class_labels=class_labels,
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index 75d9eb3e03df..6f8e3d0f5500 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-from typing import Optional
+from typing import Any, Dict, Optional, Tuple
import numpy as np
import torch
@@ -558,14 +558,22 @@ def __init__(
self.resnets = nn.ModuleList(resnets)
def forward(
- self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
- ):
+ self,
+ hidden_states: torch.FloatTensor,
+ temb: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ ) -> torch.FloatTensor:
hidden_states = self.resnets[0](hidden_states, temb)
for attn, resnet in zip(self.attentions, self.resnets[1:]):
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs,
+ attention_mask=attention_mask,
+ encoder_attention_mask=encoder_attention_mask,
return_dict=False,
)[0]
hidden_states = resnet(hidden_states, temb)
@@ -659,16 +667,34 @@ def __init__(
self.resnets = nn.ModuleList(resnets)
def forward(
- self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
+ self,
+ hidden_states: torch.FloatTensor,
+ temb: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
):
cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+
+ if attention_mask is None:
+ # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+ mask = None if encoder_hidden_states is None else encoder_attention_mask
+ else:
+ # when attention_mask is defined: we don't even check for encoder_attention_mask.
+ # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+ # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+ # then we can simplify this whole if/else block to:
+ # mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+ mask = attention_mask
+
hidden_states = self.resnets[0](hidden_states, temb)
for attn, resnet in zip(self.attentions, self.resnets[1:]):
# attn
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
- attention_mask=attention_mask,
+ attention_mask=mask,
**cross_attention_kwargs,
)
@@ -850,9 +876,14 @@ def __init__(
self.gradient_checkpointing = False
def forward(
- self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
+ self,
+ hidden_states: torch.FloatTensor,
+ temb: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
):
- # TODO(Patrick, William) - attention mask is not used
output_states = ()
for resnet, attn in zip(self.resnets, self.attentions):
@@ -867,33 +898,32 @@ def custom_forward(*inputs):
return custom_forward
- if is_torch_version(">=", "1.11.0"):
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
- )
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- cross_attention_kwargs,
- use_reentrant=False,
- )[0]
- else:
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(resnet), hidden_states, temb
- )
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- cross_attention_kwargs,
- )[0]
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet),
+ hidden_states,
+ temb,
+ **ckpt_kwargs,
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ None, # timestep
+ None, # class_labels
+ cross_attention_kwargs,
+ attention_mask,
+ encoder_attention_mask,
+ **ckpt_kwargs,
+ )[0]
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs,
+ attention_mask=attention_mask,
+ encoder_attention_mask=encoder_attention_mask,
return_dict=False,
)[0]
@@ -1501,11 +1531,28 @@ def __init__(
self.gradient_checkpointing = False
def forward(
- self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
+ self,
+ hidden_states: torch.FloatTensor,
+ temb: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
):
output_states = ()
cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+ if attention_mask is None:
+ # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+ mask = None if encoder_hidden_states is None else encoder_attention_mask
+ else:
+ # when attention_mask is defined: we don't even check for encoder_attention_mask.
+ # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+ # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+ # then we can simplify this whole if/else block to:
+ # mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+ mask = attention_mask
+
for resnet, attn in zip(self.resnets, self.attentions):
if self.training and self.gradient_checkpointing:
@@ -1523,6 +1570,7 @@ def custom_forward(*inputs):
create_custom_forward(attn, return_dict=False),
hidden_states,
encoder_hidden_states,
+ mask,
cross_attention_kwargs,
)[0]
else:
@@ -1531,7 +1579,7 @@ def custom_forward(*inputs):
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
- attention_mask=attention_mask,
+ attention_mask=mask,
**cross_attention_kwargs,
)
@@ -1690,7 +1738,13 @@ def __init__(
self.gradient_checkpointing = False
def forward(
- self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
+ self,
+ hidden_states: torch.FloatTensor,
+ temb: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
):
output_states = ()
@@ -1706,29 +1760,23 @@ def custom_forward(*inputs):
return custom_forward
- if is_torch_version(">=", "1.11.0"):
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
- )
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- attention_mask,
- cross_attention_kwargs,
- use_reentrant=False,
- )
- else:
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(resnet), hidden_states, temb
- )
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- attention_mask,
- cross_attention_kwargs,
- )
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet),
+ hidden_states,
+ temb,
+ **ckpt_kwargs,
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ temb,
+ attention_mask,
+ cross_attention_kwargs,
+ encoder_attention_mask,
+ **ckpt_kwargs,
+ )
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
@@ -1737,6 +1785,7 @@ def custom_forward(*inputs):
emb=temb,
attention_mask=attention_mask,
cross_attention_kwargs=cross_attention_kwargs,
+ encoder_attention_mask=encoder_attention_mask,
)
if self.downsamplers is None:
@@ -1916,15 +1965,15 @@ def __init__(
def forward(
self,
- hidden_states,
- res_hidden_states_tuple,
- temb=None,
- encoder_hidden_states=None,
- cross_attention_kwargs=None,
- upsample_size=None,
- attention_mask=None,
+ hidden_states: torch.FloatTensor,
+ res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+ temb: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ upsample_size: Optional[int] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
):
- # TODO(Patrick, William) - attention mask is not used
for resnet, attn in zip(self.resnets, self.attentions):
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
@@ -1942,33 +1991,32 @@ def custom_forward(*inputs):
return custom_forward
- if is_torch_version(">=", "1.11.0"):
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
- )
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- cross_attention_kwargs,
- use_reentrant=False,
- )[0]
- else:
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(resnet), hidden_states, temb
- )
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- cross_attention_kwargs,
- )[0]
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet),
+ hidden_states,
+ temb,
+ **ckpt_kwargs,
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ None, # timestep
+ None, # class_labels
+ cross_attention_kwargs,
+ attention_mask,
+ encoder_attention_mask,
+ **ckpt_kwargs,
+ )[0]
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs,
+ attention_mask=attention_mask,
+ encoder_attention_mask=encoder_attention_mask,
return_dict=False,
)[0]
@@ -2594,15 +2642,28 @@ def __init__(
def forward(
self,
- hidden_states,
- res_hidden_states_tuple,
- temb=None,
- encoder_hidden_states=None,
- upsample_size=None,
- attention_mask=None,
- cross_attention_kwargs=None,
+ hidden_states: torch.FloatTensor,
+ res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+ temb: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ upsample_size: Optional[int] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
):
cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+
+ if attention_mask is None:
+ # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+ mask = None if encoder_hidden_states is None else encoder_attention_mask
+ else:
+ # when attention_mask is defined: we don't even check for encoder_attention_mask.
+ # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+ # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+ # then we can simplify this whole if/else block to:
+ # mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+ mask = attention_mask
+
for resnet, attn in zip(self.resnets, self.attentions):
# resnet
# pop res hidden states
@@ -2626,6 +2687,7 @@ def custom_forward(*inputs):
create_custom_forward(attn, return_dict=False),
hidden_states,
encoder_hidden_states,
+ mask,
cross_attention_kwargs,
)[0]
else:
@@ -2634,7 +2696,7 @@ def custom_forward(*inputs):
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
- attention_mask=attention_mask,
+ attention_mask=mask,
**cross_attention_kwargs,
)
@@ -2811,13 +2873,14 @@ def __init__(
def forward(
self,
- hidden_states,
- res_hidden_states_tuple,
- temb=None,
- encoder_hidden_states=None,
- cross_attention_kwargs=None,
- upsample_size=None,
- attention_mask=None,
+ hidden_states: torch.FloatTensor,
+ res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+ temb: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ upsample_size: Optional[int] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
):
res_hidden_states_tuple = res_hidden_states_tuple[-1]
if res_hidden_states_tuple is not None:
@@ -2835,29 +2898,23 @@ def custom_forward(*inputs):
return custom_forward
- if is_torch_version(">=", "1.11.0"):
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
- )
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- attention_mask,
- cross_attention_kwargs,
- use_reentrant=False,
- )[0]
- else:
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(resnet), hidden_states, temb
- )
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- attention_mask,
- cross_attention_kwargs,
- )[0]
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet),
+ hidden_states,
+ temb,
+ **ckpt_kwargs,
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ temb,
+ attention_mask,
+ cross_attention_kwargs,
+ encoder_attention_mask,
+ **ckpt_kwargs,
+ )[0]
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
@@ -2866,6 +2923,7 @@ def custom_forward(*inputs):
emb=temb,
attention_mask=attention_mask,
cross_attention_kwargs=cross_attention_kwargs,
+ encoder_attention_mask=encoder_attention_mask,
)
if self.upsamplers is not None:
@@ -2944,11 +3002,14 @@ def _to_4d(self, hidden_states, height, weight):
def forward(
self,
- hidden_states,
- encoder_hidden_states=None,
- emb=None,
- attention_mask=None,
- cross_attention_kwargs=None,
+ hidden_states: torch.FloatTensor,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ # TODO: mark emb as non-optional (self.norm2 requires it).
+ # requires assessing impact of change to positional param interface.
+ emb: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
):
cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
@@ -2962,6 +3023,7 @@ def forward(
attn_output = self.attn1(
norm_hidden_states,
encoder_hidden_states=None,
+ attention_mask=attention_mask,
**cross_attention_kwargs,
)
attn_output = self._to_4d(attn_output, height, weight)
@@ -2976,6 +3038,7 @@ def forward(
attn_output = self.attn2(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states,
+ attention_mask=attention_mask if encoder_hidden_states is None else encoder_attention_mask,
**cross_attention_kwargs,
)
attn_output = self._to_4d(attn_output, height, weight)
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 2a4c9fd72c1b..76a40ffa1ec5 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -618,6 +618,7 @@ def forward(
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
mid_block_additional_residual: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.Tensor] = None,
return_dict: bool = True,
) -> Union[UNet2DConditionOutput, Tuple]:
r"""
@@ -625,6 +626,10 @@ def forward(
sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+ encoder_attention_mask (`torch.Tensor`):
+ (batch, sequence_length) cross-attention mask, applied to encoder_hidden_states. True = keep, False =
+ discard. Mask will be converted into a bias, which adds large negative values to attention scores
+ corresponding to "discard" tokens.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
cross_attention_kwargs (`dict`, *optional*):
@@ -651,11 +656,27 @@ def forward(
logger.info("Forward upsample size to force interpolation output size.")
forward_upsample_size = True
- # prepare attention_mask
+ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+ # expects mask of shape:
+ # [batch, key_tokens]
+ # adds singleton query_tokens dimension:
+ # [batch, 1, key_tokens]
+ # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+ # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+ # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
if attention_mask is not None:
+ # assume that mask is expressed as:
+ # (1 = keep, 0 = discard)
+ # convert mask into a bias that can be added to attention scores:
+ # (keep = +0, discard = -10000.0)
attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
attention_mask = attention_mask.unsqueeze(1)
+ # convert encoder_attention_mask to a bias the same way we do for attention_mask
+ if encoder_attention_mask is not None:
+ encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+ encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
# 0. center input if necessary
if self.config.center_input_sample:
sample = 2 * sample - 1.0
@@ -727,6 +748,7 @@ def forward(
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
cross_attention_kwargs=cross_attention_kwargs,
+ encoder_attention_mask=encoder_attention_mask,
)
else:
sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
@@ -752,6 +774,7 @@ def forward(
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
cross_attention_kwargs=cross_attention_kwargs,
+ encoder_attention_mask=encoder_attention_mask,
)
if mid_block_additional_residual is not None:
@@ -778,6 +801,7 @@ def forward(
cross_attention_kwargs=cross_attention_kwargs,
upsample_size=upsample_size,
attention_mask=attention_mask,
+ encoder_attention_mask=encoder_attention_mask,
)
else:
sample = upsample_block(
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 7aaa0e49e1da..29cde43337d2 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -721,6 +721,7 @@ def forward(
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
mid_block_additional_residual: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.Tensor] = None,
return_dict: bool = True,
) -> Union[UNet2DConditionOutput, Tuple]:
r"""
@@ -728,6 +729,10 @@ def forward(
sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+ encoder_attention_mask (`torch.Tensor`):
+ (batch, sequence_length) cross-attention mask, applied to encoder_hidden_states. True = keep, False =
+ discard. Mask will be converted into a bias, which adds large negative values to attention scores
+ corresponding to "discard" tokens.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
cross_attention_kwargs (`dict`, *optional*):
@@ -754,11 +759,27 @@ def forward(
logger.info("Forward upsample size to force interpolation output size.")
forward_upsample_size = True
- # prepare attention_mask
+ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+ # expects mask of shape:
+ # [batch, key_tokens]
+ # adds singleton query_tokens dimension:
+ # [batch, 1, key_tokens]
+ # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+ # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+ # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
if attention_mask is not None:
+ # assume that mask is expressed as:
+ # (1 = keep, 0 = discard)
+ # convert mask into a bias that can be added to attention scores:
+ # (keep = +0, discard = -10000.0)
attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
attention_mask = attention_mask.unsqueeze(1)
+ # convert encoder_attention_mask to a bias the same way we do for attention_mask
+ if encoder_attention_mask is not None:
+ encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+ encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
# 0. center input if necessary
if self.config.center_input_sample:
sample = 2 * sample - 1.0
@@ -830,6 +851,7 @@ def forward(
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
cross_attention_kwargs=cross_attention_kwargs,
+ encoder_attention_mask=encoder_attention_mask,
)
else:
sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
@@ -855,6 +877,7 @@ def forward(
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
cross_attention_kwargs=cross_attention_kwargs,
+ encoder_attention_mask=encoder_attention_mask,
)
if mid_block_additional_residual is not None:
@@ -881,6 +904,7 @@ def forward(
cross_attention_kwargs=cross_attention_kwargs,
upsample_size=upsample_size,
attention_mask=attention_mask,
+ encoder_attention_mask=encoder_attention_mask,
)
else:
sample = upsample_block(
@@ -1188,9 +1212,14 @@ def __init__(
self.gradient_checkpointing = False
def forward(
- self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
+ self,
+ hidden_states: torch.FloatTensor,
+ temb: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
):
- # TODO(Patrick, William) - attention mask is not used
output_states = ()
for resnet, attn in zip(self.resnets, self.attentions):
@@ -1205,33 +1234,32 @@ def custom_forward(*inputs):
return custom_forward
- if is_torch_version(">=", "1.11.0"):
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
- )
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- cross_attention_kwargs,
- use_reentrant=False,
- )[0]
- else:
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(resnet), hidden_states, temb
- )
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- cross_attention_kwargs,
- )[0]
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet),
+ hidden_states,
+ temb,
+ **ckpt_kwargs,
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ None, # timestep
+ None, # class_labels
+ cross_attention_kwargs,
+ attention_mask,
+ encoder_attention_mask,
+ **ckpt_kwargs,
+ )[0]
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs,
+ attention_mask=attention_mask,
+ encoder_attention_mask=encoder_attention_mask,
return_dict=False,
)[0]
@@ -1414,15 +1442,15 @@ def __init__(
def forward(
self,
- hidden_states,
- res_hidden_states_tuple,
- temb=None,
- encoder_hidden_states=None,
- cross_attention_kwargs=None,
- upsample_size=None,
- attention_mask=None,
+ hidden_states: torch.FloatTensor,
+ res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+ temb: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ upsample_size: Optional[int] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
):
- # TODO(Patrick, William) - attention mask is not used
for resnet, attn in zip(self.resnets, self.attentions):
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
@@ -1440,33 +1468,32 @@ def custom_forward(*inputs):
return custom_forward
- if is_torch_version(">=", "1.11.0"):
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
- )
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- cross_attention_kwargs,
- use_reentrant=False,
- )[0]
- else:
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(resnet), hidden_states, temb
- )
- hidden_states = torch.utils.checkpoint.checkpoint(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- cross_attention_kwargs,
- )[0]
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(resnet),
+ hidden_states,
+ temb,
+ **ckpt_kwargs,
+ )
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ None, # timestep
+ None, # class_labels
+ cross_attention_kwargs,
+ attention_mask,
+ encoder_attention_mask,
+ **ckpt_kwargs,
+ )[0]
else:
hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs,
+ attention_mask=attention_mask,
+ encoder_attention_mask=encoder_attention_mask,
return_dict=False,
)[0]
@@ -1564,14 +1591,22 @@ def __init__(
self.resnets = nn.ModuleList(resnets)
def forward(
- self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
- ):
+ self,
+ hidden_states: torch.FloatTensor,
+ temb: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ ) -> torch.FloatTensor:
hidden_states = self.resnets[0](hidden_states, temb)
for attn, resnet in zip(self.attentions, self.resnets[1:]):
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs,
+ attention_mask=attention_mask,
+ encoder_attention_mask=encoder_attention_mask,
return_dict=False,
)[0]
hidden_states = resnet(hidden_states, temb)
@@ -1666,16 +1701,34 @@ def __init__(
self.resnets = nn.ModuleList(resnets)
def forward(
- self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
+ self,
+ hidden_states: torch.FloatTensor,
+ temb: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
):
cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+
+ if attention_mask is None:
+ # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+ mask = None if encoder_hidden_states is None else encoder_attention_mask
+ else:
+ # when attention_mask is defined: we don't even check for encoder_attention_mask.
+ # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+ # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+ # then we can simplify this whole if/else block to:
+ # mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+ mask = attention_mask
+
hidden_states = self.resnets[0](hidden_states, temb)
for attn, resnet in zip(self.attentions, self.resnets[1:]):
# attn
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
- attention_mask=attention_mask,
+ attention_mask=mask,
**cross_attention_kwargs,
)
diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index 43a487a32b43..8a3d9dd16fd5 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -20,6 +20,7 @@
import torch
from parameterized import parameterized
+from pytest import mark
from diffusers import UNet2DConditionModel
from diffusers.models.attention_processor import CustomDiffusionAttnProcessor, LoRAAttnProcessor
@@ -418,6 +419,76 @@ def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_ma
assert processor.is_run
assert processor.number == 123
+ @parameterized.expand(
+ [
+ # fmt: off
+ [torch.bool],
+ [torch.long],
+ [torch.float],
+ # fmt: on
+ ]
+ )
+ def test_model_xattn_mask(self, mask_dtype):
+ init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+ model = self.model_class(**{**init_dict, "attention_head_dim": (8, 16)})
+ model.to(torch_device)
+ model.eval()
+
+ cond = inputs_dict["encoder_hidden_states"]
+ with torch.no_grad():
+ full_cond_out = model(**inputs_dict).sample
+ assert full_cond_out is not None
+
+ keepall_mask = torch.ones(*cond.shape[:-1], device=cond.device, dtype=mask_dtype)
+ full_cond_keepallmask_out = model(**{**inputs_dict, "encoder_attention_mask": keepall_mask}).sample
+ assert full_cond_keepallmask_out.allclose(
+ full_cond_out
+ ), "a 'keep all' mask should give the same result as no mask"
+
+ trunc_cond = cond[:, :-1, :]
+ trunc_cond_out = model(**{**inputs_dict, "encoder_hidden_states": trunc_cond}).sample
+ assert not trunc_cond_out.allclose(
+ full_cond_out
+ ), "discarding the last token from our cond should change the result"
+
+ batch, tokens, _ = cond.shape
+ mask_last = (torch.arange(tokens) < tokens - 1).expand(batch, -1).to(cond.device, mask_dtype)
+ masked_cond_out = model(**{**inputs_dict, "encoder_attention_mask": mask_last}).sample
+ assert masked_cond_out.allclose(
+ trunc_cond_out
+ ), "masking the last token from our cond should be equivalent to truncating that token out of the condition"
+
+ # see diffusers.models.attention_processor::Attention#prepare_attention_mask
+ # note: we may not need to fix mask padding to work for stable-diffusion cross-attn masks.
+ # since the use-case (somebody passes in a too-short cross-attn mask) is pretty esoteric.
+ # maybe it's fine that this only works for the unclip use-case.
+ @mark.skip(
+ reason="we currently pad mask by target_length tokens (what unclip needs), whereas stable-diffusion's cross-attn needs to instead pad by remaining_length."
+ )
+ def test_model_xattn_padding(self):
+ init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+ model = self.model_class(**{**init_dict, "attention_head_dim": (8, 16)})
+ model.to(torch_device)
+ model.eval()
+
+ cond = inputs_dict["encoder_hidden_states"]
+ with torch.no_grad():
+ full_cond_out = model(**inputs_dict).sample
+ assert full_cond_out is not None
+
+ batch, tokens, _ = cond.shape
+ keeplast_mask = (torch.arange(tokens) == tokens - 1).expand(batch, -1).to(cond.device, torch.bool)
+ keeplast_out = model(**{**inputs_dict, "encoder_attention_mask": keeplast_mask}).sample
+ assert not keeplast_out.allclose(full_cond_out), "a 'keep last token' mask should change the result"
+
+ trunc_mask = torch.zeros(batch, tokens - 1, device=cond.device, dtype=torch.bool)
+ trunc_mask_out = model(**{**inputs_dict, "encoder_attention_mask": trunc_mask}).sample
+ assert trunc_mask_out.allclose(
+ keeplast_out
+ ), "a mask with fewer tokens than condition, will be padded with 'keep' tokens. a 'discard-all' mask missing the final token is thus equivalent to a 'keep last' mask."
+
def test_lora_processors(self):
# enable deterministic behavior for gradient checkpointing
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
From 67cd46015455a08f7dcf60d70a0609a2a020d0b3 Mon Sep 17 00:00:00 2001
From: Will Berman
Date: Mon, 22 May 2023 15:19:56 -0700
Subject: [PATCH 122/206] do not scale the initial global step by gradient
accumulation steps when loading from checkpoint (#3506)
---
examples/controlnet/train_controlnet.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py
index 9754c25b81e9..b6eb98db711b 100644
--- a/examples/controlnet/train_controlnet.py
+++ b/examples/controlnet/train_controlnet.py
@@ -979,7 +979,7 @@ def load_model_hook(models, input_dir):
accelerator.load_state(os.path.join(args.output_dir, path))
global_step = int(path.split("-")[1])
- initial_global_step = global_step * args.gradient_accumulation_steps
+ initial_global_step = global_step
first_epoch = global_step // num_update_steps_per_epoch
else:
initial_global_step = 0
From 2f997f30ab660472561f5e1b5232d4f116315b1b Mon Sep 17 00:00:00 2001
From: Isotr0py <41363108+Isotr0py@users.noreply.github.com>
Date: Tue, 23 May 2023 11:25:15 +0800
Subject: [PATCH 123/206] Fix bug in panorama pipeline when using dpmsolver
scheduler (#3499)
fix panorama pipeline with dpmsolver scheduler
---
.../pipeline_stable_diffusion_panorama.py | 21 +++++++++++++++----
1 file changed, 17 insertions(+), 4 deletions(-)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
index 22c22b56c7ee..223f8a236efa 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
@@ -612,6 +612,7 @@ def __call__(
# 6. Define panorama grid and initialize views for synthesis.
views = self.get_views(height, width)
+ blocks_model_outputs = [None] * len(views)
count = torch.zeros_like(latents)
value = torch.zeros_like(latents)
@@ -632,7 +633,7 @@ def __call__(
# denoised (latent) crops are then averaged to produce the final latent
# for the current timestep via MultiDiffusion. Please see Sec. 4.1 in the
# MultiDiffusion paper for more details: https://arxiv.org/abs/2302.08113
- for h_start, h_end, w_start, w_end in views:
+ for j, (h_start, h_end, w_start, w_end) in enumerate(views):
# get the latents corresponding to the current view coordinates
latents_for_view = latents[:, :, h_start:h_end, w_start:w_end]
@@ -656,9 +657,21 @@ def __call__(
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- latents_view_denoised = self.scheduler.step(
- noise_pred, t, latents_for_view, **extra_step_kwargs
- ).prev_sample
+ if hasattr(self.scheduler, "model_outputs"):
+ # rematch model_outputs in each block
+ if i >= 1:
+ self.scheduler.model_outputs = blocks_model_outputs[j]
+ latents_view_denoised = self.scheduler.step(
+ noise_pred, t, latents_for_view, **extra_step_kwargs
+ ).prev_sample
+ # collect model_outputs
+ blocks_model_outputs[j] = [
+ output if output is not None else None for output in self.scheduler.model_outputs
+ ]
+ else:
+ latents_view_denoised = self.scheduler.step(
+ noise_pred, t, latents_for_view, **extra_step_kwargs
+ ).prev_sample
value[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised
count[:, :, h_start:h_end, w_start:w_end] += 1
From edc65051937f4a71a68ac3da31b2f27a7e422114 Mon Sep 17 00:00:00 2001
From: yingjieh
Date: Tue, 23 May 2023 16:55:14 +0800
Subject: [PATCH 124/206] [Community Pipelines]Accelerate inference of stable
diffusion by IPEX on CPU (#3105)
* add stable_diffusion_ipex community pipeline
* Update readme.md
* reformat
* reformat
* Update examples/community/README.md
Co-authored-by: Pedro Cuenca
* Update examples/community/README.md
Co-authored-by: Pedro Cuenca
* Update examples/community/README.md
Co-authored-by: Pedro Cuenca
* Update examples/community/README.md
Co-authored-by: Pedro Cuenca
* Apply suggestions from code review
Co-authored-by: Pedro Cuenca
* Update README.md
* Update README.md
* Apply suggestions from code review
Co-authored-by: Pedro Cuenca
* style
---------
Co-authored-by: Pedro Cuenca
---
examples/community/README.md | 100 +++
examples/community/stable_diffusion_ipex.py | 848 ++++++++++++++++++++
2 files changed, 948 insertions(+)
create mode 100644 examples/community/stable_diffusion_ipex.py
diff --git a/examples/community/README.md b/examples/community/README.md
index 974f77fd1011..7cb53cf6c564 100755
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -35,6 +35,7 @@ If a community doesn't work as expected, please open an issue and ping the autho
| EDICT Image Editing Pipeline | Diffusion pipeline for text-guided image editing | [EDICT Image Editing Pipeline](#edict-image-editing-pipeline) | - | [Joqsan Azocar](https://github.com/Joqsan) |
| Stable Diffusion RePaint | Stable Diffusion pipeline using [RePaint](https://arxiv.org/abs/2201.0986) for inpainting. | [Stable Diffusion RePaint](#stable-diffusion-repaint ) | - | [Markus Pobitzer](https://github.com/Markus-Pobitzer) |
| TensorRT Stable Diffusion Image to Image Pipeline | Accelerates the Stable Diffusion Image2Image Pipeline using TensorRT | [TensorRT Stable Diffusion Image to Image Pipeline](#tensorrt-image2image-stable-diffusion-pipeline) | - | [Asfiya Baig](https://github.com/asfiyab-nvidia) |
+| Stable Diffusion IPEX Pipeline | Accelerate Stable Diffusion inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [Stable Diffusion on IPEX](#stable-diffusion-on-ipex) | - | [Yingjie Han](https://github.com/yingjie-han/) |
To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
```py
@@ -1363,3 +1364,102 @@ Output Image of `reference_attn=False` and `reference_adain=True`
Output Image of `reference_attn=True` and `reference_adain=True`

+
+### Stable Diffusion on IPEX
+
+This diffusion pipeline aims to accelarate the inference of Stable-Diffusion on Intel Xeon CPUs with BF16/FP32 precision using [IPEX](https://github.com/intel/intel-extension-for-pytorch).
+
+To use this pipeline, you need to:
+1. Install [IPEX](https://github.com/intel/intel-extension-for-pytorch)
+
+**Note:** For each PyTorch release, there is a corresponding release of the IPEX. Here is the mapping relationship. It is recommended to install Pytorch/IPEX2.0 to get the best performance.
+
+|PyTorch Version|IPEX Version|
+|--|--|
+|[v2.0.\*](https://github.com/pytorch/pytorch/tree/v2.0.1 "v2.0.1")|[v2.0.\*](https://github.com/intel/intel-extension-for-pytorch/tree/v2.0.100+cpu)|
+|[v1.13.\*](https://github.com/pytorch/pytorch/tree/v1.13.0 "v1.13.0")|[v1.13.\*](https://github.com/intel/intel-extension-for-pytorch/tree/v1.13.100+cpu)|
+
+You can simply use pip to install IPEX with the latest version.
+```python
+python -m pip install intel_extension_for_pytorch
+```
+**Note:** To install a specific version, run with the following command:
+```
+python -m pip install intel_extension_for_pytorch== -f https://developer.intel.com/ipex-whl-stable-cpu
+```
+
+2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX accelaration. Supported inference datatypes are Float32 and BFloat16.
+
+**Note:** The setting of generated image height/width for `prepare_for_ipex()` should be same as the setting of pipeline inference.
+```python
+pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", custom_pipeline="stable_diffusion_ipex")
+# For Float32
+pipe.prepare_for_ipex(prompt, dtype=torch.float32, height=512, width=512) #value of image height/width should be consistent with the pipeline inference
+# For BFloat16
+pipe.prepare_for_ipex(prompt, dtype=torch.bfloat16, height=512, width=512) #value of image height/width should be consistent with the pipeline inference
+```
+
+Then you can use the ipex pipeline in a similar way to the default stable diffusion pipeline.
+```python
+# For Float32
+image = pipe(prompt, num_inference_steps=20, height=512, width=512).images[0] #value of image height/width should be consistent with 'prepare_for_ipex()'
+# For BFloat16
+with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
+ image = pipe(prompt, num_inference_steps=20, height=512, width=512).images[0] #value of image height/width should be consistent with 'prepare_for_ipex()'
+```
+
+The following code compares the performance of the original stable diffusion pipeline with the ipex-optimized pipeline.
+
+```python
+import torch
+import intel_extension_for_pytorch as ipex
+from diffusers import StableDiffusionPipeline
+import time
+
+prompt = "sailing ship in storm by Rembrandt"
+model_id = "runwayml/stable-diffusion-v1-5"
+# Helper function for time evaluation
+def elapsed_time(pipeline, nb_pass=3, num_inference_steps=20):
+ # warmup
+ for _ in range(2):
+ images = pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512).images
+ #time evaluation
+ start = time.time()
+ for _ in range(nb_pass):
+ pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512)
+ end = time.time()
+ return (end - start) / nb_pass
+
+############## bf16 inference performance ###############
+
+# 1. IPEX Pipeline initialization
+pipe = DiffusionPipeline.from_pretrained(model_id, custom_pipeline="stable_diffusion_ipex")
+pipe.prepare_for_ipex(prompt, dtype=torch.bfloat16, height=512, width=512)
+
+# 2. Original Pipeline initialization
+pipe2 = StableDiffusionPipeline.from_pretrained(model_id)
+
+# 3. Compare performance between Original Pipeline and IPEX Pipeline
+with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
+ latency = elapsed_time(pipe)
+ print("Latency of StableDiffusionIPEXPipeline--bf16", latency)
+ latency = elapsed_time(pipe2)
+ print("Latency of StableDiffusionPipeline--bf16",latency)
+
+############## fp32 inference performance ###############
+
+# 1. IPEX Pipeline initialization
+pipe3 = DiffusionPipeline.from_pretrained(model_id, custom_pipeline="stable_diffusion_ipex")
+pipe3.prepare_for_ipex(prompt, dtype=torch.float32, height=512, width=512)
+
+# 2. Original Pipeline initialization
+pipe4 = StableDiffusionPipeline.from_pretrained(model_id)
+
+# 3. Compare performance between Original Pipeline and IPEX Pipeline
+latency = elapsed_time(pipe3)
+print("Latency of StableDiffusionIPEXPipeline--fp32", latency)
+latency = elapsed_time(pipe4)
+print("Latency of StableDiffusionPipeline--fp32",latency)
+
+```
+
diff --git a/examples/community/stable_diffusion_ipex.py b/examples/community/stable_diffusion_ipex.py
new file mode 100644
index 000000000000..9abe16d56f10
--- /dev/null
+++ b/examples/community/stable_diffusion_ipex.py
@@ -0,0 +1,848 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import intel_extension_for_pytorch as ipex
+import torch
+from packaging import version
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+ deprecate,
+ is_accelerate_available,
+ is_accelerate_version,
+ logging,
+ randn_tensor,
+ replace_example_docstring,
+)
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+ Examples:
+ ```py
+ >>> import torch
+ >>> from diffusers import StableDiffusionPipeline
+
+ >>> pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", custom_pipeline="stable_diffusion_ipex")
+
+ >>> # For Float32
+ >>> pipe.prepare_for_ipex(prompt, dtype=torch.float32, height=512, width=512) #value of image height/width should be consistent with the pipeline inference
+ >>> # For BFloat16
+ >>> pipe.prepare_for_ipex(prompt, dtype=torch.bfloat16, height=512, width=512) #value of image height/width should be consistent with the pipeline inference
+
+ >>> prompt = "a photo of an astronaut riding a horse on mars"
+ >>> # For Float32
+ >>> image = pipe(prompt, num_inference_steps=num_inference_steps, height=512, width=512).images[0] #value of image height/width should be consistent with 'prepare_for_ipex()'
+ >>> # For BFloat16
+ >>> with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
+ >>> image = pipe(prompt, num_inference_steps=num_inference_steps, height=512, width=512).images[0] #value of image height/width should be consistent with 'prepare_for_ipex()'
+ ```
+"""
+
+
+class StableDiffusionIPEXPipeline(DiffusionPipeline):
+ r"""
+ Pipeline for text-to-image generation using Stable Diffusion on IPEX.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`CLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+ safety_checker ([`StableDiffusionSafetyChecker`]):
+ Classification module that estimates whether generated images could be considered offensive or harmful.
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+ feature_extractor ([`CLIPFeatureExtractor`]):
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+ """
+ _optional_components = ["safety_checker", "feature_extractor"]
+
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: KarrasDiffusionSchedulers,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ requires_safety_checker: bool = True,
+ ):
+ super().__init__()
+
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["steps_offset"] = 1
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+ " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+ " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+ " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+ " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+ )
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["clip_sample"] = False
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if safety_checker is None and requires_safety_checker:
+ logger.warning(
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+ )
+
+ if safety_checker is not None and feature_extractor is None:
+ raise ValueError(
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+ )
+
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+ version.parse(unet.config._diffusers_version).base_version
+ ) < version.parse("0.9.0.dev0")
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+ if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+ deprecation_message = (
+ "The configuration file of the unet has set the default `sample_size` to smaller than"
+ " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+ " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+ " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+ " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+ " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+ " in the config might lead to incorrect results in future versions. If you have downloaded this"
+ " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+ " the `unet/config.json` file"
+ )
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(unet.config)
+ new_config["sample_size"] = 64
+ unet._internal_dict = FrozenDict(new_config)
+
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+ def get_input_example(self, prompt, height=None, width=None, guidance_scale=7.5, num_images_per_prompt=1):
+ prompt_embeds = None
+ negative_prompt_embeds = None
+ negative_prompt = None
+ callback_steps = 1
+ generator = None
+ latents = None
+
+ # 0. Default height and width to unet
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+ # 1. Check inputs. Raise error if not correct
+ self.check_inputs(
+ prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+ )
+
+ # 2. Define call parameters
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+
+ device = "cpu"
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ # 3. Encode input prompt
+ prompt_embeds = self._encode_prompt(
+ prompt,
+ device,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
+
+ # 5. Prepare latent variables
+ latents = self.prepare_latents(
+ batch_size * num_images_per_prompt,
+ self.unet.in_channels,
+ height,
+ width,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ latents,
+ )
+ dummy = torch.ones(1, dtype=torch.int32)
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, dummy)
+
+ unet_input_example = (latent_model_input, dummy, prompt_embeds)
+ vae_decoder_input_example = latents
+
+ return unet_input_example, vae_decoder_input_example
+
+ def prepare_for_ipex(self, promt, dtype=torch.float32, height=None, width=None, guidance_scale=7.5):
+ self.unet = self.unet.to(memory_format=torch.channels_last)
+ self.vae.decoder = self.vae.decoder.to(memory_format=torch.channels_last)
+ self.text_encoder = self.text_encoder.to(memory_format=torch.channels_last)
+ if self.safety_checker is not None:
+ self.safety_checker = self.safety_checker.to(memory_format=torch.channels_last)
+
+ unet_input_example, vae_decoder_input_example = self.get_input_example(promt, height, width, guidance_scale)
+
+ # optimize with ipex
+ if dtype == torch.bfloat16:
+ self.unet = ipex.optimize(
+ self.unet.eval(), dtype=torch.bfloat16, inplace=True, sample_input=unet_input_example
+ )
+ self.vae.decoder = ipex.optimize(self.vae.decoder.eval(), dtype=torch.bfloat16, inplace=True)
+ self.text_encoder = ipex.optimize(self.text_encoder.eval(), dtype=torch.bfloat16, inplace=True)
+ if self.safety_checker is not None:
+ self.safety_checker = ipex.optimize(self.safety_checker.eval(), dtype=torch.bfloat16, inplace=True)
+ elif dtype == torch.float32:
+ self.unet = ipex.optimize(
+ self.unet.eval(),
+ dtype=torch.float32,
+ inplace=True,
+ sample_input=unet_input_example,
+ level="O1",
+ weights_prepack=True,
+ auto_kernel_selection=False,
+ )
+ self.vae.decoder = ipex.optimize(
+ self.vae.decoder.eval(),
+ dtype=torch.float32,
+ inplace=True,
+ level="O1",
+ weights_prepack=True,
+ auto_kernel_selection=False,
+ )
+ self.text_encoder = ipex.optimize(
+ self.text_encoder.eval(),
+ dtype=torch.float32,
+ inplace=True,
+ level="O1",
+ weights_prepack=True,
+ auto_kernel_selection=False,
+ )
+ if self.safety_checker is not None:
+ self.safety_checker = ipex.optimize(
+ self.safety_checker.eval(),
+ dtype=torch.float32,
+ inplace=True,
+ level="O1",
+ weights_prepack=True,
+ auto_kernel_selection=False,
+ )
+ else:
+ raise ValueError(" The value of 'dtype' should be 'torch.bfloat16' or 'torch.float32' !")
+
+ # trace unet model to get better performance on IPEX
+ with torch.cpu.amp.autocast(enabled=dtype == torch.bfloat16), torch.no_grad():
+ unet_trace_model = torch.jit.trace(self.unet, unet_input_example, check_trace=False, strict=False)
+ unet_trace_model = torch.jit.freeze(unet_trace_model)
+ self.unet.forward = unet_trace_model.forward
+
+ # trace vae.decoder model to get better performance on IPEX
+ with torch.cpu.amp.autocast(enabled=dtype == torch.bfloat16), torch.no_grad():
+ ave_decoder_trace_model = torch.jit.trace(
+ self.vae.decoder, vae_decoder_input_example, check_trace=False, strict=False
+ )
+ ave_decoder_trace_model = torch.jit.freeze(ave_decoder_trace_model)
+ self.vae.decoder.forward = ave_decoder_trace_model.forward
+
+ def enable_vae_slicing(self):
+ r"""
+ Enable sliced VAE decoding.
+
+ When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
+ steps. This is useful to save some memory and allow larger batch sizes.
+ """
+ self.vae.enable_slicing()
+
+ def disable_vae_slicing(self):
+ r"""
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_slicing()
+
+ def enable_vae_tiling(self):
+ r"""
+ Enable tiled VAE decoding.
+
+ When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
+ several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
+ """
+ self.vae.enable_tiling()
+
+ def disable_vae_tiling(self):
+ r"""
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_tiling()
+
+ def enable_sequential_cpu_offload(self, gpu_id=0):
+ r"""
+ Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+ text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+ `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+ Note that offloading happens on a submodule basis. Memory savings are higher than with
+ `enable_model_cpu_offload`, but performance is lower.
+ """
+ if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
+ from accelerate import cpu_offload
+ else:
+ raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
+
+ device = torch.device(f"cuda:{gpu_id}")
+
+ if self.device.type != "cpu":
+ self.to("cpu", silence_dtype_warnings=True)
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
+
+ for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+ cpu_offload(cpu_offloaded_model, device)
+
+ if self.safety_checker is not None:
+ cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+ def enable_model_cpu_offload(self, gpu_id=0):
+ r"""
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+ """
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+ from accelerate import cpu_offload_with_hook
+ else:
+ raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.")
+
+ device = torch.device(f"cuda:{gpu_id}")
+
+ if self.device.type != "cpu":
+ self.to("cpu", silence_dtype_warnings=True)
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
+
+ hook = None
+ for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+ _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+ if self.safety_checker is not None:
+ _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+ # We'll offload the last model manually.
+ self.final_offload_hook = hook
+
+ @property
+ def _execution_device(self):
+ r"""
+ Returns the device on which the pipeline's models will be executed. After calling
+ `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+ hooks.
+ """
+ if not hasattr(self.unet, "_hf_hook"):
+ return self.device
+ for module in self.unet.modules():
+ if (
+ hasattr(module, "_hf_hook")
+ and hasattr(module._hf_hook, "execution_device")
+ and module._hf_hook.execution_device is not None
+ ):
+ return torch.device(module._hf_hook.execution_device)
+ return self.device
+
+ def _encode_prompt(
+ self,
+ prompt,
+ device,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ ):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ prompt to be encoded
+ device: (`torch.device`):
+ torch device
+ num_images_per_prompt (`int`):
+ number of images that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+ Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ """
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ if prompt_embeds is None:
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+ text_input_ids = text_inputs.input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+ attention_mask = text_inputs.attention_mask.to(device)
+ else:
+ attention_mask = None
+
+ prompt_embeds = self.text_encoder(
+ text_input_ids.to(device),
+ attention_mask=attention_mask,
+ )
+ prompt_embeds = prompt_embeds[0]
+
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+ bs_embed, seq_len, _ = prompt_embeds.shape
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
+ uncond_tokens: List[str]
+ if negative_prompt is None:
+ uncond_tokens = [""] * batch_size
+ elif type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif isinstance(negative_prompt, str):
+ uncond_tokens = [negative_prompt]
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = negative_prompt
+
+ max_length = prompt_embeds.shape[1]
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+ attention_mask = uncond_input.attention_mask.to(device)
+ else:
+ attention_mask = None
+
+ negative_prompt_embeds = self.text_encoder(
+ uncond_input.input_ids.to(device),
+ attention_mask=attention_mask,
+ )
+ negative_prompt_embeds = negative_prompt_embeds[0]
+
+ if do_classifier_free_guidance:
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+ seq_len = negative_prompt_embeds.shape[1]
+
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+ return prompt_embeds
+
+ def run_safety_checker(self, image, device, dtype):
+ if self.safety_checker is not None:
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+ image, has_nsfw_concept = self.safety_checker(
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+ )
+ else:
+ has_nsfw_concept = None
+ return image, has_nsfw_concept
+
+ def decode_latents(self, latents):
+ latents = 1 / self.vae.config.scaling_factor * latents
+ image = self.vae.decode(latents).sample
+ image = (image / 2 + 0.5).clamp(0, 1)
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+ return image
+
+ def prepare_extra_step_kwargs(self, generator, eta):
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ # check if the scheduler accepts generator
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ if accepts_generator:
+ extra_step_kwargs["generator"] = generator
+ return extra_step_kwargs
+
+ def check_inputs(
+ self,
+ prompt,
+ height,
+ width,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ ):
+ if height % 8 != 0 or width % 8 != 0:
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+ if (callback_steps is None) or (
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ if prompt is not None and prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+ " only forward one of the two."
+ )
+ elif prompt is None and prompt_embeds is None:
+ raise ValueError(
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+ )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ if negative_prompt is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
+ raise ValueError(
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+ f" {negative_prompt_embeds.shape}."
+ )
+
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+ if isinstance(generator, list) and len(generator) != batch_size:
+ raise ValueError(
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+ )
+
+ if latents is None:
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+ else:
+ latents = latents.to(device)
+
+ # scale the initial noise by the standard deviation required by the scheduler
+ latents = latents * self.scheduler.init_noise_sigma
+ return latents
+
+ @torch.no_grad()
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
+ def __call__(
+ self,
+ prompt: Union[str, List[str]] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ latents: Optional[torch.FloatTensor] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+ callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+ instead.
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+ The height in pixels of the generated image.
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+ The width in pixels of the generated image.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+ Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+ latents (`torch.FloatTensor`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+ plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
+ `self.processor` in
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
+ Examples:
+
+ Returns:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+ (nsfw) content, according to the `safety_checker`.
+ """
+ # 0. Default height and width to unet
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+ # 1. Check inputs. Raise error if not correct
+ self.check_inputs(
+ prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+ )
+
+ # 2. Define call parameters
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ device = self._execution_device
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ # 3. Encode input prompt
+ prompt_embeds = self._encode_prompt(
+ prompt,
+ device,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
+
+ # 4. Prepare timesteps
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
+ timesteps = self.scheduler.timesteps
+
+ # 5. Prepare latent variables
+ num_channels_latents = self.unet.in_channels
+ latents = self.prepare_latents(
+ batch_size * num_images_per_prompt,
+ num_channels_latents,
+ height,
+ width,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ latents,
+ )
+
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+ # 7. Denoising loop
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
+ for i, t in enumerate(timesteps):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+ # predict the noise residual
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds)["sample"]
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+ # call the callback, if provided
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+ progress_bar.update()
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, latents)
+
+ if output_type == "latent":
+ image = latents
+ has_nsfw_concept = None
+ elif output_type == "pil":
+ # 8. Post-processing
+ image = self.decode_latents(latents)
+
+ # 9. Run safety checker
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+ # 10. Convert to PIL
+ image = self.numpy_to_pil(image)
+ else:
+ # 8. Post-processing
+ image = self.decode_latents(latents)
+
+ # 9. Run safety checker
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+ # Offload last model to CPU
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+ self.final_offload_hook.offload()
+
+ if not return_dict:
+ return (image, has_nsfw_concept)
+
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
From b134f6a8b6b3d75af45a0b918b4006d2a06e0f91 Mon Sep 17 00:00:00 2001
From: takuoko
Date: Tue, 23 May 2023 21:20:34 +0900
Subject: [PATCH 125/206] [Community] ControlNet Reference (#3508)
add controlnet reference and bugfix
Co-authored-by: Patrick von Platen
---
examples/community/README.md | 51 +-
.../stable_diffusion_controlnet_reference.py | 822 ++++++++++++++++++
.../community/stable_diffusion_reference.py | 51 +-
3 files changed, 900 insertions(+), 24 deletions(-)
create mode 100644 examples/community/stable_diffusion_controlnet_reference.py
diff --git a/examples/community/README.md b/examples/community/README.md
index 7cb53cf6c564..0211287d4ebb 100755
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -1324,7 +1324,7 @@ image.save('tensorrt_img2img_new_zealand_hills.png')
### Stable Diffusion Reference
-This pipeline uses the Reference only Control. Refer to the [sd-webui-controlnet discussion](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236).
+This pipeline uses the Reference Control. Refer to the [sd-webui-controlnet discussion: Reference-only Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236)[sd-webui-controlnet discussion: Reference-adain Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1280).
```py
@@ -1365,6 +1365,54 @@ Output Image of `reference_attn=True` and `reference_adain=True`

+### Stable Diffusion ControlNet Reference
+
+This pipeline uses the Reference Control with ControlNet. Refer to the [sd-webui-controlnet discussion: Reference-only Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236)[sd-webui-controlnet discussion: Reference-adain Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1280).
+
+
+```py
+import cv2
+import torch
+import numpy as np
+from PIL import Image
+from diffusers import UniPCMultistepScheduler
+from diffusers.utils import load_image
+
+input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
+
+# get canny image
+image = cv2.Canny(np.array(input_image), 100, 200)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image)
+
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+pipe = StableDiffusionControlNetReferencePipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5",
+ controlnet=controlnet,
+ safety_checker=None,
+ torch_dtype=torch.float16
+ ).to('cuda:0')
+
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+
+result_img = pipe(ref_image=input_image,
+ prompt="1girl",
+ image=canny_image,
+ num_inference_steps=20,
+ reference_attn=True,
+ reference_adain=True).images[0]
+```
+
+Reference Image
+
+
+
+Output Image
+
+
+
+
### Stable Diffusion on IPEX
This diffusion pipeline aims to accelarate the inference of Stable-Diffusion on Intel Xeon CPUs with BF16/FP32 precision using [IPEX](https://github.com/intel/intel-extension-for-pytorch).
@@ -1462,4 +1510,3 @@ latency = elapsed_time(pipe4)
print("Latency of StableDiffusionPipeline--fp32",latency)
```
-
diff --git a/examples/community/stable_diffusion_controlnet_reference.py b/examples/community/stable_diffusion_controlnet_reference.py
new file mode 100644
index 000000000000..606fe09c68fc
--- /dev/null
+++ b/examples/community/stable_diffusion_controlnet_reference.py
@@ -0,0 +1,822 @@
+# Inspired by: https://github.com/Mikubill/sd-webui-controlnet/discussions/1236 and https://github.com/Mikubill/sd-webui-controlnet/discussions/1280
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import PIL.Image
+import torch
+
+from diffusers import StableDiffusionControlNetPipeline
+from diffusers.models import ControlNetModel
+from diffusers.models.attention import BasicTransformerBlock
+from diffusers.models.unet_2d_blocks import CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, UpBlock2D
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.utils import is_compiled_module, logging, randn_tensor
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+ Examples:
+ ```py
+ >>> import cv2
+ >>> import torch
+ >>> import numpy as np
+ >>> from PIL import Image
+ >>> from diffusers import UniPCMultistepScheduler
+ >>> from diffusers.utils import load_image
+
+ >>> input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
+
+ >>> # get canny image
+ >>> image = cv2.Canny(np.array(input_image), 100, 200)
+ >>> image = image[:, :, None]
+ >>> image = np.concatenate([image, image, image], axis=2)
+ >>> canny_image = Image.fromarray(image)
+
+ >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+ >>> pipe = StableDiffusionControlNetReferencePipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5",
+ controlnet=controlnet,
+ safety_checker=None,
+ torch_dtype=torch.float16
+ ).to('cuda:0')
+
+ >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe_controlnet.scheduler.config)
+
+ >>> result_img = pipe(ref_image=input_image,
+ prompt="1girl",
+ image=canny_image,
+ num_inference_steps=20,
+ reference_attn=True,
+ reference_adain=True).images[0]
+
+ >>> result_img.show()
+ ```
+"""
+
+
+def torch_dfs(model: torch.nn.Module):
+ result = [model]
+ for child in model.children():
+ result += torch_dfs(child)
+ return result
+
+
+class StableDiffusionControlNetReferencePipeline(StableDiffusionControlNetPipeline):
+ def prepare_ref_latents(self, refimage, batch_size, dtype, device, generator, do_classifier_free_guidance):
+ refimage = refimage.to(device=device, dtype=dtype)
+
+ # encode the mask image into latents space so we can concatenate it to the latents
+ if isinstance(generator, list):
+ ref_image_latents = [
+ self.vae.encode(refimage[i : i + 1]).latent_dist.sample(generator=generator[i])
+ for i in range(batch_size)
+ ]
+ ref_image_latents = torch.cat(ref_image_latents, dim=0)
+ else:
+ ref_image_latents = self.vae.encode(refimage).latent_dist.sample(generator=generator)
+ ref_image_latents = self.vae.config.scaling_factor * ref_image_latents
+
+ # duplicate mask and ref_image_latents for each generation per prompt, using mps friendly method
+ if ref_image_latents.shape[0] < batch_size:
+ if not batch_size % ref_image_latents.shape[0] == 0:
+ raise ValueError(
+ "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+ f" to a total batch size of {batch_size}, but {ref_image_latents.shape[0]} images were passed."
+ " Make sure the number of images that you pass is divisible by the total requested batch size."
+ )
+ ref_image_latents = ref_image_latents.repeat(batch_size // ref_image_latents.shape[0], 1, 1, 1)
+
+ ref_image_latents = torch.cat([ref_image_latents] * 2) if do_classifier_free_guidance else ref_image_latents
+
+ # aligning device to prevent device errors when concating it with the latent model input
+ ref_image_latents = ref_image_latents.to(device=device, dtype=dtype)
+ return ref_image_latents
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ prompt: Union[str, List[str]] = None,
+ image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None,
+ ref_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ latents: Optional[torch.FloatTensor] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+ callback_steps: int = 1,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+ guess_mode: bool = False,
+ attention_auto_machine_weight: float = 1.0,
+ gn_auto_machine_weight: float = 1.0,
+ style_fidelity: float = 0.5,
+ reference_attn: bool = True,
+ reference_adain: bool = True,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+ instead.
+ image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`,
+ `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`):
+ The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+ the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
+ also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
+ height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
+ specified in init, images must be passed as a list such that each element of the list can be correctly
+ batched for input to a single controlnet.
+ ref_image (`torch.FloatTensor`, `PIL.Image.Image`):
+ The Reference Control input condition. Reference Control uses this input condition to generate guidance to Unet. If
+ the type is specified as `Torch.FloatTensor`, it is passed to Reference Control as is. `PIL.Image.Image` can
+ also be accepted as an image.
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+ The height in pixels of the generated image.
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+ The width in pixels of the generated image.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+ latents (`torch.FloatTensor`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+ plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+ controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+ The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+ to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
+ corresponding scale as a list.
+ guess_mode (`bool`, *optional*, defaults to `False`):
+ In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
+ you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
+ attention_auto_machine_weight (`float`):
+ Weight of using reference query for self attention's context.
+ If attention_auto_machine_weight=1.0, use reference query for all self attention's context.
+ gn_auto_machine_weight (`float`):
+ Weight of using reference adain. If gn_auto_machine_weight=2.0, use all reference adain plugins.
+ style_fidelity (`float`):
+ style fidelity of ref_uncond_xt. If style_fidelity=1.0, control more important,
+ elif style_fidelity=0.0, prompt more important, else balanced.
+ reference_attn (`bool`):
+ Whether to use reference query for self attention's context.
+ reference_adain (`bool`):
+ Whether to use reference adain.
+
+ Examples:
+
+ Returns:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+ (nsfw) content, according to the `safety_checker`.
+ """
+ # 0. Default height and width to unet
+ height, width = self._default_height_width(height, width, image)
+
+ # 1. Check inputs. Raise error if not correct
+ self.check_inputs(
+ prompt,
+ image,
+ height,
+ width,
+ callback_steps,
+ negative_prompt,
+ prompt_embeds,
+ negative_prompt_embeds,
+ controlnet_conditioning_scale,
+ )
+
+ # 2. Define call parameters
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ device = self._execution_device
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+ if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+ global_pool_conditions = (
+ controlnet.config.global_pool_conditions
+ if isinstance(controlnet, ControlNetModel)
+ else controlnet.nets[0].config.global_pool_conditions
+ )
+ guess_mode = guess_mode or global_pool_conditions
+
+ # 3. Encode input prompt
+ prompt_embeds = self._encode_prompt(
+ prompt,
+ device,
+ num_images_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ )
+
+ # 4. Prepare image
+ if isinstance(controlnet, ControlNetModel):
+ image = self.prepare_image(
+ image=image,
+ width=width,
+ height=height,
+ batch_size=batch_size * num_images_per_prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ device=device,
+ dtype=controlnet.dtype,
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ guess_mode=guess_mode,
+ )
+ elif isinstance(controlnet, MultiControlNetModel):
+ images = []
+
+ for image_ in image:
+ image_ = self.prepare_image(
+ image=image_,
+ width=width,
+ height=height,
+ batch_size=batch_size * num_images_per_prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ device=device,
+ dtype=controlnet.dtype,
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ guess_mode=guess_mode,
+ )
+
+ images.append(image_)
+
+ image = images
+ else:
+ assert False
+
+ # 5. Preprocess reference image
+ ref_image = self.prepare_image(
+ image=ref_image,
+ width=width,
+ height=height,
+ batch_size=batch_size * num_images_per_prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ device=device,
+ dtype=prompt_embeds.dtype,
+ )
+
+ # 6. Prepare timesteps
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
+ timesteps = self.scheduler.timesteps
+
+ # 7. Prepare latent variables
+ num_channels_latents = self.unet.config.in_channels
+ latents = self.prepare_latents(
+ batch_size * num_images_per_prompt,
+ num_channels_latents,
+ height,
+ width,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ latents,
+ )
+
+ # 8. Prepare reference latent variables
+ ref_image_latents = self.prepare_ref_latents(
+ ref_image,
+ batch_size * num_images_per_prompt,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ do_classifier_free_guidance,
+ )
+
+ # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+ # 9. Modify self attention and group norm
+ MODE = "write"
+ uc_mask = (
+ torch.Tensor([1] * batch_size * num_images_per_prompt + [0] * batch_size * num_images_per_prompt)
+ .type_as(ref_image_latents)
+ .bool()
+ )
+
+ def hacked_basic_transformer_inner_forward(
+ self,
+ hidden_states: torch.FloatTensor,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ timestep: Optional[torch.LongTensor] = None,
+ cross_attention_kwargs: Dict[str, Any] = None,
+ class_labels: Optional[torch.LongTensor] = None,
+ ):
+ if self.use_ada_layer_norm:
+ norm_hidden_states = self.norm1(hidden_states, timestep)
+ elif self.use_ada_layer_norm_zero:
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+ hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+ )
+ else:
+ norm_hidden_states = self.norm1(hidden_states)
+
+ # 1. Self-Attention
+ cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+ if self.only_cross_attention:
+ attn_output = self.attn1(
+ norm_hidden_states,
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+ attention_mask=attention_mask,
+ **cross_attention_kwargs,
+ )
+ else:
+ if MODE == "write":
+ self.bank.append(norm_hidden_states.detach().clone())
+ attn_output = self.attn1(
+ norm_hidden_states,
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+ attention_mask=attention_mask,
+ **cross_attention_kwargs,
+ )
+ if MODE == "read":
+ if attention_auto_machine_weight > self.attn_weight:
+ attn_output_uc = self.attn1(
+ norm_hidden_states,
+ encoder_hidden_states=torch.cat([norm_hidden_states] + self.bank, dim=1),
+ # attention_mask=attention_mask,
+ **cross_attention_kwargs,
+ )
+ attn_output_c = attn_output_uc.clone()
+ if do_classifier_free_guidance and style_fidelity > 0:
+ attn_output_c[uc_mask] = self.attn1(
+ norm_hidden_states[uc_mask],
+ encoder_hidden_states=norm_hidden_states[uc_mask],
+ **cross_attention_kwargs,
+ )
+ attn_output = style_fidelity * attn_output_c + (1.0 - style_fidelity) * attn_output_uc
+ self.bank.clear()
+ else:
+ attn_output = self.attn1(
+ norm_hidden_states,
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+ attention_mask=attention_mask,
+ **cross_attention_kwargs,
+ )
+ if self.use_ada_layer_norm_zero:
+ attn_output = gate_msa.unsqueeze(1) * attn_output
+ hidden_states = attn_output + hidden_states
+
+ if self.attn2 is not None:
+ norm_hidden_states = (
+ self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+ )
+
+ # 2. Cross-Attention
+ attn_output = self.attn2(
+ norm_hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ attention_mask=encoder_attention_mask,
+ **cross_attention_kwargs,
+ )
+ hidden_states = attn_output + hidden_states
+
+ # 3. Feed-forward
+ norm_hidden_states = self.norm3(hidden_states)
+
+ if self.use_ada_layer_norm_zero:
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+ ff_output = self.ff(norm_hidden_states)
+
+ if self.use_ada_layer_norm_zero:
+ ff_output = gate_mlp.unsqueeze(1) * ff_output
+
+ hidden_states = ff_output + hidden_states
+
+ return hidden_states
+
+ def hacked_mid_forward(self, *args, **kwargs):
+ eps = 1e-6
+ x = self.original_forward(*args, **kwargs)
+ if MODE == "write":
+ if gn_auto_machine_weight >= self.gn_weight:
+ var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0)
+ self.mean_bank.append(mean)
+ self.var_bank.append(var)
+ if MODE == "read":
+ if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+ var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0)
+ std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+ mean_acc = sum(self.mean_bank) / float(len(self.mean_bank))
+ var_acc = sum(self.var_bank) / float(len(self.var_bank))
+ std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+ x_uc = (((x - mean) / std) * std_acc) + mean_acc
+ x_c = x_uc.clone()
+ if do_classifier_free_guidance and style_fidelity > 0:
+ x_c[uc_mask] = x[uc_mask]
+ x = style_fidelity * x_c + (1.0 - style_fidelity) * x_uc
+ self.mean_bank = []
+ self.var_bank = []
+ return x
+
+ def hack_CrossAttnDownBlock2D_forward(
+ self,
+ hidden_states: torch.FloatTensor,
+ temb: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ ):
+ eps = 1e-6
+
+ # TODO(Patrick, William) - attention mask is not used
+ output_states = ()
+
+ for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)):
+ hidden_states = resnet(hidden_states, temb)
+ hidden_states = attn(
+ hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ cross_attention_kwargs=cross_attention_kwargs,
+ attention_mask=attention_mask,
+ encoder_attention_mask=encoder_attention_mask,
+ return_dict=False,
+ )[0]
+ if MODE == "write":
+ if gn_auto_machine_weight >= self.gn_weight:
+ var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+ self.mean_bank.append(mean)
+ self.var_bank.append(var)
+ if MODE == "read":
+ if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+ var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+ std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+ mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+ var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+ std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+ hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+ hidden_states_c = hidden_states_uc.clone()
+ if do_classifier_free_guidance and style_fidelity > 0:
+ hidden_states_c[uc_mask] = hidden_states[uc_mask]
+ hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+ output_states = output_states + (hidden_states,)
+
+ if MODE == "read":
+ self.mean_bank = []
+ self.var_bank = []
+
+ if self.downsamplers is not None:
+ for downsampler in self.downsamplers:
+ hidden_states = downsampler(hidden_states)
+
+ output_states = output_states + (hidden_states,)
+
+ return hidden_states, output_states
+
+ def hacked_DownBlock2D_forward(self, hidden_states, temb=None):
+ eps = 1e-6
+
+ output_states = ()
+
+ for i, resnet in enumerate(self.resnets):
+ hidden_states = resnet(hidden_states, temb)
+
+ if MODE == "write":
+ if gn_auto_machine_weight >= self.gn_weight:
+ var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+ self.mean_bank.append(mean)
+ self.var_bank.append(var)
+ if MODE == "read":
+ if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+ var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+ std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+ mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+ var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+ std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+ hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+ hidden_states_c = hidden_states_uc.clone()
+ if do_classifier_free_guidance and style_fidelity > 0:
+ hidden_states_c[uc_mask] = hidden_states[uc_mask]
+ hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+ output_states = output_states + (hidden_states,)
+
+ if MODE == "read":
+ self.mean_bank = []
+ self.var_bank = []
+
+ if self.downsamplers is not None:
+ for downsampler in self.downsamplers:
+ hidden_states = downsampler(hidden_states)
+
+ output_states = output_states + (hidden_states,)
+
+ return hidden_states, output_states
+
+ def hacked_CrossAttnUpBlock2D_forward(
+ self,
+ hidden_states: torch.FloatTensor,
+ res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+ temb: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ upsample_size: Optional[int] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ ):
+ eps = 1e-6
+ # TODO(Patrick, William) - attention mask is not used
+ for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)):
+ # pop res hidden states
+ res_hidden_states = res_hidden_states_tuple[-1]
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+ hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+ hidden_states = resnet(hidden_states, temb)
+ hidden_states = attn(
+ hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ cross_attention_kwargs=cross_attention_kwargs,
+ attention_mask=attention_mask,
+ encoder_attention_mask=encoder_attention_mask,
+ return_dict=False,
+ )[0]
+
+ if MODE == "write":
+ if gn_auto_machine_weight >= self.gn_weight:
+ var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+ self.mean_bank.append(mean)
+ self.var_bank.append(var)
+ if MODE == "read":
+ if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+ var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+ std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+ mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+ var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+ std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+ hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+ hidden_states_c = hidden_states_uc.clone()
+ if do_classifier_free_guidance and style_fidelity > 0:
+ hidden_states_c[uc_mask] = hidden_states[uc_mask]
+ hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+ if MODE == "read":
+ self.mean_bank = []
+ self.var_bank = []
+
+ if self.upsamplers is not None:
+ for upsampler in self.upsamplers:
+ hidden_states = upsampler(hidden_states, upsample_size)
+
+ return hidden_states
+
+ def hacked_UpBlock2D_forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+ eps = 1e-6
+ for i, resnet in enumerate(self.resnets):
+ # pop res hidden states
+ res_hidden_states = res_hidden_states_tuple[-1]
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+ hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+ hidden_states = resnet(hidden_states, temb)
+
+ if MODE == "write":
+ if gn_auto_machine_weight >= self.gn_weight:
+ var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+ self.mean_bank.append(mean)
+ self.var_bank.append(var)
+ if MODE == "read":
+ if len(self.mean_bank) > 0 and len(self.var_bank) > 0:
+ var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0)
+ std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+ mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i]))
+ var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i]))
+ std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+ hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc
+ hidden_states_c = hidden_states_uc.clone()
+ if do_classifier_free_guidance and style_fidelity > 0:
+ hidden_states_c[uc_mask] = hidden_states[uc_mask]
+ hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc
+
+ if MODE == "read":
+ self.mean_bank = []
+ self.var_bank = []
+
+ if self.upsamplers is not None:
+ for upsampler in self.upsamplers:
+ hidden_states = upsampler(hidden_states, upsample_size)
+
+ return hidden_states
+
+ if reference_attn:
+ attn_modules = [module for module in torch_dfs(self.unet) if isinstance(module, BasicTransformerBlock)]
+ attn_modules = sorted(attn_modules, key=lambda x: -x.norm1.normalized_shape[0])
+
+ for i, module in enumerate(attn_modules):
+ module._original_inner_forward = module.forward
+ module.forward = hacked_basic_transformer_inner_forward.__get__(module, BasicTransformerBlock)
+ module.bank = []
+ module.attn_weight = float(i) / float(len(attn_modules))
+
+ if reference_adain:
+ gn_modules = [self.unet.mid_block]
+ self.unet.mid_block.gn_weight = 0
+
+ down_blocks = self.unet.down_blocks
+ for w, module in enumerate(down_blocks):
+ module.gn_weight = 1.0 - float(w) / float(len(down_blocks))
+ gn_modules.append(module)
+
+ up_blocks = self.unet.up_blocks
+ for w, module in enumerate(up_blocks):
+ module.gn_weight = float(w) / float(len(up_blocks))
+ gn_modules.append(module)
+
+ for i, module in enumerate(gn_modules):
+ if getattr(module, "original_forward", None) is None:
+ module.original_forward = module.forward
+ if i == 0:
+ # mid_block
+ module.forward = hacked_mid_forward.__get__(module, torch.nn.Module)
+ elif isinstance(module, CrossAttnDownBlock2D):
+ module.forward = hack_CrossAttnDownBlock2D_forward.__get__(module, CrossAttnDownBlock2D)
+ elif isinstance(module, DownBlock2D):
+ module.forward = hacked_DownBlock2D_forward.__get__(module, DownBlock2D)
+ elif isinstance(module, CrossAttnUpBlock2D):
+ module.forward = hacked_CrossAttnUpBlock2D_forward.__get__(module, CrossAttnUpBlock2D)
+ elif isinstance(module, UpBlock2D):
+ module.forward = hacked_UpBlock2D_forward.__get__(module, UpBlock2D)
+ module.mean_bank = []
+ module.var_bank = []
+ module.gn_weight *= 2
+
+ # 11. Denoising loop
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
+ for i, t in enumerate(timesteps):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+ # controlnet(s) inference
+ if guess_mode and do_classifier_free_guidance:
+ # Infer ControlNet only for the conditional batch.
+ controlnet_latent_model_input = latents
+ controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+ else:
+ controlnet_latent_model_input = latent_model_input
+ controlnet_prompt_embeds = prompt_embeds
+
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
+ controlnet_latent_model_input,
+ t,
+ encoder_hidden_states=controlnet_prompt_embeds,
+ controlnet_cond=image,
+ conditioning_scale=controlnet_conditioning_scale,
+ guess_mode=guess_mode,
+ return_dict=False,
+ )
+
+ if guess_mode and do_classifier_free_guidance:
+ # Infered ControlNet only for the conditional batch.
+ # To apply the output of ControlNet to both the unconditional and conditional batches,
+ # add 0 to the unconditional batch to keep it unchanged.
+ down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+ mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+ # ref only part
+ noise = randn_tensor(
+ ref_image_latents.shape, generator=generator, device=device, dtype=ref_image_latents.dtype
+ )
+ ref_xt = self.scheduler.add_noise(
+ ref_image_latents,
+ noise,
+ t.reshape(
+ 1,
+ ),
+ )
+ ref_xt = self.scheduler.scale_model_input(ref_xt, t)
+
+ MODE = "write"
+ self.unet(
+ ref_xt,
+ t,
+ encoder_hidden_states=prompt_embeds,
+ cross_attention_kwargs=cross_attention_kwargs,
+ return_dict=False,
+ )
+
+ # predict the noise residual
+ MODE = "read"
+ noise_pred = self.unet(
+ latent_model_input,
+ t,
+ encoder_hidden_states=prompt_embeds,
+ cross_attention_kwargs=cross_attention_kwargs,
+ down_block_additional_residuals=down_block_res_samples,
+ mid_block_additional_residual=mid_block_res_sample,
+ return_dict=False,
+ )[0]
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+ # call the callback, if provided
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+ progress_bar.update()
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, latents)
+
+ # If we do sequential model offloading, let's offload unet and controlnet
+ # manually for max memory savings
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+ self.unet.to("cpu")
+ self.controlnet.to("cpu")
+ torch.cuda.empty_cache()
+
+ if not output_type == "latent":
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+ else:
+ image = latents
+ has_nsfw_concept = None
+
+ if has_nsfw_concept is None:
+ do_denormalize = [True] * image.shape[0]
+ else:
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+ # Offload last model to CPU
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+ self.final_offload_hook.offload()
+
+ if not return_dict:
+ return (image, has_nsfw_concept)
+
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/examples/community/stable_diffusion_reference.py b/examples/community/stable_diffusion_reference.py
index 5e8051cdcdb2..22e0b40f60a3 100644
--- a/examples/community/stable_diffusion_reference.py
+++ b/examples/community/stable_diffusion_reference.py
@@ -1,5 +1,5 @@
-# Inspired by: https://github.com/Mikubill/sd-webui-controlnet/discussions/1236
-from typing import Any, Callable, Dict, List, Optional, Union
+# Inspired by: https://github.com/Mikubill/sd-webui-controlnet/discussions/1236 and https://github.com/Mikubill/sd-webui-controlnet/discussions/1280
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import numpy as np
import PIL.Image
@@ -162,7 +162,7 @@ def prepare_ref_latents(self, refimage, batch_size, dtype, device, generator, do
def __call__(
self,
prompt: Union[str, List[str]] = None,
- ref_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None,
+ ref_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
height: Optional[int] = None,
width: Optional[int] = None,
num_inference_steps: int = 50,
@@ -356,12 +356,13 @@ def __call__(
def hacked_basic_transformer_inner_forward(
self,
- hidden_states,
- encoder_hidden_states=None,
- timestep=None,
- attention_mask=None,
- cross_attention_kwargs=None,
- class_labels=None,
+ hidden_states: torch.FloatTensor,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ timestep: Optional[torch.LongTensor] = None,
+ cross_attention_kwargs: Dict[str, Any] = None,
+ class_labels: Optional[torch.LongTensor] = None,
):
if self.use_ada_layer_norm:
norm_hidden_states = self.norm1(hidden_states, timestep)
@@ -427,7 +428,7 @@ def hacked_basic_transformer_inner_forward(
attn_output = self.attn2(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states,
- attention_mask=attention_mask,
+ attention_mask=encoder_attention_mask,
**cross_attention_kwargs,
)
hidden_states = attn_output + hidden_states
@@ -473,11 +474,12 @@ def hacked_mid_forward(self, *args, **kwargs):
def hack_CrossAttnDownBlock2D_forward(
self,
- hidden_states,
- temb=None,
- encoder_hidden_states=None,
- attention_mask=None,
- cross_attention_kwargs=None,
+ hidden_states: torch.FloatTensor,
+ temb: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
):
eps = 1e-6
@@ -490,6 +492,8 @@ def hack_CrossAttnDownBlock2D_forward(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs,
+ attention_mask=attention_mask,
+ encoder_attention_mask=encoder_attention_mask,
return_dict=False,
)[0]
if MODE == "write":
@@ -566,13 +570,14 @@ def hacked_DownBlock2D_forward(self, hidden_states, temb=None):
def hacked_CrossAttnUpBlock2D_forward(
self,
- hidden_states,
- res_hidden_states_tuple,
- temb=None,
- encoder_hidden_states=None,
- cross_attention_kwargs=None,
- upsample_size=None,
- attention_mask=None,
+ hidden_states: torch.FloatTensor,
+ res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+ temb: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ upsample_size: Optional[int] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
):
eps = 1e-6
# TODO(Patrick, William) - attention mask is not used
@@ -586,6 +591,8 @@ def hacked_CrossAttnUpBlock2D_forward(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs,
+ attention_mask=attention_mask,
+ encoder_attention_mask=encoder_attention_mask,
return_dict=False,
)[0]
From d4197bf4d72f04d4927ff1e7be2f8ee46efebe47 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Tue, 23 May 2023 14:20:55 +0200
Subject: [PATCH 126/206] Allow custom pipeline loading (#3504)
---
src/diffusers/pipelines/pipeline_utils.py | 10 ++++++---
tests/pipelines/test_pipelines.py | 27 +++++++++++++++++++++++
2 files changed, 34 insertions(+), 3 deletions(-)
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index aed1139a2a16..2f56f650ea33 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -491,15 +491,19 @@ def register_modules(self, **kwargs):
library = module.__module__.split(".")[0]
# check if the module is a pipeline module
- pipeline_dir = module.__module__.split(".")[-2] if len(module.__module__.split(".")) > 2 else None
+ module_path_items = module.__module__.split(".")
+ pipeline_dir = module_path_items[-2] if len(module_path_items) > 2 else None
+
path = module.__module__.split(".")
is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir)
# if library is not in LOADABLE_CLASSES, then it is a custom module.
# Or if it's a pipeline module, then the module is inside the pipeline
# folder so we set the library to module name.
- if library not in LOADABLE_CLASSES or is_pipeline_module:
+ if is_pipeline_module:
library = pipeline_dir
+ elif library not in LOADABLE_CLASSES:
+ library = module.__module__
# retrieve class_name
class_name = module.__class__.__name__
@@ -1039,7 +1043,7 @@ def load_module(name, value):
# 6.2 Define all importable classes
is_pipeline_module = hasattr(pipelines, library_name)
- importable_classes = ALL_IMPORTABLE_CLASSES if is_pipeline_module else LOADABLE_CLASSES[library_name]
+ importable_classes = ALL_IMPORTABLE_CLASSES
loaded_sub_model = None
# 6.3 Use passed sub model or load class_name from library_name
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index a9abb0b4fb62..6ec9ff0346a6 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -35,6 +35,7 @@
from diffusers import (
AutoencoderKL,
+ ConfigMixin,
DDIMPipeline,
DDIMScheduler,
DDPMPipeline,
@@ -44,6 +45,7 @@
EulerAncestralDiscreteScheduler,
EulerDiscreteScheduler,
LMSDiscreteScheduler,
+ ModelMixin,
PNDMScheduler,
StableDiffusionImg2ImgPipeline,
StableDiffusionInpaintPipelineLegacy,
@@ -77,6 +79,17 @@
enable_full_determinism()
+class CustomEncoder(ModelMixin, ConfigMixin):
+ def __init__(self):
+ super().__init__()
+
+
+class CustomPipeline(DiffusionPipeline):
+ def __init__(self, encoder: CustomEncoder, scheduler: DDIMScheduler):
+ super().__init__()
+ self.register_modules(encoder=encoder, scheduler=scheduler)
+
+
class DownloadTests(unittest.TestCase):
def test_one_request_upon_cached(self):
# TODO: For some reason this test fails on MPS where no HEAD call is made.
@@ -695,6 +708,20 @@ def test_local_custom_pipeline_file(self):
# compare to https://github.com/huggingface/diffusers/blob/main/tests/fixtures/custom_pipeline/pipeline.py#L102
assert output_str == "This is a local test"
+ def test_custom_model_and_pipeline(self):
+ pipe = CustomPipeline(
+ encoder=CustomEncoder(),
+ scheduler=DDIMScheduler(),
+ )
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ pipe.save_pretrained(tmpdirname)
+
+ pipe_new = CustomPipeline.from_pretrained(tmpdirname)
+ pipe_new.save_pretrained(tmpdirname)
+
+ assert dict(pipe_new.config) == dict(pipe.config)
+
@slow
@require_torch_gpu
def test_download_from_git(self):
From 9e2734a710fcc73a9790ec4b15a3cb4fb229cf55 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Tue, 23 May 2023 15:22:43 +0200
Subject: [PATCH 127/206] Make sure Diffusers works even if Hub is down (#3447)
* Make sure Diffusers works even if Hub is down
* Make sure hub down is well tested
---
src/diffusers/pipelines/pipeline_utils.py | 17 ++++++++++++-----
tests/pipelines/test_pipelines.py | 2 +-
2 files changed, 13 insertions(+), 6 deletions(-)
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 2f56f650ea33..d5fa22548a15 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -30,6 +30,7 @@
import torch
from huggingface_hub import hf_hub_download, model_info, snapshot_download
from packaging import version
+from requests.exceptions import HTTPError
from tqdm.auto import tqdm
import diffusers
@@ -1228,6 +1229,17 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
allow_patterns = None
ignore_patterns = None
+ if not local_files_only:
+ try:
+ info = model_info(
+ pretrained_model_name,
+ use_auth_token=use_auth_token,
+ revision=revision,
+ )
+ except HTTPError as e:
+ logger.warn(f"Couldn't connect to the Hub: {e}.\nWill try to load from local cache.")
+ local_files_only = True
+
if not local_files_only:
config_file = hf_hub_download(
pretrained_model_name,
@@ -1239,11 +1251,6 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
resume_download=resume_download,
use_auth_token=use_auth_token,
)
- info = model_info(
- pretrained_model_name,
- use_auth_token=use_auth_token,
- revision=revision,
- )
config_dict = cls._dict_from_json_file(config_file)
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index 6ec9ff0346a6..d05785a31315 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -353,7 +353,7 @@ def test_cached_files_are_used_when_no_internet(self):
with mock.patch("requests.request", return_value=response_mock):
# Download this model to make sure it's in the cache.
pipe = StableDiffusionPipeline.from_pretrained(
- "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None, local_files_only=True
+ "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
)
comps = {k: v for k, v in pipe.components.items() if hasattr(v, "parameters")}
From 84ce50f08e8a99e91e838fe96d1993789b03511e Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Tue, 23 May 2023 17:53:34 +0200
Subject: [PATCH 128/206] Improve README (#3524)
Update README.md
---
README.md | 130 ++++++++++++++++++++++++++++++++++--------------------
1 file changed, 82 insertions(+), 48 deletions(-)
diff --git a/README.md b/README.md
index 76d7df79c813..17c883519b39 100644
--- a/README.md
+++ b/README.md
@@ -99,55 +99,11 @@ Check out the [Quickstart](https://huggingface.co/docs/diffusers/quicktour) to l
| **Documentation** | **What can I learn?** |
|---------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Tutorial | A basic crash course for learning how to use the library's most important features like using models and schedulers to build your own diffusion system, and training your own diffusion model. |
-| Loading | Guides for how to load and configure all the components (pipelines, models, and schedulers) of the library, as well as how to use different schedulers. |
-| Pipelines for inference | Guides for how to use pipelines for different inference tasks, batched generation, controlling generated outputs and randomness, and how to contribute a pipeline to the library. |
-| Optimization | Guides for how to optimize your diffusion model to run faster and consume less memory. |
+| [Tutorial](https://huggingface.co/docs/diffusers/tutorials/tutorial_overview) | A basic crash course for learning how to use the library's most important features like using models and schedulers to build your own diffusion system, and training your own diffusion model. |
+| [Loading](https://huggingface.co/docs/diffusers/using-diffusers/loading_overview) | Guides for how to load and configure all the components (pipelines, models, and schedulers) of the library, as well as how to use different schedulers. |
+| [Pipelines for inference](https://huggingface.co/docs/diffusers/using-diffusers/pipeline_overview) | Guides for how to use pipelines for different inference tasks, batched generation, controlling generated outputs and randomness, and how to contribute a pipeline to the library. |
+| [Optimization](https://huggingface.co/docs/diffusers/optimization/opt_overview) | Guides for how to optimize your diffusion model to run faster and consume less memory. |
| [Training](https://huggingface.co/docs/diffusers/training/overview) | Guides for how to train a diffusion model for different tasks with different training techniques. |
-
-## Supported pipelines
-
-| Pipeline | Paper | Tasks |
-|---|---|:---:|
-| [alt_diffusion](./api/pipelines/alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation |
-| [audio_diffusion](./api/pipelines/audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio-diffusion.git) | Unconditional Audio Generation |
-| [controlnet](./api/pipelines/stable_diffusion/controlnet) | [**ControlNet with Stable Diffusion**](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation |
-| [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
-| [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
-| [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
-| [ddim](./api/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
-| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation |
-| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image |
-| [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation |
-| [paint_by_example](./api/pipelines/paint_by_example) | [**Paint by Example: Exemplar-based Image Editing with Diffusion Models**](https://arxiv.org/abs/2211.13227) | Image-Guided Image Inpainting |
-| [pndm](./api/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation |
-| [score_sde_ve](./api/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
-| [score_sde_vp](./api/pipelines/score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
-| [semantic_stable_diffusion](./api/pipelines/semantic_stable_diffusion) | [**Semantic Guidance**](https://arxiv.org/abs/2301.12247) | Text-Guided Generation |
-| [stable_diffusion_text2img](./api/pipelines/stable_diffusion/text2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation |
-| [stable_diffusion_img2img](./api/pipelines/stable_diffusion/img2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation |
-| [stable_diffusion_inpaint](./api/pipelines/stable_diffusion/inpaint) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting |
-| [stable_diffusion_panorama](./api/pipelines/stable_diffusion/panorama) | [**MultiDiffusion**](https://multidiffusion.github.io/) | Text-to-Panorama Generation |
-| [stable_diffusion_pix2pix](./api/pipelines/stable_diffusion/pix2pix) | [**InstructPix2Pix**](https://github.com/timothybrooks/instruct-pix2pix) | Text-Guided Image Editing|
-| [stable_diffusion_pix2pix_zero](./api/pipelines/stable_diffusion/pix2pix_zero) | [**Zero-shot Image-to-Image Translation**](https://pix2pixzero.github.io/) | Text-Guided Image Editing |
-| [stable_diffusion_attend_and_excite](./api/pipelines/stable_diffusion/attend_and_excite) | [**Attend and Excite for Stable Diffusion**](https://attendandexcite.github.io/Attend-and-Excite/) | Text-to-Image Generation |
-| [stable_diffusion_self_attention_guidance](./api/pipelines/stable_diffusion/self_attention_guidance) | [**Self-Attention Guidance**](https://ku-cvlab.github.io/Self-Attention-Guidance) | Text-to-Image Generation |
-| [stable_diffusion_image_variation](./stable_diffusion/image_variation) | [**Stable Diffusion Image Variations**](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations) | Image-to-Image Generation |
-| [stable_diffusion_latent_upscale](./stable_diffusion/latent_upscale) | [**Stable Diffusion Latent Upscaler**](https://twitter.com/StabilityAI/status/1590531958815064065) | Text-Guided Super Resolution Image-to-Image |
-| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation |
-| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting |
-| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Depth-Conditional Stable Diffusion**](https://github.com/Stability-AI/stablediffusion#depth-conditional-stable-diffusion) | Depth-to-Image Generation |
-| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
-| [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation |
-| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Text-to-Image Generation |
-| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Image-to-Image Text-Guided Generation |
-| [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
-| [unclip](./api/pipelines/unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125) | Text-to-Image Generation |
-| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation |
-| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation |
-| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation |
-| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation |
-
## Contribution
We ❤️ contributions from the open-source community!
@@ -160,6 +116,84 @@ You can look out for [issues](https://github.com/huggingface/diffusers/issues) y
Also, say 👋 in our public Discord channel
. We discuss the hottest trends about diffusion models, help each other with contributions, personal projects or
just hang out ☕.
+
+## Popular Tasks & Pipelines
+
+
+
+## ❤️ Popular repos building on 🧨 Diffusers
+
+- https://github.com/microsoft/TaskMatrix
+- https://github.com/invoke-ai/InvokeAI
+- https://github.com/apple/ml-stable-diffusion
+- https://github.com/Sanster/lama-cleaner
+- https://github.com/IDEA-Research/Grounded-Segment-Anything
+- https://github.com/ashawkey/stable-dreamfusion
+- https://github.com/deep-floyd/IF
+- https://github.com/bentoml/BentoML
+- https://github.com/bmaltais/kohya_ss
+
## Credits
This library concretizes previous work by many different authors and would not have been possible without their great research and implementations. We'd like to thank, in particular, the following implementations which have helped us in our development and without which the API could not have been as polished today:
From b402604de4c2ea4f4bb689201d848b0e73513430 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Tue, 23 May 2023 18:28:39 +0200
Subject: [PATCH 129/206] Update README.md (#3525)
---
README.md | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 17c883519b39..709abaff8e65 100644
--- a/README.md
+++ b/README.md
@@ -59,8 +59,9 @@ Generating outputs is super easy with 🤗 Diffusers. To generate an image from
```python
from diffusers import DiffusionPipeline
+import torch
-pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
pipeline.to("cuda")
pipeline("An image of a squirrel in Picasso style").images[0]
```
@@ -182,7 +183,7 @@ just hang out ☕.
-## ❤️ Popular repos building on 🧨 Diffusers
+## Popular using 🧨 Diffusers
- https://github.com/microsoft/TaskMatrix
- https://github.com/invoke-ai/InvokeAI
@@ -193,6 +194,9 @@ just hang out ☕.
- https://github.com/deep-floyd/IF
- https://github.com/bentoml/BentoML
- https://github.com/bmaltais/kohya_ss
+- +3000 other amazing GitHub repositories 💪
+
+Thank you for using us ❤️
## Credits
From abab61d49ea2aad144f70fb30700d07942d30872 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Tue, 23 May 2023 17:29:18 +0100
Subject: [PATCH 130/206] Update README.md
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 709abaff8e65..cb6e29ee1406 100644
--- a/README.md
+++ b/README.md
@@ -183,7 +183,7 @@ just hang out ☕.
-## Popular using 🧨 Diffusers
+## Popular libraries using 🧨 Diffusers
- https://github.com/microsoft/TaskMatrix
- https://github.com/invoke-ai/InvokeAI
From bde2cb5d9b335aa87ff989445cf2e2e9607ad400 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca
Date: Tue, 23 May 2023 19:24:17 +0200
Subject: [PATCH 131/206] Run `torch.compile` tests in separate subprocesses
(#3503)
* Run ControlNet compile test in a separate subprocess
`torch.compile()` spawns several subprocesses and the GPU memory used
was not reclaimed after the test ran. This approach was taken from
`transformers`.
* Style
* Prepare a couple more compile tests to run in subprocess.
* Use require_torch_2 decorator.
* Test inpaint_compile in subprocess.
* Run img2img compile test in subprocess.
* Run stable diffusion compile test in subprocess.
* style
* Temporarily trigger on pr to test.
* Revert "Temporarily trigger on pr to test."
This reverts commit 82d76868ddf9cc634a9f14b2b0aef1d5433cd750.
---
src/diffusers/utils/testing_utils.py | 45 ++++++++++
tests/models/test_modeling_common.py | 44 ++++++---
tests/pipelines/controlnet/test_controlnet.py | 90 +++++++++++--------
.../stable_diffusion/test_stable_diffusion.py | 86 ++++++++++--------
.../test_stable_diffusion_img2img.py | 70 ++++++++++-----
.../test_stable_diffusion_inpaint.py | 73 ++++++++++-----
tests/pipelines/test_pipelines.py | 73 +++++++++------
7 files changed, 318 insertions(+), 163 deletions(-)
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index 93d0ef5b7b5f..7d5e6bcacecd 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -1,5 +1,6 @@
import inspect
import logging
+import multiprocessing
import os
import random
import re
@@ -477,6 +478,50 @@ def summary_failures_short(tr):
config.option.tbstyle = orig_tbstyle
+# Taken from: https://github.com/huggingface/transformers/blob/3658488ff77ff8d45101293e749263acf437f4d5/src/transformers/testing_utils.py#L1787
+def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None):
+ """
+ To run a test in a subprocess. In particular, this can avoid (GPU) memory issue.
+
+ Args:
+ test_case (`unittest.TestCase`):
+ The test that will run `target_func`.
+ target_func (`Callable`):
+ The function implementing the actual testing logic.
+ inputs (`dict`, *optional*, defaults to `None`):
+ The inputs that will be passed to `target_func` through an (input) queue.
+ timeout (`int`, *optional*, defaults to `None`):
+ The timeout (in seconds) that will be passed to the input and output queues. If not specified, the env.
+ variable `PYTEST_TIMEOUT` will be checked. If still `None`, its value will be set to `600`.
+ """
+ if timeout is None:
+ timeout = int(os.environ.get("PYTEST_TIMEOUT", 600))
+
+ start_methohd = "spawn"
+ ctx = multiprocessing.get_context(start_methohd)
+
+ input_queue = ctx.Queue(1)
+ output_queue = ctx.JoinableQueue(1)
+
+ # We can't send `unittest.TestCase` to the child, otherwise we get issues regarding pickle.
+ input_queue.put(inputs, timeout=timeout)
+
+ process = ctx.Process(target=target_func, args=(input_queue, output_queue, timeout))
+ process.start()
+ # Kill the child process if we can't get outputs from it in time: otherwise, the hanging subprocess prevents
+ # the test to exit properly.
+ try:
+ results = output_queue.get(timeout=timeout)
+ output_queue.task_done()
+ except Exception as e:
+ process.terminate()
+ test_case.fail(e)
+ process.join(timeout=timeout)
+
+ if results["error"] is not None:
+ test_case.fail(f'{results["error"]}')
+
+
class CaptureLogger:
"""
Args:
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index b2c5f2d79d4f..adc18e003a56 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -15,6 +15,7 @@
import inspect
import tempfile
+import traceback
import unittest
import unittest.mock as mock
from typing import Dict, List, Tuple
@@ -27,7 +28,31 @@
from diffusers.models import UNet2DConditionModel
from diffusers.training_utils import EMAModel
from diffusers.utils import logging, torch_device
-from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
+from diffusers.utils.testing_utils import CaptureLogger, require_torch_2, run_test_in_subprocess
+
+
+# Will be run via run_test_in_subprocess
+def _test_from_save_pretrained_dynamo(in_queue, out_queue, timeout):
+ error = None
+ try:
+ init_dict, model_class = in_queue.get(timeout=timeout)
+
+ model = model_class(**init_dict)
+ model.to(torch_device)
+ model = torch.compile(model)
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ model.save_pretrained(tmpdirname)
+ new_model = model_class.from_pretrained(tmpdirname)
+ new_model.to(torch_device)
+
+ assert new_model.__class__ == model_class
+ except Exception:
+ error = f"{traceback.format_exc()}"
+
+ results = {"error": error}
+ out_queue.put(results, timeout=timeout)
+ out_queue.join()
class ModelUtilsTest(unittest.TestCase):
@@ -235,20 +260,11 @@ def test_from_save_pretrained_variant(self):
max_diff = (image - new_image).abs().sum().item()
self.assertLessEqual(max_diff, 5e-5, "Models give different forward passes")
- @require_torch_gpu
+ @require_torch_2
def test_from_save_pretrained_dynamo(self):
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
- model = self.model_class(**init_dict)
- model.to(torch_device)
- model = torch.compile(model)
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- model.save_pretrained(tmpdirname)
- new_model = self.model_class.from_pretrained(tmpdirname)
- new_model.to(torch_device)
-
- assert new_model.__class__ == self.model_class
+ init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+ inputs = [init_dict, self.model_class]
+ run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=inputs)
def test_from_save_pretrained_dtype(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py
index 97b5e20f3c14..ee6f8fce2508 100644
--- a/tests/pipelines/controlnet/test_controlnet.py
+++ b/tests/pipelines/controlnet/test_controlnet.py
@@ -15,11 +15,11 @@
import gc
import tempfile
+import traceback
import unittest
import numpy as np
import torch
-from packaging import version
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
@@ -32,7 +32,12 @@
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel
from diffusers.utils import load_image, load_numpy, randn_tensor, slow, torch_device
from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+ enable_full_determinism,
+ require_torch_2,
+ require_torch_gpu,
+ run_test_in_subprocess,
+)
from ..pipeline_params import (
TEXT_TO_IMAGE_BATCH_PARAMS,
@@ -44,6 +49,51 @@
enable_full_determinism()
+# Will be run via run_test_in_subprocess
+def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
+ error = None
+ try:
+ _ = in_queue.get(timeout=timeout)
+
+ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+
+ pipe = StableDiffusionControlNetPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+ )
+ pipe.to("cuda")
+ pipe.set_progress_bar_config(disable=None)
+
+ pipe.unet.to(memory_format=torch.channels_last)
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+ pipe.controlnet.to(memory_format=torch.channels_last)
+ pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
+
+ generator = torch.Generator(device="cpu").manual_seed(0)
+ prompt = "bird"
+ image = load_image(
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+ )
+
+ output = pipe(prompt, image, generator=generator, output_type="np")
+ image = output.images[0]
+
+ assert image.shape == (768, 512, 3)
+
+ expected_image = load_numpy(
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy"
+ )
+
+ assert np.abs(expected_image - image).max() < 1.0
+
+ except Exception:
+ error = f"{traceback.format_exc()}"
+
+ results = {"error": error}
+ out_queue.put(results, timeout=timeout)
+ out_queue.join()
+
+
class ControlNetPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionControlNetPipeline
params = TEXT_TO_IMAGE_PARAMS
@@ -594,41 +644,9 @@ def test_canny_guess_mode(self):
expected_slice = np.array([0.2724, 0.2846, 0.2724, 0.3843, 0.3682, 0.2736, 0.4675, 0.3862, 0.2887])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+ @require_torch_2
def test_stable_diffusion_compile(self):
- if version.parse(torch.__version__) < version.parse("2.0"):
- print(f"Test `test_stable_diffusion_ddim` is skipped because {torch.__version__} is < 2.0")
- return
-
- controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
-
- pipe = StableDiffusionControlNetPipeline.from_pretrained(
- "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
- )
- pipe.to("cuda")
- pipe.set_progress_bar_config(disable=None)
-
- pipe.unet.to(memory_format=torch.channels_last)
- pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
- pipe.controlnet.to(memory_format=torch.channels_last)
- pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
-
- generator = torch.Generator(device="cpu").manual_seed(0)
- prompt = "bird"
- image = load_image(
- "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
- )
-
- output = pipe(prompt, image, generator=generator, output_type="np")
- image = output.images[0]
-
- assert image.shape == (768, 512, 3)
-
- expected_image = load_numpy(
- "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy"
- )
-
- assert np.abs(expected_image - image).max() < 1.0
+ run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None)
def test_v11_shuffle_global_pool_conditions(self):
controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_shuffle")
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index aec4436710b9..6140bf771e65 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -15,19 +15,14 @@
import gc
-import os
-import signal
-import subprocess
-import sys
import tempfile
import time
+import traceback
import unittest
import numpy as np
-import pytest
import torch
from huggingface_hub import hf_hub_download
-from packaging import version
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
@@ -44,25 +39,52 @@
)
from diffusers.models.attention_processor import AttnProcessor
from diffusers.utils import load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import CaptureLogger, enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+ CaptureLogger,
+ enable_full_determinism,
+ require_torch_2,
+ require_torch_gpu,
+ run_test_in_subprocess,
+)
from ...models.test_models_unet_2d_condition import create_lora_layers
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
-@pytest.fixture(autouse=True)
-def process_fixture():
- # This will be run before each test
- command = [sys.executable, os.path.abspath(__file__)]
- process = subprocess.Popen(command)
- enable_full_determinism()
- yield process
- # This will be run after each test
+enable_full_determinism()
+
+
+# Will be run via run_test_in_subprocess
+def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
+ error = None
try:
- os.kill(process.pid, signal.SIGTERM) # or signal.SIGKILL
- except ProcessLookupError:
- pass
+ inputs = in_queue.get(timeout=timeout)
+ torch_device = inputs.pop("torch_device")
+ seed = inputs.pop("seed")
+ inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
+
+ sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+ sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
+ sd_pipe = sd_pipe.to(torch_device)
+
+ sd_pipe.unet.to(memory_format=torch.channels_last)
+ sd_pipe.unet = torch.compile(sd_pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+ sd_pipe.set_progress_bar_config(disable=None)
+
+ image = sd_pipe(**inputs).images
+ image_slice = image[0, -3:, -3:, -1].flatten()
+
+ assert image.shape == (1, 512, 512, 3)
+ expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
+ assert np.abs(image_slice - expected_slice).max() < 5e-3
+ except Exception:
+ error = f"{traceback.format_exc()}"
+
+ results = {"error": error}
+ out_queue.put(results, timeout=timeout)
+ out_queue.join()
class StableDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
@@ -927,27 +949,15 @@ def test_stable_diffusion_textual_inversion(self):
max_diff = np.abs(expected_image - image).max()
assert max_diff < 8e-1
+ @require_torch_2
def test_stable_diffusion_compile(self):
- if version.parse(torch.__version__) < version.parse("2.0"):
- print(f"Test `test_stable_diffusion_ddim` is skipped because {torch.__version__} is < 2.0")
- return
-
- sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
- sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
- sd_pipe = sd_pipe.to(torch_device)
-
- sd_pipe.unet.to(memory_format=torch.channels_last)
- sd_pipe.unet = torch.compile(sd_pipe.unet, mode="reduce-overhead", fullgraph=True)
-
- sd_pipe.set_progress_bar_config(disable=None)
-
- inputs = self.get_inputs(torch_device)
- image = sd_pipe(**inputs).images
- image_slice = image[0, -3:, -3:, -1].flatten()
-
- assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
- assert np.abs(image_slice - expected_slice).max() < 5e-3
+ seed = 0
+ inputs = self.get_inputs(torch_device, seed=seed)
+ # Can't pickle a Generator object
+ del inputs["generator"]
+ inputs["torch_device"] = torch_device
+ inputs["seed"] = seed
+ run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=inputs)
@slow
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 8ab252b9be80..33305d5980be 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -15,11 +15,11 @@
import gc
import random
+import traceback
import unittest
import numpy as np
import torch
-from packaging import version
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
@@ -34,7 +34,13 @@
)
from diffusers.image_processor import VaeImageProcessor
from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import (
+ enable_full_determinism,
+ require_torch_2,
+ require_torch_gpu,
+ run_test_in_subprocess,
+ skip_mps,
+)
from ..pipeline_params import (
IMAGE_TO_IMAGE_IMAGE_PARAMS,
@@ -47,6 +53,38 @@
enable_full_determinism()
+# Will be run via run_test_in_subprocess
+def _test_img2img_compile(in_queue, out_queue, timeout):
+ error = None
+ try:
+ inputs = in_queue.get(timeout=timeout)
+ torch_device = inputs.pop("torch_device")
+ seed = inputs.pop("seed")
+ inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
+
+ pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+ pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+ pipe.to(torch_device)
+ pipe.set_progress_bar_config(disable=None)
+
+ pipe.unet.to(memory_format=torch.channels_last)
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+ image = pipe(**inputs).images
+ image_slice = image[0, -3:, -3:, -1].flatten()
+
+ assert image.shape == (1, 512, 768, 3)
+ expected_slice = np.array([0.0593, 0.0607, 0.0851, 0.0582, 0.0636, 0.0721, 0.0751, 0.0981, 0.0781])
+
+ assert np.abs(expected_slice - image_slice).max() < 1e-3
+ except Exception:
+ error = f"{traceback.format_exc()}"
+
+ results = {"error": error}
+ out_queue.put(results, timeout=timeout)
+ out_queue.join()
+
+
class StableDiffusionImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionImg2ImgPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
@@ -464,27 +502,15 @@ def test_img2img_safety_checker_works(self):
assert out.nsfw_content_detected[0], f"Safety checker should work for prompt: {inputs['prompt']}"
assert np.abs(out.images[0]).sum() < 1e-5 # should be all zeros
+ @require_torch_2
def test_img2img_compile(self):
- if version.parse(torch.__version__) < version.parse("2.0"):
- print(f"Test `test_stable_diffusion_ddim` is skipped because {torch.__version__} is < 2.0")
- return
-
- pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
- pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
- pipe.to(torch_device)
- pipe.set_progress_bar_config(disable=None)
-
- pipe.unet.to(memory_format=torch.channels_last)
- pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
- inputs = self.get_inputs(torch_device)
- image = pipe(**inputs).images
- image_slice = image[0, -3:, -3:, -1].flatten()
-
- assert image.shape == (1, 512, 768, 3)
- expected_slice = np.array([0.0593, 0.0607, 0.0851, 0.0582, 0.0636, 0.0721, 0.0751, 0.0981, 0.0781])
-
- assert np.abs(expected_slice - image_slice).max() < 1e-3
+ seed = 0
+ inputs = self.get_inputs(torch_device, seed=seed)
+ # Can't pickle a Generator object
+ del inputs["generator"]
+ inputs["torch_device"] = torch_device
+ inputs["seed"] = seed
+ run_test_in_subprocess(test_case=self, target_func=_test_img2img_compile, inputs=inputs)
@nightly
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 44de277ead07..eb1c097dfba0 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -15,11 +15,11 @@
import gc
import random
+import traceback
import unittest
import numpy as np
import torch
-from packaging import version
from PIL import Image
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
@@ -33,7 +33,12 @@
)
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image
from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+ enable_full_determinism,
+ require_torch_2,
+ require_torch_gpu,
+ run_test_in_subprocess,
+)
from ...models.test_models_unet_2d_condition import create_lora_layers
from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
@@ -43,6 +48,40 @@
enable_full_determinism()
+# Will be run via run_test_in_subprocess
+def _test_inpaint_compile(in_queue, out_queue, timeout):
+ error = None
+ try:
+ inputs = in_queue.get(timeout=timeout)
+ torch_device = inputs.pop("torch_device")
+ seed = inputs.pop("seed")
+ inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
+
+ pipe = StableDiffusionInpaintPipeline.from_pretrained(
+ "runwayml/stable-diffusion-inpainting", safety_checker=None
+ )
+ pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
+ pipe.to(torch_device)
+ pipe.set_progress_bar_config(disable=None)
+
+ pipe.unet.to(memory_format=torch.channels_last)
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+ image = pipe(**inputs).images
+ image_slice = image[0, 253:256, 253:256, -1].flatten()
+
+ assert image.shape == (1, 512, 512, 3)
+ expected_slice = np.array([0.0425, 0.0273, 0.0344, 0.1694, 0.1727, 0.1812, 0.3256, 0.3311, 0.3272])
+
+ assert np.abs(expected_slice - image_slice).max() < 3e-3
+ except Exception:
+ error = f"{traceback.format_exc()}"
+
+ results = {"error": error}
+ out_queue.put(results, timeout=timeout)
+ out_queue.join()
+
+
class StableDiffusionInpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = StableDiffusionInpaintPipeline
params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
@@ -315,29 +354,15 @@ def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self):
# make sure that less than 2.2 GB is allocated
assert mem_bytes < 2.2 * 10**9
+ @require_torch_2
def test_inpaint_compile(self):
- if version.parse(torch.__version__) < version.parse("2.0"):
- print(f"Test `test_stable_diffusion_ddim` is skipped because {torch.__version__} is < 2.0")
- return
-
- pipe = StableDiffusionInpaintPipeline.from_pretrained(
- "runwayml/stable-diffusion-inpainting", safety_checker=None
- )
- pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
- pipe.to(torch_device)
- pipe.set_progress_bar_config(disable=None)
-
- pipe.unet.to(memory_format=torch.channels_last)
- pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
- inputs = self.get_inputs(torch_device)
- image = pipe(**inputs).images
- image_slice = image[0, 253:256, 253:256, -1].flatten()
-
- assert image.shape == (1, 512, 512, 3)
- expected_slice = np.array([0.0425, 0.0273, 0.0344, 0.1694, 0.1727, 0.1812, 0.3256, 0.3311, 0.3272])
-
- assert np.abs(expected_slice - image_slice).max() < 3e-3
+ seed = 0
+ inputs = self.get_inputs(torch_device, seed=seed)
+ # Can't pickle a Generator object
+ del inputs["generator"]
+ inputs["torch_device"] = torch_device
+ inputs["seed"] = seed
+ run_test_in_subprocess(test_case=self, target_func=_test_inpaint_compile, inputs=inputs)
def test_stable_diffusion_inpaint_pil_input_resolution_test(self):
pipe = StableDiffusionInpaintPipeline.from_pretrained(
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index d05785a31315..8eaee0915a4f 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -20,6 +20,7 @@
import shutil
import sys
import tempfile
+import traceback
import unittest
import unittest.mock as mock
@@ -73,12 +74,54 @@
require_compel,
require_flax,
require_torch_gpu,
+ run_test_in_subprocess,
)
enable_full_determinism()
+# Will be run via run_test_in_subprocess
+def _test_from_save_pretrained_dynamo(in_queue, out_queue, timeout):
+ error = None
+ try:
+ # 1. Load models
+ model = UNet2DModel(
+ block_out_channels=(32, 64),
+ layers_per_block=2,
+ sample_size=32,
+ in_channels=3,
+ out_channels=3,
+ down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+ up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+ )
+ model = torch.compile(model)
+ scheduler = DDPMScheduler(num_train_timesteps=10)
+
+ ddpm = DDPMPipeline(model, scheduler)
+ ddpm.to(torch_device)
+ ddpm.set_progress_bar_config(disable=None)
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ ddpm.save_pretrained(tmpdirname)
+ new_ddpm = DDPMPipeline.from_pretrained(tmpdirname)
+ new_ddpm.to(torch_device)
+
+ generator = torch.Generator(device=torch_device).manual_seed(0)
+ image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
+
+ generator = torch.Generator(device=torch_device).manual_seed(0)
+ new_image = new_ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
+
+ assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass"
+ except Exception:
+ error = f"{traceback.format_exc()}"
+
+ results = {"error": error}
+ out_queue.put(results, timeout=timeout)
+ out_queue.join()
+
+
class CustomEncoder(ModelMixin, ConfigMixin):
def __init__(self):
super().__init__()
@@ -1342,35 +1385,7 @@ def test_from_save_pretrained(self):
@require_torch_2
def test_from_save_pretrained_dynamo(self):
- # 1. Load models
- model = UNet2DModel(
- block_out_channels=(32, 64),
- layers_per_block=2,
- sample_size=32,
- in_channels=3,
- out_channels=3,
- down_block_types=("DownBlock2D", "AttnDownBlock2D"),
- up_block_types=("AttnUpBlock2D", "UpBlock2D"),
- )
- model = torch.compile(model)
- scheduler = DDPMScheduler(num_train_timesteps=10)
-
- ddpm = DDPMPipeline(model, scheduler)
- ddpm.to(torch_device)
- ddpm.set_progress_bar_config(disable=None)
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- ddpm.save_pretrained(tmpdirname)
- new_ddpm = DDPMPipeline.from_pretrained(tmpdirname)
- new_ddpm.to(torch_device)
-
- generator = torch.Generator(device=torch_device).manual_seed(0)
- image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
-
- generator = torch.Generator(device=torch_device).manual_seed(0)
- new_image = new_ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
-
- assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass"
+ run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=None)
def test_from_pretrained_hub(self):
model_path = "google/ddpm-cifar10-32"
From c13dbd5c3a53017d27de35ad77b8d57f04c8ec7c Mon Sep 17 00:00:00 2001
From: Will Berman
Date: Tue, 23 May 2023 13:11:53 -0700
Subject: [PATCH 132/206] fix attention mask pad check (#3531)
---
src/diffusers/models/attention_processor.py | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index d0e2e7bd2dac..13c7afc8e922 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -381,12 +381,7 @@ def prepare_attention_mask(self, attention_mask, target_length, batch_size=None,
return attention_mask
current_length: int = attention_mask.shape[-1]
- if current_length > target_length:
- # we *could* trim the mask with:
- # attention_mask = attention_mask[:,:target_length]
- # but this is weird enough that it's more likely to be a mistake than a shortcut
- raise ValueError(f"mask's length ({current_length}) exceeds the sequence length ({target_length}).")
- elif current_length < target_length:
+ if current_length != target_length:
if attention_mask.device.type == "mps":
# HACK: MPS: Does not support padding by greater than dimension of input tensor.
# Instead, we can manually construct the padding tensor.
From db56f8a4f5b433ec600d0acf69026e8de375f3a4 Mon Sep 17 00:00:00 2001
From: Will Berman
Date: Wed, 24 May 2023 03:17:41 -0700
Subject: [PATCH 133/206] explicit broadcasts for assignments (#3535)
---
src/diffusers/models/resnet.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index debe120e8ead..92bc89c80099 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -433,7 +433,8 @@ def forward(self, x):
x = F.pad(x, (self.pad,) * 4, self.pad_mode)
weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]])
indices = torch.arange(x.shape[1], device=x.device)
- weight[indices, indices] = self.kernel.to(weight)
+ kernel = self.kernel.to(weight)[None, :].expand(x.shape[1], -1, -1)
+ weight[indices, indices] = kernel
return F.conv2d(x, weight, stride=2)
@@ -449,7 +450,8 @@ def forward(self, x):
x = F.pad(x, ((self.pad + 1) // 2,) * 4, self.pad_mode)
weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]])
indices = torch.arange(x.shape[1], device=x.device)
- weight[indices, indices] = self.kernel.to(weight)
+ kernel = self.kernel.to(weight)[None, :].expand(x.shape[1], -1, -1)
+ weight[indices, indices] = kernel
return F.conv_transpose2d(x, weight, stride=2, padding=self.pad * 2 + 1)
From 8e69708b0d4f2784676cbfd9bfefa487d9f1ebb3 Mon Sep 17 00:00:00 2001
From: Sayak Paul
Date: Wed, 24 May 2023 16:16:28 +0530
Subject: [PATCH 134/206] [Examples/DreamBooth] refactor save_model_card
utility in dreambooth examples (#3543)
refactor save_model_card utility in dreambooth examples.
---
examples/dreambooth/train_dreambooth.py | 16 +++++++++++++---
examples/dreambooth/train_dreambooth_lora.py | 17 +++++++++++++----
2 files changed, 26 insertions(+), 7 deletions(-)
diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index ad43ee7aeee2..158d03185a54 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -46,6 +46,7 @@
DDPMScheduler,
DiffusionPipeline,
DPMSolverMultistepScheduler,
+ StableDiffusionPipeline,
UNet2DConditionModel,
)
from diffusers.optimization import get_scheduler
@@ -62,7 +63,15 @@
logger = get_logger(__name__)
-def save_model_card(repo_id: str, images=None, base_model=str, train_text_encoder=False, prompt=str, repo_folder=None):
+def save_model_card(
+ repo_id: str,
+ images=None,
+ base_model=str,
+ train_text_encoder=False,
+ prompt=str,
+ repo_folder=None,
+ pipeline: DiffusionPipeline = None,
+):
img_str = ""
for i, image in enumerate(images):
image.save(os.path.join(repo_folder, f"image_{i}.png"))
@@ -74,8 +83,8 @@ def save_model_card(repo_id: str, images=None, base_model=str, train_text_encode
base_model: {base_model}
instance_prompt: {prompt}
tags:
-- stable-diffusion
-- stable-diffusion-diffusers
+- {'stable-diffusion' if isinstance(pipeline, StableDiffusionPipeline) else 'if'}
+- {'stable-diffusion-diffusers' if isinstance(pipeline, StableDiffusionPipeline) else 'if-diffusers'}
- text-to-image
- diffusers
- dreambooth
@@ -1297,6 +1306,7 @@ def compute_text_embeddings(prompt):
train_text_encoder=args.train_text_encoder,
prompt=args.instance_prompt,
repo_folder=args.output_dir,
+ pipeline=pipeline,
)
upload_folder(
repo_id=repo_id,
diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
index e640542e36da..4ff759dcd6d4 100644
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
@@ -68,7 +68,15 @@
logger = get_logger(__name__)
-def save_model_card(repo_id: str, images=None, base_model=str, train_text_encoder=False, prompt=str, repo_folder=None):
+def save_model_card(
+ repo_id: str,
+ images=None,
+ base_model=str,
+ train_text_encoder=False,
+ prompt=str,
+ repo_folder=None,
+ pipeline: DiffusionPipeline = None,
+):
img_str = ""
for i, image in enumerate(images):
image.save(os.path.join(repo_folder, f"image_{i}.png"))
@@ -80,8 +88,8 @@ def save_model_card(repo_id: str, images=None, base_model=str, train_text_encode
base_model: {base_model}
instance_prompt: {prompt}
tags:
-- stable-diffusion
-- stable-diffusion-diffusers
+- {'stable-diffusion' if isinstance(pipeline, StableDiffusionPipeline) else 'if'}
+- {'stable-diffusion-diffusers' if isinstance(pipeline, StableDiffusionPipeline) else 'if-diffusers'}
- text-to-image
- diffusers
- lora
@@ -844,7 +852,7 @@ def main(args):
hidden_size=module.out_features, cross_attention_dim=None
)
text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
- temp_pipeline = StableDiffusionPipeline.from_pretrained(
+ temp_pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path, text_encoder=text_encoder
)
temp_pipeline._modify_text_encoder(text_lora_attn_procs)
@@ -1332,6 +1340,7 @@ def compute_text_embeddings(prompt):
train_text_encoder=args.train_text_encoder,
prompt=args.instance_prompt,
repo_folder=args.output_dir,
+ pipeline=pipeline,
)
upload_folder(
repo_id=repo_id,
From a94977b8b32b94ccd00d2f8f812aadb46764baba Mon Sep 17 00:00:00 2001
From: Isotr0py <41363108+Isotr0py@users.noreply.github.com>
Date: Wed, 24 May 2023 20:28:08 +0800
Subject: [PATCH 135/206] Fix panorama to support all schedulers (#3546)
* refactor blocks init
* refactor blocks loop
* remove unused function and warnings
* fix scheduler update location
* reformat code
* reformat code again
* fix PNDM test case
* reformat pndm test case
---
.../pipeline_stable_diffusion_panorama.py | 33 ++++++++-----------
.../test_stable_diffusion_panorama.py | 15 ++++++---
2 files changed, 24 insertions(+), 24 deletions(-)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
index 223f8a236efa..66706c806a81 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
@@ -11,6 +11,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import copy
import inspect
import warnings
from typing import Any, Callable, Dict, List, Optional, Union
@@ -21,7 +22,7 @@
from ...image_processor import VaeImageProcessor
from ...loaders import TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import DDIMScheduler, PNDMScheduler
+from ...schedulers import DDIMScheduler
from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring
from ..pipeline_utils import DiffusionPipeline
from . import StableDiffusionPipelineOutput
@@ -96,9 +97,6 @@ def __init__(
):
super().__init__()
- if isinstance(scheduler, PNDMScheduler):
- logger.error("PNDMScheduler for this pipeline is currently not supported.")
-
if safety_checker is None and requires_safety_checker:
logger.warning(
f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
@@ -612,7 +610,7 @@ def __call__(
# 6. Define panorama grid and initialize views for synthesis.
views = self.get_views(height, width)
- blocks_model_outputs = [None] * len(views)
+ views_scheduler_status = [copy.deepcopy(self.scheduler.__dict__)] * len(views)
count = torch.zeros_like(latents)
value = torch.zeros_like(latents)
@@ -637,6 +635,9 @@ def __call__(
# get the latents corresponding to the current view coordinates
latents_for_view = latents[:, :, h_start:h_end, w_start:w_end]
+ # rematch block's scheduler status
+ self.scheduler.__dict__.update(views_scheduler_status[j])
+
# expand the latents if we are doing classifier free guidance
latent_model_input = (
torch.cat([latents_for_view] * 2) if do_classifier_free_guidance else latents_for_view
@@ -657,21 +658,13 @@ def __call__(
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
- if hasattr(self.scheduler, "model_outputs"):
- # rematch model_outputs in each block
- if i >= 1:
- self.scheduler.model_outputs = blocks_model_outputs[j]
- latents_view_denoised = self.scheduler.step(
- noise_pred, t, latents_for_view, **extra_step_kwargs
- ).prev_sample
- # collect model_outputs
- blocks_model_outputs[j] = [
- output if output is not None else None for output in self.scheduler.model_outputs
- ]
- else:
- latents_view_denoised = self.scheduler.step(
- noise_pred, t, latents_for_view, **extra_step_kwargs
- ).prev_sample
+ latents_view_denoised = self.scheduler.step(
+ noise_pred, t, latents_for_view, **extra_step_kwargs
+ ).prev_sample
+
+ # save views scheduler status after sample
+ views_scheduler_status[j] = copy.deepcopy(self.scheduler.__dict__)
+
value[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised
count[:, :, h_start:h_end, w_start:w_end] += 1
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
index 02a15b2a29dc..021065416838 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
@@ -174,15 +174,22 @@ def test_stable_diffusion_panorama_euler(self):
def test_stable_diffusion_panorama_pndm(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
- components["scheduler"] = PNDMScheduler()
+ components["scheduler"] = PNDMScheduler(
+ beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
+ )
sd_pipe = StableDiffusionPanoramaPipeline(**components)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
- # the pipeline does not expect pndm so test if it raises error.
- with self.assertRaises(ValueError):
- _ = sd_pipe(**inputs).images
+ image = sd_pipe(**inputs).images
+ image_slice = image[0, -3:, -3:, -1]
+
+ assert image.shape == (1, 64, 64, 3)
+
+ expected_slice = np.array([0.6391, 0.6291, 0.4861, 0.5134, 0.5552, 0.4578, 0.5032, 0.5023, 0.4539])
+
+ assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@slow
From f19f1287358beb31a71bc1bf0ef680a2c6155964 Mon Sep 17 00:00:00 2001
From: Patrick von Platen
Date: Thu, 25 May 2023 12:11:20 +0200
Subject: [PATCH 136/206] Add open parti prompts to docs (#3549)
* Add open parti prompts
* More changes
---
docs/source/en/conceptual/evaluation.mdx | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/docs/source/en/conceptual/evaluation.mdx b/docs/source/en/conceptual/evaluation.mdx
index 2721adea0c16..6e5c14acad4e 100644
--- a/docs/source/en/conceptual/evaluation.mdx
+++ b/docs/source/en/conceptual/evaluation.mdx
@@ -37,7 +37,8 @@ We cover Diffusion models with the following pipelines:
## Qualitative Evaluation
-Qualitative evaluation typically involves human assessment of generated images. Quality is measured across aspects such as compositionality, image-text alignment, and spatial relations. Common prompts provide a degree of uniformity for subjective metrics. DrawBench and PartiPrompts are prompt datasets used for qualitative benchmarking. DrawBench and PartiPrompts were introduced by [Imagen](https://imagen.research.google/) and [Parti](https://parti.research.google/) respectively.
+Qualitative evaluation typically involves human assessment of generated images. Quality is measured across aspects such as compositionality, image-text alignment, and spatial relations. Common prompts provide a degree of uniformity for subjective metrics.
+DrawBench and PartiPrompts are prompt datasets used for qualitative benchmarking. DrawBench and PartiPrompts were introduced by [Imagen](https://imagen.research.google/) and [Parti](https://parti.research.google/) respectively.
From the [official Parti website](https://parti.research.google/):
@@ -51,7 +52,13 @@ PartiPrompts has the following columns:
- Category of the prompt (such as “Abstract”, “World Knowledge”, etc.)
- Challenge reflecting the difficulty (such as “Basic”, “Complex”, “Writing & Symbols”, etc.)
-These benchmarks allow for side-by-side human evaluation of different image generation models. Let’s see how we can use `diffusers` on a couple of PartiPrompts.
+These benchmarks allow for side-by-side human evaluation of different image generation models.
+
+For this, the 🧨 Diffusers team has built **Open Parti Prompts**, which is a community-driven qualitative benchmark based on Parti Prompts to compare state-of-the-art open-source diffusion models:
+- [Open Parti Prompts Game](https://huggingface.co/spaces/OpenGenAI/open-parti-prompts): For 10 parti prompts, 4 generated images are shown and the user selects the image that suits the prompt best.
+- [Open Parti Prompts Leaderboard](https://huggingface.co/spaces/OpenGenAI/parti-prompts-leaderboard): The leaderboard comparing the currently best open-sourced diffusion models to each other.
+
+To manually compare images, let’s see how we can use `diffusers` on a couple of PartiPrompts.
Below we show some prompts sampled across different challenges: Basic, Complex, Linguistic Structures, Imagination, and Writing & Symbols. Here we are using PartiPrompts as a [dataset](https://huggingface.co/datasets/nateraw/parti-prompts).
From 03b7a84cbee11fa1cff98e5275050f284da168df Mon Sep 17 00:00:00 2001
From: YiYi Xu
Date: Thu, 25 May 2023 11:28:34 -1000
Subject: [PATCH 137/206] Add Kandinsky 2.1 (#3308)
add kandinsky2.1
---------
Co-authored-by: yiyixuxu
Co-authored-by: Ayush Mangal <43698245+ayushtues@users.noreply.github.com>
Co-authored-by: ayushmangal
Co-authored-by: Patrick von Platen
Co-authored-by: Sayak Paul
---
docs/source/en/_toctree.yml | 2 +
docs/source/en/api/pipelines/kandinsky.mdx | 306 ++++
scripts/convert_kandinsky_to_diffusers.py | 1400 +++++++++++++++++
src/diffusers/__init__.py | 4 +
src/diffusers/models/attention_processor.py | 45 +-
src/diffusers/models/embeddings.py | 45 +
src/diffusers/models/resnet.py | 13 +-
src/diffusers/models/unet_2d_blocks.py | 30 +-
src/diffusers/models/unet_2d_condition.py | 78 +-
src/diffusers/models/vae.py | 37 +-
src/diffusers/models/vq_model.py | 6 +-
src/diffusers/pipelines/__init__.py | 6 +
src/diffusers/pipelines/kandinsky/__init__.py | 19 +
.../pipelines/kandinsky/pipeline_kandinsky.py | 463 ++++++
.../kandinsky/pipeline_kandinsky_img2img.py | 547 +++++++
.../kandinsky/pipeline_kandinsky_inpaint.py | 672 ++++++++
.../kandinsky/pipeline_kandinsky_prior.py | 563 +++++++
.../pipelines/kandinsky/text_encoder.py | 27 +
.../versatile_diffusion/modeling_text_unet.py | 80 +-
.../dummy_torch_and_transformers_objects.py | 60 +
tests/pipelines/kandinsky/__init__.py | 0
tests/pipelines/kandinsky/test_kandinsky.py | 282 ++++
.../kandinsky/test_kandinsky_img2img.py | 303 ++++
.../kandinsky/test_kandinsky_inpaint.py | 313 ++++
.../kandinsky/test_kandinsky_prior.py | 236 +++
tests/pipelines/test_pipelines_common.py | 2 +-
26 files changed, 5497 insertions(+), 42 deletions(-)
create mode 100644 docs/source/en/api/pipelines/kandinsky.mdx
create mode 100644 scripts/convert_kandinsky_to_diffusers.py
create mode 100644 src/diffusers/pipelines/kandinsky/__init__.py
create mode 100644 src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
create mode 100644 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
create mode 100644 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
create mode 100644 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
create mode 100644 src/diffusers/pipelines/kandinsky/text_encoder.py
create mode 100644 tests/pipelines/kandinsky/__init__.py
create mode 100644 tests/pipelines/kandinsky/test_kandinsky.py
create mode 100644 tests/pipelines/kandinsky/test_kandinsky_img2img.py
create mode 100644 tests/pipelines/kandinsky/test_kandinsky_inpaint.py
create mode 100644 tests/pipelines/kandinsky/test_kandinsky_prior.py
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index aa2d907da4bd..368ea30a2690 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -166,6 +166,8 @@
title: DiT
- local: api/pipelines/if
title: IF
+ - local: api/pipelines/kandinsky
+ title: Kandinsky
- local: api/pipelines/latent_diffusion
title: Latent Diffusion
- local: api/pipelines/paint_by_example
diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
new file mode 100644
index 000000000000..b5b4f0f06400
--- /dev/null
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -0,0 +1,306 @@
+
+
+# Kandinsky
+
+## Overview
+
+Kandinsky 2.1 inherits best practices from [DALL-E 2](https://arxiv.org/abs/2204.06125) and [Latent Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/latent_diffusion), while introducing some new ideas.
+
+It uses [CLIP](https://huggingface.co/docs/transformers/model_doc/clip) for encoding images and text, and a diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach enhances the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.
+
+The Kandinsky model is created by [Arseniy Shakhmatov](https://github.com/cene555), [Anton Razzhigaev](https://github.com/razzant), [Aleksandr Nikolich](https://github.com/AlexWortega), [Igor Pavlov](https://github.com/boomb0om), [Andrey Kuznetsov](https://github.com/kuznetsoffandrey) and [Denis Dimitrov](https://github.com/denndimitrov) and the original codebase can be found [here](https://github.com/ai-forever/Kandinsky-2)
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_kandinsky.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py) | *Text-to-Image Generation* | - |
+| [pipeline_kandinsky_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py) | *Image-Guided Image Generation* | - |
+| [pipeline_kandinsky_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py) | *Image-Guided Image Generation* | - |
+
+## Usage example
+
+In the following, we will walk you through some cool examples of using the Kandinsky pipelines to create some visually aesthetic artwork.
+
+### Text-to-Image Generation
+
+For text-to-image generation, we need to use both [`KandinskyPriorPipeline`] and [`KandinskyPipeline`]. The first step is to encode text prompts with CLIP and then diffuse the CLIP text embeddings to CLIP image embeddings, as first proposed in [DALL-E 2](https://cdn.openai.com/papers/dall-e-2.pdf). Let's throw a fun prompt at Kandinsky to see what it comes up with :)
+
+```python
+prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting"
+negative_prompt = "low quality, bad quality"
+```
+
+We will pass both the `prompt` and `negative_prompt` to our prior diffusion pipeline. In contrast to other diffusion pipelines, such as Stable Diffusion, the `prompt` and `negative_prompt` shall be passed separately so that we can retrieve a CLIP image embedding for each prompt input. You can use `guidance_scale`, and `num_inference_steps` arguments to guide this process, just like how you would normally do with all other pipelines in diffusers.
+
+```python
+from diffusers import KandinskyPriorPipeline
+import torch
+
+# create prior
+pipe_prior = KandinskyPriorPipeline.from_pretrained(
+ "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+)
+pipe_prior.to("cuda")
+
+generator = torch.Generator(device="cuda").manual_seed(12)
+image_emb = pipe_prior(
+ prompt, guidance_scale=1.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt
+).images
+
+zero_image_emb = pipe_prior(
+ negative_prompt, guidance_scale=1.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt
+).images
+```
+
+Once we create the image embedding, we can use [`KandinskyPipeline`] to generate images.
+
+```python
+from PIL import Image
+from diffusers import KandinskyPipeline
+
+
+def image_grid(imgs, rows, cols):
+ assert len(imgs) == rows * cols
+
+ w, h = imgs[0].size
+ grid = Image.new("RGB", size=(cols * w, rows * h))
+ grid_w, grid_h = grid.size
+
+ for i, img in enumerate(imgs):
+ grid.paste(img, box=(i % cols * w, i // cols * h))
+ return grid
+
+
+# create diffuser pipeline
+pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
+pipe.to("cuda")
+
+images = pipe(
+ prompt,
+ image_embeds=image_emb,
+ negative_image_embeds=zero_image_emb,
+ num_images_per_prompt=2,
+ height=768,
+ width=768,
+ num_inference_steps=100,
+ guidance_scale=4.0,
+ generator=generator,
+).images
+```
+
+One cheeseburger monster coming up! Enjoy!
+
+
+
+The Kandinsky model works extremely well with creative prompts. Here is some of the amazing art that can be created using the exact same process but with different prompts.
+
+```python
+prompt = "bird eye view shot of a full body woman with cyan light orange magenta makeup, digital art, long braided hair her face separated by makeup in the style of yin Yang surrealism, symmetrical face, real image, contrasting tone, pastel gradient background"
+```
+
+
+```python
+prompt = "A car exploding into colorful dust"
+```
+
+
+```python
+prompt = "editorial photography of an organic, almost liquid smoke style armchair"
+```
+
+
+```python
+prompt = "birds eye view of a quilted paper style alien planet landscape, vibrant colours, Cinematic lighting"
+```
+
+
+
+### Text Guided Image-to-Image Generation
+
+The same Kandinsky model weights can be used for text-guided image-to-image translation. In this case, just make sure to load the weights using the [`KandinskyImg2ImgPipeline`] pipeline.
+
+**Note**: You can also directly move the weights of the text-to-image pipelines to the image-to-image pipelines
+without loading them twice by making use of the [`~DiffusionPipeline.components`] function as explained [here](#converting-between-different-pipelines).
+
+Let's download an image.
+
+```python
+from PIL import Image
+import requests
+from io import BytesIO
+
+# download image
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+response = requests.get(url)
+original_image = Image.open(BytesIO(response.content)).convert("RGB")
+original_image = original_image.resize((768, 512))
+```
+
+
+
+```python
+import torch
+from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline
+
+# create prior
+pipe_prior = KandinskyPriorPipeline.from_pretrained(
+ "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+)
+pipe_prior.to("cuda")
+
+# create img2img pipeline
+pipe = KandinskyImg2ImgPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
+pipe.to("cuda")
+
+prompt = "A fantasy landscape, Cinematic lighting"
+negative_prompt = "low quality, bad quality"
+
+generator = torch.Generator(device="cuda").manual_seed(30)
+image_emb = pipe_prior(
+ prompt, guidance_scale=4.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt
+).images
+
+zero_image_emb = pipe_prior(
+ negative_prompt, guidance_scale=4.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt
+).images
+
+out = pipe(
+ prompt,
+ image=original_image,
+ image_embeds=image_emb,
+ negative_image_embeds=zero_image_emb,
+ height=768,
+ width=768,
+ num_inference_steps=500,
+ strength=0.3,
+)
+
+out.images[0].save("fantasy_land.png")
+```
+
+
+
+
+### Text Guided Inpainting Generation
+
+You can use [`KandinskyInpaintPipeline`] to edit images. In this example, we will add a hat to the portrait of a cat.
+
+```python
+from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline
+from diffusers.utils import load_image
+import torch
+import numpy as np
+
+pipe_prior = KandinskyPriorPipeline.from_pretrained(
+ "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+)
+pipe_prior.to("cuda")
+
+prompt = "a hat"
+image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
+
+pipe = KandinskyInpaintPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16)
+pipe.to("cuda")
+
+init_image = load_image(
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+)
+
+mask = np.ones((768, 768), dtype=np.float32)
+# Let's mask out an area above the cat's head
+mask[:250, 250:-250] = 0
+
+out = pipe(
+ prompt,
+ image=init_image,
+ mask_image=mask,
+ image_embeds=image_emb,
+ negative_image_embeds=zero_image_emb,
+ height=768,
+ width=768,
+ num_inference_steps=150,
+)
+
+image = out.images[0]
+image.save("cat_with_hat.png")
+```
+
+
+### Interpolate
+
+The [`KandinskyPriorPipeline`] also comes with a cool utility function that will allow you to interpolate the latent space of different images and texts super easily. Here is an example of how you can create an Impressionist-style portrait for your pet based on "The Starry Night".
+
+Note that you can interpolate between texts and images - in the below example, we passed a text prompt "a cat" and two images to the `interplate` function, along with a `weights` variable containing the corresponding weights for each condition we interplate.
+
+```python
+from diffusers import KandinskyPriorPipeline, KandinskyPipeline
+from diffusers.utils import load_image
+import PIL
+
+import torch
+from torchvision import transforms
+
+pipe_prior = KandinskyPriorPipeline.from_pretrained(
+ "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+)
+pipe_prior.to("cuda")
+
+img1 = load_image(
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+)
+
+img2 = load_image(
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/starry_night.jpeg"
+)
+
+# add all the conditions we want to interpolate, can be either text or image
+images_texts = ["a cat", img1, img2]
+# specify the weights for each condition in images_texts
+weights = [0.3, 0.3, 0.4]
+image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)
+
+pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
+pipe.to("cuda")
+
+image = pipe(
+ "", image_embeds=image_emb, negative_image_embeds=zero_image_emb, height=768, width=768, num_inference_steps=150
+).images[0]
+
+image.save("starry_cat.png")
+```
+
+
+
+## KandinskyPriorPipeline
+
+[[autodoc]] KandinskyPriorPipeline
+ - all
+ - __call__
+ - interpolate
+
+## KandinskyPipeline
+
+[[autodoc]] KandinskyPipeline
+ - all
+ - __call__
+
+## KandinskyInpaintPipeline
+
+[[autodoc]] KandinskyInpaintPipeline
+ - all
+ - __call__
+
+## KandinskyImg2ImgPipeline
+
+[[autodoc]] KandinskyImg2ImgPipeline
+ - all
+ - __call__
+
diff --git a/scripts/convert_kandinsky_to_diffusers.py b/scripts/convert_kandinsky_to_diffusers.py
new file mode 100644
index 000000000000..de9879f7f03b
--- /dev/null
+++ b/scripts/convert_kandinsky_to_diffusers.py
@@ -0,0 +1,1400 @@
+import argparse
+import os
+import tempfile
+
+import torch
+from accelerate import load_checkpoint_and_dispatch
+
+from diffusers import UNet2DConditionModel
+from diffusers.models.prior_transformer import PriorTransformer
+from diffusers.models.vq_model import VQModel
+from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
+
+
+"""
+Example - From the diffusers root directory:
+
+Download weights:
+```sh
+$ wget https://huggingface.co/ai-forever/Kandinsky_2.1/blob/main/prior_fp16.ckpt
+```
+
+Convert the model:
+```sh
+python scripts/convert_kandinsky_to_diffusers.py \
+ --prior_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/prior_fp16.ckpt \
+ --clip_stat_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/ViT-L-14_stats.th \
+ --text2img_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/decoder_fp16.ckpt \
+ --inpaint_text2img_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/inpainting_fp16.ckpt \
+ --movq_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/movq_final.ckpt \
+ --dump_path /home/yiyi_huggingface_co/dump \
+ --debug decoder
+```
+"""
+
+
+# prior
+
+PRIOR_ORIGINAL_PREFIX = "model"
+
+# Uses default arguments
+PRIOR_CONFIG = {}
+
+
+def prior_model_from_original_config():
+ model = PriorTransformer(**PRIOR_CONFIG)
+
+ return model
+
+
+def prior_original_checkpoint_to_diffusers_checkpoint(model, checkpoint, clip_stats_checkpoint):
+ diffusers_checkpoint = {}
+
+ # .time_embed.0 -> .time_embedding.linear_1
+ diffusers_checkpoint.update(
+ {
+ "time_embedding.linear_1.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.0.weight"],
+ "time_embedding.linear_1.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.0.bias"],
+ }
+ )
+
+ # .clip_img_proj -> .proj_in
+ diffusers_checkpoint.update(
+ {
+ "proj_in.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.clip_img_proj.weight"],
+ "proj_in.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.clip_img_proj.bias"],
+ }
+ )
+
+ # .text_emb_proj -> .embedding_proj
+ diffusers_checkpoint.update(
+ {
+ "embedding_proj.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_emb_proj.weight"],
+ "embedding_proj.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_emb_proj.bias"],
+ }
+ )
+
+ # .text_enc_proj -> .encoder_hidden_states_proj
+ diffusers_checkpoint.update(
+ {
+ "encoder_hidden_states_proj.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_enc_proj.weight"],
+ "encoder_hidden_states_proj.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_enc_proj.bias"],
+ }
+ )
+
+ # .positional_embedding -> .positional_embedding
+ diffusers_checkpoint.update({"positional_embedding": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.positional_embedding"]})
+
+ # .prd_emb -> .prd_embedding
+ diffusers_checkpoint.update({"prd_embedding": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.prd_emb"]})
+
+ # .time_embed.2 -> .time_embedding.linear_2
+ diffusers_checkpoint.update(
+ {
+ "time_embedding.linear_2.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.2.weight"],
+ "time_embedding.linear_2.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.2.bias"],
+ }
+ )
+
+ # .resblocks. -> .transformer_blocks.
+ for idx in range(len(model.transformer_blocks)):
+ diffusers_transformer_prefix = f"transformer_blocks.{idx}"
+ original_transformer_prefix = f"{PRIOR_ORIGINAL_PREFIX}.transformer.resblocks.{idx}"
+
+ # .attn -> .attn1
+ diffusers_attention_prefix = f"{diffusers_transformer_prefix}.attn1"
+ original_attention_prefix = f"{original_transformer_prefix}.attn"
+ diffusers_checkpoint.update(
+ prior_attention_to_diffusers(
+ checkpoint,
+ diffusers_attention_prefix=diffusers_attention_prefix,
+ original_attention_prefix=original_attention_prefix,
+ attention_head_dim=model.attention_head_dim,
+ )
+ )
+
+ # .mlp -> .ff
+ diffusers_ff_prefix = f"{diffusers_transformer_prefix}.ff"
+ original_ff_prefix = f"{original_transformer_prefix}.mlp"
+ diffusers_checkpoint.update(
+ prior_ff_to_diffusers(
+ checkpoint, diffusers_ff_prefix=diffusers_ff_prefix, original_ff_prefix=original_ff_prefix
+ )
+ )
+
+ # .ln_1 -> .norm1
+ diffusers_checkpoint.update(
+ {
+ f"{diffusers_transformer_prefix}.norm1.weight": checkpoint[
+ f"{original_transformer_prefix}.ln_1.weight"
+ ],
+ f"{diffusers_transformer_prefix}.norm1.bias": checkpoint[f"{original_transformer_prefix}.ln_1.bias"],
+ }
+ )
+
+ # .ln_2 -> .norm3
+ diffusers_checkpoint.update(
+ {
+ f"{diffusers_transformer_prefix}.norm3.weight": checkpoint[
+ f"{original_transformer_prefix}.ln_2.weight"
+ ],
+ f"{diffusers_transformer_prefix}.norm3.bias": checkpoint[f"{original_transformer_prefix}.ln_2.bias"],
+ }
+ )
+
+ # .final_ln -> .norm_out
+ diffusers_checkpoint.update(
+ {
+ "norm_out.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.final_ln.weight"],
+ "norm_out.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.final_ln.bias"],
+ }
+ )
+
+ # .out_proj -> .proj_to_clip_embeddings
+ diffusers_checkpoint.update(
+ {
+ "proj_to_clip_embeddings.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.out_proj.weight"],
+ "proj_to_clip_embeddings.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.out_proj.bias"],
+ }
+ )
+
+ # clip stats
+ clip_mean, clip_std = clip_stats_checkpoint
+ clip_mean = clip_mean[None, :]
+ clip_std = clip_std[None, :]
+
+ diffusers_checkpoint.update({"clip_mean": clip_mean, "clip_std": clip_std})
+
+ return diffusers_checkpoint
+
+
+def prior_attention_to_diffusers(
+ checkpoint, *, diffusers_attention_prefix, original_attention_prefix, attention_head_dim
+):
+ diffusers_checkpoint = {}
+
+ # .c_qkv -> .{to_q, to_k, to_v}
+ [q_weight, k_weight, v_weight], [q_bias, k_bias, v_bias] = split_attentions(
+ weight=checkpoint[f"{original_attention_prefix}.c_qkv.weight"],
+ bias=checkpoint[f"{original_attention_prefix}.c_qkv.bias"],
+ split=3,
+ chunk_size=attention_head_dim,
+ )
+
+ diffusers_checkpoint.update(
+ {
+ f"{diffusers_attention_prefix}.to_q.weight": q_weight,
+ f"{diffusers_attention_prefix}.to_q.bias": q_bias,
+ f"{diffusers_attention_prefix}.to_k.weight": k_weight,
+ f"{diffusers_attention_prefix}.to_k.bias": k_bias,
+ f"{diffusers_attention_prefix}.to_v.weight": v_weight,
+ f"{diffusers_attention_prefix}.to_v.bias": v_bias,
+ }
+ )
+
+ # .c_proj -> .to_out.0
+ diffusers_checkpoint.update(
+ {
+ f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{original_attention_prefix}.c_proj.weight"],
+ f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{original_attention_prefix}.c_proj.bias"],
+ }
+ )
+
+ return diffusers_checkpoint
+
+
+def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix):
+ diffusers_checkpoint = {
+ # .c_fc -> .net.0.proj
+ f"{diffusers_ff_prefix}.net.{0}.proj.weight": checkpoint[f"{original_ff_prefix}.c_fc.weight"],
+ f"{diffusers_ff_prefix}.net.{0}.proj.bias": checkpoint[f"{original_ff_prefix}.c_fc.bias"],
+ # .c_proj -> .net.2
+ f"{diffusers_ff_prefix}.net.{2}.weight": checkpoint[f"{original_ff_prefix}.c_proj.weight"],
+ f"{diffusers_ff_prefix}.net.{2}.bias": checkpoint[f"{original_ff_prefix}.c_proj.bias"],
+ }
+
+ return diffusers_checkpoint
+
+
+# done prior
+
+# unet
+
+# We are hardcoding the model configuration for now. If we need to generalize to more model configurations, we can
+# update then.
+
+UNET_CONFIG = {
+ "act_fn": "silu",
+ "attention_head_dim": 64,
+ "block_out_channels": (384, 768, 1152, 1536),
+ "center_input_sample": False,
+ "class_embed_type": "identity",
+ "cross_attention_dim": 768,
+ "down_block_types": (
+ "ResnetDownsampleBlock2D",
+ "SimpleCrossAttnDownBlock2D",
+ "SimpleCrossAttnDownBlock2D",
+ "SimpleCrossAttnDownBlock2D",
+ ),
+ "downsample_padding": 1,
+ "dual_cross_attention": False,
+ "flip_sin_to_cos": True,
+ "freq_shift": 0,
+ "in_channels": 4,
+ "layers_per_block": 3,
+ "mid_block_scale_factor": 1,
+ "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+ "norm_eps": 1e-05,
+ "norm_num_groups": 32,
+ "only_cross_attention": False,
+ "out_channels": 8,
+ "resnet_time_scale_shift": "scale_shift",
+ "sample_size": 64,
+ "up_block_types": (
+ "SimpleCrossAttnUpBlock2D",
+ "SimpleCrossAttnUpBlock2D",
+ "SimpleCrossAttnUpBlock2D",
+ "ResnetUpsampleBlock2D",
+ ),
+ "upcast_attention": False,
+ "use_linear_projection": False,
+}
+
+
+def unet_model_from_original_config():
+ model = UNet2DConditionModel(**UNET_CONFIG)
+
+ return model
+
+
+def unet_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+ diffusers_checkpoint = {}
+
+ num_head_channels = UNET_CONFIG["attention_head_dim"]
+
+ diffusers_checkpoint.update(unet_time_embeddings(checkpoint))
+ diffusers_checkpoint.update(unet_conv_in(checkpoint))
+
+ # .input_blocks -> .down_blocks
+
+ original_down_block_idx = 1
+
+ for diffusers_down_block_idx in range(len(model.down_blocks)):
+ checkpoint_update, num_original_down_blocks = unet_downblock_to_diffusers_checkpoint(
+ model,
+ checkpoint,
+ diffusers_down_block_idx=diffusers_down_block_idx,
+ original_down_block_idx=original_down_block_idx,
+ num_head_channels=num_head_channels,
+ )
+
+ original_down_block_idx += num_original_down_blocks
+
+ diffusers_checkpoint.update(checkpoint_update)
+
+ # done .input_blocks -> .down_blocks
+
+ diffusers_checkpoint.update(
+ unet_midblock_to_diffusers_checkpoint(
+ model,
+ checkpoint,
+ num_head_channels=num_head_channels,
+ )
+ )
+
+ # .output_blocks -> .up_blocks
+
+ original_up_block_idx = 0
+
+ for diffusers_up_block_idx in range(len(model.up_blocks)):
+ checkpoint_update, num_original_up_blocks = unet_upblock_to_diffusers_checkpoint(
+ model,
+ checkpoint,
+ diffusers_up_block_idx=diffusers_up_block_idx,
+ original_up_block_idx=original_up_block_idx,
+ num_head_channels=num_head_channels,
+ )
+
+ original_up_block_idx += num_original_up_blocks
+
+ diffusers_checkpoint.update(checkpoint_update)
+
+ # done .output_blocks -> .up_blocks
+
+ diffusers_checkpoint.update(unet_conv_norm_out(checkpoint))
+ diffusers_checkpoint.update(unet_conv_out(checkpoint))
+
+ return diffusers_checkpoint
+
+
+# done unet
+
+# inpaint unet
+
+# We are hardcoding the model configuration for now. If we need to generalize to more model configurations, we can
+# update then.
+
+INPAINT_UNET_CONFIG = {
+ "act_fn": "silu",
+ "attention_head_dim": 64,
+ "block_out_channels": (384, 768, 1152, 1536),
+ "center_input_sample": False,
+ "class_embed_type": "identity",
+ "cross_attention_dim": 768,
+ "down_block_types": (
+ "ResnetDownsampleBlock2D",
+ "SimpleCrossAttnDownBlock2D",
+ "SimpleCrossAttnDownBlock2D",
+ "SimpleCrossAttnDownBlock2D",
+ ),
+ "downsample_padding": 1,
+ "dual_cross_attention": False,
+ "flip_sin_to_cos": True,
+ "freq_shift": 0,
+ "in_channels": 9,
+ "layers_per_block": 3,
+ "mid_block_scale_factor": 1,
+ "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+ "norm_eps": 1e-05,
+ "norm_num_groups": 32,
+ "only_cross_attention": False,
+ "out_channels": 8,
+ "resnet_time_scale_shift": "scale_shift",
+ "sample_size": 64,
+ "up_block_types": (
+ "SimpleCrossAttnUpBlock2D",
+ "SimpleCrossAttnUpBlock2D",
+ "SimpleCrossAttnUpBlock2D",
+ "ResnetUpsampleBlock2D",
+ ),
+ "upcast_attention": False,
+ "use_linear_projection": False,
+}
+
+
+def inpaint_unet_model_from_original_config():
+ model = UNet2DConditionModel(**INPAINT_UNET_CONFIG)
+
+ return model
+
+
+def inpaint_unet_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+ diffusers_checkpoint = {}
+
+ num_head_channels = UNET_CONFIG["attention_head_dim"]
+
+ diffusers_checkpoint.update(unet_time_embeddings(checkpoint))
+ diffusers_checkpoint.update(unet_conv_in(checkpoint))
+
+ # .input_blocks -> .down_blocks
+
+ original_down_block_idx = 1
+
+ for diffusers_down_block_idx in range(len(model.down_blocks)):
+ checkpoint_update, num_original_down_blocks = unet_downblock_to_diffusers_checkpoint(
+ model,
+ checkpoint,
+ diffusers_down_block_idx=diffusers_down_block_idx,
+ original_down_block_idx=original_down_block_idx,
+ num_head_channels=num_head_channels,
+ )
+
+ original_down_block_idx += num_original_down_blocks
+
+ diffusers_checkpoint.update(checkpoint_update)
+
+ # done .input_blocks -> .down_blocks
+
+ diffusers_checkpoint.update(
+ unet_midblock_to_diffusers_checkpoint(
+ model,
+ checkpoint,
+ num_head_channels=num_head_channels,
+ )
+ )
+
+ # .output_blocks -> .up_blocks
+
+ original_up_block_idx = 0
+
+ for diffusers_up_block_idx in range(len(model.up_blocks)):
+ checkpoint_update, num_original_up_blocks = unet_upblock_to_diffusers_checkpoint(
+ model,
+ checkpoint,
+ diffusers_up_block_idx=diffusers_up_block_idx,
+ original_up_block_idx=original_up_block_idx,
+ num_head_channels=num_head_channels,
+ )
+
+ original_up_block_idx += num_original_up_blocks
+
+ diffusers_checkpoint.update(checkpoint_update)
+
+ # done .output_blocks -> .up_blocks
+
+ diffusers_checkpoint.update(unet_conv_norm_out(checkpoint))
+ diffusers_checkpoint.update(unet_conv_out(checkpoint))
+
+ return diffusers_checkpoint
+
+
+# done inpaint unet
+
+# text proj
+
+TEXT_PROJ_CONFIG = {}
+
+
+def text_proj_from_original_config():
+ model = KandinskyTextProjModel(**TEXT_PROJ_CONFIG)
+ return model
+
+
+# Note that the input checkpoint is the original text2img model checkpoint
+def text_proj_original_checkpoint_to_diffusers_checkpoint(checkpoint):
+ diffusers_checkpoint = {
+ # .text_seq_proj.0 -> .encoder_hidden_states_proj
+ "encoder_hidden_states_proj.weight": checkpoint["to_model_dim_n.weight"],
+ "encoder_hidden_states_proj.bias": checkpoint["to_model_dim_n.bias"],
+ # .clip_tok_proj -> .clip_extra_context_tokens_proj
+ "clip_extra_context_tokens_proj.weight": checkpoint["clip_to_seq.weight"],
+ "clip_extra_context_tokens_proj.bias": checkpoint["clip_to_seq.bias"],
+ # .proj_n -> .embedding_proj
+ "embedding_proj.weight": checkpoint["proj_n.weight"],
+ "embedding_proj.bias": checkpoint["proj_n.bias"],
+ # .ln_model_n -> .embedding_norm
+ "embedding_norm.weight": checkpoint["ln_model_n.weight"],
+ "embedding_norm.bias": checkpoint["ln_model_n.bias"],
+ # .clip_emb -> .clip_image_embeddings_project_to_time_embeddings
+ "clip_image_embeddings_project_to_time_embeddings.weight": checkpoint["img_layer.weight"],
+ "clip_image_embeddings_project_to_time_embeddings.bias": checkpoint["img_layer.bias"],
+ }
+
+ return diffusers_checkpoint
+
+
+# unet utils
+
+
+# .time_embed -> .time_embedding
+def unet_time_embeddings(checkpoint):
+ diffusers_checkpoint = {}
+
+ diffusers_checkpoint.update(
+ {
+ "time_embedding.linear_1.weight": checkpoint["time_embed.0.weight"],
+ "time_embedding.linear_1.bias": checkpoint["time_embed.0.bias"],
+ "time_embedding.linear_2.weight": checkpoint["time_embed.2.weight"],
+ "time_embedding.linear_2.bias": checkpoint["time_embed.2.bias"],
+ }
+ )
+
+ return diffusers_checkpoint
+
+
+# .input_blocks.0 -> .conv_in
+def unet_conv_in(checkpoint):
+ diffusers_checkpoint = {}
+
+ diffusers_checkpoint.update(
+ {
+ "conv_in.weight": checkpoint["input_blocks.0.0.weight"],
+ "conv_in.bias": checkpoint["input_blocks.0.0.bias"],
+ }
+ )
+
+ return diffusers_checkpoint
+
+
+# .out.0 -> .conv_norm_out
+def unet_conv_norm_out(checkpoint):
+ diffusers_checkpoint = {}
+
+ diffusers_checkpoint.update(
+ {
+ "conv_norm_out.weight": checkpoint["out.0.weight"],
+ "conv_norm_out.bias": checkpoint["out.0.bias"],
+ }
+ )
+
+ return diffusers_checkpoint
+
+
+# .out.2 -> .conv_out
+def unet_conv_out(checkpoint):
+ diffusers_checkpoint = {}
+
+ diffusers_checkpoint.update(
+ {
+ "conv_out.weight": checkpoint["out.2.weight"],
+ "conv_out.bias": checkpoint["out.2.bias"],
+ }
+ )
+
+ return diffusers_checkpoint
+
+
+# .input_blocks -> .down_blocks
+def unet_downblock_to_diffusers_checkpoint(
+ model, checkpoint, *, diffusers_down_block_idx, original_down_block_idx, num_head_channels
+):
+ diffusers_checkpoint = {}
+
+ diffusers_resnet_prefix = f"down_blocks.{diffusers_down_block_idx}.resnets"
+ original_down_block_prefix = "input_blocks"
+
+ down_block = model.down_blocks[diffusers_down_block_idx]
+
+ num_resnets = len(down_block.resnets)
+
+ if down_block.downsamplers is None:
+ downsampler = False
+ else:
+ assert len(down_block.downsamplers) == 1
+ downsampler = True
+ # The downsample block is also a resnet
+ num_resnets += 1
+
+ for resnet_idx_inc in range(num_resnets):
+ full_resnet_prefix = f"{original_down_block_prefix}.{original_down_block_idx + resnet_idx_inc}.0"
+
+ if downsampler and resnet_idx_inc == num_resnets - 1:
+ # this is a downsample block
+ full_diffusers_resnet_prefix = f"down_blocks.{diffusers_down_block_idx}.downsamplers.0"
+ else:
+ # this is a regular resnet block
+ full_diffusers_resnet_prefix = f"{diffusers_resnet_prefix}.{resnet_idx_inc}"
+
+ diffusers_checkpoint.update(
+ resnet_to_diffusers_checkpoint(
+ checkpoint, resnet_prefix=full_resnet_prefix, diffusers_resnet_prefix=full_diffusers_resnet_prefix
+ )
+ )
+
+ if hasattr(down_block, "attentions"):
+ num_attentions = len(down_block.attentions)
+ diffusers_attention_prefix = f"down_blocks.{diffusers_down_block_idx}.attentions"
+
+ for attention_idx_inc in range(num_attentions):
+ full_attention_prefix = f"{original_down_block_prefix}.{original_down_block_idx + attention_idx_inc}.1"
+ full_diffusers_attention_prefix = f"{diffusers_attention_prefix}.{attention_idx_inc}"
+
+ diffusers_checkpoint.update(
+ attention_to_diffusers_checkpoint(
+ checkpoint,
+ attention_prefix=full_attention_prefix,
+ diffusers_attention_prefix=full_diffusers_attention_prefix,
+ num_head_channels=num_head_channels,
+ )
+ )
+
+ num_original_down_blocks = num_resnets
+
+ return diffusers_checkpoint, num_original_down_blocks
+
+
+# .middle_block -> .mid_block
+def unet_midblock_to_diffusers_checkpoint(model, checkpoint, *, num_head_channels):
+ diffusers_checkpoint = {}
+
+ # block 0
+
+ original_block_idx = 0
+
+ diffusers_checkpoint.update(
+ resnet_to_diffusers_checkpoint(
+ checkpoint,
+ diffusers_resnet_prefix="mid_block.resnets.0",
+ resnet_prefix=f"middle_block.{original_block_idx}",
+ )
+ )
+
+ original_block_idx += 1
+
+ # optional block 1
+
+ if hasattr(model.mid_block, "attentions") and model.mid_block.attentions[0] is not None:
+ diffusers_checkpoint.update(
+ attention_to_diffusers_checkpoint(
+ checkpoint,
+ diffusers_attention_prefix="mid_block.attentions.0",
+ attention_prefix=f"middle_block.{original_block_idx}",
+ num_head_channels=num_head_channels,
+ )
+ )
+ original_block_idx += 1
+
+ # block 1 or block 2
+
+ diffusers_checkpoint.update(
+ resnet_to_diffusers_checkpoint(
+ checkpoint,
+ diffusers_resnet_prefix="mid_block.resnets.1",
+ resnet_prefix=f"middle_block.{original_block_idx}",
+ )
+ )
+
+ return diffusers_checkpoint
+
+
+# .output_blocks -> .up_blocks
+def unet_upblock_to_diffusers_checkpoint(
+ model, checkpoint, *, diffusers_up_block_idx, original_up_block_idx, num_head_channels
+):
+ diffusers_checkpoint = {}
+
+ diffusers_resnet_prefix = f"up_blocks.{diffusers_up_block_idx}.resnets"
+ original_up_block_prefix = "output_blocks"
+
+ up_block = model.up_blocks[diffusers_up_block_idx]
+
+ num_resnets = len(up_block.resnets)
+
+ if up_block.upsamplers is None:
+ upsampler = False
+ else:
+ assert len(up_block.upsamplers) == 1
+ upsampler = True
+ # The upsample block is also a resnet
+ num_resnets += 1
+
+ has_attentions = hasattr(up_block, "attentions")
+
+ for resnet_idx_inc in range(num_resnets):
+ if upsampler and resnet_idx_inc == num_resnets - 1:
+ # this is an upsample block
+ if has_attentions:
+ # There is a middle attention block that we skip
+ original_resnet_block_idx = 2
+ else:
+ original_resnet_block_idx = 1
+
+ # we add the `minus 1` because the last two resnets are stuck together in the same output block
+ full_resnet_prefix = (
+ f"{original_up_block_prefix}.{original_up_block_idx + resnet_idx_inc - 1}.{original_resnet_block_idx}"
+ )
+
+ full_diffusers_resnet_prefix = f"up_blocks.{diffusers_up_block_idx}.upsamplers.0"
+ else:
+ # this is a regular resnet block
+ full_resnet_prefix = f"{original_up_block_prefix}.{original_up_block_idx + resnet_idx_inc}.0"
+ full_diffusers_resnet_prefix = f"{diffusers_resnet_prefix}.{resnet_idx_inc}"
+
+ diffusers_checkpoint.update(
+ resnet_to_diffusers_checkpoint(
+ checkpoint, resnet_prefix=full_resnet_prefix, diffusers_resnet_prefix=full_diffusers_resnet_prefix
+ )
+ )
+
+ if has_attentions:
+ num_attentions = len(up_block.attentions)
+ diffusers_attention_prefix = f"up_blocks.{diffusers_up_block_idx}.attentions"
+
+ for attention_idx_inc in range(num_attentions):
+ full_attention_prefix = f"{original_up_block_prefix}.{original_up_block_idx + attention_idx_inc}.1"
+ full_diffusers_attention_prefix = f"{diffusers_attention_prefix}.{attention_idx_inc}"
+
+ diffusers_checkpoint.update(
+ attention_to_diffusers_checkpoint(
+ checkpoint,
+ attention_prefix=full_attention_prefix,
+ diffusers_attention_prefix=full_diffusers_attention_prefix,
+ num_head_channels=num_head_channels,
+ )
+ )
+
+ num_original_down_blocks = num_resnets - 1 if upsampler else num_resnets
+
+ return diffusers_checkpoint, num_original_down_blocks
+
+
+def resnet_to_diffusers_checkpoint(checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
+ diffusers_checkpoint = {
+ f"{diffusers_resnet_prefix}.norm1.weight": checkpoint[f"{resnet_prefix}.in_layers.0.weight"],
+ f"{diffusers_resnet_prefix}.norm1.bias": checkpoint[f"{resnet_prefix}.in_layers.0.bias"],
+ f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.in_layers.2.weight"],
+ f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.in_layers.2.bias"],
+ f"{diffusers_resnet_prefix}.time_emb_proj.weight": checkpoint[f"{resnet_prefix}.emb_layers.1.weight"],
+ f"{diffusers_resnet_prefix}.time_emb_proj.bias": checkpoint[f"{resnet_prefix}.emb_layers.1.bias"],
+ f"{diffusers_resnet_prefix}.norm2.weight": checkpoint[f"{resnet_prefix}.out_layers.0.weight"],
+ f"{diffusers_resnet_prefix}.norm2.bias": checkpoint[f"{resnet_prefix}.out_layers.0.bias"],
+ f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.out_layers.3.weight"],
+ f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.out_layers.3.bias"],
+ }
+
+ skip_connection_prefix = f"{resnet_prefix}.skip_connection"
+
+ if f"{skip_connection_prefix}.weight" in checkpoint:
+ diffusers_checkpoint.update(
+ {
+ f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{skip_connection_prefix}.weight"],
+ f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{skip_connection_prefix}.bias"],
+ }
+ )
+
+ return diffusers_checkpoint
+
+
+def attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix, num_head_channels):
+ diffusers_checkpoint = {}
+
+ # .norm -> .group_norm
+ diffusers_checkpoint.update(
+ {
+ f"{diffusers_attention_prefix}.group_norm.weight": checkpoint[f"{attention_prefix}.norm.weight"],
+ f"{diffusers_attention_prefix}.group_norm.bias": checkpoint[f"{attention_prefix}.norm.bias"],
+ }
+ )
+
+ # .qkv -> .{query, key, value}
+ [q_weight, k_weight, v_weight], [q_bias, k_bias, v_bias] = split_attentions(
+ weight=checkpoint[f"{attention_prefix}.qkv.weight"][:, :, 0],
+ bias=checkpoint[f"{attention_prefix}.qkv.bias"],
+ split=3,
+ chunk_size=num_head_channels,
+ )
+
+ diffusers_checkpoint.update(
+ {
+ f"{diffusers_attention_prefix}.to_q.weight": q_weight,
+ f"{diffusers_attention_prefix}.to_q.bias": q_bias,
+ f"{diffusers_attention_prefix}.to_k.weight": k_weight,
+ f"{diffusers_attention_prefix}.to_k.bias": k_bias,
+ f"{diffusers_attention_prefix}.to_v.weight": v_weight,
+ f"{diffusers_attention_prefix}.to_v.bias": v_bias,
+ }
+ )
+
+ # .encoder_kv -> .{context_key, context_value}
+ [encoder_k_weight, encoder_v_weight], [encoder_k_bias, encoder_v_bias] = split_attentions(
+ weight=checkpoint[f"{attention_prefix}.encoder_kv.weight"][:, :, 0],
+ bias=checkpoint[f"{attention_prefix}.encoder_kv.bias"],
+ split=2,
+ chunk_size=num_head_channels,
+ )
+
+ diffusers_checkpoint.update(
+ {
+ f"{diffusers_attention_prefix}.add_k_proj.weight": encoder_k_weight,
+ f"{diffusers_attention_prefix}.add_k_proj.bias": encoder_k_bias,
+ f"{diffusers_attention_prefix}.add_v_proj.weight": encoder_v_weight,
+ f"{diffusers_attention_prefix}.add_v_proj.bias": encoder_v_bias,
+ }
+ )
+
+ #