##  StableDiffusionUniPipeline
### Prepare

```bash
# 1. Install diffusers
pip install diffusers==0.20.2

# 2. Move .py to diffusers install path
mv pipeline_stable_diffusion_uni.py xxx/diffusers/pipelines/stable_diffusion/
mv pipeline_stable_diffusion_uni_parallel.py xxx/diffusers/pipelines/stable_diffusion/

## 3. Edit __init__.py
vim xxx/diffusers/pipelines/stable_diffusion/__init__.py
#line:210 + StableDiffusionUniPipeline
#line:211 + StableDiffusionUniParallelPipeline

vim xxx/diffusers/pipelines/stable_diffusion/pipelines/__init__.py
#line:110 + StableDiffusionUniPipeline
#line:111 + StableDiffusionUniParallelPipeline

vim xxx/diffusers/pipelines/stable_diffusion/pipelines/stable_diffusion/__init__.py
#line64 + from .pipeline_stable_diffusion_uni import StableDiffusionUniPipeline
#line64 + from .pipeline_stable_diffusion_uni_parallel import StableDiffusionUniParallelPipeline
```

In [None]:
!pip freeze|grep diffusers

In [None]:
import torch
model_path = 'runwayml/stable-diffusion-v1-5'

prompt = ["a photograph of an astronaut riding a horse"]
height = 512                        # default height of Stable Diffusion
width = 512                         # default width of Stable Diffusion
num_inference_steps = 20            # Number of denoising steps
guidance_scale = 7.5                # Scale for classifier-free guidance
generator = torch.manual_seed(32)   # Seed generator to create the inital latent noise

### case1: 原txt2img

In [None]:
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained(model_path)
pipe = pipe.to("cuda")

image = pipe(
    prompt,
    height=height,
    width=width,
    generator=generator,
    num_inference_steps=num_inference_steps,
    guidance_scale=guidance_scale,
    ).images[0]

image.save(f"case1.png")

del pipe
torch.cuda.empty_cache()

### case2: 原img2img


In [None]:
from PIL import Image
from diffusers import StableDiffusionImg2ImgPipeline
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_path)
pipe = pipe.to("cuda")

img = Image.open("case1.png")
image = pipe(
    prompt,
    generator=generator,
    num_inference_steps=num_inference_steps,
    guidance_scale=guidance_scale,
    image=img
    ).images[0]

image.save(f"case2.png")

del pipe
torch.cuda.empty_cache()

### case3: 合并txt2img和img2img

In [None]:
from PIL import Image
from diffusers import StableDiffusionUniPipeline
pipe = StableDiffusionUniPipeline.from_pretrained(model_path)
pipe = pipe.to("cuda")

image = pipe(prompt).images[0]
image.save(f"case3_1.png")

del pipe
torch.cuda.empty_cache()

img = Image.open("case3_1.png")
image = pipe(prompt, img).images[0]
image.save(f"case3_2.png")

del pipe
torch.cuda.empty_cache()

### case4：合并txt2img和img2img，同时CFG并行计算加速
StableDiffusionUniParallelPipeline类准备工作同上

In [None]:
from PIL import Image
from diffusers import StableDiffusionUniParallelPipeline
pipe = StableDiffusionUniParallelPipeline.from_pretrained(model_path, single_gpu_parallel=False)

image = pipe(prompt).images[0]
image.save(f"case4_1.png")

del pipe
torch.cuda.empty_cache()

img = Image.open("case4_1.png")
image = pipe(prompt, img).images[0]
image.save(f"case4_2.png")

del pipe
torch.cuda.empty_cache()

### BenchMark

In [1]:
import torch
model_path = 'runwayml/stable-diffusion-v1-5'
prompt = ["a photograph of an astronaut riding a horse"]
num_inference_steps = 20

from diffusers import StableDiffusionPipeline
from diffusers import StableDiffusionUniPipeline
from diffusers import StableDiffusionUniParallelPipeline

# Function execution time statistics（decorator）
def timer(f):
    import time
    def inner(*args, **kwargs):
        start = time.time()
        result = f(*args, **kwargs)
        end = time.time()
        print(f"==> time: {round(end-start, 3)}s")
        return result
    return inner

@timer
def benchmark(pipe, prompt, num_inference_steps):
    image = pipe(prompt, num_inference_steps=num_inference_steps).images[0]
    return image

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
pipe = StableDiffusionPipeline.from_pretrained(model_path, dtype=torch.float16).to("cuda")
frame = benchmark(pipe, prompt, num_inference_steps)

del pipe
torch.cuda.empty_cache()

Keyword arguments {'dtype': torch.float16} are not expected by StableDiffusionPipeline and will be ignored.
Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
Loading pipeline components...: 100%|██████████| 7/7 [00:00<00:00, 31.17it/s]
100%|██████████| 20/20 [00:06<00:00,  3.07it/s]


==> time: 6.839s


In [7]:
pipe = StableDiffusionUniPipeline.from_pretrained(model_path, dtype=torch.float16).to("cuda")
frame = benchmark(pipe, prompt, num_inference_steps)

del pipe
torch.cuda.empty_cache()

Keyword arguments {'dtype': torch.float16} are not expected by StableDiffusionUniPipeline and will be ignored.
Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
Loading pipeline components...: 100%|██████████| 7/7 [00:00<00:00, 31.55it/s]
100%|██████████| 20/20 [00:06<00:00,  3.06it/s]


==> time: 6.869s


In [13]:
pipe = StableDiffusionUniParallelPipeline.from_pretrained(model_path, single_gpu_parallel=True) #单卡CFG并行
frame = benchmark(pipe, prompt, num_inference_steps)

del pipe
torch.cuda.empty_cache()

text_encoders: [device(type='cuda', index=0), device(type='cuda', index=0)]
unets: [device(type='cuda', index=0), device(type='cuda', index=0)]


100%|██████████| 20/20 [00:03<00:00,  6.18it/s]


==> time: 3.483s


In [10]:
pipe = StableDiffusionUniParallelPipeline.from_pretrained(model_path, single_gpu_parallel=False) #双卡CFG并行
frame = benchmark(pipe, prompt, num_inference_steps)

del pipe
torch.cuda.empty_cache()

text_encoders: [device(type='cuda', index=0), device(type='cuda', index=1)]
unets: [device(type='cuda', index=0), device(type='cuda', index=1)]


100%|██████████| 20/20 [00:01<00:00, 12.49it/s]


==> time: 1.828s
