In [9]:
from diffusers import AutoencoderKL, UNet2DConditionModel, LMSDiscreteScheduler
from transformers import CLIPTextModel, CLIPTokenizer
from torchvision import transforms
from PIL import Image
import torch

prompt = ["a photograph of an astronaut riding a horse"]
height = 512                        # default height of Stable Diffusion
width = 512                         # default width of Stable Diffusion
num_inference_steps = 100           # Number of denoising steps
guidance_scale = 7.5                # Scale for classifier-free guidance
torch.manual_seed(0)                # 看情况是否固定随机种子
batch_size = len(prompt)
device = 'cuda'

vae = AutoencoderKL.from_pretrained("pretrain_model/stable-diffusion-v1-4", subfolder="vae", torch_dtype=torch.float16).to(device)
scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
scheduler.set_timesteps(num_inference_steps)
tokenizer = CLIPTokenizer.from_pretrained("pretrain_model/stable-diffusion-v1-4", subfolder="tokenizer", torch_dtype=torch.float16)
text_encoder = CLIPTextModel.from_pretrained("pretrain_model/stable-diffusion-v1-4", subfolder="text_encoder", torch_dtype=torch.float16).to(device)

def text_encode(prompts, maxlen=None):
    '''
    A function to take a texual promt and convert it into embeddings
    '''
    if maxlen is None: maxlen = tokenizer.model_max_length
    inp = tokenizer(prompts, padding="max_length", max_length=maxlen, truncation=True, return_tensors="pt") 
    return text_encoder(inp.input_ids.to("cuda"))[0].half()

def load_image(p):
    '''
    Function to load images from a defined path
    '''
    return Image.open(p).convert('RGB').resize((512,512))

def pil_to_latents(image):
    '''
    Function to convert image to latents
    '''
    init_image = transforms.ToTensor()(image).unsqueeze(0) * 2.0 - 1.0
    init_image = init_image.to(device="cuda", dtype=torch.float16) 
    init_latent_dist = vae.encode(init_image).latent_dist.sample() * 0.18215
    return init_latent_dist

strength = 0.8 #控制一开始给图像加噪的程度

image = load_image('image/2.png')
prompt = ["Wolf howling at the moon, photorealistic 4K"]

text_embeddings = text_encode(prompt) 
uncond_embeddings = text_encode([""] * batch_size, text_embeddings.shape[1])
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

init_latents = pil_to_latents(image)

# Figuring initial time step based on strength

In [17]:

init_timestep = int(num_inference_steps * strength)  #80
timesteps = scheduler.timesteps[-init_timestep] #tensor(797.1818)
timesteps = torch.tensor([timesteps], device=device)

print(scheduler.timesteps)

# Adding noise to the latents 
noise = torch.randn(init_latents.shape, device=device, dtype=init_latents.dtype)
init_latents = scheduler.add_noise(init_latents, noise, timesteps) #把x0加噪到x797
latents = init_latents

# Computing the timestep to start the diffusion loop
t_start = max(num_inference_steps - init_timestep, 0) #20
timesteps = scheduler.timesteps[t_start:].to(device)  #从步数797开始去噪


tensor([999.0000, 988.9091, 978.8182, 968.7273, 958.6364, 948.5455, 938.4545,
        928.3636, 918.2727, 908.1818, 898.0909, 888.0000, 877.9091, 867.8182,
        857.7273, 847.6364, 837.5455, 827.4545, 817.3636, 807.2727, 797.1818,
        787.0909, 777.0000, 766.9091, 756.8182, 746.7273, 736.6364, 726.5455,
        716.4545, 706.3636, 696.2727, 686.1818, 676.0909, 666.0000, 655.9091,
        645.8182, 635.7273, 625.6364, 615.5455, 605.4545, 595.3636, 585.2727,
        575.1818, 565.0909, 555.0000, 544.9091, 534.8182, 524.7273, 514.6364,
        504.5454, 494.4546, 484.3636, 474.2727, 464.1818, 454.0909, 444.0000,
        433.9091, 423.8182, 413.7273, 403.6364, 393.5454, 383.4546, 373.3636,
        363.2727, 353.1818, 343.0909, 333.0000, 322.9091, 312.8182, 302.7273,
        292.6364, 282.5454, 272.4546, 262.3636, 252.2727, 242.1818, 232.0909,
        222.0000, 211.9091, 201.8182, 191.7273, 181.6364, 171.5455, 161.4545,
        151.3636, 141.2727, 131.1818, 121.0909, 111.0000, 100.90