In [None]:
import torch
from transformers import CLIPTokenizer, CLIPTextModel
from diffusers import AutoencoderKL, DDPMScheduler
from PIL import Image
from tensorflow_addons.layers.normalizations import GroupNormalization
from ipynb.fs.full.model4 import UNetMidBlock2DCrossAttn
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Lambda

In [None]:
# Load the CLIP text encoder
tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="text_encoder", use_safetensors=True)
text_encoder.to("cuda")

In [None]:
#parameters
prompt = ["A unicorn sitting on a rainbow"]
height = 512  # Set the desired height
width = 512  # Set the desired width
img_height=32
img_width = 32
num_inference_steps = 100  # Number of denoising steps
guidance_scale = 3.5  # Scale for classifier-free guidance
batch_size = 1

In [None]:
import tensorflow as tf
model_path = "model.h5"  # Replace with the path to your model.pt
model =tf.keras.models.load_model(model_path,custom_objects={"GroupNormalization": GroupNormalization,
                                                             "UNetMidBlock2DCrossAttn": UNetMidBlock2DCrossAttn})         

In [None]:
# Tokenize the text prompt and convert it to a PyTorch tensor
text_input = tokenizer(
    prompt,
    padding="max_length",
    max_length=tokenizer.model_max_length,
    truncation=True,
    return_tensors="pt"
)

In [None]:
with torch.no_grad():
    text_embeddings = text_encoder(text_input.input_ids.to("cuda"))[0]

In [None]:
# Generate a random latent noise image and normalize it
latents = torch.randn(
    (batch_size, height, width, 3),
    device="cuda",
)
latents = latents / 255.0  # Normalize the noise image

In [None]:
from tensorflow.image import resize
latents = latents.cpu().numpy()
latents = resize(latents, (height, width))
latents = tf.convert_to_tensor(latents)

In [None]:
# Create a diffusion schedulerfrom tensorflow.image import resize
scheduler = DDPMScheduler.from_pretrained("google/ddpm-cat-256")

In [None]:
# Set the number of denoising steps for the diffusion process
scheduler.set_timesteps(num_inference_steps)

In [None]:
def reshape_tensor(input_tensor, target_shape):

    target_height, target_width, num_channels = target_shape[1], target_shape[2], target_shape[3]
    input_tensor = tf.cast(input_tensor, dtype=tf.float32)
    reshaped_tensor = tf.image.resize(input_tensor, (target_height, target_width))
    reshaped_tensor = tf.image.resize_with_crop_or_pad(reshaped_tensor, target_height, target_width)
    
    return reshaped_tensor

In [None]:
original_shape = (1, 512, 512, 3)
input_tensor = tf.Variable(tf.zeros(original_shape, dtype=tf.float32))

# Define the target shape
target_shape = (None, 32, 32, 3)

for t in range(num_inference_steps):
    latents_tensor = reshape_tensor(input_tensor, target_shape)

    # Predict the noise residual using your model
    with tf.device("gpu:0"):  # Assuming you want to use GPU
        #latents_tensor = tf.reshape(latents, (1, -1, 1, 1))  # Use tf.reshape to reshape the tensor
        noise_pred = model(latents_tensor,t) 

    latents_tensor = tf.reshape(latents_tensor, (1, 32, 32, 3))
    noise_pred = tf.cast(noise_pred, dtype=tf.float32)
    # Perform guidance
    latents_tensor = tf.image.resize(latents_tensor, [32,32])
    noise_pred = tf.reshape(noise_pred, (1, 32, 32, 1))

    # Perform guidance
    noise_pred = tf.image.resize(noise_pred, [32, 32])
    noise_pred *= guidance_scale

    # Resize latents to match the shape of model_output
    latents = tf.image.resize(latents, [32, 32])
    noise_pred = tf.cast(noise_pred, latents.dtype)
    noise_pred = tf.clip_by_value(noise_pred, -1.0, 1.0)
    
    
    # Compute the previous noisy sample x_t -> x_t-1
    latents = scheduler.step(noise_pred, t, latents)

In [None]:
import torch

# Assuming 'input_tensor', 'latents', 'model', and 'scheduler' are TensorFlow objects
for t in range(num_inference_steps):
    # Convert 'input_tensor' to a PyTorch tensor
    input_tensor_torch = torch.FloatTensor(input_tensor.numpy())

    latents_tensor = reshape_tensor(input_tensor_torch, target_shape)

    # Predict the noise residual using your model
    with tf.device("gpu:0"):  # Assuming you want to use GPU
        noise_pred = model(latents_tensor, t)

    latents_tensor = tf.reshape(latents_tensor, (1, 32, 32, 3))
    noise_pred = tf.cast(noise_pred, dtype=tf.float32)
    
    # Perform guidance
    latents_tensor = tf.image.resize(latents_tensor, [32, 32])
    noise_pred = tf.reshape(noise_pred, (1, 32, 32, 1))

    # Perform guidance
    noise_pred = tf.image.resize(noise_pred, [32, 32])
    noise_pred *= guidance_scale

    # Convert 'latents' to a PyTorch tensor
    latents = torch.FloatTensor(latents.numpy())

    # Resize 'latents' to match the shape of model_output
    latents = tf.image.resize(latents, [32, 32])
    noise_pred = tf.cast(noise_pred, latents.dtype)
    
    # Clip 'noise_pred' if needed
    noise_pred = tf.clip_by_value(noise_pred, -1.0, 1.0)

    # Compute the previous noisy sample x_t -> x_t-1
    latents = scheduler.step(noise_pred, t, latents)


In [None]:
# Denormalize the image and convert it to a PIL image
image = (latents * 255.0).clamp(0, 255).to(torch.uint8).cpu().numpy()
image = Image.fromarray(image.squeeze())

# Display the generated image
image.show()