#AnimateDiffPipeline

In [None]:
!pip install --upgrade transformers accelerate diffusers imageio-ffmpeg
import torch
from diffusers import AnimateDiffPipeline, MotionAdapter, EulerDiscreteScheduler
from diffusers.utils import export_to_gif
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file

device = "cuda"
dtype = torch.float16

step = 4  # Options: [1,2,4,8]
repo = "ByteDance/AnimateDiff-Lightning"
ckpt = f"animatediff_lightning_{step}step_diffusers.safetensors"
base = "emilianJR/epiCRealism"  # Choose to base model.

adapter = MotionAdapter().to(device, dtype)
adapter.load_state_dict(load_file(hf_hub_download(repo ,ckpt), device=device))
pipe = AnimateDiffPipeline.from_pretrained(base, motion_adapter=adapter, torch_dtype=dtype).to(device)
pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing", beta_schedule="linear")

output = pipe(prompt="crack two eggs into a bowl", guidance_scale=1.0, num_inference_steps=step)
export_to_gif(output.frames[0], "animation.gif")

#Fine-tuning


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import torch.nn.functional as F
from torch.optim import AdamW
from tqdm import tqdm
from diffusers import AnimateDiffPipeline, MotionAdapter
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file

# Set up the device
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16

In [None]:
import os
import json
import glob
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

# Data path
frame_dir = "/content/drive/Shareddrives/DATA 298B Team 8/298A/processed_data/train"
annotation_path = "/content/drive/Shareddrives/DATA 298B Team 8/298A/youcook/annotations/youcookii_annotations_trainval.json"
os.makedirs(frame_dir, exist_ok=True)

class YouCook2Dataset(Dataset):
    def __init__(self, annotation_path, frame_path, transform=None):
        with open(annotation_path, "r") as f:
            self.annotations = json.load(f)
        self.frame_path = frame_path
        self.transform = transform or transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.ToTensor()
        ])

        # Get all video IDs in `frame_dir`
        self.existing_video_ids = set([f.split("_sentence")[0] for f in os.listdir(self.frame_path) if "_frames" in f])

        # Load data
        self.data = self.prepare_data()

    def prepare_data(self):
        """ Parse JSON to get video frames and corresponding captions """
        data = []

        for video_id, info in self.annotations['database'].items():
            if video_id not in self.existing_video_ids:
                continue

            for annotation_idx, annotation in enumerate(info['annotations']):
                sentence = annotation["sentence"]
                sentence_folder = f"{video_id}_sentence{annotation_idx}_frames"
                video_path = os.path.join(self.frame_path, sentence_folder)

                if not os.path.exists(video_path):
                    continue

                # Get `frame_*.jpg` files
                frames = sorted(glob.glob(os.path.join(video_path, "frame_*.jpg")))
                if not frames:
                    continue

                data.append((frames[:8], sentence))  # Select the first 8 frames

        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        frame_paths, sentence = self.data[idx]
        frames = torch.stack([self.transform(Image.open(fp).convert("RGB")) for fp in frame_paths])
        return frames, sentence

dataset = YouCook2Dataset(annotation_path, frame_dir)
print("Dataset loaded successfully, total samples:", len(dataset))


Dataset loaded successfully, total samples: 8634


In [None]:
# !pip install --upgrade transformers accelerate diffusers imageio-ffmpeg
import torch
import gc
import numpy as np
from diffusers import AnimateDiffPipeline, MotionAdapter, EulerDiscreteScheduler
from diffusers.utils import export_to_gif
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
from peft import LoraConfig, get_peft_model

device = "cuda"
dtype = torch.float16

step = 4  # Options: [1,2,4,8]
repo = "ByteDance/AnimateDiff-Lightning"
ckpt = f"animatediff_lightning_{step}step_diffusers.safetensors"
base = "emilianJR/epiCRealism"  # Choose the base model.

adapter = MotionAdapter().to(device, dtype)
adapter.load_state_dict(load_file(hf_hub_download(repo, ckpt), device=device))
pipe = AnimateDiffPipeline.from_pretrained(base, motion_adapter=adapter, torch_dtype=dtype).to(device)
pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing", beta_schedule="linear")


# Load pre-trained MotionAdapter
repo = "ByteDance/AnimateDiff-Lightning"
ckpt = f"animatediff_lightning_{step}step_diffusers.safetensors"

adapter = MotionAdapter().to(device, dtype)
adapter.load_state_dict(load_file(hf_hub_download(repo, ckpt), device=device))  # Load the official MotionAdapter pre-trained model
adapter.train()  # Set to training mode

# Enable training for the entire MotionAdapter
for param in adapter.parameters():
    param.requires_grad = True

# Training parameters
num_epochs = 3
learning_rate = 3e-5
# max_batches = 4  # Train more batches

# Create DataLoader
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
optimizer = AdamW(adapter.parameters(), lr=learning_rate)
scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs // 2, eta_min=1e-6)  # Smooth learning rate decay

# Attach the fine-tuned MotionAdapter
pipe.motion_adapter = adapter

# Train MotionAdapter (without LoRA)
for epoch in range(num_epochs):
    epoch_loss = 0
    for batch_idx, batch in enumerate(tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")):
        # if batch_idx >= max_batches:
        #     break  # Train only the first 50 batches
        optimizer.zero_grad()

        # Load input data
        frames, captions = batch
        frames = frames.to(device, dtype)  # [batch, frames, channels, height, width]

        # Process text input
        if isinstance(captions, tuple):
            prompts = [str(c) for c in captions]
        elif isinstance(captions, list):
            prompts = [str(c) for c in captions]
        else:
            prompts = [str(captions)]
        prompt = " ".join(prompts)

        # Generate animation
        output = pipe(prompt=prompt, guidance_scale=1.0, num_inference_steps=step)

        # Ensure output.frames is a NumPy array
        frames_np = np.array(output.frames)
        print(f"Shape of output.frames: {frames_np.shape}")  # Debugging

        # Ensure output.frames shape is (frames, H, W, C)
        if frames_np.ndim == 5:  # Possibly (batch, frames, H, W, C)
            frames_np = frames_np.squeeze(0)  # Remove batch dimension

        # Convert to PyTorch Tensor, ensure requires_grad=True
        generated_frames = torch.as_tensor(frames_np, device=device, dtype=dtype).clone().detach().requires_grad_() / 255.0

        # Adjust channel order (frames, H, W, C) -> (frames, C, H, W)
        generated_frames = generated_frames.permute(0, 3, 1, 2)

        # Add batch dimension (batch, frames, C, H, W)
        generated_frames = generated_frames.unsqueeze(0)

        # Interpolate only supports (N, C, H, W), so flatten frames dimension first
        b, f, c, h, w = generated_frames.shape
        generated_frames = generated_frames.view(b * f, c, h, w)

        # Resize to 256x256
        generated_frames = torch.nn.functional.interpolate(
            generated_frames, size=(256, 256), mode='bilinear', align_corners=False
        )

        # Restore frames dimension (batch, frames, C, H, W)
        generated_frames = generated_frames.view(b, f, c, 256, 256)

        # Ensure consistent number of frames
        min_frames = min(generated_frames.shape[1], frames.shape[1])
        generated_frames = generated_frames[:, :min_frames, :, :, :]
        frames = frames[:, :min_frames, :, :, :]

        # Ensure frames also require gradients
        frames = frames.clone().detach().requires_grad_()

        # Compute loss
        # loss = F.mse_loss(generated_frames, frames)
        loss = F.smooth_l1_loss(generated_frames, frames, beta=0.5)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

# Save the fully fine-tuned MotionAdapter
torch.save(adapter.state_dict(), "/content/drive/Shareddrives/DATA 298B Team 8/CODE/motion_adapter_finetuned_4.6.pth")
print("Training complete, saved motion_adapter_finetuned.pth")

# Reload the fully fine-tuned MotionAdapter
adapter = MotionAdapter().to(device, dtype)

# Directly load the full fine-tuned MotionAdapter weights
state_dict = torch.load("/content/drive/Shareddrives/DATA 298B Team 8/CODE/motion_adapter_finetuned_4.6.pth", map_location=device)  # No need for `weights_only=True`
adapter.load_state_dict(state_dict, strict=True)  # `strict=True` ensures weights match exactly

pipe = AnimateDiffPipeline.from_pretrained(base, motion_adapter=adapter, torch_dtype=dtype).to(device)
pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing", beta_schedule="linear")

output = pipe(prompt="crack two eggs into a bowl", guidance_scale=1.0, num_inference_steps=step)
export_to_gif(output.frames[0], "/content/drive/Shareddrives/DATA 298B Team 8/CODE/AnimateDiff_finetuned_animation_4.6.gif")


In [None]:
!pip install gradio diffusers huggingface_hub safetensors torch torchvision

import gradio as gr
import torch
import numpy as np
from diffusers import AnimateDiffPipeline, MotionAdapter, EulerDiscreteScheduler
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
from PIL import Image

# Define Gradio processing function
def generate_animation(prompt):
    """Generate an animation based on the user input text."""
    output = pipe(prompt=prompt, guidance_scale=1.0, num_inference_steps=step)

    # Ensure output is in NumPy format
    frames_np = np.array(output.frames, dtype=np.float32)  # Ensure float32 for precision

    # Handle dimensions
    if frames_np.ndim == 5:  # Possibly (batch, frames, H, W, C)
        frames_np = frames_np.squeeze(0)

    # Normalize values to range [0, 1] if they are outside the range
    if frames_np.min() < 0 or frames_np.max() > 1:
        frames_np = (frames_np - frames_np.min()) / (frames_np.max() - frames_np.min())

    # Convert to RGB format (if necessary, some models output BGR or grayscale)
    if frames_np.shape[-1] == 3:  # Assuming (frames, H, W, C)
        frames_np = frames_np[..., ::-1]  # Convert BGR to RGB if necessary

    # Convert NumPy array to PIL format
    frames_pil = [Image.fromarray((frame * 255).astype(np.uint8), mode="RGB") for frame in frames_np]

    # Generate GIF
    gif_path = "/content/generated_animation.gif"
    frames_pil[0].save(gif_path, save_all=True, append_images=frames_pil[1:], duration=100, loop=0)

    return gif_path

# Create Gradio interface
interface = gr.Interface(
    fn=generate_animation,
    inputs=gr.Textbox(label="Enter a text prompt"),
    outputs=gr.Image(type="filepath", label="Generated Animation"),
    title="Text to Video Web App",
    description="Enter a text description and generate an animation."
)

# Launch Web App
interface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://08008f3358ae9b0d96.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
!pip install lpips

Collecting lpips
  Downloading lpips-0.1.4-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=0.4.0->lpips)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=0.4.0->lpips)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=0.4.0->lpips)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=0.4.0->lpips)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=0.4.0->lpips)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=0.4.0->lpips)
  Downloading nvidia_cufft

# Compute all evaluation metrics for each comparison

In [None]:
# !pip install piq

import os, json, glob, random, gc
from PIL import Image
import torch
from torchvision import transforms
import imageio
import numpy as np
import lpips
import piq
from transformers import CLIPProcessor, CLIPModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === Load annotation & sample ===
annotation_path = "/content/drive/Shareddrives/DATA 298B Team 8/298A/youcook/annotations/youcookii_annotations_trainval.json"
frame_dir = "/content/drive/Shareddrives/DATA 298B Team 8/298A/processed_data/val"

with open(annotation_path, "r") as f:
    annotations = json.load(f)

transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor()
])
resize_only = transforms.Resize((256, 256))

existing_ids = set([f.split("_sentence")[0] for f in os.listdir(frame_dir) if "_frames" in f])
valid_samples = []

for vid, info in annotations["database"].items():
    if vid not in existing_ids: continue
    for i, ann in enumerate(info["annotations"]):
        folder = f"{vid}_sentence{i}_frames"
        path = os.path.join(frame_dir, folder)
        frames = sorted(glob.glob(os.path.join(path, "frame_*.jpg")))
        if os.path.isdir(path) and len(frames) >= 1:
            valid_samples.append((frames[:1], ann["sentence"]))

frame_paths, prompt = random.choice(valid_samples)
print("Prompt:", prompt)

# === Load 1 frame from GIF
gif_path = "/content/drive/Shareddrives/DATA 298B Team 8/CODE/AnimateDiff_finetuned_animation_4.6.gif"
gif = imageio.mimread(gif_path)
gen_img = Image.fromarray(gif[0])  # only the first frame

# === LPIPS + SSIM + PSNR
loss_fn = lpips.LPIPS(net='alex').to(device)

real_tensor = transform(Image.open(frame_paths[0]).convert("RGB")).unsqueeze(0).to(device)
gen_tensor = transform(gen_img).unsqueeze(0).to(device)

with torch.no_grad():
    lpips_score = loss_fn(real_tensor, gen_tensor).item()
    ssim_score = piq.ssim(gen_tensor, real_tensor, data_range=1.0).item()
    psnr_score = piq.psnr(gen_tensor, real_tensor, data_range=1.0).item()

del real_tensor, gen_tensor
torch.cuda.empty_cache()
gc.collect()

# === FVD (over single frame)
def fvd_single(real_path, gen_pil):
    r = np.array(resize_only(Image.open(real_path))).astype(np.float32).flatten()
    g = np.array(resize_only(gen_pil)).astype(np.float32).flatten()
    mu1, mu2 = r.mean(), g.mean()
    sigma1, sigma2 = r.std(), g.std()
    return (mu1 - mu2) ** 2 + (sigma1 + sigma2 - 2 * np.sqrt(sigma1 * sigma2))

fvd_score = fvd_single(frame_paths[0], gen_img)

# === CLIP (1 frame only)
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
clip_inputs = clip_processor(text=prompt, images=gen_img, return_tensors="pt").to(device)
clip_outputs = clip_model(**clip_inputs)
clip_score = clip_outputs.logits_per_image.mean().item()

# === Final output
print({
    "Prompt": prompt,
    "LPIPS": lpips_score,
    "SSIM": ssim_score,
    "PSNR": psnr_score,
    "FVD": fvd_score,
    "CLIP Score": clip_score
})


Prompt: use a tablespoon to make balls out of the mashed potatoes
Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]




Loading model from: /usr/local/lib/python3.11/dist-packages/lpips/weights/v0.1/alex.pth


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/599M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

{'Prompt': 'use a tablespoon to make balls out of the mashed potatoes', 'LPIPS': 0.7376506328582764, 'SSIM': 0.0040740687400102615, 'PSNR': 5.727221488952637, 'FVD': np.float32(13616.206), 'CLIP Score': 26.30241584777832}


In [None]:
import os, glob, json
from PIL import Image
import numpy as np
import imageio
from torchvision import transforms
import torch
import numpy as np
from torchvision.transforms import Compose, Resize, ToTensor, Grayscale

# ========== CONFIG ==========
annotation_path = "/content/drive/Shareddrives/DATA 298B Team 8/298A/youcook/annotations/youcookii_annotations_trainval.json"
frame_dir = "/content/drive/Shareddrives/DATA 298B Team 8/298A/processed_data/val"
gif_path = "/content/drive/Shareddrives/DATA 298B Team 8/CODE/AnimateDiff_finetuned_animation_4.6.gif"
resize_only = transforms.Resize((256, 256))
# ============================

# === Load annotations
with open(annotation_path, "r") as f:
    annotations = json.load(f)

existing_ids = set([f.split("_sentence")[0] for f in os.listdir(frame_dir) if "_frames" in f])

# === Find valid sample
valid_samples = []
for vid, info in annotations["database"].items():
    if vid not in existing_ids:
        continue
    for i, ann in enumerate(info["annotations"]):
        folder = f"{vid}_sentence{i}_frames"
        path = os.path.join(frame_dir, folder)
        frames = sorted(glob.glob(os.path.join(path, "frame_*.jpg")))
        if os.path.isdir(path) and len(frames) >= 2:
            valid_samples.append((frames[:2], ann["sentence"]))

# === Pick one sample
frame_paths, prompt = valid_samples[0]
gif = imageio.mimread(gif_path)

resize_64_gray = Compose([
    Resize((64, 64)),
    Grayscale(),
    ToTensor()
])

def fvd_lightweight(real_paths, gen_frames, num_frames=2):
    real_feats, gen_feats = [], []
    for i in range(num_frames):
        r = resize_64_gray(Image.open(real_paths[i])).view(-1).numpy()
        g = resize_64_gray(Image.fromarray(gen_frames[i])).view(-1).numpy()
        real_feats.append(r)
        gen_feats.append(g)
    real_feats = np.stack(real_feats)
    gen_feats = np.stack(gen_feats)

    mu_r, mu_g = real_feats.mean(0), gen_feats.mean(0)
    sigma_r = np.cov(real_feats, rowvar=False)
    sigma_g = np.cov(gen_feats, rowvar=False)

    diff = np.sum((mu_r - mu_g) ** 2)
    trace_cov = np.trace(sigma_r) + np.trace(sigma_g) - 2 * np.sqrt(np.trace(sigma_r) * np.trace(sigma_g))
    return float(diff + trace_cov)

# === Compute FVD
n_fvd = min(2, len(frame_paths), len(gif))
fvd_score = fvd_lightweight(frame_paths, gif, num_frames=n_fvd)


print(f"Prompt: {prompt}")
print(f"FVD (2-frame): {fvd_score:.4f}")


Prompt: heat some oil in a pan with garlic and cumin seeds
FVD (2-frame): 436.5696


In [None]:
# !pip install piq lpips transformers imageio

import os, json, glob, random, gc
from PIL import Image
import torch
from torchvision import transforms
import imageio
import numpy as np
import piq
import lpips
from transformers import CLIPProcessor, CLIPModel
from torch.nn.functional import normalize
import pandas as pd

# ========== CONFIG ==========
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
annotation_path = "/content/drive/Shareddrives/DATA 298B Team 8/298A/youcook/annotations/youcookii_annotations_trainval.json"
frame_dir = "/content/drive/Shareddrives/DATA 298B Team 8/298A/processed_data/val"
gif_path = "/content/drive/Shareddrives/DATA 298B Team 8/CODE/AnimateDiff_finetuned_animation_4.6.gif"

transform = transforms.Compose([transforms.Resize((256, 256)), transforms.ToTensor()])
resize_only = transforms.Resize((256, 256))
resize_64_gray = transforms.Compose([transforms.Resize((64, 64)), transforms.Grayscale(), transforms.ToTensor()])

# === Load annotation
with open(annotation_path, "r") as f:
    annotations = json.load(f)

existing_ids = set([f.split("_sentence")[0] for f in os.listdir(frame_dir) if "_frames" in f])

valid_samples = []
for vid, info in annotations["database"].items():
    if vid not in existing_ids:
        continue
    for i, ann in enumerate(info["annotations"]):
        folder = f"{vid}_sentence{i}_frames"
        path = os.path.join(frame_dir, folder)
        frames = sorted(glob.glob(os.path.join(path, "frame_*.jpg")))
        if os.path.isdir(path) and len(frames) >= 2:
            valid_samples.append((frames, ann["sentence"]))

print(f"Total valid samples: {len(valid_samples)}")

# === Load shared GIF and models
gif_frames = imageio.mimread(gif_path)
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
lpips_fn = lpips.LPIPS(net='alex').to(device)
clip_model.eval()

def fvd_lightweight(real_paths, gen_frames, num_frames=2):
    real_feats, gen_feats = [], []
    for i in range(num_frames):
        r = resize_64_gray(Image.open(real_paths[i])).view(-1).numpy()
        g = resize_64_gray(Image.fromarray(gen_frames[i])).view(-1).numpy()
        real_feats.append(r)
        gen_feats.append(g)
    real_feats = np.stack(real_feats)
    gen_feats = np.stack(gen_feats)
    mu_r, mu_g = real_feats.mean(0), gen_feats.mean(0)
    sigma_r = np.cov(real_feats, rowvar=False)
    sigma_g = np.cov(gen_feats, rowvar=False)
    diff = np.sum((mu_r - mu_g) ** 2)
    trace_cov = np.trace(sigma_r) + np.trace(sigma_g) - 2 * np.sqrt(np.trace(sigma_r) * np.trace(sigma_g))
    return float(diff + trace_cov)

def compute_clip_score(prompt, gen_pil):
    inputs = clip_processor(text=prompt, images=gen_pil, return_tensors="pt").to(device)
    with torch.no_grad():
        image_features = clip_model.get_image_features(inputs["pixel_values"])
        text_features = clip_model.get_text_features(inputs["input_ids"])
        image_features = normalize(image_features, dim=-1)
        text_features = normalize(text_features, dim=-1)
        score = (image_features * text_features).sum(dim=-1).mean().item()
    return score

# === Main Loop
results = []
samples = random.sample(valid_samples, k=10)

for idx, (frame_paths, prompt) in enumerate(samples):
    real_img = transform(Image.open(frame_paths[0]).convert("RGB")).unsqueeze(0).to(device)
    gen_img = transform(Image.fromarray(gif_frames[0])).unsqueeze(0).to(device)

    with torch.no_grad():
        lpips_score = lpips_fn(real_img, gen_img).item()
        ssim_score = piq.ssim(gen_img, real_img, data_range=1.0).item()
        psnr_score = piq.psnr(gen_img, real_img, data_range=1.0).item()

    # FVD
    fvd_score = fvd_lightweight(frame_paths, gif_frames, num_frames=2)

    # CLIP
    clip_score = compute_clip_score(prompt, Image.fromarray(gif_frames[0]))

    results.append({
        "Prompt": prompt,
        "LPIPS": round(lpips_score, 4),
        "SSIM": round(ssim_score, 4),
        "PSNR": round(psnr_score, 4),
        "FVD (2f)": round(fvd_score, 4),
        "CLIP Score": round(clip_score, 4)
    })

    print(f"[{idx+1}/10] Done | LPIPS: {lpips_score:.4f}, SSIM: {ssim_score:.4f}, "
          f"PSNR: {psnr_score:.2f}, FVD: {fvd_score:.2f}, CLIP: {clip_score:.4f}")

# === Results table
df = pd.DataFrame(results)
print(df)


Total valid samples: 1813


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]




Loading model from: /usr/local/lib/python3.11/dist-packages/lpips/weights/v0.1/alex.pth
[1/10] Done | LPIPS: 0.7640, SSIM: 0.0004, PSNR: 5.69, FVD: 1112.74, CLIP: 0.2042
[2/10] Done | LPIPS: 0.7044, SSIM: 0.2095, PSNR: 10.73, FVD: 457.70, CLIP: 0.1879
[3/10] Done | LPIPS: 0.7092, SSIM: 0.0031, PSNR: 5.76, FVD: 906.55, CLIP: 0.1812
[4/10] Done | LPIPS: 0.6062, SSIM: 0.2181, PSNR: 8.44, FVD: 596.12, CLIP: 0.1741
[5/10] Done | LPIPS: 0.7640, SSIM: 0.0004, PSNR: 5.69, FVD: 963.22, CLIP: 0.2537
[6/10] Done | LPIPS: 0.7626, SSIM: 0.0111, PSNR: 5.75, FVD: 1104.03, CLIP: 0.1982
[7/10] Done | LPIPS: 0.7640, SSIM: 0.0004, PSNR: 5.69, FVD: 939.83, CLIP: 0.2216
[8/10] Done | LPIPS: 0.7220, SSIM: 0.3831, PSNR: 11.27, FVD: 357.69, CLIP: 0.2287
[9/10] Done | LPIPS: 0.6656, SSIM: 0.1122, PSNR: 8.21, FVD: 548.61, CLIP: 0.2149
[10/10] Done | LPIPS: 0.6479, SSIM: 0.1428, PSNR: 9.30, FVD: 524.67, CLIP: 0.2193
                                              Prompt   LPIPS    SSIM     PSNR  \
0               