# Preparation - 3D model generation

In [None]:
# Assume that pwd is "/content"
%%bash
git clone https://github.com/ashawkey/stable-dreamfusion.git
cd /content/stable-dreamfusion

pip install -r requirements.txt

# No need for this reproduction.
# cd /content/stable-dreamfusion/pretrained/zero123
# wget https://zero123.cs.columbia.edu/assets/zero123-xl.ckpt

# cd /content/stable-dreamfusion
# mkdir pretrained/omnidata
# cd pretrained/omnidata
# assume gdown is installed
# gdown '1Jrh-bRnJEjyMCS7f-WsaFlccfPjJPPHI&confirm=t' # omnidata_dpt_depth_v2.ckpt
# gdown '1wNxVO4vVbDEMEpnAi_jwQObf2MFodcBR&confirm=t' # omnidata_dpt_normal_v2.ckpt

# Get videos
# Videos can be found under /content/stable-dreamfusion/workspace_name/results
python main.py --text "a hamburger" --workspace hamburger_trial -O
python main.py --workspace hamburger_trial -O --test

python main.py --text "matte painting of a castle made of cheesecake surrounded by a moat made of ice cream" --workspace castle_trial -O
python main.py --workspace castle_trial -O --test

python main.py --text "a vase with pink flowers" --workspace vase_trial -O
python main.py --workspace vase_trial -O --test

# Reproduction - CLIP R-Precision
**Preparation:**
1. Pick 1 text prompt and generate model for it.
2. Get the video description of the 3D model generated.
**CLIP R-Precision Evaluation:**
1. Extract 16 views from different azimuth angle
2. Generate distractors via DeepSeek and manually fliter out excessively similar/distinct distractors
3. Apply CLIP to perform zero-shot prediction on similarity between the rendering and 1 true text prompt + 99 distractors.
4. Output the average value of predicted similarity. Chunk the results to top 5.

In [26]:
# Get CLIP
%%bash
pip install torchaudio==0.9.0
pip install git+https://github.com/openai/CLIP.git
wget https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-l_b2ewwz
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->clip==1.0)
  Downloading nvi

ERROR: Ignored the following yanked versions: 2.0.0
ERROR: Could not find a version that satisfies the requirement torchaudio==0.9.0 (from versions: 2.0.1, 2.0.2, 2.1.0, 2.1.1, 2.1.2, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0, 2.4.1, 2.5.0, 2.5.1, 2.6.0, 2.7.0)
ERROR: No matching distribution found for torchaudio==0.9.0
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-l_b2ewwz
--2025-05-03 15:22:47--  https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt
Resolving openaipublic.azureedge.net (openaipublic.azureedge.net)... 13.107.246.59, 2620:1ec:29:1::59
Connecting to openaipublic.azureedge.net (openaipublic.azureedge.net)|13.107.246.59|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 934088680 (891M) [application/octet-stream]
Saving to: ‘ViT-L-14-336px.pt’

     0K .......... .......... .......... .......... ..........  0% 2.0

In [1]:
import cv2
import os

In [19]:
## Prepare images
## Assume that the video has been uploaded in /content/"caption".mp4

def extract_frames(video_path: str, output_folder: str, frame_count: int=16):
  # Create the output folder if it does not exist
  if not os.path.exists(output_folder):
    os.makedirs(output_folder)
  # Empty the output folder if it exist
  else:
    !rm -rf $output_folder
  # Open the video
  cap = cv2.VideoCapture(video_path)
  if not cap.isOpened():
    print("Error: Failed to open the video!")
    return

  total_frame = cap.get(cv2.CAP_PROP_FRAME_COUNT)
  frame_interval = total_frame // frame_count
  print(f"total_frame: {total_frame}, frame_interval: {frame_interval}, extracted_frame: {frame_count}")
  count, saved_count = 1, 0

  while True:
    ret, frame = cap.read()
    if not ret:
      break
    if count % frame_interval == 0:
      file_name = os.path.join(output_folder, f"{saved_count+1}.jpg")
      # print(file_name)
      saved = cv2.imwrite(file_name, frame)
      if not saved:
        print(f"Error: Failed to save frame: {frame_count}.")
      else:
        saved_count += 1
    count += 1

  cap.release()
  print(f"Extracted & saved {saved_count} frames.")


**Assume that videos have been copied to /content**

In [20]:
extract_frames("/content/hamburger.mp4", "/content/a hamburger", 16)

total_frame: 100.0, frame_interval: 6.0, extracted_frame: 16
Extracted & saved 16 frames.


In [21]:
extract_frames("/content/castle.mp4", "/content/matte painting of a castle made of cheesecake surrounded by a moat made of ice cream", 16)

total_frame: 100.0, frame_interval: 6.0, extracted_frame: 16
Extracted & saved 16 frames.


In [22]:
extract_frames("/content/vase.mp4", "/content/a vase with pink flowers", 16)

total_frame: 100.0, frame_interval: 6.0, extracted_frame: 16
Extracted & saved 16 frames.


In [23]:
# 99 distractors + 1 true text prompt
# Deepseek generated
import random
hamburger_prompts = [
    "a red cushion shaped like a bun",  # Visual trickery
    "a sesame seed bun alone on a plate",  # Component isolation
    "a cheeseburger with lettuce and tomato",  # Close food competitor
    "a chicken sandwich on a sesame seed bun",
    "a veggie burger with avocado",
    "a bacon burger with fries",
    "a grilled cheese sandwich",
    "a hot dog with ketchup and mustard",
    "a pepperoni pizza on a wooden table",
    "a sushi roll on a plate",
    "a bowl of spaghetti with meatballs",
    "a salad with grilled chicken and dressing",
    "a breakfast burrito with eggs and cheese",
    "a turkey club sandwich with chips",
    "a plate of nachos with melted cheese",
    "a steak dinner with mashed potatoes",
    "a fruit bowl with assorted berries",
    "a chocolate cake with frosting",
    "a red apple on a white background",
    "a glass of orange juice",
    "a cup of coffee with foam art",
    "a bowl of cereal with milk",
    "a peanut butter and jelly sandwich",
    "a blueberry muffin on a plate",
    "a cinnamon roll with icing",
    "a basket of french fries",
    "a plate of pancakes with syrup",
    "a waffle with strawberries and whipped cream",
    "a bowl of oatmeal with bananas",
    "a slice of cheesecake with raspberry sauce",
    "a lobster roll on a New England-style bun",
    "a taco with ground beef and toppings",
    "a gyro with lamb and tzatziki sauce",
    "a falafel wrap with tahini",
    "a ramen bowl with egg and pork",
    "a bento box with sushi and tempura",
    "a pho bowl with beef and herbs",
    "a paella with seafood and saffron rice",
    "a croissant on a cutting board",
    "a bagel with cream cheese",
    "a charcuterie board with meats and cheeses",
    "a cheese platter with grapes and crackers",
    "a tomato soup in a bowl",
    "a grilled chicken Caesar salad",
    "a margherita pizza with fresh basil",
    "a donut with sprinkles",
    "a pretzel with salt crystals",
    "a bowl of popcorn",
    "a sushi platter with wasabi and ginger",
    "a fried egg sunny-side up",
    "a bowl of curry with rice",
    "a green smoothie in a glass",
    "a plate of dumplings with soy sauce",
    "a chocolate chip cookie",
    "a slice of apple pie",
    "a roasted chicken with vegetables",
    "a bowl of chili with beans",
    "a vanilla ice cream cone",
    "a strawberry milkshake with a cherry",
    "a cheese and crackers snack",
    "a plate of scrambled eggs",
    "a grilled salmon fillet",
    "a bowl of miso soup",
    "a plate of lasagna",
    "a serving of tiramisu",
    "a bowl of fried rice",
    "a slice of watermelon",
    "a banana split dessert",
    "a plate of shrimp scampi",
    "a bowl of oatmeal raisin cookies",
    "a glass of milk with cookies",
    "a plate of bacon and eggs",
    "a bowl of macaroni and cheese",
    "a slice of garlic bread",
    "a bowl of chicken noodle soup",
    "a plate of beef stir-fry",
    "a tray of sushi rolls",
    "a bowl of fruit salad",
    "a plate of enchiladas",
    "a cup of yogurt with granola",
    "a plate of fajitas",
    "a bowl of lentil soup",
    "a plate of spaghetti carbonara",
    "a slice of key lime pie",
    "a bowl of beef stew",
    "a plate of roasted vegetables",
    "a bowl of tomato pasta",
    "a plate of fish and chips",
    "a serving of mashed potatoes",
    "a plate of onion rings",
    "a bowl of coleslaw",
    "a plate of mozzarella sticks",
    "a cup of hot chocolate",
    "a jar of honey with a dipper",
    "a bowl of guacamole with chips",
    "a plate of bruschetta",
    "a serving of deviled eggs",
    "a plate of stuffed peppers",
    "a bowl of quinoa salad",
    "a plate of chicken wings",
]

random.shuffle(hamburger_prompts)
hamburger_prompts.insert(0, "a hamburger") # insert the true prompt at the position 0.

In [53]:
### Text-prompt for "a vase with pink flowers", generated by deepseek.
vase_prompts = [
    "a vase with green flowers",
    "a vase with red roses on a wooden table",
    "a ceramic jar containing yellow sunflowers",
    "a glass pitcher filled with white daisies",
    "a blue pot holding purple tulips",
    "a bronze urn with orange marigolds",
    "a modern vase containing dried lavender",
    "a rustic bucket with wildflowers in bloom",
    "a crystal decanter with pink peonies",
    "a vintage teapot holding fresh lilacs",
    "a metallic container with blue hydrangeas",
    "a clay pot filled with red poppies",
    "a porcelain vase with mixed autumn leaves",
    "a wooden box containing purple irises",
    "a silver chalice with yellow daffodils",
    "a terracotta pitcher holding pink carnations",
    "a glass bowl with floating gardenias",
    "a copper vase containing white magnolias",
    "a ceramic mug filled with forget-me-nots",
    "a stoneware jug with purple violets",
    "a bamboo holder with orchid blossoms",
    "a crystal vase with red chrysanthemums",
    "a pewter tankard holding daisy chains",
    "a marble urn with yellow begonias",
    "a tin can containing blue cornflowers",
    "a golden vase with purple asters",
    "a plastic container holding white lilies",
    "a glass jar with dried eucalyptus leaves",
    "a vintage bottle containing red geraniums",
    "a wicker basket with sunflowers and wheat",
    "a steel canister holding pink azaleas",
    "a ceramic planter with cacti and succulents",
    "a glass terrarium with ferns and moss",
    "a copper pot containing purple pansies",
    "a wooden barrel with red berries",
    "a crystal pitcher holding yellow roses",
    "a clay amphora with white jasmine",
    "a silver vase containing orange lilies",
    "a glass decanter with purple hyacinths",
    "a brass urn holding dried lavender",
    "a ceramic bowl with floating candles",
    "a vintage tin with dried wheat stalks",
    "a glass bottle containing red amaryllis",
    "a wooden crate with autumn foliage",
    "a copper bowl holding white camellias",
    "a crystal carafe with blue delphiniums",
    "a stone vase containing pink rhododendrons",
    "a metal bucket with yellow buttercups",
    "a glass vase holding purple freesias",
    "a ceramic jug with orange dahlias",
    "a wooden vase containing white garden roses",
    "a silver chalice holding red zinnias",
    "a glass jar with dried lavender sprigs",
    "a clay pot containing purple petunias",
    "a crystal bowl with floating rose petals",
    "a copper pitcher holding yellow tulips",
    "a ceramic urn with white calla lilies",
    "a vintage glass with purple lilacs",
    "a wooden box holding dried pampas grass",
    "a metal vase containing red anthuriums",
    "a glass bottle with baby's breath flowers",
    "a stone jar holding purple foxgloves",
    "a crystal vase with red ranunculus",
    "a ceramic mug containing white heather",
    "a copper pot with yellow daisies",
    "a wooden bowl holding purple statice",
    "a glass carafe with orange cosmos",
    "a silver urn containing red gladiolus",
    "a clay vase with white narcissus",
    "a crystal decanter holding purple lisianthus",
    "a ceramic pitcher with red hibiscus",
    "a metal container holding yellow coreopsis",
    "a glass jar containing purple scabiosa",
    "a wooden vase with red poinsettias",
    "a copper bowl holding white peonies",
    "a crystal urn with orange birds of paradise",
    "a ceramic pot containing purple alliums",
    "a glass bottle with red spider lilies",
    "a stoneware jug holding yellow primroses",
    "a silver vase with white stephanotis",
    "a clay amphora containing purple bougainvillea",
    "a glass terrarium with air plants",
    "a ceramic bowl holding red proteas",
    "a copper pitcher with yellow forsythia",
    "a wooden box containing white edelweiss",
    "a crystal carafe holding purple verbena",
    "a metal urn with red celosia flowers",
    "a glass vase containing yellow goldenrod",
    "a ceramic jar with white snowdrops",
    "a copper pot holding purple morning glories",
    "a stone container with red clover",
    "a crystal bowl containing yellow yarrow",
    "a wooden jug with white trillium",
    "a glass decanter holding purple wisteria",
    "a ceramic vase with red bleeding hearts",
    "a metal canister containing yellow tansy",
    "a clay pot with white cherry blossoms",
    "a crystal urn holding purple heliotrope",
    "a glass jar with red passion flowers"
]
random.shuffle(vase_prompts)
vase_prompts.insert(0, "a vase with pink flowers")

In [54]:
## Text prompts for the castle, generated by deepseek
castle_prompts = [
    "digital painting of a fortress made of chocolate cake encircled by lava flow of caramel",
    "watercolor of a gingerbread palace bordered by lemonade river",
    "3D render of cookie dough towers surrounded by milk chocolate lake",
    "concept art of an ice cream sundae castle with hot fenda moat",
    "oil painting of a waffle cone keep protected by whipped cream walls",
    "vector art of candy cane battlements above marshmallow fluff river",
    "fantasy illustration of a cupcake citadel beside strawberry syrup sea",
    "surreal photo of a pie crust fortress floating in coffee oceans",
    "storybook style cake pop towers rising from butterscotch wetlands",
    "steampunk-inspired biscuit castle over molten chocolate falls",
    "acrylic painting of a macaron fortress guarded by licorice walls",
    "medieval manuscript style pudding castle in honey valleys",
    "sci-fi concept of crystalline sugar palace above soda pop lakes",
    "collage art of pretzel bastions surrounded by chocolate milk moats",
    "neon-lit donut keep protected by rainbow sprinkles trench",
    "brutalist architecture of brownie blocks over cream cheese rivers",
    "fairy tale illustration of lollipop turrets above gelatin pools",
    "vangogh-style painting of tiramisu towers beside espresso lagoons",
    "minimalist line art of crepe paper castle over matcha pond",
    "cyberpunk render of futuristic cake structure with neon syrup defenses",
    "claymation style shortbread castle surrounded by raspberry sauce",
    "stained glass design of jellybean towers over caramel swamp",
    "art nouveau pastry palace encircled by champagne streams",
    "pop art print of candy floss ramparts above cola waterfalls",
    "gothic revival meringue cathedral beside chocolate fondue ditches",
    "isometric pixel art cheesecake tower in strawberry jam seas",
    "impressionist painting of profiterole fortress with custard canals",
    "tilt-shift photo of petit four battlements over honey moats",
    "woodcut style baklava citadel protected by pistachio paste rivers",
    "retro poster design of marshmallow castle with hot cocoa moat",
    "low poly render of biscuit fortress floating in tea lakes",
    "abstract expressionist cake structure with dripping icing defenses",
    "fantasy map style pie castle in vanilla cream bay",
    "diorama of wafer palace surrounded by melted white chocolate",
    "luminist painting of crystal sugar fortress above fruit syrup marsh",
    "blueprint schematic of truffle castle with ganache irrigation system",
    "kaleidoscopic view of marzipan towers over caramelized sugar moats",
    "children's book illustration of cookie castle with milk reservoir",
    "art deco chocolate palace surrounded by bubbling mocha moat",
    "uv reactive art of glow-in-dark candy fortress with neon drizzle",
    "topographic map style cake mountain with icing sugar rivers",
    "cross-section diagram of layered dessert castle with jam aquifers",
    "silhouette art of eclaire spires against caramel sunset",
    "food photography of creme brulee citadel with raspberry coulis",
    "origami style folded pastry castle beside condensed milk lake",
    "manga-inspired mochi fortress with red bean paste defenses",
    "roman mosaic style cheesecake villa overlooking yogurt seas",
    "fantasy concept art of croissant bastions guarding honeycomb moats",
    "dutch golden age painting of spiced cake fort beside brandy rivers",
    "surrealist sculpture of melting ice cream castle over soda springs",
    "tribal pattern design of chocolate palace with coconut milk moat",
    "infrared photo of warm pie fortress emitting custard steam",
    "pointillist painting of sprinkle towers above chocolate fountain",
    "papercut art of filo pastry castle beside rosewater stream",
    "glitch art visualization of digital dessert structure with code rivers",
    "dystopian concept of crumbling cake ruins over solidified syrup",
    "microscopic view of sugar crystal castle in coffee droplet",
    "fantasy battle scene with cake siege engines attacking frosting walls",
    "roman fresco style honeycomb palace overlooking mead moats",
    "bioluminescent concept of glowing dessert structure with liquid light",
    "aerial view of pancake stack citadel surrounded by maple syrup",
    "cubist interpretation of broken cookie fortress in milk pond",
    "vaporwave aesthetic cake ziggurat with blue raspberry canals",
    "arctic concept of snow cone castle surrounded by slushie moats",
    "desert landscape with dried fruit fortress and cactus sugar defenses",
    "underwater scene of kelp cake reef surrounded by sea salt caramel",
    "volcanic concept of molten chocolate castle with candy lava flows",
    "prehistoric theme of dinosaur-shaped cake in tar pit moat",
    "space opera concept of floating dessert asteroid with comet trails",
    "jungle ruins of overgrown cake temple with tropical fruit rivers",
    "noir style detective story set in crumbling cheesecake alleyways",
    "orientalist painting of baklava palace with rose syrup canals",
    "post-apocalyptic scene of irradiated dessert structures",
    "farm landscape with crop circle cake formation and milk streams",
    "psychedelic vision of swirling candyfloss towers over kool-aid seas",
    "alien architecture of unknown dessert materials with strange liquids",
    "time-lapse of melting ice cream castle reforming in reverse",
    "x-ray view of layered dessert structure with hidden filling channels",
    "film noir lighting on abandoned cake factory with leaking icing",
    "abstract geometry of dessert shapes forming optical illusions",
    "bird's eye view of wedding cake city with buttercream highways",
    "military schematic of cupcake bunkers and marshmallow trenches",
    "renaissance study sketches of dessert fortress proportions",
    "haunted house concept of rotting cake mansion with moldy moats",
    "future city concept with self-replicating dessert architecture",
    "mythological scene of dessert gods shaping cake landscapes",
    "industrial revolution era candy factories polluting syrup rivers",
    "samurai drama set in rice cake castle with soy sauce moats",
    "parallel universe where dessert geography follows baking rules",
    "steampunk dessert airships docking at cloud cake platforms",
    "religious iconography of sacred cake shrine with ceremonial syrups",
    "survival horror game set in sentient dessert structure",
    "optical illusion art where castle shifts between cake and stone",
    "blacklight poster of psychedelic dessert landscape",
    "cookbook illustration of deconstructed cake castle elements",
    "virtual reality tour through interactive dessert environment",
    "scientific diagram labeling dessert castle's nutritional components",
    "roman triumph arch made of layered cakes celebrating sugar wars",
    "film poster style dramatic dessert fortress at twilight",
    "instruction manual for assembling modular cake castle kits",
    "emergency response plan for melting dessert structure incidents",
    "matte painting of cheese tower surrounded by wine rivers",
    "children's playset of edible castle with candy construction pieces",
    "courtroom drama set in dessert architecture copyright lawsuit",
    "archeological dig site revealing ancient dessert civilization",
    "time capsule containing future visions of dessert architecture"
]
random.shuffle(castle_prompts)
castle_prompts.insert(0, "matte painting of a castle made of cheesecake surrounded by a moat made of ice cream")

### Codes:

In [48]:
### Apply CLIP to perform zero-shot prediction
import os
import clip
import torch
import torchvision
import matplotlib.pyplot as plt
from typing import List
from torch.cuda.amp import autocast
from PIL import Image


def single_CLIP_Prediction(img_path, true_prompt, text_prompts: List, device=None):
  # Fine tuning
  if device is None:
    device = "cuda" if torch.cuda.is_available() else "cpu"
  # Try to save storage
  # device = "cpu"
  # model, preprocess = clip.load('/content/ViT-L-14-336px.pt', device)
  model, preprocess = clip.load("ViT-B/32", device)

  # Prepare the inputs
  ## Try to lower the storage usage by lowering the resolution
  img = Image.open(img_path)
  if img is None:
    print(f"Error: Failed to read the image at {img_path}!")
    return
  img = img.resize((224, 224))
  # Checking the details
  # plt.figure()
  # plt.imshow(img)
  # plt.show()

  image_input = preprocess(img).unsqueeze(0).to(device)
  text_input = torch.cat([clip.tokenize(f"a photo of {prompt}") for prompt in text_prompts]).to(device)

  # Calculate features
  with autocast():
    with torch.no_grad():
      image_features = model.encode_image(image_input)
      text_features = model.encode_text(text_input)

  # Pick the top 5 most similar labels
  image_features /= image_features.norm(dim=-1, keepdim=True)
  text_features /= text_features.norm(dim=-1, keepdim=True)
  similarity = (100.0 * image_features@text_features.T).softmax(dim=-1).squeeze()
  return similarity

def average_similarity(img_folder_path, true_prompt, text_prompts: List):
  img_paths = []
  for folder, subfolders, files in os.walk(img_folder_path):
    for file in files:
      img_paths.append(os.path.join(img_folder_path, file))

  # print(img_paths)
  device = "cuda" if torch.cuda.is_available() else "cpu"
  average_similarity = torch.zeros(len(text_prompts)).to(device)
  total = len(img_paths)
  for img_path in img_paths:
    single_similarity = single_CLIP_Prediction(img_path, true_prompt, text_prompts, device).to(device)
    # print(single_similarity)
    average_similarity = torch.add(average_similarity, single_similarity,alpha=1/total)

  values, indices = average_similarity.topk(5)
  # Print the result
  print("Top predictions:")
  show_true = True
  for value, index in zip(values, indices):
    if text_prompts[index] == true_prompt:
      show_true = False
    print(f"{text_prompts[index]:>35s}: {100 * value.item():.2f}%")
  if show_true:
    found = False
    for id, value in enumerate(average_similarity):
      if id < 0 or id >= len(text_prompts):
        print(f"Error: id out of range, id = {id}")
        break
      if text_prompts[id] == true_prompt:
        print(f"{text_prompts[id]:>35s}: {100 * value.item():.2f}%")
        found = True

    print(f"Mean similarity: {average_similarity.mean()}")
    if not found:
      print("Warning! True text prompt not found in the prompt list!")

In [49]:
average_similarity("/content/a hamburger", "a hamburger", hamburger_prompts)

  with autocast():


Top predictions:
                        a hamburger: 81.80%
a cheeseburger with lettuce and tomato: 13.30%
a chicken sandwich on a sesame seed bun: 3.53%
       a veggie burger with avocado: 0.56%
 a sesame seed bun alone on a plate: 0.31%


Hamburger case: The true text prompt ranks 1st among all text prompts.

In [51]:
average_similarity("/content/a vase with pink flowers", "a vase with pink flowers",vase_prompts)

  with autocast():


Top predictions:
           a vase with pink flowers: 61.75%
a glass vase holding purple freesias: 7.53%
a glass vase containing yellow goldenrod: 4.08%
a crystal urn holding purple heliotrope: 3.41%
a terracotta pitcher holding pink carnations: 2.74%


Vase case: The true text prompt ranks ranks 1st among all text prompts.

In [55]:
average_similarity("/content/matte painting of a castle made of cheesecake surrounded by a moat made of ice cream",
                   "matte painting of a castle made of cheesecake surrounded by a moat made of ice cream",
                   castle_prompts)

  with autocast():


Top predictions:
tilt-shift photo of petit four battlements over honey moats: 41.44%
time-lapse of melting ice cream castle reforming in reverse: 11.71%
fantasy map style pie castle in vanilla cream bay: 9.78%
low poly render of biscuit fortress floating in tea lakes: 7.68%
concept art of an ice cream sundae castle with hot fenda moat: 5.25%
matte painting of a castle made of cheesecake surrounded by a moat made of ice cream: 4.13%
Mean similarity: 0.009345721453428268


Castle case: The true text prompt fails to rank 1st among all text prompts, probably due to lack of color fidelity in the moat part.