In [2]:

import torch
import clip
from PIL import Image
import torchvision.transforms as transforms
import numpy as np
import fiftyone as fo
import fiftyone.zoo as foz

# Load one sample from COCO dataset using FiftyOne
dataset = foz.load_zoo_dataset("coco-2017", split="train", max_samples=1)


Downloading split 'train' to 'C:\Users\naska\fiftyone\coco-2017\train' if necessary
Found annotations at 'C:\Users\naska\fiftyone\coco-2017\raw\instances_train2017.json'
Sufficient images already downloaded
Existing download of split 'train' is sufficient
Loading existing dataset 'coco-2017-train-1'. To reload from disk, either delete the existing dataset or provide a custom `dataset_name` to use


In [3]:
# Load the first sample from the dataset
sample = dataset.first()

# Load the image using its filepath
image_path = sample["filepath"]
image = Image.open(image_path)

# Display the original image
image.show()



In [4]:
# Preprocess the image
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
image_input = preprocess(image).unsqueeze(0).to(device)

# Get the image embeddings
with torch.no_grad():
    image_features = model.encode_image(image_input)

# Load COCO captions for the corresponding image
image_id = dataset.match({"filepath": image_path}).first().id

print(image_id)
list_of_detections = sample.ground_truth.detections

labels = []
for detection in list_of_detections:
    labels.append(detection.label)
print(labels)

65f870077a3b3e2da43709d7
['bowl', 'bowl', 'broccoli', 'bowl', 'orange', 'orange', 'orange', 'orange']


In [8]:
stylized_image = image_input.clone()
# Set optimization parameters
optimizer = torch.optim.Adam([stylized_image], lr=0.1)

# # Perform style transfer (for demonstration, let's just use random noise)
stylized_image_features = image_features + 0.1 * torch.randn_like(image_features)

# Optimization loop
for _ in range(100):
    optimizer.zero_grad()
    
    # Get CLIP features for the stylized image
    stylized_image_features = model.encode_image(stylized_image)
    
    # Loss function: minimize the distance between stylized image features and combined features
    loss = torch.nn.functional.mse_loss(stylized_image_features, image_features + 0.1 * torch.randn_like(image_features)
)
    
    # Backpropagation
    loss.backward()
    optimizer.step()

# # Convert the stylized image tensor to a PIL image
# stylized_image = transforms.functional.to_pil_image(stylized_image.squeeze().cpu())

# Convert the stylized image tensor to a numpy array
stylized_image_np = stylized_image[0].permute(1, 2, 0).cpu().numpy()

# Display the stylized image
Image.fromarray((stylized_image_np * 255).astype(np.uint8)).show()

In [6]:
captions = labels

# Choose a random caption
random_caption = "green forest"  # Just choose the first caption for simplicity

# Preprocess the text
text_input = clip.tokenize([random_caption]).to(device)

In [9]:
# Get the text embeddings
with torch.no_grad():
    text_features = model.encode_text(text_input)

# stylized_image_features = image_features + text_features

# Initialize the stylized image as the original image
stylized_image = image_input.clone().requires_grad_()

# Set optimization parameters
optimizer = torch.optim.Adam([stylized_image], lr=0.1)

# Optimization loop
for _ in range(100):
    optimizer.zero_grad()
    
    # Get CLIP features for the stylized image
    stylized_image_features = model.encode_image(stylized_image)
    
    # Loss function: minimize the distance between stylized image features and combined features
    loss = torch.nn.functional.mse_loss(stylized_image_features, image_features + text_features)
    
    # Backpropagation
    loss.backward()
    optimizer.step()

# Convert the stylized image tensor to a PIL image
stylized_image = transforms.functional.to_pil_image(stylized_image.squeeze().cpu())

# Display the stylized image
stylized_image.show()