In [1]:

#  pip install git+https://github.com/apple/ml-mobileclip
#  mkdir -p checkpoints
#  wget https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_s0.pt -P checkpoints
#  pip install --upgrade coremltools

In [17]:
import torch
import coremltools as ct
import mobileclip
import numpy as np
from PIL import Image

# 1. Export TextEncoder

In [21]:


#device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"
model, _, preprocess = mobileclip.create_model_and_transforms('mobileclip_s0', pretrained='./checkpoints/mobileclip_s0.pt')
tokenizer = mobileclip.get_tokenizer('mobileclip_s0')

model=model.to(device)
model = model.eval()

text_encoder = model.text_encoder
example_input = tokenizer("a photo of a cat", return_tensors="pt")
traced_model = torch.jit.trace(text_encoder, example_input)

  if seq_len != self.num_embeddings:


In [4]:
example_input.shape

torch.Size([1, 77])

In [18]:
# https://github.com/apple/ml-mobileclip/blob/main/mobileclip/configs/mobileclip_s0.json
max_seq_length = 77

In [6]:

text_encoder_model = ct.convert(
            traced_model,
            convert_to="mlprogram",
            minimum_deployment_target=ct.target.iOS16,
            inputs=[ct.TensorType(name="prompt",
                                 shape=[1,max_seq_length],
                                 dtype=np.int32)],
            outputs=[ct.TensorType(name="embOutput", dtype=np.float32)],
        )
text_encoder_model.save("TextEncoder_float32.mlpackage")

Converting PyTorch Frontend ==> MIL Ops:  91%|█████████ | 365/402 [00:00<00:00, 1316.20 ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!
Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 401/402 [00:00<00:00, 1172.66 ops/s]
Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 110.35 passes/s]
Running MIL default pipeline: 100%|██████████| 78/78 [00:03<00:00, 22.69 passes/s] 
Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 145.91 passes/s]


## Validate export  precision

In [22]:
# Load the model
te_ml_model = ct.models.MLModel('TextEncoder_float32.mlpackage')

# Choose a tokenizer, here we use the clip tokenizer
text = tokenizer("a photo of a cat").to(torch.int32)
text = text[:,:max_seq_length]
print("Tokenized text: ", text[0, :10])

# # Or use CLIPTokenizerFast
# text = tokenizer("a photo of a cat", return_tensors="pt", padding="max_length", max_length=max_seq_length)
# text = text.data['input_ids'].to(torch.int32)

orig_features = text_encoder(text)
predictions = te_ml_model.predict({'prompt': text})
out = traced_model(text)

Tokenized text:  tensor([49406,   320,  1125,   539,   320,  2368, 49407,     0,     0,     0],
       dtype=torch.int32)


In [24]:
print("Original PyTorch TextEncoder ckpt out for \"a photo of a cat\":\n>>>", orig_features[0, :10])
print("Traced PyTorch TextEncoder ckpt out for \"a photo of a cat\":\n>>>", out[0, :10])
print("\nCoreML TextEncoder ckpt out for \"a photo of a cat\":\n>>>", predictions['embOutput'][0, :10])

Original PyTorch TextEncoder ckpt out for "a photo of a cat":
>>> tensor([ 0.1062,  0.3889,  0.2455,  0.2906,  0.3474, -0.0871,  0.0244, -0.1012,
         0.4056, -0.0591], grad_fn=<SliceBackward0>)
Traced PyTorch TextEncoder ckpt out for "a photo of a cat":
>>> tensor([ 0.1062,  0.3889,  0.2455,  0.2906,  0.3474, -0.0871,  0.0244, -0.1012,
         0.4056, -0.0591], grad_fn=<SliceBackward0>)

CoreML TextEncoder ckpt out for "a photo of a cat":
>>> [ 0.10631     0.388583    0.24500522  0.29059237  0.3471204  -0.0872687
  0.024912   -0.10095407  0.4052309  -0.05918849]


You can see that there is some loss in precision, but it is still acceptable.

# 2. Export ImageEncoder

In [9]:
image_encoder = model.image_encoder

img = Image.open("./sample_images/IMG_4085.jpeg")
example_input = torch.tensor(preprocess(img))
#reshape to 1,3,256,256
example_input = example_input.unsqueeze(0)
print(example_input.shape)
traced_model = torch.jit.trace(image_encoder, example_input)

  example_input = torch.tensor(preprocess(img))


torch.Size([1, 3, 256, 256])


In [10]:
example_output = image_encoder(example_input)
print("Original PyTorch ImageEncoder ckpt out for jpg:\n>>>", example_output[0, :10])

Original PyTorch ImageEncoder ckpt out for jpg:
>>> tensor([-0.0295, -0.0015,  0.0392, -0.0413,  0.0045, -0.0126,  0.0253, -0.0135,
         0.0118,  0.0866], grad_fn=<SliceBackward0>)


In [11]:
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
image_mean = IMAGENET_DEFAULT_MEAN
image_std = IMAGENET_DEFAULT_STD

In [12]:
import torchvision.transforms as transforms

class Wrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        _means = IMAGENET_DEFAULT_MEAN
        _stds = IMAGENET_DEFAULT_STD
        self.stds = torch.tensor(_stds).half()[:,None,None]
        self.means = torch.tensor(_means).half()[:,None,None]

    transform_model = torch.nn.Sequential(
        transforms.Normalize(mean=image_mean,
                             std=image_std)
                             )

    def forward(self, input):        
        input = input/255.0
        intput = self.transform_model(input)
        output = self.model(input)        
        return output

# Instantiate the Wrapper model passing the original PyTorch FCN model
wrapped_model = Wrapper(traced_model)

In [13]:
i = np.asarray(img.resize((256, 256)))
i = i.astype("float32")
i = np.transpose(i, (2, 0, 1))
i = np.expand_dims(i, 0)
i = torch.from_numpy(i)

with torch.no_grad():
    out = wrapped_model(i)

print("wrapped PyTorch ImageEncoder ckpt out for jpg:\n>>>", out[0, :10])

traced_model = torch.jit.trace(wrapped_model, i)

with torch.no_grad():
    out = traced_model(i)

print("Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\n>>>", out[0, :10])

wrapped PyTorch ImageEncoder ckpt out for jpg:
>>> tensor([-0.0234, -0.0132,  0.0335, -0.0267,  0.0033, -0.0109,  0.0201, -0.0244,
         0.0172,  0.0927])
Traced wrapped PyTorch ImageEncoder ckpt out for jpg:
>>> tensor([-0.0234, -0.0132,  0.0335, -0.0267,  0.0033, -0.0109,  0.0201, -0.0244,
         0.0172,  0.0927])


In [14]:
image_input = ct.ImageType(name="colorImage", shape=i.shape)
image_encoder_model = ct.converters.convert(
    traced_model,
    convert_to="mlprogram",
    inputs=[image_input],
    outputs=[ct.TensorType(name="embOutput", dtype=np.float32)],
    minimum_deployment_target=ct.target.iOS16,
)
image_encoder_model.save("ImageEncoder_float32.mlpackage")

Model is not in eval mode. Consider calling '.eval()' on your model prior to conversion
Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 723/724 [00:00<00:00, 2459.07 ops/s]
Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 45.32 passes/s]
Running MIL default pipeline: 100%|██████████| 78/78 [00:03<00:00, 21.20 passes/s]
Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 52.74 passes/s]


## Validate export

In [15]:
import torchvision.transforms as transforms

ie_ml_model = ct.models.MLModel('ImageEncoder_float32.mlpackage')
imgPIL = Image.open("./sample_images/IMG_4085.jpeg")
imgPIL = imgPIL.resize((256, 256), Image.BICUBIC)

img_np = np.asarray(imgPIL).astype(np.float32) # (256, 256, 3)
img_np = img_np[np.newaxis, :, :, :] # (1, 256, 256, 3)
img_np = np.transpose(img_np, [0, 3, 1, 2]) # (1, 3, 256, 256)
torch_tensor_input = torch.from_numpy(img_np)

predictions = ie_ml_model.predict({'colorImage': imgPIL})
out = wrapped_model(torch_tensor_input)
print("Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\n>>>", out[0, :10])
print("\nCoreML ImageEncoder ckpt out for jpg:\n>>>", predictions['embOutput'][0, :10])

  imgPIL = imgPIL.resize((256, 256), Image.BICUBIC)


Traced wrapped PyTorch ImageEncoder ckpt out for jpg:
>>> tensor([-0.0234, -0.0132,  0.0335, -0.0267,  0.0033, -0.0109,  0.0201, -0.0244,
         0.0172,  0.0927], grad_fn=<SliceBackward0>)

CoreML ImageEncoder ckpt out for jpg:
>>> [-0.02342224 -0.01332092  0.03356934 -0.02656555  0.00331879 -0.01082611
  0.01998901 -0.02452087  0.01733398  0.09289551]


In [16]:
import os
import pickle

path = r"./sample_images"
# this list holds all the image filename
images = []

def image_resize(image):
    image = image.resize((256, 256), Image.BICUBIC)
    return image

# creates a ScandirIterator aliased as files
with os.scandir(path) as files:
  # loops through each file in the directory
    for file in files:
        if file.name.endswith('.jpeg'):
          # adds only the image files to the flowers list
            images.append(file.name)

def extract_features(path, images):
    num_images = len(images)
    images_features = []
    counter = 0
    for i in range(0, num_images):
        images_preprocess = image_resize(Image.open(os.path.join(path,images[i])).convert("RGB"))        
        print(i)
        cur_features = ie_ml_model.predict({'colorImage': images_preprocess})
        cur_features = torch.tensor(cur_features['embOutput']).float().to(device)
        cur_features /= cur_features.norm(dim=-1, keepdim=True)
        images_features.append(cur_features)

    images_features = torch.cat(images_features)
    print("Features shape {}".format(images_features.shape))
    return images_features.cpu().numpy()
   
data = {}
p = r"./ml_mobileclip_s0_features.pkl"

# check if the pickled file exists
if os.path.exists(p):
    with open(p,'rb') as file:
        data = pickle.load(file)
else:
    print("Extracting features")
    images_features = extract_features(path, images)
    for i in range(len(images_features)):
        data[images[i]] = images_features[i]

    with open(p,'wb') as file:
        pickle.dump(data,file)
          
 
# get a list of the filenames
filenames = np.array(list(data.keys()))

# get a list of just the features
feat = np.array(list(data.values()))
feat = torch.tensor(feat).float().to(device)

# reshape so that there are n samples of 512 vectors
#feat = feat.reshape(-1,512)

print(f"There are {len(filenames)} images in the dataset, each has a feature of shape {feat[0].shape}")

text_input = ["a photo of a dog", "a dog", "dogs"]
#text = tokenizer("a photo of a cat").to(torch.int32)
texts_input_tokenized = tokenizer(text_input).to(torch.int32)
texts_input_tokenized = texts_input_tokenized[:,:max_seq_length]

for i in range(len(text_input)):
    text_input_tokenized = [texts_input_tokenized[i]]
    text_features = te_ml_model.predict({'prompt': text_input_tokenized})
    text_features = torch.tensor(text_features['embOutput']).float().to(device)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    # calculate the similarity between the text features and the image features
    similarity = (100.0 * text_features @ feat.T).softmax(dim=-1)
    print("\n")
    print(f"Text: {text_input[i]}")
    values, indices = similarity[0].topk(5)
    print("Most similar images:")
    for value, index in zip(values, indices):
        print(f"{filenames[index]:<40} {100 * value.item():.2f}%")    


There are 27 images in the dataset, each has a feature of shape torch.Size([512])


Text: a photo of a dog
Most similar images:
IMG_4061.jpeg                            50.45%
IMG_2134.jpeg                            45.32%
21-09-07_1153.jpeg                       3.20%
IMG_0519.jpeg                            1.01%
IMG_2732.jpeg                            0.01%


Text: a dog
Most similar images:
IMG_2134.jpeg                            85.73%
IMG_4061.jpeg                            12.42%
21-09-07_1153.jpeg                       1.19%
IMG_0519.jpeg                            0.65%
IMG_2732.jpeg                            0.00%


Text: dogs
Most similar images:
IMG_0519.jpeg                            79.85%
IMG_2134.jpeg                            16.58%
IMG_4061.jpeg                            3.17%
21-09-07_1153.jpeg                       0.20%
IMG_6172.jpeg                            0.12%
