### **Install Packages & Imports**

In [None]:
!pip install sentencepiece==0.2.0
!pip install quanto==0.0.11

In [2]:
from transformers import CLIPTokenizer, CLIPProcessor, CLIPModel
import torch
from PIL import Image
from memory_usage_helper import *

import warnings
# Ignore specific UserWarnings related to max_length in transformers
warnings.filterwarnings("ignore",
    message=".*Using the model-agnostic default `max_length`.*")

### **Model without Quantize**

In [3]:
model_name = "openai/clip-vit-large-patch14"
model = CLIPModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

In [4]:
helper = MemoryUsageHelper()

In [5]:
original_module_sizes = helper.compute_module_sizes(model)

In [6]:
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [7]:
text = "a cat sitting on the beach"
image_path = "catOnTheBeach.jpg"
image = Image.open(image_path)
inputs = processor(text=text, images=image, return_tensors="pt")

output = model(**inputs)

In [8]:
original_model_text_embeds = output["text_embeds"]
original_model_image_embeds = output["image_embeds"]

##**Quantize the model by int8**

In [9]:
from quanto import quantize, freeze

In [10]:
quantize(model, weights=torch.int8, activations=None)

In [11]:
freeze(model)

In [12]:
quantized_module_sizes = helper.compute_module_sizes(model)

In [13]:
text = "a cat sitting on the beach"

image_path = "catOnTheBeach.jpg"
image = Image.open(image_path)
inputs = processor(text=text, images=image, return_tensors="pt")
output = model(**inputs)

In [14]:
quantize_model_text_embeds = output["text_embeds"]
quantize_model_image_embeds = output["image_embeds"]

In [15]:
quantized_module_sizes = helper.compute_module_sizes(model)

##**Compare Results:**

**Memory Usage:**

In [16]:
print(f"The original model size is {original_module_sizes[''] * 1e-9} GB")
print(f"The quantized model size is {quantized_module_sizes[''] * 1e-9} GB")

The original model size is 1.710468724 GB
The quantized model size is 0.5467907240000001 GB


**Performance:**

In [19]:
import torch.nn.functional as F

def compare(t1, t2, embeds_kind):
  tensor1_flat = t1.view(1, -1)
  tensor2_flat = t2.view(1, -1)
  cos_sim = F.cosine_similarity(tensor1_flat, tensor2_flat)
  print(f"Cosine Similarity for {embeds_kind}:", round(cos_sim.item(), 6))

text_embeds_similarity = compare(original_model_text_embeds, quantize_model_text_embeds, 'text')
image_embeds_similarity = compare(original_model_image_embeds, quantize_model_image_embeds, 'image')

Cosine Similarity for text: 0.999907
Cosine Similarity for image: 0.999899
