In [1]:
import numpy as np
import torch
from pkg_resources import packaging
import tome
import clip

In [2]:
model, preprocess = clip.load("ViT-B/16")
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 149,620,737
Input resolution: 224
Context length: 77
Vocab size: 49408


In [3]:
preprocess

Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=None)
    CenterCrop(size=(224, 224))
    <function _convert_image_to_rgb at 0x000001903DAD7430>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [4]:
# Set this to be whatever device you want to benchmark on
# If you don't have a GPU, you can use "cpu" but you probably want to set the # runs to be lower
device = "cuda:0"
runs = 50
batch_size = 256  # Lower this if you don't have that much memory

In [5]:
baseline_throughput = tome.utils.benchmark(
    model.visual,
    device=device,
    verbose=True,
    runs=runs,
    batch_size=batch_size,
)

Benchmarking: 100%|██████████| 50/50 [00:28<00:00,  1.78it/s]


Throughput: 463.93 im/s


In [6]:
tome.patch.vittome(model.visual,reduction=16)

In [7]:
tome_throughput_r16 = tome.utils.benchmark(
    model.visual,
    device=device,
    verbose=True,
    runs=runs,
    batch_size=batch_size,
)

  dst = dst.scatter_reduce(-2, dst_idx.expand(n, r, c), src, reduce=mode)
Benchmarking: 100%|██████████| 50/50 [00:15<00:00,  3.30it/s]

Throughput: 825.50 im/s





In [8]:
print(f"Throughput improvement: {tome_throughput_r16 / baseline_throughput:.2f}x")

Throughput improvement: 1.78x
