LiVeAction f16c48
---
---

In [1]:
import torch, io, datasets, PIL.Image,  numpy as np, time
from huggingface_hub import hf_hub_download
from types import SimpleNamespace
from livecodec.codec import AutoCodecND, latent_to_pil, pil_to_latent
from torchvision.transforms.v2.functional import to_pil_image, pil_to_tensor, resize

dataset = datasets.load_dataset("danjacobellis/kodak")
checkpoint_file = hf_hub_download(
    repo_id="danjacobellis/liveaction",
    filename="lsdir_f16c48.pth"
)
checkpoint = torch.load(checkpoint_file, map_location="cpu",weights_only=False)
config = checkpoint['config']
codec = AutoCodecND(
    dim=2,
    input_channels=config.input_channels,
    J = int(np.log2(config.F)),
    latent_dim=config.latent_dim,
    encoder_depth = 4,
    encoder_kernel_size = config.encoder_kernel_size,
    decoder_depth = config.decoder_depth,
    lightweight_encode = config.lightweight_encode,
    lightweight_decode = config.lightweight_decode,
)
codec.load_state_dict(checkpoint['state_dict'])
codec.eval();
print(sum( p.numel() for p in codec.encoder_blocks.parameters())/1e3)

493.056


In [2]:
def evaluate_throughput(sample, device='cpu', dtype=torch.float):
    codec.to(device).to(dtype)
    img = sample['image']
    img = img.resize((int(2.5*img.size[0]),int(2.5*img.size[1]))) # 1080p
    x_orig = pil_to_tensor(img).to(device).unsqueeze(0).to(dtype) / 127.5 - 1.0
    orig_size = tuple(x_orig.shape[-2:])
    orig_dim = x_orig.numel() 
    
    # analysis transform
    t0 = time.time()
    with torch.no_grad():
        z = codec.encode(x_orig)
        latent = codec.quantize.compand(z).round()
    analysis_time = time.time() - t0
    
    # entropy coding
    t0 = time.time()
    webp = latent_to_pil(latent.cpu(), n_bits=8, C=3)
    buff = io.BytesIO()
    webp[0].save(buff, format='WEBP', lossless=True)
    entropy_code_time = time.time() - t0
    
    # entropy decoding
    t0 = time.time()
    webp = [PIL.Image.open(buff)]
    latent_decoded = pil_to_latent(webp, N=config.latent_dim, n_bits=8, C=3).to(device).to(dtype)
    entropy_decode_time = time.time() - t0
    
    # synthesis transform
    t0 = time.time()
    with torch.no_grad():
        x_hat = codec.decode(latent_decoded).clamp(-1,1)
    synthesis_time = time.time() - t0

    return {
        'analysis_time': analysis_time,
        'entropy_code_time': entropy_code_time,
        'entropy_decode_time': entropy_decode_time,
        'synthesis_time': synthesis_time,
    }

In [3]:
for (device,dtype) in [('cuda',torch.bfloat16),('cpu',torch.float)]:
    results_dataset = dataset['validation'].map(lambda s: evaluate_throughput(s,device=device, dtype=dtype))
    print("mean\n---")
    for metric in [
        'analysis_time',
        'entropy_code_time',
        'entropy_decode_time',
        'synthesis_time',
    ]:
        μ = 1920*1080e-6/np.mean(results_dataset[metric])
        print(μ)
    print(1920*1080e-6/np.mean(np.array(results_dataset['analysis_time'])+np.array(results_dataset['entropy_code_time'])))
    print(1920*1080e-6/np.mean(np.array(results_dataset['entropy_decode_time'])+np.array(results_dataset['synthesis_time'])))

mean
---
166.79071482954996
43.731614014788285
754.5735253088093
205.6265428837407
34.647285170257454
161.59168333836527


Map:   0%|          | 0/24 [00:00<?, ? examples/s]

mean
---
10.539825783846718
35.94067859807265
762.0611611359977
0.7378348391044605
8.149836065984434
0.7371211514191478


Cosmos di16
---
---

In [1]:
import torch, io, datasets, PIL.Image,  numpy as np, time
from huggingface_hub import snapshot_download
from types import SimpleNamespace
from piq import LPIPS, DISTS, SSIMLoss
from torchvision.transforms.v2.functional import to_pil_image, pil_to_tensor
from cosmos_tokenizer.image_lib import ImageTokenizer

dataset = datasets.load_dataset("danjacobellis/kodak")
model_path = snapshot_download(repo_id='nvidia/Cosmos-Tokenizer-DI16x16')
encoder = ImageTokenizer(checkpoint_enc=f'{model_path}/encoder.jit')
decoder = ImageTokenizer(checkpoint_dec=f'{model_path}/decoder.jit')

lpips_loss = LPIPS()
dists_loss = DISTS()
ssim_loss = SSIMLoss()
print(sum( p.numel() for p in encoder.parameters())/1e6)
print(sum( p.numel() for p in decoder.parameters())/1e6)

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]



34.621062
49.102896


In [2]:
def evaluate_throughput(sample, device='cuda', dtype=torch.bfloat16):
    encoder.to(device)
    decoder.to(device)
    img = sample['image']
    img = img.resize((int(2.5*img.size[0]),int(2.5*img.size[1]))) # 1080p
    x_orig = pil_to_tensor(img).to(device).unsqueeze(0).to(dtype) / 127.5 - 1.0
    orig_size = tuple(x_orig.shape[-2:])
    orig_dim = x_orig.numel() 
    
    # analysis transform
    t0 = time.time()
    with torch.no_grad():
        z = encoder.encode(x_orig)[0]
    analysis_time = time.time() - t0
    
    # entropy coding
    t0 = time.time()
    torch.save(z,'tmp.pth')
    entropy_code_time = time.time() - t0
    
    # entropy decoding
    t0 = time.time()
    z = torch.load('tmp.pth')
    entropy_decode_time = time.time() - t0
    
    # synthesis transform
    t0 = time.time()
    with torch.no_grad():
        x_hat = decoder.decode(z).to(torch.float).clamp(-1,1)
    synthesis_time = time.time() - t0

    return {
        'analysis_time': analysis_time,
        'entropy_code_time': entropy_code_time,
        'entropy_decode_time': entropy_decode_time,
        'synthesis_time': synthesis_time,
    }

In [3]:
for (device,dtype) in [('cuda',torch.bfloat16)]:
    results_dataset = dataset['validation'].map(lambda s: evaluate_throughput(s,device=device, dtype=dtype))
    print("mean\n---")
    for metric in [
        'analysis_time',
        'entropy_code_time',
        'entropy_decode_time',
        'synthesis_time',
    ]:
        μ = 1920*1080e-6/np.mean(results_dataset[metric])
        print(μ)
    print(1920*1080e-6/np.mean(np.array(results_dataset['analysis_time'])+np.array(results_dataset['entropy_code_time'])))
    print(1920*1080e-6/np.mean(np.array(results_dataset['entropy_decode_time'])+np.array(results_dataset['synthesis_time'])))



Map:   0%|          | 0/24 [00:00<?, ? examples/s]

mean
---
20.852846693501366
5862.201549852557
6836.163312556494
20.055054187077012
20.778932500989743
19.996391353291333


WaLLoC f8c48
---
---

In [1]:
import torch, io, datasets, PIL.Image,  numpy as np, json, time
from huggingface_hub import hf_hub_download
from types import SimpleNamespace
from piq import LPIPS, DISTS, SSIMLoss
from walloc import walloc
from walloc.walloc import latent_to_pil, pil_to_latent
from torchvision.transforms.v2.functional import to_pil_image, pil_to_tensor

dataset = datasets.load_dataset("danjacobellis/kodak")

config_file = hf_hub_download(
    repo_id="danjacobellis/walloc",
    filename="RGB_4x.json"
)
codec_config = SimpleNamespace(**json.load(open(config_file)))
checkpoint_file = hf_hub_download(
    repo_id="danjacobellis/walloc",
    filename="RGB_4x.pth"
)
checkpoint = torch.load(checkpoint_file, map_location="cpu",weights_only=False)
codec = walloc.Codec2D(
    channels = codec_config.channels,
    J = codec_config.J,
    Ne = codec_config.Ne,
    Nd = codec_config.Nd,
    latent_dim = codec_config.latent_dim,
    latent_bits = codec_config.latent_bits,
    lightweight_encode = codec_config.lightweight_encode
)
codec.load_state_dict(checkpoint['model_state_dict'])
codec = codec
codec.eval();
print(sum(p.numel() for p in codec.encoder.parameters()))
print(sum(p.numel() for p in codec.decoder.parameters())/1e6)

  from pkg_resources import resource_stream


9264
57.13248


In [2]:
def evaluate_throughput(sample, device='cuda', dtype=torch.bfloat16):
    codec.to(device).to(dtype)
    img = sample['image']
    img = img.resize((int(2.5*img.size[0]),int(2.5*img.size[1]))) # 1080p
    x_orig = pil_to_tensor(img).to(device).unsqueeze(0).to(dtype) / 127.5 - 1.0
    orig_size = tuple(x_orig.shape[-2:])
    orig_dim = x_orig.numel() 
    
    # analysis transform
    t0 = time.time()
    with torch.no_grad():
        z = codec.encoder(codec.wavelet_analysis(x_orig,J=codec.J))
    analysis_time = time.time() - t0
    
    # entropy coding
    t0 = time.time()
    webp = latent_to_pil(z.cpu(), n_bits=8, C=3)
    buff = io.BytesIO()
    webp[0].save(buff, format='WEBP', lossless=True)
    entropy_code_time = time.time() - t0
    
    # entropy decoding
    t0 = time.time()
    latent_decoded = pil_to_latent(webp, N=codec_config.latent_dim, n_bits=8, C=3).to(device).to(dtype)
    entropy_decode_time = time.time() - t0
    
    # synthesis transform
    t0 = time.time()
    with torch.no_grad():
        x_hat = codec.wavelet_synthesis(codec.decoder(z),J=codec.J).clamp(-0.5,0.5)
    synthesis_time = time.time() - t0

    return {
        'analysis_time': analysis_time,
        'entropy_code_time': entropy_code_time,
        'entropy_decode_time': entropy_decode_time,
        'synthesis_time': synthesis_time,
    }

In [3]:
for (device,dtype) in [('cuda',torch.bfloat16),('cpu',torch.float)]:
    results_dataset = dataset['validation'].map(lambda s: evaluate_throughput(s,device=device, dtype=dtype))
    print("mean\n---")
    for metric in [
        'analysis_time',
        'entropy_code_time',
        'entropy_decode_time',
        'synthesis_time',
    ]:
        μ = 1920*1080e-6/np.mean(results_dataset[metric])
        print(μ)
    print(1920*1080e-6/np.mean(np.array(results_dataset['analysis_time'])+np.array(results_dataset['entropy_code_time'])))
    print(1920*1080e-6/np.mean(np.array(results_dataset['entropy_decode_time'])+np.array(results_dataset['synthesis_time'])))

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

mean
---
302.3485769946986
10.739115837889393
1241.7631031410622
226.31119024218887
10.370757031012644
191.42415825772122


Map:   0%|          | 0/24 [00:00<?, ? examples/s]

mean
---
21.77318738825532
11.061159136061766
1876.8638275916019
0.30510071287792034
7.334901287662958
0.30505112413628127
