In [1]:
import os
import json
import time
from tqdm import tqdm
import torch
from torch.profiler import profile, record_function, ProfilerActivity
import pandas as pd
from datasets import load_dataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch.quantization
import torch.ao.quantization as tq
import torch.nn.functional as F
import inspect
import types
from types import FunctionType
from functools import lru_cache
import accelerate
import gc

import jiwer
from jiwer import (
    Compose,
    ToLowerCase,
    RemoveMultipleSpaces,
    Strip,
)

# Quantization
В этом ноутбуке тестируется post-training квантизация как один из простейших методов, не требующих пересборки архитектуры модели. Ожидается, что квантизация даст прирост по скорости в простых операциях матричного умножения и в аттеншн блоках модели.

Все значения профилировщика были сняты в прогоне на чистовую вне изолированной среды, поэтому значения могут отличаться, но статистически сводятся к итоговым метрикам.

In [2]:
def print_size_of_model(model):
    """ Prints the real size of the model """
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

In [3]:
def asr_metrics(hypothesis: str, reference: str):
    tr = Compose([ToLowerCase(), RemoveMultipleSpaces(), Strip()])

    ref_tr = tr(reference)
    hyp_tr = tr(hypothesis)

    out = jiwer.process_words(ref_tr, hyp_tr)
    wer = out.wer
    # S, D, I = out.substitutions, out.deletions, out.insertions

    cer = jiwer.cer(ref_tr, hyp_tr) # ?????

    return {
        "wer": wer,
        "cer": cer,
    }

In [4]:
def profile_sample(sample_idx=0, trace_path="whisper_perfetto_large-v3.json", sort_by="cpu_time_total", model=None):
    example = dataset[sample_idx]
    audio_array = example["audio"]["array"]
    sampling_rate = example["audio"]["sampling_rate"]

    inputs = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt").input_features

    with profile(
        activities=[ProfilerActivity.CPU],
        record_shapes=True,
        profile_memory=True,
        with_stack=False,
    ) as prof:
        with record_function("whisper.generate"):
            predicted_ids = model.generate(inputs, forced_decoder_ids=forced_decoder_ids)

    prof.export_chrome_trace(trace_path)
    print(f"Perfetto trace saved to {trace_path}")
    print(prof.key_averages().table(
        sort_by=sort_by,
        row_limit=10
    ))
    return processor.decode(predicted_ids[0])

In [5]:
dataset = load_dataset("bond005/sberdevices_golos_10h_crowd", split="validation") #, split="test")
# dataset = dataset.select(range(100))

In [6]:
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
forced_decoder_ids = processor.get_decoder_prompt_ids(language="russian", task="transcribe")
print_size_of_model(model)

Size (MB): 6174.372281


In [7]:
# Посмотрим на архитектуру модели
print(model)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias=Tr

# CPU

## Бесплатное ускорение с torch.compile

In [8]:
model = torch.compile(model)

In [16]:
_ = profile_sample(116, trace_path="whisper_perfetto_large-v3_compiled.json", model=model)

Perfetto trace saved to whisper_perfetto_large-v3_compiled.json
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                 Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                     whisper.generate         3.81%     326.723ms       100.00%        8.569s        8.569s         112 B      -6.85 GB             1  
                                         aten::linear         0.59%      50.416ms        72.62%        6.222s       1.514ms       2.55 GB           0 B          4111  
                                          aten::addmm        59.14%        5.068s        60.90% 

Видим ускорение. Однако, это может быть лаг неизолированной среды. Кроме того, эффективность torch.compile сильно зависит от warmup И длины последовательности. Сделаем тестовый прогон, чтобы увидеть динамику среднего времени выполнения.

In [45]:
def run_model(verbose=False, model=None, dataset=None):
    results = []
    i = 0
    for audio in tqdm(dataset):
        audio_array = audio["audio"]["array"]
        sampling_rate = audio["audio"]["sampling_rate"]
        reference = audio["transcription"]
    
        start_time = time.time()
        input_features = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt").input_features 
        predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)[0] #Уточнить в зависимости от выбранной модели
        hypothesis = processor.decode(predicted_ids)
        run_time = time.time() - start_time
        metrics = asr_metrics(hypothesis, reference)
        metrics["run_time_sec"] = run_time
        if verbose:
            if i % 50 == 0:
                print("referenct:")
                print(reference)
                print("hypothesis:")
                print(hypothesis)
            i += 1
        results.append(metrics)

    df_results = pd.DataFrame(results)
    
    summary = {
        "total_samples": len(df_results),
        "avg_wer": df_results["wer"].mean(),
        "avg_cer": df_results["cer"].mean(),
        "avg_time_per_audio": df_results["run_time_sec"].mean(),
        "total_time": df_results["run_time_sec"].sum(),
    }
    
    print("large-v3")
    print(json.dumps(summary, ensure_ascii=True, indent=2))
    return summary

In [15]:
dataset_small = dataset.select(range(50))
_ = run_model(model=model, dataset=dataset_small)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [06:50<00:00,  8.21s/it]

large-v3
{
  "total_samples": 50,
  "avg_wer": 0.3984776334776335,
  "avg_cer": 0.14465731826243972,
  "avg_time_per_audio": 8.201239647865295,
  "total_time": 410.06198239326477
}





In [17]:
del model, dataset_small, _
gc.collect()

1302245

Это уже статистически значимый результат. Сбросим получившиеся ускорения и начнём квантизацию. Компиляцию применим после.

## PTQ Static
Это неудачный эксперимент со статической квантизацией. Пояснения по шагам.

In [19]:
qmodel = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
print_size_of_model(qmodel)

Size (MB): 6174.372281


Было обнаружено, что подход "в лоб" не проходит из-за перенаправления вызова Conv1d внутри generate на CPU бэкэнд вместо QuantCPU совместимого, поэтому производится загрузка бэкэнда и работа только по Linear слоями

In [20]:
torch.backends.quantized.engine = "fbgemm"
qengine = torch.backends.quantized.engine
qmodel.eval()

def set_qconfig_for_linears(module):
    if isinstance(module, (torch.nn.Linear)):
        module.qconfig = tq.get_default_qconfig(qengine)

qmodel.apply(set_qconfig_for_linears)

print(qmodel.model.encoder.layers[0])

tq.prepare(qmodel, inplace=True)

WhisperEncoderLayer(
  (self_attn): WhisperAttention(
    (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
    (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
    (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
    (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
  )
  (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  (activation_fn): GELUActivation()
  (fc1): Linear(in_features=1280, out_features=5120, bias=True)
  (fc2): Linear(in_features=5120, out_features=1280, bias=True)
  (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  tq.prepare(qmodel, inplace=True)


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(
              in_features=1280, out_features=1280, bias=False
              (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
            )
            (v_proj): Linear(
              in_features=1280, out_features=1280, bias=True
              (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
            )
            (q_proj): Linear(
              in_features=1280, out_features=1280, bias=True
              (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
            )
            (out_proj)

In [21]:
print("current engine:", torch.backends.quantized.engine)

current engine: fbgemm


Prepare -> Calibrate -> Convert

In [23]:
# Калибровка
qmodel.to("cpu")

dataset_smol = dataset.select(range(10))
with torch.no_grad():
    for i, audio in enumerate(tqdm(dataset_smol)):
        audio_array = audio["audio"]["array"]
        sampling_rate = audio["audio"]["sampling_rate"]
        inputs = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt")
        _ = qmodel.generate(inputs.input_features)


  0%|                                                                                                                                                                                                                        | 0/10 [00:00<?, ?it/s]Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [02:56<00:00, 17.70s/it]


In [24]:
tq.convert(qmodel, inplace=True)
print(qmodel.model.encoder.layers[0])
print_size_of_model(qmodel)

For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  tq.convert(qmodel, inplace=True)


WhisperEncoderLayer(
  (self_attn): WhisperAttention(
    (k_proj): QuantizedLinear(in_features=1280, out_features=1280, scale=0.0764756053686142, zero_point=63, qscheme=torch.per_channel_affine)
    (v_proj): QuantizedLinear(in_features=1280, out_features=1280, scale=0.04671403765678406, zero_point=69, qscheme=torch.per_channel_affine)
    (q_proj): QuantizedLinear(in_features=1280, out_features=1280, scale=0.07780876010656357, zero_point=61, qscheme=torch.per_channel_affine)
    (out_proj): QuantizedLinear(in_features=1280, out_features=1280, scale=0.032819852232933044, zero_point=77, qscheme=torch.per_channel_affine)
  )
  (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  (activation_fn): GELUActivation()
  (fc1): QuantizedLinear(in_features=1280, out_features=5120, scale=0.10480756312608719, zero_point=49, qscheme=torch.per_channel_affine)
  (fc2): QuantizedLinear(in_features=5120, out_features=1280, scale=0.029906874522566795, zero_point=65, qscheme=

In [25]:
1852.556293/6174.372281

0.3000396167721783

### Замеры качества

In [26]:
results = run_model(model=qmodel, dataset=dataset)

  0%|                                                                                                                                                                                                                       | 0/793 [00:00<?, ?it/s]


NotImplementedError: Could not run 'quantized::linear' with arguments from the 'CPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'quantized::linear' is only available for these backends: [Meta, QuantizedCPU, QuantizedCUDA, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradMPS, AutogradXPU, AutogradHPU, AutogradLazy, AutogradMTIA, AutogradMAIA, AutogradMeta, Tracer, AutocastCPU, AutocastMTIA, AutocastMAIA, AutocastXPU, AutocastMPS, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].

Meta: registered at /pytorch/aten/src/ATen/core/MetaFallbackKernel.cpp:23 [backend fallback]
QuantizedCPU: registered at /pytorch/aten/src/ATen/native/quantized/cpu/qlinear.cpp:1603 [kernel]
QuantizedCUDA: registered at /pytorch/aten/src/ATen/native/quantized/cudnn/Linear.cpp:359 [kernel]
BackendSelect: fallthrough registered at /pytorch/aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:194 [backend fallback]
FuncTorchDynamicLayerBackMode: registered at /pytorch/aten/src/ATen/functorch/DynamicLayer.cpp:479 [backend fallback]
Functionalize: registered at /pytorch/aten/src/ATen/FunctionalizeFallbackKernel.cpp:387 [backend fallback]
Named: registered at /pytorch/aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
Conjugate: registered at /pytorch/aten/src/ATen/ConjugateFallback.cpp:17 [backend fallback]
Negative: registered at /pytorch/aten/src/ATen/native/NegateFallback.cpp:18 [backend fallback]
ZeroTensor: registered at /pytorch/aten/src/ATen/ZeroTensorFallback.cpp:115 [backend fallback]
ADInplaceOrView: fallthrough registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:104 [backend fallback]
AutogradOther: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:63 [backend fallback]
AutogradCPU: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:67 [backend fallback]
AutogradCUDA: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:75 [backend fallback]
AutogradXLA: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:87 [backend fallback]
AutogradMPS: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:95 [backend fallback]
AutogradXPU: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:71 [backend fallback]
AutogradHPU: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:108 [backend fallback]
AutogradLazy: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:91 [backend fallback]
AutogradMTIA: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:79 [backend fallback]
AutogradMAIA: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:83 [backend fallback]
AutogradMeta: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:99 [backend fallback]
Tracer: registered at /pytorch/torch/csrc/autograd/TraceTypeManual.cpp:294 [backend fallback]
AutocastCPU: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:324 [backend fallback]
AutocastMTIA: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:468 [backend fallback]
AutocastMAIA: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:506 [backend fallback]
AutocastXPU: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:544 [backend fallback]
AutocastMPS: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:209 [backend fallback]
AutocastCUDA: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:165 [backend fallback]
FuncTorchBatched: registered at /pytorch/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:731 [backend fallback]
BatchedNestedTensor: registered at /pytorch/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:758 [backend fallback]
FuncTorchVmapMode: fallthrough registered at /pytorch/aten/src/ATen/functorch/VmapModeRegistrations.cpp:27 [backend fallback]
Batched: registered at /pytorch/aten/src/ATen/LegacyBatchingRegistrations.cpp:1075 [backend fallback]
VmapMode: fallthrough registered at /pytorch/aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]
FuncTorchGradWrapper: registered at /pytorch/aten/src/ATen/functorch/TensorWrapper.cpp:210 [backend fallback]
PythonTLSSnapshot: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:202 [backend fallback]
FuncTorchDynamicLayerFrontMode: registered at /pytorch/aten/src/ATen/functorch/DynamicLayer.cpp:475 [backend fallback]
PreDispatch: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:206 [backend fallback]
PythonDispatcher: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:198 [backend fallback]


 ### Почему происходит ошибка
Во время вызова qmodel.generate(), когда Whisper создаёт временные тензоры в обычном CPU-бэкэнде, не переключённом в QuantizedCPU, создаётся обращение к слою LayerNorm.
То есть слои QuantizedLayerNorm инициализированы правильно, но во время инференса их вызов идёт через AutogradCPU или Functionalize, а не через QuantizedCPU. При этом, переключение бэкэндом происходит автоматически и поэтому инференс, заданный таким образом, требует полной переработки.

Костыль номер 1 - инъекция, заменяющая работу forward

In [27]:
import inspect
import types
from transformers.models.whisper.modeling_whisper import (
    WhisperAttention,
    WhisperEncoderLayer,
    WhisperDecoderLayer,
)

def patch_whisper_for_quant(model):
    """
    Патчит только WhisperAttention / WhisperEncoderLayer / WhisperDecoderLayer.
    Именно там происходят вызовы F.linear и F.layer_norm.
    """

    target_classes = (WhisperAttention, WhisperEncoderLayer, WhisperDecoderLayer)

    for name, module in model.named_modules():
        if not isinstance(module, target_classes):
            continue

        if not hasattr(module, "forward"):
            continue

        try:
            src = inspect.getsource(module.forward)
        except:
            # Например, если source недоступен
            continue

        if "F.linear" not in src and "F.layer_norm" not in src:
            continue

        print(f"[patching] {name}")

        patched = src
        patched = patched.replace("F.linear", "self._patched_linear")
        patched = patched.replace("F.layer_norm", "self._patched_layer_norm")

        wrapper = f"""
import torch
import torch.nn.functional as F

def _patched_linear(self, x, weight, bias=None):
    # input quantized?
    if hasattr(x, 'is_quantized') and x.is_quantized:
        try:
            return torch.ops.quantized.linear(x, weight, bias)
        except Exception:
            return F.linear(x.dequantize(), weight, bias)
    return F.linear(x, weight, bias)

def _patched_layer_norm(self, x, normalized_shape, weight=None, bias=None, eps=1e-5):
    if hasattr(x, 'is_quantized') and x.is_quantized:
        try:
            return torch.ops.quantized.layer_norm(x, normalized_shape, weight, bias, eps)
        except Exception:
            return F.layer_norm(x.dequantize(), normalized_shape, weight, bias, eps)
    return F.layer_norm(x, normalized_shape, weight, bias, eps)
"""

        full_code = wrapper + "\n" + patched

        ctx = {
            "F": F,
            "torch": torch,
        }

        exec(full_code, ctx)
        module.forward = types.MethodType(ctx["forward"], module)

    print("✅ Whisper F.linear / F.layer_norm patched.")


In [28]:
# полностью перегрузим модель

qmodel = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
print_size_of_model(qmodel)

patch_whisper_for_quant(qmodel)

Size (MB): 6174.372281
✅ Whisper F.linear / F.layer_norm patched.


In [29]:
# Переквантуем

torch.backends.quantized.engine = "fbgemm"
qengine = torch.backends.quantized.engine
qmodel.eval()

def set_qconfig_for_linears(module):
    if isinstance(module, (torch.nn.Linear)):
        module.qconfig = tq.get_default_qconfig(qengine)

qmodel.apply(set_qconfig_for_linears)

print(qmodel.model.encoder.layers[0])

tq.prepare(qmodel, inplace=True)

WhisperEncoderLayer(
  (self_attn): WhisperAttention(
    (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
    (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
    (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
    (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
  )
  (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  (activation_fn): GELUActivation()
  (fc1): Linear(in_features=1280, out_features=5120, bias=True)
  (fc2): Linear(in_features=5120, out_features=1280, bias=True)
  (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  tq.prepare(qmodel, inplace=True)


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(
              in_features=1280, out_features=1280, bias=False
              (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
            )
            (v_proj): Linear(
              in_features=1280, out_features=1280, bias=True
              (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
            )
            (q_proj): Linear(
              in_features=1280, out_features=1280, bias=True
              (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
            )
            (out_proj)

In [30]:
# Калибровка
qmodel.to("cpu")

dataset_smol = dataset.select(range(10))
with torch.no_grad():
    for i, audio in enumerate(tqdm(dataset_smol)):
        audio_array = audio["audio"]["array"]
        sampling_rate = audio["audio"]["sampling_rate"]
        inputs = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt")
        _ = qmodel.generate(inputs.input_features)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:00<00:00, 18.01s/it]


In [31]:
tq.convert(qmodel, inplace=True)
print(qmodel.model.encoder.layers[0])
print_size_of_model(qmodel)

For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  tq.convert(qmodel, inplace=True)


WhisperEncoderLayer(
  (self_attn): WhisperAttention(
    (k_proj): QuantizedLinear(in_features=1280, out_features=1280, scale=0.0764756053686142, zero_point=63, qscheme=torch.per_channel_affine)
    (v_proj): QuantizedLinear(in_features=1280, out_features=1280, scale=0.04671403765678406, zero_point=69, qscheme=torch.per_channel_affine)
    (q_proj): QuantizedLinear(in_features=1280, out_features=1280, scale=0.07780876010656357, zero_point=61, qscheme=torch.per_channel_affine)
    (out_proj): QuantizedLinear(in_features=1280, out_features=1280, scale=0.032819852232933044, zero_point=77, qscheme=torch.per_channel_affine)
  )
  (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  (activation_fn): GELUActivation()
  (fc1): QuantizedLinear(in_features=1280, out_features=5120, scale=0.10480756312608719, zero_point=49, qscheme=torch.per_channel_affine)
  (fc2): QuantizedLinear(in_features=5120, out_features=1280, scale=0.029906874522566795, zero_point=65, qscheme=

In [32]:
_ = profile_sample(116, trace_path="whisper_perfetto_large-v3_quanted.json", model=qmodel)

NotImplementedError: Could not run 'quantized::linear' with arguments from the 'CPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'quantized::linear' is only available for these backends: [Meta, QuantizedCPU, QuantizedCUDA, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradMPS, AutogradXPU, AutogradHPU, AutogradLazy, AutogradMTIA, AutogradMAIA, AutogradMeta, Tracer, AutocastCPU, AutocastMTIA, AutocastMAIA, AutocastXPU, AutocastMPS, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].

Meta: registered at /pytorch/aten/src/ATen/core/MetaFallbackKernel.cpp:23 [backend fallback]
QuantizedCPU: registered at /pytorch/aten/src/ATen/native/quantized/cpu/qlinear.cpp:1603 [kernel]
QuantizedCUDA: registered at /pytorch/aten/src/ATen/native/quantized/cudnn/Linear.cpp:359 [kernel]
BackendSelect: fallthrough registered at /pytorch/aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:194 [backend fallback]
FuncTorchDynamicLayerBackMode: registered at /pytorch/aten/src/ATen/functorch/DynamicLayer.cpp:479 [backend fallback]
Functionalize: registered at /pytorch/aten/src/ATen/FunctionalizeFallbackKernel.cpp:387 [backend fallback]
Named: registered at /pytorch/aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
Conjugate: registered at /pytorch/aten/src/ATen/ConjugateFallback.cpp:17 [backend fallback]
Negative: registered at /pytorch/aten/src/ATen/native/NegateFallback.cpp:18 [backend fallback]
ZeroTensor: registered at /pytorch/aten/src/ATen/ZeroTensorFallback.cpp:115 [backend fallback]
ADInplaceOrView: fallthrough registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:104 [backend fallback]
AutogradOther: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:63 [backend fallback]
AutogradCPU: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:67 [backend fallback]
AutogradCUDA: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:75 [backend fallback]
AutogradXLA: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:87 [backend fallback]
AutogradMPS: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:95 [backend fallback]
AutogradXPU: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:71 [backend fallback]
AutogradHPU: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:108 [backend fallback]
AutogradLazy: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:91 [backend fallback]
AutogradMTIA: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:79 [backend fallback]
AutogradMAIA: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:83 [backend fallback]
AutogradMeta: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:99 [backend fallback]
Tracer: registered at /pytorch/torch/csrc/autograd/TraceTypeManual.cpp:294 [backend fallback]
AutocastCPU: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:324 [backend fallback]
AutocastMTIA: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:468 [backend fallback]
AutocastMAIA: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:506 [backend fallback]
AutocastXPU: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:544 [backend fallback]
AutocastMPS: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:209 [backend fallback]
AutocastCUDA: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:165 [backend fallback]
FuncTorchBatched: registered at /pytorch/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:731 [backend fallback]
BatchedNestedTensor: registered at /pytorch/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:758 [backend fallback]
FuncTorchVmapMode: fallthrough registered at /pytorch/aten/src/ATen/functorch/VmapModeRegistrations.cpp:27 [backend fallback]
Batched: registered at /pytorch/aten/src/ATen/LegacyBatchingRegistrations.cpp:1075 [backend fallback]
VmapMode: fallthrough registered at /pytorch/aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]
FuncTorchGradWrapper: registered at /pytorch/aten/src/ATen/functorch/TensorWrapper.cpp:210 [backend fallback]
PythonTLSSnapshot: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:202 [backend fallback]
FuncTorchDynamicLayerFrontMode: registered at /pytorch/aten/src/ATen/functorch/DynamicLayer.cpp:475 [backend fallback]
PreDispatch: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:206 [backend fallback]
PythonDispatcher: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:198 [backend fallback]


### Патч инъекция не сработала, попробуем другой костыль

In [33]:
qmodel = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
print_size_of_model(qmodel)

Size (MB): 6174.372281


In [34]:
# Переквантуем

torch.backends.quantized.engine = "fbgemm"
qengine = torch.backends.quantized.engine
qmodel.eval()

def set_qconfig_for_linears(module):
    if isinstance(module, (torch.nn.Linear)):
        module.qconfig = tq.get_default_qconfig(qengine)

qmodel.apply(set_qconfig_for_linears)

print(qmodel.model.encoder.layers[0])

tq.prepare(qmodel, inplace=True)

WhisperEncoderLayer(
  (self_attn): WhisperAttention(
    (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
    (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
    (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
    (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
  )
  (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  (activation_fn): GELUActivation()
  (fc1): Linear(in_features=1280, out_features=5120, bias=True)
  (fc2): Linear(in_features=5120, out_features=1280, bias=True)
  (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  tq.prepare(qmodel, inplace=True)


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(
              in_features=1280, out_features=1280, bias=False
              (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
            )
            (v_proj): Linear(
              in_features=1280, out_features=1280, bias=True
              (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
            )
            (q_proj): Linear(
              in_features=1280, out_features=1280, bias=True
              (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
            )
            (out_proj)

In [35]:
# Калибровка
qmodel.to("cpu")

dataset_smol = dataset.select(range(10))
with torch.no_grad():
    for i, audio in enumerate(tqdm(dataset_smol)):
        audio_array = audio["audio"]["array"]
        sampling_rate = audio["audio"]["sampling_rate"]
        inputs = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt")
        _ = qmodel.generate(inputs.input_features)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [02:58<00:00, 17.85s/it]


In [36]:
tq.convert(qmodel, inplace=True)
print(qmodel.model.encoder.layers[0])
print_size_of_model(qmodel)

For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  tq.convert(qmodel, inplace=True)


WhisperEncoderLayer(
  (self_attn): WhisperAttention(
    (k_proj): QuantizedLinear(in_features=1280, out_features=1280, scale=0.0764756053686142, zero_point=63, qscheme=torch.per_channel_affine)
    (v_proj): QuantizedLinear(in_features=1280, out_features=1280, scale=0.04671403765678406, zero_point=69, qscheme=torch.per_channel_affine)
    (q_proj): QuantizedLinear(in_features=1280, out_features=1280, scale=0.07780876010656357, zero_point=61, qscheme=torch.per_channel_affine)
    (out_proj): QuantizedLinear(in_features=1280, out_features=1280, scale=0.032819852232933044, zero_point=77, qscheme=torch.per_channel_affine)
  )
  (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  (activation_fn): GELUActivation()
  (fc1): QuantizedLinear(in_features=1280, out_features=5120, scale=0.10480756312608719, zero_point=49, qscheme=torch.per_channel_affine)
  (fc2): QuantizedLinear(in_features=5120, out_features=1280, scale=0.029906874522566795, zero_point=65, qscheme=

Сам костыль - мы хукаем F вызовы и заменяем на квантованные обработчики

In [37]:
# Сохраним оригиналы
_orig_F_linear = F.linear
_orig_F_layer_norm = F.layer_norm

# Вспомогательный контейнер для состояния
_safe_quant_state = {
    "installed": False,
    "model": None,
    "orig_linear": _orig_F_linear,
    "orig_layer_norm": _orig_F_layer_norm,
}

def _find_quantized_linear_for_weight(model, weight_tensor):
    """
    Ищем в model QuantizedLinear модуль, чей (распакованный/де-квантованный) вес
    совпадает по форме с weight_tensor (best-effort).
    """
    if model is None:
        return None

    wt_shape = tuple(weight_tensor.shape) if hasattr(weight_tensor, "shape") else None

    for name, m in model.named_modules():
        # Поддерживаем разные названия классов в разных версиях PyTorch
        cls_name = type(m).__name__.lower()
        if "quantizedlinear" in cls_name or "quantizedlinear" in cls_name.replace("_", "") or "quantizedlinear" in cls_name.replace(".", ""):
            # Попробуем получить вес в доступной форме
            try:
                w = getattr(m, "weight", None)
                if isinstance(w, torch.Tensor):
                    if tuple(w.shape) == wt_shape:
                        return m
                else:
                    # У quantized modules weight() часто метод
                    wcall = getattr(m, "weight", None)
                    if callable(wcall):
                        ww = wcall()
                        if isinstance(ww, torch.Tensor) and tuple(ww.shape) == wt_shape:
                            return m
            except Exception:
                # безопасно игнорируем ошибки доступа
                continue
    return None

def _find_quantized_layernorm_for_shape(model, normalized_shape):
    if model is None:
        return None
    target = tuple(normalized_shape) if hasattr(normalized_shape, "__iter__") else (int(normalized_shape),)
    for name, m in model.named_modules():
        cls_name = type(m).__name__.lower()
        if "quantizedlayernorm" in cls_name or "quantizedlayernorm" in cls_name.replace("_", ""):
            try:
                # пытаемся прочитать normalized_shape атрибут
                ns = getattr(m, "normalized_shape", None)
                if ns is None:
                    ns = getattr(m, "normalized_shape_", None)
                if ns is None and hasattr(m, "_packed_params"):
                    # некоторые реализации хранят параметры иначе — пытаемся weight().shape
                    w = getattr(m, "weight", None)
                    if callable(w):
                        w = w()
                    if isinstance(w, torch.Tensor):
                        ns = (w.shape[-1],)
                if ns is not None and tuple(ns) == tuple(target):
                    return m
            except Exception:
                continue
    return None

@lru_cache(maxsize=1024)
def _cached_find_quant_linear(model_id, wt_shape):
    # model_id — id(model) to avoid capturing model in cache key incorrectly
    # wt_shape — tuple
    m = _safe_quant_state.get("model", None)
    if m is None:
        return None
    # Проходим по named_modules — используем helper
    return _find_quantized_linear_for_weight(m, torch.empty(wt_shape))

def install_safe_quant_inference(model):
    """
    Устанавливает безопасные обёртки F.linear и F.layer_norm.
    Перед вызовом generate сделай: install_safe_quant_inference(qmodel)
    В конце можно вернуть оригинал: uninstall_safe_quant_inference()
    """
    if _safe_quant_state["installed"]:
        return
    _safe_quant_state["installed"] = True
    _safe_quant_state["model"] = model

    def _safe_linear(input, weight, bias=None):
        # если вход не квантован — обычное поведение
        if not (hasattr(input, "is_quantized") and input.is_quantized):
            return _orig_F_linear(input, weight, bias)

        # 1) Попытка вызвать нативный quantized op напрямую
        try:
            # Некоторые сборки ожидают определённый набор аргументов; пробуем наиболее общий вызов.
            return torch.ops.quantized.linear(input, weight, bias)
        except Exception:
            pass

        # 2) Поиск соответствующего QuantizedLinear в модели и вызов его forward
        try:
            # weight может быть Tensor; берем его shape
            wt_shape = tuple(weight.shape) if hasattr(weight, "shape") else None
            # Пытаться найти модуль
            qmod = _find_quantized_linear_for_weight(_safe_quant_state.get("model", None), weight)
            if qmod is not None:
                # При вызове модуля он сам разберётся с упаковкой весов
                return qmod(input)
        except Exception:
            pass

        # 3) Last resort: деквантовать input и вызвать оригинал
        try:
            deq = input.dequantize()
            return _orig_F_linear(deq, weight, bias)
        except Exception:
            # В крайнем случае — пробуем оригинал и пусть поднимет ошибку
            return _orig_F_linear(input, weight, bias)

    def _safe_layer_norm(input, normalized_shape, weight=None, bias=None, eps=1e-5):
        if not (hasattr(input, "is_quantized") and input.is_quantized):
            return _orig_F_layer_norm(input, normalized_shape, weight, bias, eps)

        # 1) Попытка вызвать нативный quantized op
        try:
            return torch.ops.quantized.layer_norm(input, normalized_shape, weight, bias, eps)
        except Exception:
            pass

        # 2) Найти QuantizedLayerNorm модуль и вызвать его
        try:
            qln = _find_quantized_layernorm_for_shape(_safe_quant_state.get("model", None), normalized_shape)
            if qln is not None:
                return qln(input)
        except Exception:
            pass

        # 3) Fallback: деквантовать и запустить float path
        try:
            return _orig_F_layer_norm(input.dequantize(), normalized_shape, weight, bias, eps)
        except Exception:
            return _orig_F_layer_norm(input, normalized_shape, weight, bias, eps)

    # Установим патчи
    F.linear = _safe_linear
    F.layer_norm = _safe_layer_norm

def uninstall_safe_quant_inference():
    """Восстановить исходные F.linear и F.layer_norm."""
    if not _safe_quant_state["installed"]:
        return
    F.linear = _safe_quant_state["orig_linear"]
    F.layer_norm = _safe_quant_state["orig_layer_norm"]
    _safe_quant_state["installed"] = False
    _safe_quant_state["model"] = None


In [38]:
trace_path="whisper_perfetto_large-v3_quanted.json"
example = dataset[116]
audio_array = example["audio"]["array"]
sampling_rate = example["audio"]["sampling_rate"]

inputs = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt").input_features

with profile(
        activities=[ProfilerActivity.CPU],
        record_shapes=True,
        profile_memory=True,
        with_stack=False,
    ) as prof:
    with record_function("whisper.generate"):
        install_safe_quant_inference(qmodel)
        predicted_ids = qmodel.generate(inputs, forced_decoder_ids=forced_decoder_ids)

prof.export_chrome_trace(trace_path)
print(f"Perfetto trace saved to {trace_path}")
print(prof.key_averages().table(
        sort_by=sort_by,
        row_limit=10
    ))

NotImplementedError: Could not run 'quantized::linear' with arguments from the 'CPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'quantized::linear' is only available for these backends: [Meta, QuantizedCPU, QuantizedCUDA, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradMPS, AutogradXPU, AutogradHPU, AutogradLazy, AutogradMTIA, AutogradMAIA, AutogradMeta, Tracer, AutocastCPU, AutocastMTIA, AutocastMAIA, AutocastXPU, AutocastMPS, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].

Meta: registered at /pytorch/aten/src/ATen/core/MetaFallbackKernel.cpp:23 [backend fallback]
QuantizedCPU: registered at /pytorch/aten/src/ATen/native/quantized/cpu/qlinear.cpp:1603 [kernel]
QuantizedCUDA: registered at /pytorch/aten/src/ATen/native/quantized/cudnn/Linear.cpp:359 [kernel]
BackendSelect: fallthrough registered at /pytorch/aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:194 [backend fallback]
FuncTorchDynamicLayerBackMode: registered at /pytorch/aten/src/ATen/functorch/DynamicLayer.cpp:479 [backend fallback]
Functionalize: registered at /pytorch/aten/src/ATen/FunctionalizeFallbackKernel.cpp:387 [backend fallback]
Named: registered at /pytorch/aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
Conjugate: registered at /pytorch/aten/src/ATen/ConjugateFallback.cpp:17 [backend fallback]
Negative: registered at /pytorch/aten/src/ATen/native/NegateFallback.cpp:18 [backend fallback]
ZeroTensor: registered at /pytorch/aten/src/ATen/ZeroTensorFallback.cpp:115 [backend fallback]
ADInplaceOrView: fallthrough registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:104 [backend fallback]
AutogradOther: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:63 [backend fallback]
AutogradCPU: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:67 [backend fallback]
AutogradCUDA: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:75 [backend fallback]
AutogradXLA: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:87 [backend fallback]
AutogradMPS: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:95 [backend fallback]
AutogradXPU: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:71 [backend fallback]
AutogradHPU: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:108 [backend fallback]
AutogradLazy: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:91 [backend fallback]
AutogradMTIA: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:79 [backend fallback]
AutogradMAIA: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:83 [backend fallback]
AutogradMeta: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:99 [backend fallback]
Tracer: registered at /pytorch/torch/csrc/autograd/TraceTypeManual.cpp:294 [backend fallback]
AutocastCPU: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:324 [backend fallback]
AutocastMTIA: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:468 [backend fallback]
AutocastMAIA: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:506 [backend fallback]
AutocastXPU: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:544 [backend fallback]
AutocastMPS: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:209 [backend fallback]
AutocastCUDA: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:165 [backend fallback]
FuncTorchBatched: registered at /pytorch/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:731 [backend fallback]
BatchedNestedTensor: registered at /pytorch/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:758 [backend fallback]
FuncTorchVmapMode: fallthrough registered at /pytorch/aten/src/ATen/functorch/VmapModeRegistrations.cpp:27 [backend fallback]
Batched: registered at /pytorch/aten/src/ATen/LegacyBatchingRegistrations.cpp:1075 [backend fallback]
VmapMode: fallthrough registered at /pytorch/aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]
FuncTorchGradWrapper: registered at /pytorch/aten/src/ATen/functorch/TensorWrapper.cpp:210 [backend fallback]
PythonTLSSnapshot: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:202 [backend fallback]
FuncTorchDynamicLayerFrontMode: registered at /pytorch/aten/src/ATen/functorch/DynamicLayer.cpp:475 [backend fallback]
PreDispatch: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:206 [backend fallback]
PythonDispatcher: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:198 [backend fallback]


Тоже не сработала.

Вывод - текущими средствами без изменения архитектуры модели (т.е. замены внутри forward модели обращения к nn.F блокам на nn блоки) статическую квантизацию сделать не получается.

In [39]:
del example, audio_array, sampling_rate, inputs, prof, qmodel
gc.collect()

63116

# PTQ Dynamic
Простейшая восьмибитная квантизация в одну строчку. 8 бит были выбраны не случайно, т.к. по графикам с лекций после 8 бит драматически снижается качество модели, а ASR модели чувствительны к вычислительной точности.

In [40]:
qmodel = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
print_size_of_model(qmodel)

Size (MB): 6174.372281


In [41]:
modules_to_quantize = {torch.nn.Linear}
qmodel = tq.quantize_dynamic(
    qmodel, 
    modules_to_quantize, 
    dtype=torch.qint8
)
print_size_of_model(qmodel)


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  qmodel = tq.quantize_dynamic(


Size (MB): 1837.108365


In [42]:
_ = profile_sample(116, trace_path="whisper_perfetto_large-v3_quanted.json", model=qmodel)

Perfetto trace saved to whisper_perfetto_large-v3_quanted.json
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                 Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                     whisper.generate         7.86%     704.158ms       100.00%        8.964s        8.964s         112 B      -6.85 GB             1  
                            quantized::linear_dynamic        48.54%        4.351s        49.63%        4.449s       1.082ms       2.55 GB      -2.55 GB          4111  
                   aten::scaled_dot_product_attention         0.18%      16.361ms        16.09%  

Мы получили значительное ускорение инференса. Попробуем снять замеры качества с учётом torch.compile

In [43]:
qmodel = torch.compile(qmodel)
_ = profile_sample(116, trace_path="whisper_perfetto_large-v3_quanted_compiled.json", model=qmodel)

Perfetto trace saved to whisper_perfetto_large-v3_quanted_compiled.json
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                 Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                     whisper.generate         8.22%     634.960ms       100.00%        7.726s        7.726s         112 B      -6.85 GB             1  
                            quantized::linear_dynamic        48.54%        3.750s        49.43%        3.819s     929.023us       2.55 GB      -2.55 GB          4111  
                   aten::scaled_dot_product_attention         0.10%       7.414ms       

In [46]:
summary = run_model(verbose=True, model=qmodel, dataset=dataset)

  0%|▎                                                                                                                                                                                                            | 1/793 [00:06<1:27:06,  6.60s/it]

referenct:
можешь включить сериал теория большого взрыва
hypothesis:
 Можешь включить сериал «Теория большого взрыва»?


  6%|█████████████                                                                                                                                                                                               | 51/793 [04:59<1:16:07,  6.16s/it]

referenct:
покажи на смотрешке канал бридж тв
hypothesis:
 Покажи на сматрёшке канал Бридж ТВ.


 13%|█████████████████████████▊                                                                                                                                                                                 | 101/793 [09:55<1:03:30,  5.51s/it]

referenct:
асият иванов
hypothesis:
 Асиат Иванов


 19%|██████████████████████████████████████▋                                                                                                                                                                    | 151/793 [14:45<1:04:01,  5.98s/it]

referenct:
заказать тольятти молоко три и два процента жирности один литр
hypothesis:
 Заказать в Тольятти молоко 3,2% жирности 1 литр.


 25%|███████████████████████████████████████████████████▉                                                                                                                                                         | 201/793 [19:34<57:09,  5.79s/it]

referenct:
фильм самый лучший день
hypothesis:
 Фильм «Самый лучший день»


 32%|████████████████████████████████████████████████████████████████▉                                                                                                                                            | 251/793 [24:21<49:43,  5.50s/it]

referenct:
лилль
hypothesis:
 Лиль


 38%|█████████████████████████████████████████████████████████████████████████████▊                                                                                                                               | 301/793 [29:07<46:57,  5.73s/it]

referenct:
брюс уиллис
hypothesis:
 Брюс Уиллис


 44%|██████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                  | 351/793 [34:07<42:57,  5.83s/it]

referenct:
ооо грузовой легковой шиномонтаж
hypothesis:
 О-о-о, грузовой легковой шиномонтаж.


 51%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                     | 401/793 [39:07<39:35,  6.06s/it]

referenct:
покажи мне амирана сардарова на ютюбе
hypothesis:
 Покажи мне Амирана Сардарова на YouTube.


 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                        | 451/793 [43:56<35:12,  6.18s/it]

referenct:
арсенал манчестер сити
hypothesis:
 Арсенал Манчестер Сити


 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                           | 501/793 [48:49<31:09,  6.40s/it]

referenct:
у тебя в каталоге есть сериал охотники за бриллиантами первый сезон
hypothesis:
 У тебя в каталоге есть сериал «Охотники за бриллиантами. Первый сезон».


 69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                              | 551/793 [53:35<22:31,  5.58s/it]

referenct:
джой сколько страниц в собака баскервилей
hypothesis:
 Джой, сколько страниц в собак обоскервилий?


 76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 601/793 [58:23<17:20,  5.42s/it]

referenct:
шант ньюс
hypothesis:
 Шант Ньюс


 82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                    | 651/793 [1:03:06<13:06,  5.54s/it]

referenct:
танго любви найди
hypothesis:
 Танго любви найди.


 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                       | 701/793 [1:07:56<08:47,  5.74s/it]

referenct:
вячеслав владимирович месяцев
hypothesis:
 Вячеслав Владимирович Месяцев


 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 751/793 [1:12:49<03:59,  5.71s/it]

referenct:
футбольный матч тоттенхэм лестер
hypothesis:
 Футбольный матч Тоттенхэм-Лестер


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 793/793 [1:16:50<00:00,  5.81s/it]

large-v3
{
  "total_samples": 793,
  "avg_wer": 0.4739530218975364,
  "avg_cer": 0.16637995920858634,
  "avg_time_per_audio": 5.805704870861385,
  "total_time": 4603.923962593079
}





In [47]:
with open("whisper_metric.json", "r", encoding="utf-8") as f:
    data = json.load(f)

data["large-v3_cpu_quanted"] = summary

with open("whisper_metric.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=True, indent=2)

In [48]:
pd.DataFrame(data)

Unnamed: 0,tiny,small,iarge-v3,large-v3_cuda,large-v3_cpu_quanted,large-v3_cuda_quanted,large-v3_cuda_autocast_compile
total_samples,793.0,793.0,793.0,793.0,793.0,793.0,793.0
avg_wer,1.049771,0.523011,0.440303,0.440303,0.473953,0.447231,0.442587
avg_cer,0.486142,0.207796,0.158293,0.158293,0.16638,0.158552,0.158944
avg_time_per_audio,0.1689,0.682818,7.948655,0.838627,5.805705,2.843514,0.997778
total_time,133.937801,541.474906,6303.283459,665.031494,4603.923963,2254.906898,791.237885


In [50]:
# Сохраняем state_dict квантованной модели
torch.save(qmodel.state_dict(), "./whisper-large-v3-quantized-dynamic.pth")

# Также сохраните конфигурацию отдельно (она не меняется)
qmodel.config.save_pretrained("./whisper-large-v3-quantized-dynamic")

In [52]:
del qmodel, _
gc.collect()

NameError: name 'qmodel' is not defined

# CUDA
В CUDA нет наивной реализации квантизации и требуется самостоятельно писать необходимые операции.
Однако, при работе с трансформерами можно ожидать готового решения для каждой конкретной модели. В нашем случае существует минимум три готовых варианта:
1. TensorRT ускорение с помощью смешанной точности. Не завёлся, т.к. требует CUDA>= 12.9, а переставлять тулкиты = ломать текущие зависимости проекта, что весьма накладно. Основная идея - работать в half_precision с INT8/FP16 fallback
2. Квантизация через BitsAndBytes backend. Будем пробовать.
3. FP8/SmoothQuant через TransformerEngine. Требует видеокарты архитектуры Hopper/Ada, доступа к такой нет.

In [54]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [55]:
!nvidia-smi

Tue Nov  4 23:19:41 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.124.06             Driver Version: 572.70         CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070 Ti     On  |   00000000:01:00.0  On |                  N/A |
| 60%   30C    P8             11W /  285W |    1167MiB /  12282MiB |      7%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [59]:
qmodel_cuda = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3", 
                                                              load_in_8bit=True,   # или load_in_4bit=True
                                                              device_map="auto")
print_size_of_model(qmodel_cuda)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Size (MB): 1623.253273


In [60]:
qmodel_cuda.eval()
qmodel_cuda = torch.compile(qmodel_cuda)

def profile_sample_cuda(sample_idx=0, trace_path="whisper_perfetto_large-v3_cuda.json", model=None):
    example = dataset[sample_idx]
    audio_array = example["audio"]["array"]
    sampling_rate = example["audio"]["sampling_rate"]

    inputs = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt").input_features
    inputs = inputs.half().to(device)  

    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],  
        record_shapes=True,
        profile_memory=True,
        with_stack=False,
    ) as prof:
        with record_function("whisper.generate"):
            with torch.no_grad():
                predicted_ids = model.generate(inputs, forced_decoder_ids=forced_decoder_ids)

    prof.export_chrome_trace(trace_path)
    print(f"Perfetto trace saved to {trace_path}")
    print(prof.key_averages().table(
        sort_by="cuda_time_total",
        row_limit=10
    ))
    return processor.decode(predicted_ids[0].cpu())

_ = profile_sample_cuda(116, trace_path="whisper_perfetto_large-v3_cuda_quantized.json", model=qmodel_cuda)
torch.cuda.empty_cache()

Perfetto trace saved to whisper_perfetto_large-v3_cuda_quantized.json
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       whisper.generate         0.00%       0.000us         0.00%       0.000us       0.000us        4.442s       956.87%        4.442

In [62]:
def run_model_cuda_quant(verbose=False, model=None, dataset=None):
    results = []
    i = 0
    with torch.no_grad():
        for audio in tqdm(dataset):
            audio_array = audio["audio"]["array"]
            sampling_rate = audio["audio"]["sampling_rate"]
            reference = audio["transcription"]
        
            start_time = time.time()
            input_features = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt").input_features 
            input_features = input_features.half().to(device)
            predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)[0] #Уточнить в зависимости от выбранной модели
            hypothesis = processor.decode(predicted_ids)
            run_time = time.time() - start_time
            metrics = asr_metrics(hypothesis, reference)
            metrics["run_time_sec"] = run_time
            if verbose:
                if i % 50 == 0:
                    print("referenct:")
                    print(reference)
                    print("hypothesis:")
                    print(hypothesis)
                i += 1
            results.append(metrics)
            
    torch.cuda.empty_cache()
    df_results = pd.DataFrame(results)
    
    summary = {
        "total_samples": len(df_results),
        "avg_wer": df_results["wer"].mean(),
        "avg_cer": df_results["cer"].mean(),
        "avg_time_per_audio": df_results["run_time_sec"].mean(),
        "total_time": df_results["run_time_sec"].sum(),
    }
    
    print("large-v3_cuda_q")
    print(json.dumps(summary, ensure_ascii=True, indent=2))
    return summary

In [63]:
summary = run_model_cuda_quant(model=qmodel_cuda, dataset=dataset)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 793/793 [39:51<00:00,  3.02s/it]

large-v3_cuda_q
{
  "total_samples": 793,
  "avg_wer": 0.44723060042101653,
  "avg_cer": 0.1585522291109167,
  "avg_time_per_audio": 3.006850938327977,
  "total_time": 2384.4327940940857
}





In [64]:
data["large-v3_cuda_quanted"] = summary

with open("whisper_metric.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=True, indent=2)

In [65]:
pd.DataFrame(data)

Unnamed: 0,tiny,small,iarge-v3,large-v3_cuda,large-v3_cpu_quanted,large-v3_cuda_quanted,large-v3_cuda_autocast_compile
total_samples,793.0,793.0,793.0,793.0,793.0,793.0,793.0
avg_wer,1.049771,0.523011,0.440303,0.440303,0.473953,0.447231,0.442587
avg_cer,0.486142,0.207796,0.158293,0.158293,0.16638,0.158552,0.158944
avg_time_per_audio,0.1689,0.682818,7.948655,0.838627,5.805705,3.006851,0.997778
total_time,133.937801,541.474906,6303.283459,665.031494,4603.923963,2384.432794,791.237885


In [66]:
del qmodel_cuda, _
gc.collect()
torch.cuda.empty_cache()

# The old way. Quantization vs Autocast

Во времена, когда деревья были большими, а ML-инженеры - тупенькими, использовался проверенный бородатыми сеньорами способ torch.compile + torch.autocast.

Пришло время проверить эту гипотезу.

### CPU

In [67]:
model_a = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")

In [68]:
model_a.eval()
model_a = torch.compile(model_a)

def profile_sample_autocast(sample_idx=0, trace_path="whisper_perfetto_large-v3_cpu_a.json", model=None, device='cpu'):
    example = dataset[sample_idx]
    audio_array = example["audio"]["array"]
    sampling_rate = example["audio"]["sampling_rate"]

    inputs = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt").input_features
    inputs = inputs.to(device)  

    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],  
        record_shapes=True,
        profile_memory=True,
        with_stack=False,
    ) as prof:
        with record_function("whisper.generate"):
            with torch.no_grad():
                with torch.autocast(device_type=device, dtype=torch.bfloat16):
                    predicted_ids = model.generate(inputs, forced_decoder_ids=forced_decoder_ids)

    prof.export_chrome_trace(trace_path)
    print(f"Perfetto trace saved to {trace_path}")
    print(prof.key_averages().table(
        sort_by="cpu_time_total",
        row_limit=10
    ))
    return processor.decode(predicted_ids[0].cpu())

_ = profile_sample_autocast(116, trace_path="whisper_perfetto_large-v3_cpu_autocasted.json", model=model_a)


Perfetto trace saved to whisper_perfetto_large-v3_cpu_autocasted.json
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                 Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                         aten::conv1d         0.00%     891.711us       105.30%       91.128s       22.782s      32.29 MB    -750.00 KB             4  
                                     whisper.generate         0.58%     503.662ms        99.99%       86.532s       86.532s         112 B      -6.78 GB             1  
                                         aten::linear         0.17%     143.439ms        7

### CUDA raw

In [69]:
model_a = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
model_a.to(device).eval()
model_a = torch.compile(model_a)

In [70]:
def profile_sample_autocast_cuda(sample_idx=0, trace_path="whisper_perfetto_large-v3_cuda.json", model=None, device='cpu'):
    example = dataset[sample_idx]
    audio_array = example["audio"]["array"]
    sampling_rate = example["audio"]["sampling_rate"]

    inputs = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt").input_features
    inputs = inputs.half().to(device)  

    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],  
        record_shapes=True,
        profile_memory=True,
        with_stack=False,
    ) as prof:
        with record_function("whisper.generate"):
            with torch.no_grad():
                with torch.autocast(device_type=device, dtype=torch.bfloat16):
                    predicted_ids = model.generate(inputs, forced_decoder_ids=forced_decoder_ids)

    prof.export_chrome_trace(trace_path)
    print(f"Perfetto trace saved to {trace_path}")
    print(prof.key_averages().table(
        sort_by="cuda_time_total",
        row_limit=10
    ))
    return processor.decode(predicted_ids[0].cpu())

_ = profile_sample_autocast_cuda(116, trace_path="whisper_perfetto_large-v3_CUDA_autocasted.json",  model=model_a, device=device)

Perfetto trace saved to whisper_perfetto_large-v3_CUDA_autocasted.json
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       whisper.generate         0.00%       0.000us         0.00%       0.000us       0.000us        3.019s      2040.76%        3.01

In [71]:
del _
gc.collect()
torch.cuda.empty_cache()

In [72]:
results = []
i = 0

with torch.no_grad():
    for audio in tqdm(dataset):
        audio_array = audio["audio"]["array"]
        sampling_rate = audio["audio"]["sampling_rate"]
        reference = audio["transcription"]

        start_time = time.time()
        
        
        input_features = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt").input_features
        input_features = input_features.half().to(device)
        
        with torch.autocast(device_type=device, dtype=torch.bfloat16):
            predicted_ids = model_a.generate(input_features, forced_decoder_ids=forced_decoder_ids)[0]
        
        
        hypothesis = processor.decode(predicted_ids.cpu())
        
        run_time = time.time() - start_time
        metrics = asr_metrics(hypothesis, reference)
        metrics["run_time_sec"] = run_time
        
        # if i % 50 == 0:
        #     print("reference:")
        #     print(reference)
        #     print("hypothesis:")
        #     print(hypothesis)
        # i += 1
        
        results.append(metrics)

torch.cuda.empty_cache()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 793/793 [11:58<00:00,  1.10it/s]


In [73]:
df_results = pd.DataFrame(results)

summary = {
    "total_samples": len(df_results),
    "avg_wer": df_results["wer"].mean(),
    "avg_cer": df_results["cer"].mean(),
    "avg_time_per_audio": df_results["run_time_sec"].mean(),
    "total_time": df_results["run_time_sec"].sum(),
}

print("large-v3_cuda_q")
print(json.dumps(summary, ensure_ascii=True, indent=2))

large-v3_cuda_q
{
  "total_samples": 793,
  "avg_wer": 0.44258729290885657,
  "avg_cer": 0.15894431408815926,
  "avg_time_per_audio": 0.8967222247418359,
  "total_time": 711.1007242202759
}


In [74]:
data["large-v3_cuda_autocast_compile"] = summary

with open("whisper_metric.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=True, indent=2)

pd.DataFrame(data)

Unnamed: 0,tiny,small,iarge-v3,large-v3_cuda,large-v3_cpu_quanted,large-v3_cuda_quanted,large-v3_cuda_autocast_compile
total_samples,793.0,793.0,793.0,793.0,793.0,793.0,793.0
avg_wer,1.049771,0.523011,0.440303,0.440303,0.473953,0.447231,0.442587
avg_cer,0.486142,0.207796,0.158293,0.158293,0.16638,0.158552,0.158944
avg_time_per_audio,0.1689,0.682818,7.948655,0.838627,5.805705,3.006851,0.896722
total_time,133.937801,541.474906,6303.283459,665.031494,4603.923963,2384.432794,711.100724


In [75]:
del model_a, input_features, predicted_ids, hypothesis, audio, audio_array
gc.collect()
torch.cuda.empty_cache()

### CUDA quant

In [76]:
qmodel_cuda_a = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3", 
                                                              load_in_8bit=True,   # или load_in_4bit=True
                                                              device_map="auto")
qmodel_cuda_a.eval()
qmodel_cuda_a = torch.compile(qmodel_cuda_a)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [77]:
_ = profile_sample_autocast_cuda(116, trace_path="whisper_perfetto_large-v3_CUDA_quantized_autocasted.json", model=qmodel_cuda_a, device=device)



Perfetto trace saved to whisper_perfetto_large-v3_CUDA_quantized_autocasted.json
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       whisper.generate         0.00%       0.000us         0.00%       0.000us       0.000us        5.273s      1115.12%  

In [78]:
del qmodel_cuda_a, _
gc.collect()
torch.cuda.empty_cache()

Выводы:
На CPU- autocast накладывает огромное количество оверхэда на операции чистой и квантованной моделей и не должен использоваться в качестве ускорения инференса в данной работе.

На чистом GPU оверхэд от autocast превышает выигрыш в скорости (что странно, но ожидаемо, т.к. в случае батчевания ситуация перебалансируется в сторону скорости).

На квантованном GPU autocast не имеет смысла.