# 0 

In [1]:
import os.path
from timeit import timeit

from transformers import AutoTokenizer, AutoModel

In [2]:
model_name = "sentence-transformers/multi-qa-mpnet-base-cos-v1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [3]:
text = """
MiÅ‚oÅ›Ä‡ szczÄ™Å›liwa. Czy to jest normalne,
czy to powaÅ¼ne, czy to poÅ¼yteczne -
co Å›wiat ma z dwojga ludzi,
ktÃ³rzy nie widzÄ… Å›wiata?

WywyÅ¼szeni ku sobie bez Å¼adnej zasÅ‚ugi,
pierwsi lepsi z miliona, ale przekonani,
Å¼e tak staÄ‡ siÄ™ musiaÅ‚o - w nagrodÄ™ za co?
za nic;
Å›wiatÅ‚o pada znikÄ…d -
dlaczego wÅ‚aÅ›nie na tych, a nie na innych?
Czy to obraÅ¼a sprawiedliwoÅ›Ä‡? Tak.
Czy to narusza troskliwie piÄ™trzone zasady,
strÄ…cÄ… ze szczytu moraÅ‚? Narusza i strÄ…ca.

SpÃ³jrzcie na tych szczÄ™Å›liwych:
gdyby siÄ™ chociaÅ¼ maskowali trochÄ™,
udawali zgnÄ™bienie krzepiÄ…c tym przyjaciÃ³Å‚!
SÅ‚uchajcie, jak siÄ™ Å›miejÄ… - obraÅºliwie.
Jakim jÄ™zykiem mÃ³wiÄ… - zrozumiaÅ‚ym na pozÃ³r.
A te ich ceremonie, ceregiele,
wymyÅ›lne obowiÄ…zki wzglÄ™dem siebie -
wyglÄ…da to na zmowÄ™ za plecami ludzkoÅ›ci!

Trudno nawet przewidzieÄ‡, do czego by doszÅ‚o,
gdyby ich przykÅ‚ad daÅ‚ siÄ™ naÅ›ladowaÄ‡.
Na co liczyÄ‡ by mogÅ‚y religie, poezje,
o czym by pamiÄ™tano, czego zaniechano.
kto by chciaÅ‚ zostaÄ‡ w krÄ™gu.

MiÅ‚oÅ›Ä‡ szczÄ™Å›liwa. Czy to jest konieczne?
Takt i rozsÄ…dek kaÅ¼Ä… milczeÄ‡ o niej
jako skandalu z wysokich sfer Å»ycia.
Wspaniale dziatki rodzÄ… siÄ™ bez jej pomocy.
Przenigdy nie zdolaÅ‚aby zaludniÄ‡ ziemi,
zdarza siÄ™ przecieÅ¼ rzadko.

Niech ludzie nie znajÄ…cy miÅ‚oÅ›ci szczÄ™Å›liwej
twierdzÄ…, Å¼e nigdzie nie ma miÅ‚oÅ›ci szczÄ™Å›liwej.

Z tÄ… wiarÄ… lÅ¼ej im bÄ™dzie i Å¼yÄ‡, i umieraÄ‡."""

#  ÅºrÃ³dÅ‚o: https://poezja.org/wz/Wislawa_Szymborska/19/Milosc_szczesliwa

In [4]:
encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

In [5]:
import timeit
import torch


# 1

In [6]:
def no_optimizations():
    model.train()
    _ = model(**encoded_input)


def only_eval():
    model.eval()
    _ = model(**encoded_input)


def eval_no_grad():
    model.eval()
    with torch.no_grad():
        _ = model(**encoded_input)

def eval_inference():
    model.eval()
    with torch.inference_mode():
        _ = model(**encoded_input)

In [7]:
n_runs = 100

no_opti_avg_time = timeit.timeit(no_optimizations, number=n_runs) / n_runs
only_eval_avg_time = timeit.timeit(only_eval, number=n_runs) / n_runs
eval_no_grad_avg_time = timeit.timeit(eval_no_grad, number=n_runs) / n_runs
eval_inference_avg_time = timeit.timeit(eval_inference, number=n_runs) / n_runs

In [8]:
print("Benchmark results:")
print(f"{no_opti_avg_time=:.6f}")
print(f"{only_eval_avg_time=:.6f}")
print(f"{eval_no_grad_avg_time=:.6f}")
print(f"{eval_inference_avg_time=:.6f}")

Benchmark results:
no_opti_avg_time=0.144187
only_eval_avg_time=0.126625
eval_no_grad_avg_time=0.125176
eval_inference_avg_time=0.118044


In [9]:
print("Speedup comparing to no optimization method")
print(f"{no_opti_avg_time / only_eval_avg_time:.2f}")
print(f"{no_opti_avg_time / eval_no_grad_avg_time:.2f}")
print(f"{no_opti_avg_time / eval_inference_avg_time:.2f}")

Speedup comparing to no optimization method
1.14
1.15
1.22


# 2

In [10]:
from time import time

In [11]:
model.eval()

start_time = time()

compiled_model = torch.compile(model)
_ = compiled_model(**encoded_input)

end_time = time() - start_time

print(f"Total time of compilation and warmup inference: {end_time:.6f}")

Total time of compilation and warmup inference: 6.669413


In [12]:
def compiled_eval_inference():
    with torch.inference_mode():
        _ = compiled_model(**encoded_input)

In [13]:
compiled_eval_inference_avg_time = timeit.timeit(compiled_eval_inference, number=n_runs) / n_runs

In [14]:
print(f"{compiled_eval_inference_avg_time=:.6f}")

compiled_eval_inference_avg_time=0.130274


In [15]:
print("Speedup comparing to no optimization method")
print(f"{no_opti_avg_time / compiled_eval_inference_avg_time:.2f}")

Speedup comparing to no optimization method
1.11


Yes. This is the best so far! ðŸ’…ðŸ’… (this is Kuba, no LLM here)

# 3

In [16]:
model = model.to("cpu")

In [17]:
model_quantized = torch.ao.quantization.quantize_dynamic(model, dtype=torch.qint8)

In [18]:
print(model_quantized)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (k): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (v): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (o): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (dropout): Dropout(p=0.1, inplace=F

In [19]:
torch.save(model.state_dict(), "model.pth")

In [20]:
torch.save(model_quantized.state_dict(), "model_quantized.pth")

In [21]:
print(f"Size of normal model: {os.path.getsize('model.pth') / 1024 / 1024:.5} MB")
print(f"Size of model quantized: {os.path.getsize('model_quantized.pth') / 1024 / 1024:.5} MB")

Size of normal model: 417.72 MB
Size of model quantized: 173.1 MB


In [22]:
def quantized_eval_inference():
    model_quantized.eval()
    with torch.inference_mode():
        _ = model_quantized(**encoded_input)

In [23]:
quantized_eval_inference_avg_time = timeit.timeit(quantized_eval_inference, number=n_runs) / n_runs

In [24]:
print("Benchmark results:")
print(f"{no_opti_avg_time=:.6f}")
print(f"{quantized_eval_inference_avg_time=:.6f}")

Benchmark results:
no_opti_avg_time=0.144187
quantized_eval_inference_avg_time=0.056384


In [25]:
print("Speedup comparing to no optimization method")
print(f"{no_opti_avg_time / quantized_eval_inference_avg_time:.2f}")

Speedup comparing to no optimization method
2.56


Quantization helps us speed up model 2 times and help us save 2.5 times disc space (250MB), but I dont have any metrics to compare quality so I cant tell

# 4

In [26]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [27]:
model_gpu = model.to(device)

In [28]:
encoded_input_gpu = {k: v.to(device) for k, v in encoded_input.items()}
with torch.inference_mode():
    outputs = model_gpu(**encoded_input_gpu)

In [29]:
compiled_model_gpu = torch.compile(model_gpu)
compiled_max_autotune_model_gpu = torch.compile(model_gpu, mode="max-autotune")
compiled_max_autotune_no_cudagraphs_model_gpu = torch.compile(model_gpu, mode="max-autotune-no-cudagraphs")

In [34]:
def compiled_model_gpu_eval_inference():
    compiled_model_gpu.eval()
    with torch.inference_mode():
        _ = compiled_model_gpu(**encoded_input_gpu)


def compiled_max_autotune_model_gpu_eval_inference():
    compiled_max_autotune_model_gpu.eval()
    with torch.inference_mode():
        _ = compiled_max_autotune_model_gpu(**encoded_input_gpu)


def compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference():
    compiled_max_autotune_no_cudagraphs_model_gpu.eval()
    with torch.inference_mode():
        _ = compiled_max_autotune_no_cudagraphs_model_gpu(**encoded_input_gpu)


def compiled_model_gpu_eval_inference_and_input_to_gpu():
    compiled_model_gpu.eval()
    encoded_input_gpu = {k: v.to(device) for k, v in encoded_input.items()}
    with torch.inference_mode():
        _ = compiled_model_gpu(**encoded_input_gpu)


def compiled_max_autotune_model_gpu_eval_inference_and_input_to_gpu():
    compiled_max_autotune_model_gpu.eval()
    encoded_input_gpu = {k: v.to(device) for k, v in encoded_input.items()}
    with torch.inference_mode():
        _ = compiled_max_autotune_model_gpu(**encoded_input_gpu)


def compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_and_input_to_gpu():
    compiled_max_autotune_no_cudagraphs_model_gpu.eval()
    encoded_input_gpu = {k: v.to(device) for k, v in encoded_input.items()}
    with torch.inference_mode():
        _ = compiled_max_autotune_no_cudagraphs_model_gpu(**encoded_input_gpu)

In [42]:
n_runs = 1000

In [43]:
compiled_model_gpu_eval_inference_avg_time = timeit.timeit(compiled_model_gpu_eval_inference, number=n_runs) / n_runs
compiled_max_autotune_model_gpu_eval_inference_avg_time = timeit.timeit(compiled_max_autotune_model_gpu_eval_inference, number=n_runs) / n_runs
compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_avg_time = timeit.timeit(compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference, number=n_runs) / n_runs

# calculating input to gpu too
compiled_model_gpu_eval_inference_input_to_gpu_avg_time = timeit.timeit(compiled_model_gpu_eval_inference_and_input_to_gpu, number=n_runs) / n_runs
compiled_max_autotune_model_gpu_eval_inference_input_to_gpu_avg_time = timeit.timeit(compiled_max_autotune_model_gpu_eval_inference_and_input_to_gpu, number=n_runs) / n_runs
compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_input_to_gpu_avg_time = timeit.timeit(compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_and_input_to_gpu, number=n_runs) / n_runs

In [44]:
print("Benchmark results:")
print(f"{compiled_model_gpu_eval_inference_avg_time=:.6f}")
print(f"{compiled_max_autotune_model_gpu_eval_inference_avg_time=:.6f}")
print(f"{compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_avg_time=:.6f}")

print("\nWith inputs to gpu time included:")
print(f"{compiled_model_gpu_eval_inference_input_to_gpu_avg_time=:.6f}")
print(f"{compiled_max_autotune_model_gpu_eval_inference_input_to_gpu_avg_time=:.6f}")
print(f"{compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_input_to_gpu_avg_time=:.6f}")

Benchmark results:
compiled_model_gpu_eval_inference_avg_time=0.015556
compiled_max_autotune_model_gpu_eval_inference_avg_time=0.015493
compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_avg_time=0.015649

With inputs to gpu time included:
compiled_model_gpu_eval_inference_input_to_gpu_avg_time=0.016106
compiled_max_autotune_model_gpu_eval_inference_input_to_gpu_avg_time=0.017008
compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_input_to_gpu_avg_time=0.016561


In [46]:
print("Speedup comparing to no optimization method")
print(f"{no_opti_avg_time / compiled_model_gpu_eval_inference_avg_time:.2f}")
print(f"{no_opti_avg_time / compiled_max_autotune_model_gpu_eval_inference_avg_time:.2f}")
print(f"{no_opti_avg_time / compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_avg_time:.2f}")

print("\nWith inputs to gpu time included:")
print(f"{no_opti_avg_time / compiled_model_gpu_eval_inference_input_to_gpu_avg_time:.2f}")
print(f"{no_opti_avg_time / compiled_max_autotune_model_gpu_eval_inference_input_to_gpu_avg_time:.2f}")
print(f"{no_opti_avg_time / compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_input_to_gpu_avg_time:.2f}")

Speedup comparing to no optimization method
9.27
9.31
9.21

With inputs to gpu time included:
8.95
8.48
8.71


In [57]:
device

'cuda'

In [52]:
text2 = "Morgulium"
encoded_input2 = tokenizer(text2, padding=True, truncation=True, return_tensors="pt")
encoded_input2_gpu = {k: v.to(device) for k, v in encoded_input2.items()}

In [55]:
def compiled_model_gpu_eval_inference():
    compiled_model_gpu.eval()
    with torch.inference_mode():
        _ = compiled_model_gpu(**encoded_input2_gpu)


def compiled_max_autotune_model_gpu_eval_inference():
    compiled_max_autotune_model_gpu.eval()
    with torch.inference_mode():
        _ = compiled_max_autotune_model_gpu(**encoded_input2_gpu)


def compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference():
    compiled_max_autotune_no_cudagraphs_model_gpu.eval()
    with torch.inference_mode():
        _ = compiled_max_autotune_no_cudagraphs_model_gpu(**encoded_input2_gpu)


def compiled_model_gpu_eval_inference_and_input_to_gpu():
    compiled_model_gpu.eval()
    encoded_input_gpu2 = {k: v.to(device) for k, v in encoded_input2.items()}
    with torch.inference_mode():
        _ = compiled_model_gpu(**encoded_input_gpu2)


def compiled_max_autotune_model_gpu_eval_inference_and_input_to_gpu():
    compiled_max_autotune_model_gpu.eval()
    encoded_input_gpu2 = {k: v.to(device) for k, v in encoded_input2.items()}
    with torch.inference_mode():
        _ = compiled_max_autotune_model_gpu(**encoded_input_gpu2)


def compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_and_input_to_gpu():
    compiled_max_autotune_no_cudagraphs_model_gpu.eval()
    encoded_input_gpu2 = {k: v.to(device) for k, v in encoded_input2.items()}
    with torch.inference_mode():
        _ = compiled_max_autotune_no_cudagraphs_model_gpu(**encoded_input_gpu2)

In [56]:
compiled_model_gpu_eval_inference_avg_time = timeit.timeit(compiled_model_gpu_eval_inference, number=n_runs) / n_runs
compiled_max_autotune_model_gpu_eval_inference_avg_time = timeit.timeit(compiled_max_autotune_model_gpu_eval_inference, number=n_runs) / n_runs
compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_avg_time = timeit.timeit(compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference, number=n_runs) / n_runs

# calculating input to gpu too
compiled_model_gpu_eval_inference_input_to_gpu_avg_time = timeit.timeit(compiled_model_gpu_eval_inference_and_input_to_gpu, number=n_runs) / n_runs
compiled_max_autotune_model_gpu_eval_inference_input_to_gpu_avg_time = timeit.timeit(compiled_max_autotune_model_gpu_eval_inference_and_input_to_gpu, number=n_runs) / n_runs
compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_input_to_gpu_avg_time = timeit.timeit(compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_and_input_to_gpu, number=n_runs) / n_runs

In [50]:
print("Benchmark results (small input):")
print(f"{compiled_model_gpu_eval_inference_avg_time=:.6f}")
print(f"{compiled_max_autotune_model_gpu_eval_inference_avg_time=:.6f}")
print(f"{compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_avg_time=:.6f}")

print("\nWith inputs to gpu time included:")
print(f"{compiled_model_gpu_eval_inference_input_to_gpu_avg_time=:.6f}")
print(f"{compiled_max_autotune_model_gpu_eval_inference_input_to_gpu_avg_time=:.6f}")
print(f"{compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_input_to_gpu_avg_time=:.6f}")

Benchmark results (small input):
compiled_model_gpu_eval_inference_avg_time=0.010158
compiled_max_autotune_model_gpu_eval_inference_avg_time=0.010410
compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_avg_time=0.009178

With inputs to gpu time included:
compiled_model_gpu_eval_inference_input_to_gpu_avg_time=0.002147
compiled_max_autotune_model_gpu_eval_inference_input_to_gpu_avg_time=0.002019
compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_input_to_gpu_avg_time=0.002232


In [51]:
print("Speedup comparing to no optimization method (small input)")
print(f"{no_opti_avg_time / compiled_model_gpu_eval_inference_avg_time:.2f}")
print(f"{no_opti_avg_time / compiled_max_autotune_model_gpu_eval_inference_avg_time:.2f}")
print(f"{no_opti_avg_time / compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_avg_time:.2f}")

print("\nWith inputs to gpu time included:")
print(f"{no_opti_avg_time / compiled_model_gpu_eval_inference_input_to_gpu_avg_time:.2f}")
print(f"{no_opti_avg_time / compiled_max_autotune_model_gpu_eval_inference_input_to_gpu_avg_time:.2f}")
print(f"{no_opti_avg_time / compiled_max_autotune_no_cudagraphs_model_gpu_eval_inference_input_to_gpu_avg_time:.2f}")

Speedup comparing to no optimization method (small input)
14.19
13.85
15.71

With inputs to gpu time included:
67.15
71.40
64.59


I dont understand. Why when we perform more operations (we count time when we convert input to gpu) we got so much faster times. I was looking at it and I do not understand. Also I did not notice any difference between mode. Time is sooo small so that different run gave me different results

# 5

In [140]:
# I will reset my model because I broke something XD
model = AutoModel.from_pretrained(model_name).to(device)
model_fresh = AutoModel.from_pretrained(model_name).to(device)

In [141]:
capability = torch.cuda.get_device_capability()
print(f"CUDA device capability: {capability}")

# Tensor Cores are available on NVidia GPUs with CUDA >= 7 (e.g. Volta, Turing, Ampere, Hopper)
if capability >= (7, 0):
    print("Tensor Cores available: fast float16 supported.")
else:
    print("Tensor Cores not available: float16 may be slow or unsupported.")

CUDA device capability: (7, 5)
Tensor Cores available: fast float16 supported.


In [142]:
model_half = model.half().to(device)

In [143]:
input_ids = encoded_input["input_ids"]
attention_mask = encoded_input["attention_mask"]

In [144]:
_ = model_half(**encoded_input_gpu)

In [145]:
print(next(model_fresh.parameters()).dtype)
print(next(model.parameters()).dtype)
print(next(model_half.parameters()).dtype)

torch.float32
torch.float16
torch.float16


### model_half = model.half().to(device) is making model half precision too!!!!!

In [148]:
from torch.amp import autocast

def f32_eval_inference():
    model_fresh.eval()
    with torch.inference_mode():
        _ = model_fresh(**encoded_input_gpu)


def f16_eval_inference():
    model_half.eval()
    with torch.inference_mode():
        _ = model_half(**encoded_input_gpu)


def autocast_eval_inference():
    model_fresh.eval()
    with torch.inference_mode():
        with autocast(device_type=device, dtype=torch.float16):
            _ = model_fresh(**encoded_input_gpu)

In [149]:
n_runs = 100

f32_eval_inference_avg_time = timeit.timeit(f32_eval_inference, number=n_runs) / n_runs
f16_eval_inference_avg_time = timeit.timeit(f16_eval_inference, number=n_runs) / n_runs
autocast_eval_inference_avg_time = timeit.timeit(autocast_eval_inference, number=n_runs) / n_runs

In [150]:
print("Benchmark results:")
print(f"{f32_eval_inference_avg_time=:.6f}")
print(f"{f16_eval_inference_avg_time=:.6f}")
print(f"{autocast_eval_inference_avg_time=:.6f}")

Benchmark results:
f32_eval_inference_avg_time=0.019158
f16_eval_inference_avg_time=0.007637
autocast_eval_inference_avg_time=0.011719


In [151]:
print("Speedup comparing to no optimization method")
print(f"{no_opti_avg_time / f32_eval_inference_avg_time:.2f}")
print(f"{no_opti_avg_time / f16_eval_inference_avg_time:.2f}")
print(f"{no_opti_avg_time / autocast_eval_inference_avg_time:.2f}")

Speedup comparing to no optimization method
7.53
18.88
12.30


I will check metrics to choose correct model for my case. Autocast seems cool

# 6

In [None]:
So I have chosen sleep (yet again)