# 0 

In [2]:
import os.path
from timeit import timeit

from transformers import AutoTokenizer, AutoModel

In [15]:
model_name = "sentence-transformers/multi-qa-mpnet-base-cos-v1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [35]:
text = """
MiÅ‚oÅ›Ä‡ szczÄ™Å›liwa. Czy to jest normalne,
czy to powaÅ¼ne, czy to poÅ¼yteczne -
co Å›wiat ma z dwojga ludzi,
ktÃ³rzy nie widzÄ… Å›wiata?

WywyÅ¼szeni ku sobie bez Å¼adnej zasÅ‚ugi,
pierwsi lepsi z miliona, ale przekonani,
Å¼e tak staÄ‡ siÄ™ musiaÅ‚o - w nagrodÄ™ za co?
za nic;
Å›wiatÅ‚o pada znikÄ…d -
dlaczego wÅ‚aÅ›nie na tych, a nie na innych?
Czy to obraÅ¼a sprawiedliwoÅ›Ä‡? Tak.
Czy to narusza troskliwie piÄ™trzone zasady,
strÄ…cÄ… ze szczytu moraÅ‚? Narusza i strÄ…ca.

SpÃ³jrzcie na tych szczÄ™Å›liwych:
gdyby siÄ™ chociaÅ¼ maskowali trochÄ™,
udawali zgnÄ™bienie krzepiÄ…c tym przyjaciÃ³Å‚!
SÅ‚uchajcie, jak siÄ™ Å›miejÄ… - obraÅºliwie.
Jakim jÄ™zykiem mÃ³wiÄ… - zrozumiaÅ‚ym na pozÃ³r.
A te ich ceremonie, ceregiele,
wymyÅ›lne obowiÄ…zki wzglÄ™dem siebie -
wyglÄ…da to na zmowÄ™ za plecami ludzkoÅ›ci!

Trudno nawet przewidzieÄ‡, do czego by doszÅ‚o,
gdyby ich przykÅ‚ad daÅ‚ siÄ™ naÅ›ladowaÄ‡.
Na co liczyÄ‡ by mogÅ‚y religie, poezje,
o czym by pamiÄ™tano, czego zaniechano.
kto by chciaÅ‚ zostaÄ‡ w krÄ™gu.

MiÅ‚oÅ›Ä‡ szczÄ™Å›liwa. Czy to jest konieczne?
Takt i rozsÄ…dek kaÅ¼Ä… milczeÄ‡ o niej
jako skandalu z wysokich sfer Å»ycia.
Wspaniale dziatki rodzÄ… siÄ™ bez jej pomocy.
Przenigdy nie zdolaÅ‚aby zaludniÄ‡ ziemi,
zdarza siÄ™ przecieÅ¼ rzadko.

Niech ludzie nie znajÄ…cy miÅ‚oÅ›ci szczÄ™Å›liwej
twierdzÄ…, Å¼e nigdzie nie ma miÅ‚oÅ›ci szczÄ™Å›liwej.

Z tÄ… wiarÄ… lÅ¼ej im bÄ™dzie i Å¼yÄ‡, i umieraÄ‡."""

#  ÅºrÃ³dÅ‚o: https://poezja.org/wz/Wislawa_Szymborska/19/Milosc_szczesliwa

In [44]:
encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

In [52]:
import timeit
import torch


# 1

In [53]:
def no_optimizations():
    model.train()
    _ = model(**encoded_input)


def only_eval():
    model.eval()
    _ = model(**encoded_input)


def eval_no_grad():
    model.eval()
    with torch.no_grad():
        _ = model(**encoded_input)

def eval_inference():
    model.eval()
    with torch.inference_mode():
        _ = model(**encoded_input)

In [54]:
n_runs = 100

no_opti_avg_time = timeit.timeit(no_optimizations, number=n_runs) / n_runs
only_eval_avg_time = timeit.timeit(only_eval, number=n_runs) / n_runs
eval_no_grad_avg_time = timeit.timeit(eval_no_grad, number=n_runs) / n_runs
eval_inference_avg_time = timeit.timeit(eval_inference, number=n_runs) / n_runs

In [55]:
print("Benchmark results:")
print(f"{no_opti_avg_time=:.6f}")
print(f"{only_eval_avg_time=:.6f}")
print(f"{eval_no_grad_avg_time=:.6f}")
print(f"{eval_inference_avg_time=:.6f}")

Benchmark results:
no_opti_avg_time=0.142580
only_eval_avg_time=0.125754
eval_no_grad_avg_time=0.123940
eval_inference_avg_time=0.121378


In [57]:
print("Speedup comparing to no optimization method")
print(f"{no_opti_avg_time / only_eval_avg_time:.2f}")
print(f"{no_opti_avg_time / eval_no_grad_avg_time:.2f}")
print(f"{no_opti_avg_time / eval_inference_avg_time:.2f}")

Speedup comparing to no optimization method
1.13
1.15
1.17


# 2

In [59]:
from time import time

In [60]:
model.eval()

start_time = time()

compiled_model = torch.compile(model)
_ = compiled_model(**encoded_input)

end_time = time() - start_time

print(f"Total time of compilation and warmup inference: {end_time:.6f}")

Total time of compilation and warmup inference: 12.368721


In [64]:
def compiled_eval_inference():
    with torch.inference_mode():
        _ = compiled_model(**encoded_input)

In [66]:
compiled_eval_inference_avg_time = timeit.timeit(compiled_eval_inference, number=n_runs) / n_runs

In [67]:
print(f"{compiled_eval_inference_avg_time=:.6f}")

compiled_eval_inference_avg_time=0.120110


In [68]:
print("Speedup comparing to no optimization method")
print(f"{no_opti_avg_time / compiled_eval_inference_avg_time:.2f}")

Speedup comparing to no optimization method
1.19


Yes. This is the best so far! ðŸ’…ðŸ’… (this is Kuba, no LLM here)

# 3

In [69]:
model = model.to("cpu")

In [72]:
model_quantized = torch.ao.quantization.quantize_dynamic(model, dtype=torch.qint8)

In [73]:
print(model_quantized)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (k): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (v): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (o): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (dropout): Dropout(p=0.1, inplace=F

In [74]:
torch.save(model.state_dict(), "model.pth")

In [75]:
torch.save(model_quantized.state_dict(), "model_quantized.pth")

In [83]:
print(f"Size of normal model: {os.path.getsize('model.pth') / 1024 / 1024:.5} MB")
print(f"Size of model quantized: {os.path.getsize('model_quantized.pth') / 1024 / 1024:.5} MB")

Size of normal model: 417.72 MB
Size of model quantized: 173.1 MB
