### **Install Packages & Imports**

In [None]:
!pip install sentencepiece==0.2.0
!pip install quanto==0.0.11

In [3]:
import transformers
import torch
import sentencepiece as spm
from transformers import T5Tokenizer, T5ForConditionalGeneration
from memory_usage_helper import *

In [4]:
import warnings
# Ignore specific UserWarnings related to max_length in transformers
warnings.filterwarnings("ignore",
    message=".*Using the model-agnostic default `max_length`.*")

In [5]:
helper = MemoryUsageHelper()

### **Model without Quantize**

In [None]:
model_name = "google/flan-t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

**Generation:**

In [7]:
input_text = "Hello, my name is "
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids)
original_generation_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

**Model Size:**

In [8]:
original_module_sizes = helper.compute_module_sizes(model)

##**Quantize the model by int8**

In [9]:
from quanto import quantize, freeze

In [10]:
quantize(model, weights=torch.int8, activations=None)

In [11]:
freeze(model)

**Generation of quantize model:**

In [12]:
input_text = "Hello, my name is "
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids)
quantized_generation_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

**Quantize Model Size:**

In [13]:
quantized_module_sizes = helper.compute_module_sizes(model)

##**Compare Results:**

**Memory Usage:**

In [14]:
print(f"The original model size is {original_module_sizes[''] * 1e-9} GB")
print(f"The quantized model size is {quantized_module_sizes[''] * 1e-9} GB")

The original model size is 0.307844608 GB
The quantized model size is 0.12682868 GB


**Performance:**

In [15]:
print(f"original model output: {original_generation_output}")
print(f"quantized model output: {quantized_generation_output}")

original model output: annie scott
quantized model output: annie scott
