In [1]:
import torch

In [2]:
device = torch.device('cuda')

In [3]:
model_path = "Prathyusha101/led-large-16384-arxiv"

In [5]:
from transformers import BitsAndBytesConfig

In [6]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True,bnb_4bit_compute_dtype=torch.float16)

In [4]:
!pip install bitsandbytes



Here's what each part means:

nf4_config = BitsAndBytesConfig(...): This line creates a configuration object called nf4_config using the BitsAndBytesConfig class. This object will store the settings for quantization.

load_in_4bit=True: This argument tells the config to load the model in 4-bit precision. This means that the model's weights will be stored using only 4 bits instead of the usual 16 or 32 bits, reducing memory usage.

bnb_4bit_use_double_quant=True: This enables double quantization, a technique that can further improve the accuracy of the quantized model.

bnb_4bit_quant_type="nf4": This specifies the quantization type as NF4, which is a specific algorithm for 4-bit quantization.

bnb_4bit_compute_dtype=torch.bfloat16: This sets the data type used for computations to torch.bfloat16 (Brain Floating Point 16-bit). Bfloat16 is a numerical format that offers a good balance between precision and performance. It is often used in deep learning to speed up training and inference.

In [7]:
nf4_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_use_double_quant=True,
                                bnb_4bit_quant_type="nf4",
                                bnb_4bit_compute_dtype=torch.bfloat16)

In [9]:
from transformers import AutoModelForSeq2SeqLM

In [10]:
model_nf4 = AutoModelForSeq2SeqLM.from_pretrained(model_path, quantization_config=nf4_config, device_map=device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
# save quantized_model

model_nf4.save_pretrained("quantized_model")

In [17]:
# check size of new model
import os
from transformers import AutoModelForSeq2SeqLM

def get_model_size(model_path):
  """Calculates the size of a model on disk.

  Args:
    model_path: The path to the model directory.

  Returns:
    The size of the model in MB.
  """
  total_size = 0
  # Check if the path exists before walking through it
  if os.path.exists(model_path):
    for dirpath, dirnames, filenames in os.walk(model_path):
      for f in filenames:
        fp = os.path.join(dirpath, f)
        total_size += os.path.getsize(fp)
    return total_size / (1024 * 1024) # Convert bytes to MB
  else:
    print(f"Warning: Model path '{model_path}' does not exist.")  # Print a warning
    return 0  # Return 0 to avoid division by zero

# Download the original model
original_model_path = "original_model"  # Local directory to save the model
AutoModelForSeq2SeqLM.from_pretrained("Prathyusha101/led-large-16384-arxiv", cache_dir=original_model_path)

# Get the size of the original model
original_size = get_model_size(original_model_path)

# Get the size of the quantized model
quantized_model_path = "quantized_model"  # Assuming you saved it here
quantized_size = get_model_size(quantized_model_path)

print(f"Original model size: {original_size:.2f} MB")
print(f"Quantized model size: {quantized_size:.2f} MB")
# Avoid division by zero if original_size is 0
if original_size != 0:
  print(f"Size reduction: {(original_size - quantized_size) / original_size * 100:.2f}%")
else:
  print("Size reduction cannot be calculated as original model size is 0.")

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/202 [00:00<?, ?B/s]

Original model size: 3508.53 MB
Quantized model size: 325.48 MB
Size reduction: 90.72%
