In [None]:
import torch

In [None]:
device = torch.device('cuda')

In [None]:
model_path = "Prathyusha101/led-large-16384-arxiv"

In [None]:
from transformers import BitsAndBytesConfig

In [None]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True,bnb_4bit_compute_dtype=torch.float16)

In [None]:
!pip install bitsandbytes



Here's what each part means:

nf4_config = BitsAndBytesConfig(...): This line creates a configuration object called nf4_config using the BitsAndBytesConfig class. This object will store the settings for quantization.

load_in_4bit=True: This argument tells the config to load the model in 4-bit precision. This means that the model's weights will be stored using only 4 bits instead of the usual 16 or 32 bits, reducing memory usage.

bnb_4bit_use_double_quant=True: This enables double quantization, a technique that can further improve the accuracy of the quantized model.

bnb_4bit_quant_type="nf4": This specifies the quantization type as NF4, which is a specific algorithm for 4-bit quantization.

bnb_4bit_compute_dtype=torch.bfloat16: This sets the data type used for computations to torch.bfloat16 (Brain Floating Point 16-bit). Bfloat16 is a numerical format that offers a good balance between precision and performance. It is often used in deep learning to speed up training and inference.

In [None]:
nf4_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_use_double_quant=True,
                                bnb_4bit_quant_type="nf4",
                                bnb_4bit_compute_dtype=torch.bfloat16)

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [None]:
model_nf4 = AutoModelForSeq2SeqLM.from_pretrained(model_path, quantization_config=nf4_config, device_map=device)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
# hf login
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
repo_id = "Prathyusha101/quantized_model_2"

In [None]:
model_nf4.push_to_hub(
    repo_id=repo_id,
    use_auth_token=True,
    safe_serialization=True  # Use safetensors format
)



model.safetensors:   0%|          | 0.00/341M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Prathyusha101/quantized_model_2/commit/eeba48a9d37846399ab23a5e5a285b30475e1ec3', commit_message='Upload LEDForConditionalGeneration', commit_description='', oid='eeba48a9d37846399ab23a5e5a285b30475e1ec3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Prathyusha101/quantized_model_2', endpoint='https://huggingface.co', repo_type='model', repo_id='Prathyusha101/quantized_model_2'), pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub(repo_id)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Prathyusha101/quantized_model_2/commit/32b8b03f525d322cd5149af6fd52e1bf2e162df2', commit_message='Upload tokenizer', commit_description='', oid='32b8b03f525d322cd5149af6fd52e1bf2e162df2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Prathyusha101/quantized_model_2', endpoint='https://huggingface.co', repo_type='model', repo_id='Prathyusha101/quantized_model_2'), pr_revision=None, pr_num=None)

In [None]:

# save quantized_model

model_nf4.save_pretrained("quantized_model")

In [None]:
# check size of new model
import os
from transformers import AutoModelForSeq2SeqLM

def get_model_size(model_path):
  """Calculates the size of a model on disk.

  Args:
    model_path: The path to the model directory.

  Returns:
    The size of the model in MB.
  """
  total_size = 0
  # Check if the path exists before walking through it
  if os.path.exists(model_path):
    for dirpath, dirnames, filenames in os.walk(model_path):
      for f in filenames:
        fp = os.path.join(dirpath, f)
        total_size += os.path.getsize(fp)
    return total_size / (1024 * 1024) # Convert bytes to MB
  else:
    print(f"Warning: Model path '{model_path}' does not exist.")  # Print a warning
    return 0  # Return 0 to avoid division by zero

# Download the original model
original_model_path = "original_model"  # Local directory to save the model
AutoModelForSeq2SeqLM.from_pretrained("Prathyusha101/led-large-16384-arxiv", cache_dir=original_model_path)

# Get the size of the original model
original_size = get_model_size(original_model_path)

# Get the size of the quantized model
quantized_model_path = "quantized_model"  # Assuming you saved it here
quantized_size = get_model_size(quantized_model_path)

print(f"Original model size: {original_size:.2f} MB")
print(f"Quantized model size: {quantized_size:.2f} MB")
# Avoid division by zero if original_size is 0
if original_size != 0:
  print(f"Size reduction: {(original_size - quantized_size) / original_size * 100:.2f}%")
else:
  print("Size reduction cannot be calculated as original model size is 0.")

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/202 [00:00<?, ?B/s]

Original model size: 3508.53 MB
Quantized model size: 325.48 MB
Size reduction: 90.72%


In [None]:
!pip install --upgrade huggingface_hub
from huggingface_hub import hf_api

Collecting huggingface_hub
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.4/481.4 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.30.1
    Uninstalling huggingface-hub-0.30.1:
      Successfully uninstalled huggingface-hub-0.30.1
Successfully installed huggingface_hub-0.30.2


In [None]:
tokenizer.save_pretrained("model")  # Save the tokenizer as well

('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/vocab.json',
 'model/merges.txt',
 'model/added_tokens.json',
 'model/tokenizer.json')

In [None]:
### save quantized model to hf

from huggingface_hub import Repository

    repo = Repository("quantized_LED_finetuned", clone_from="your-username/my-quantized-model", local_dir="my-quantized-model") # Replace with your username and model name
    repo.push_to_hub(commit_message="Initial commit of my quantized model")

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

In [None]:
!pwd

/content


In [None]:
ls

articl_2.csv  [0m[01;34moriginal_model[0m/  [01;34mquantized_model[0m/  [01;34msample_data[0m/


In [None]:
# prompt: load the quantized model to check if its working

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# Load the quantized model
quantized_model_path = "model"  # Replace with the actual path
model = AutoModelForSeq2SeqLM.from_pretrained(quantized_model_path, device_map="auto", local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(quantized_model_path, local_files_only=True)

# Example usage (replace with your actual input)
text = "This is a test input."
inputs = tokenizer(text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

decoded_output


Input ids are automatically padded from 8 to 1024 to be a multiple of `config.attention_window`: 1024


' this is a test input for a test input for a test input .                                                                                                                                                                                                                                                                                                                                                                                                                                             '

In [None]:
import pandas as pd
df = pd.read_csv("articl_2.csv")

article = df["article"][0]

In [None]:
inputs = tokenizer(article, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

decoded_output

Input ids are automatically padded from 3593 to 4096 to be a multiple of `config.attention_window`: 1024


' we evaluate the state-of-the-art close-sourced close-sourced language model over a large dataset of close-sourced language models faced with knowledge conflicts . \n we observe that the three close-sourced language models tend to exhibit uncertainty when faced withknowledge conflicts .    \n * keywords : * close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-sourced language model , close-so

### Note: unfortunately bitsandbytes is not supported in Mac. So a different quantization technique should be pursued to use it on Mac


In [None]:
!pip install optimum[exporters]
!pip install torch



In [None]:
!pip install torch transformers



In [None]:
import torch

AttributeError: partially initialized module 'torch' has no attribute '_ops' (most likely due to a circular import)

In [None]:

# Load tokenizer and PyTorch weights form the Hub
tokenizer = AutoTokenizer.from_pretrained("Prathyusha101/led-large-16384-arxiv")
pt_model = AutoModel.from_pretrained("Prathyusha101/led-large-16384-arxiv")


AttributeError: partially initialized module 'torch' has no attribute '_ops' (most likely due to a circular import)

In [None]:
tokenizer.save_pretrained("local-pt-checkpoint")
pt_model.save_pretrained("local-pt-checkpoint")