In [None]:
!nvidia-smi #this command acts as a task manager between nvidia gpu and the user. Shows GPU related details (system management information)




## *Below code is specific to pushing collab notebooks to git, check for text cell below indicating "Implementation"*

In [None]:
# to clear notebook metadata before pushing it to git. This is so to make notebook readable in git.

!pip install nbformat

import nbformat

# Read notebook
with open('AWQ_Implementation.ipynb', 'r') as f:
    nb = nbformat.read(f, as_version=4)

# Remove problematic metadata
if 'widgets' in nb.metadata:
    del nb.metadata['widgets']

# Clear outputs
for cell in nb.cells:
    if cell.cell_type == 'code':
        cell.outputs = []
        cell.execution_count = None

# Write cleaned notebook
with open('AWQ_Implementation.ipynb', 'w') as f:
    nbformat.write(nb, f)

print("✅ Cleaned!")

In [None]:
# git push code

!git add AWQ_Implementation.ipynb
!git commit -m "Compression for opt-1.3b complete, 2.5x smaller size"
!git push


# **Implementation**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive
#switching the directory to my drive to keep all the model related data there

In [None]:
!git clone https://github.com/Tanishk-Singh/Transformers.git
# cloning the git repo in my directory

In [None]:
%cd Transformers
# switching to transofrmer git repo as directory

In [None]:
!git config user.name "Tanishk-Singh"
!git config user.email "tanwartanishk5@gmail.com"

In [None]:
!git status

In [None]:
!pip install llmcompressor transformers accelerate

# original autoawq is depricated and now llcompressor by VLLM maintains it
# transformers - hugging face library to download latest llm models
# accelerate - hugging face library to load large models efficently on cpu/gpu

In [None]:
import torch

In [None]:
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0)}")
print("AutoAWQ imported successfully!")

In [None]:
!mkdir -p models/awq_quantized
#creates a folder name awq_quantised in directory models, p is the parent flag, if parent directory doesn't exist
# it creates one

In [None]:
from llmcompressor import oneshot
from transformers import AutoModelForCausalLM, AutoTokenizer



In [None]:
model_path = "facebook/opt-1.3b" # looks for this model in cache, if not found, loads it from Hugging Face Library to cache
output_dir = "/content/drive/MyDrive/Transformers/models/opt-1.3b-awq"
#to store our quantised or compresses model

print("Loading FP16 model...")
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.float16
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
# we first load tokeniser becasue they convert text to numbers to input into the model

fp16_size = sum(p.numel() * p.element_size() for p in model.parameters()) / 1e9
# here we iterate every tensor(weight matrix) in the model. Get the no of elements in each and mutiply it by size
# then convert total bytest to Gigabytes
print(f"Size of FP16 Parameters loaded: {fp16_size:.2f} GB")


In [None]:
output_dir = "/content/drive/MyDrive/Transformers/models/opt-1.3b-awq"

print("\nQuantizing with AWQ (W4A16)...")

# AWQ quantization recipe
recipe = """
quant_stage:
    quant_modifiers:
        QuantizationModifier:
            ignore: ["lm_head"]
            config_groups:
                group_0:
                    targets: ["Linear"]
                    weights:
                        num_bits: 4
                        type: "int"
                        symmetric: True
                        strategy: "group"
                        group_size: 128
"""

oneshot = oneshot(
    model=model,
    dataset="open_platypus",
    num_calibration_samples=512,
    recipe=recipe,
    output_dir=output_dir
)

print(f"✅ Quantization complete! Saved to {output_dir}")

In [None]:
# Load quantized model
quant_model = AutoModelForCausalLM.from_pretrained(
    "/content/drive/MyDrive/Transformers/models/opt-1.3b-awq",
    device_map="auto"
)

# Calculate size of quanitsed model
quant_size = sum(p.numel() * p.element_size() for p in quant_model.parameters()) / 1e9

print(f"Model Size Comparison:")
print(f"FP16:  {fp16_size:.2f} GB")
print(f"AWQ:   {quant_size:.2f} GB")
print(f"Ratio: {fp16_size/quant_size:.1f}x smaller")

In [None]:
!git status