In [1]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
import os
import numpy as np

# os.environ["CUDA_VISIBLE_DEVICES"]="9"

token = ''

# Clear GPU Cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()


# Force garbage collection
import gc
gc.collect()

# Move models to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# Configure 4-bit quantization
quantization_config_4bit = BitsAndBytesConfig(
    load_in_8bit=True,
    # bnb_4bit_compute_dtype=torch.float16
)

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.nn.utils import prune

# Load the model and tokenizer from Hugging Face
model_name = "meta-llama/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, token=token)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")

# Function to apply structured pruning to a given layer
def structured_prune_layer(layer, amount=0.3, n=1):
    for name, module in layer.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.ln_structured(module, name="weight", amount=amount, dim=0, n=n)

# Apply structured pruning to each layer of the model
for name, layer in model.named_modules():
    if isinstance(layer, torch.nn.Module):
        if "mha" in name.lower(): 
            structured_prune_layer(layer, amount=0.2, n=2) 
        else:
            structured_prune_layer(layer, amount=0.4, n=2) 

# Remove pruning masks to make the pruning permanent
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        prune.remove(module, "weight")

# Save the pruned model and tokenizer
output_dir = "structured_prune"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Pruned model and tokenizer saved to {output_dir}")


Pruned model and tokenizer saved to structured_prune
