In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.nn.utils import prune
from torch.quantization import quantize_dynamic

token = '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'

# Load model and tokenizer from Hugging Face
model_name = "meta-llama/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, token=token)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")

# Ensure model is in training mode for pruning
model.train()

# Define pruning on Linear layers
def apply_pruning(model, amount=0.3):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=amount)
            print(f"Pruned {name}: {amount*100}% of weights")


In [2]:
# Apply pruning to the model
pruning_amount = 0.3  # Example: prune 50% of weights
apply_pruning(model, pruning_amount)

# Remove pruning masks
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        prune.remove(module, 'weight')

# Convert model to dynamic quantization
quantized_model = quantize_dynamic(
    model,
    {torch.nn.Linear}, 
    dtype=torch.qint8
)

Pruned model.layers.0.self_attn.q_proj: 30.0% of weights
Pruned model.layers.0.self_attn.k_proj: 30.0% of weights
Pruned model.layers.0.self_attn.v_proj: 30.0% of weights
Pruned model.layers.0.self_attn.o_proj: 30.0% of weights
Pruned model.layers.0.mlp.gate_proj: 30.0% of weights
Pruned model.layers.0.mlp.up_proj: 30.0% of weights
Pruned model.layers.0.mlp.down_proj: 30.0% of weights
Pruned model.layers.1.self_attn.q_proj: 30.0% of weights
Pruned model.layers.1.self_attn.k_proj: 30.0% of weights
Pruned model.layers.1.self_attn.v_proj: 30.0% of weights
Pruned model.layers.1.self_attn.o_proj: 30.0% of weights
Pruned model.layers.1.mlp.gate_proj: 30.0% of weights
Pruned model.layers.1.mlp.up_proj: 30.0% of weights
Pruned model.layers.1.mlp.down_proj: 30.0% of weights
Pruned model.layers.2.self_attn.q_proj: 30.0% of weights
Pruned model.layers.2.self_attn.k_proj: 30.0% of weights
Pruned model.layers.2.self_attn.v_proj: 30.0% of weights
Pruned model.layers.2.self_attn.o_proj: 30.0% of weig

In [4]:
# Save the state dictionary 
save_path = "llama-quantized-pruned"
torch.save(quantized_model.state_dict(), f"{save_path}/pytorch_model.bin")

# Save the tokenizer
tokenizer.save_pretrained(save_path)

print("Quantization and pruning complete. Model and tokenizer saved.")

Quantization and pruning complete. Model and tokenizer saved.
