In [1]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
import os
import numpy as np

# os.environ["CUDA_VISIBLE_DEVICES"]="9"

token = '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'

# Clear GPU Cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()


# Force garbage collection
import gc
gc.collect()

# Move models to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# Configure 4-bit quantization
quantization_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)


In [2]:
def prune_model(model, pruning_percentage=0.2):
    """
    Perform magnitude pruning on the model weights.

    Args:
        model (torch.nn.Module): The model to prune.
        pruning_percentage (float): Percentage of weights to prune.

    Returns:
        torch.nn.Module: Pruned model.
    """
    with torch.no_grad():
        for name, param in model.named_parameters():
            if 'weight' in name and param.requires_grad:
                # Flatten weights 
                weights = param.data.cpu().numpy()
                threshold = np.percentile(np.abs(weights), pruning_percentage * 100)

                # Create a mask 
                mask = np.abs(weights) > threshold

                # Apply the mask
                pruned_weights = torch.from_numpy(weights * mask).to(param.device)
                param.data.copy_(pruned_weights)

    return model

# Load your model and tokenizer
model_name = "meta-llama/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, token=token)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Display model parameters before pruning
print("Model parameters before pruning:")
for name, param in model.named_parameters():
    print(f"{name}: {param.numel()} parameters")

# Perform magnitude pruning
pruning_percentage = 0.3 
model = prune_model(model, pruning_percentage)

# Display model parameters after pruning
print("\nModel parameters after pruning:")
for name, param in model.named_parameters():
    print(f"{name}: {param.nonzero().size(0)} active parameters")

# Save the pruned model
pruned_model_path = "magnitude_pruned_model"
model.save_pretrained(pruned_model_path)
tokenizer.save_pretrained(pruned_model_path)

print(f"Pruned model saved to {pruned_model_path}")


Model parameters before pruning:
model.embed_tokens.weight: 262668288 parameters
model.layers.0.self_attn.q_proj.weight: 4194304 parameters
model.layers.0.self_attn.k_proj.weight: 1048576 parameters
model.layers.0.self_attn.v_proj.weight: 1048576 parameters
model.layers.0.self_attn.o_proj.weight: 4194304 parameters
model.layers.0.mlp.gate_proj.weight: 16777216 parameters
model.layers.0.mlp.up_proj.weight: 16777216 parameters
model.layers.0.mlp.down_proj.weight: 16777216 parameters
model.layers.0.input_layernorm.weight: 2048 parameters
model.layers.0.post_attention_layernorm.weight: 2048 parameters
model.layers.1.self_attn.q_proj.weight: 4194304 parameters
model.layers.1.self_attn.k_proj.weight: 1048576 parameters
model.layers.1.self_attn.v_proj.weight: 1048576 parameters
model.layers.1.self_attn.o_proj.weight: 4194304 parameters
model.layers.1.mlp.gate_proj.weight: 16777216 parameters
model.layers.1.mlp.up_proj.weight: 16777216 parameters
model.layers.1.mlp.down_proj.weight: 16777216 p