# Libraries

In [1]:
# required libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch.nn as nn
import os
from transformers.models.gpt2.modeling_gpt2 import Conv1D

In [2]:
# device agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


# Model Loading

In [3]:
# loading the model
model_name = "distilgpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
model = model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Support functions

In [4]:
# function to check the size of the model
def count_params(model):
  return sum(p.numel() for p in model.parameters())
print(f"Model name: {model_name}, Parameters count: {count_params(model)} ")

Model name: distilgpt2, Parameters count: 81912576 


In [5]:
def get_output(prompt,model,tokenizer):
  inputs = tokenizer(prompt,return_tensors='pt').to(device)
  outputs = model.generate(inputs['input_ids'],attention_mask=inputs['attention_mask'],max_length=10,num_return_sequences=1)
  generated = tokenizer.batch_decode(outputs[0],skip_special_tokens=True)
  return generated

# Model summary

In [6]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [7]:
print(model.config)

GPT2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.48.2",
  "use_cache

In [8]:
# testing the original model
prompt = "Paris is the capital of"
generated =get_output(prompt,model,tokenizer)
print(f"Base Models generated response: {generated}")
base_params_count = count_params(model)
print(f"Base Models parameters: {base_params_count}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Base Models generated response: ['Paris', ' is', ' the', ' capital', ' of', ' the', ' United', ' States', '.', '\n']
Base Models parameters: 81912576


# Pruning the model

In [9]:
# Initialize new_intermediate_size
new_intermediate_size = None

## pruning support functions

In [10]:
# for computing important scores based on L1 norm
def compute_imp_score(c_fc_weight):
  return torch.sum(torch.abs(c_fc_weight), dim=0)  # Shape: [intermediate_size]

In [11]:
# Function to prune neurons and create new Conv1D layers
def prune_neurons(mlp, prune_percent, device):
    # Get the weights of the c_fc layer (input projection)
    c_fc_weight = mlp.c_fc.weight.data

    # Compute importance scores for each neuron
    importance_scores = compute_imp_score(c_fc_weight)

    # Determine the number of neurons to prune
    original_intermediate_size = c_fc_weight.size(1)  # This is intermediate_size
    num_neurons_to_prune = int(prune_percent * original_intermediate_size)

    # Get indices of neurons to keep (those with highest importance)
    _, indices_to_keep = torch.topk(importance_scores, original_intermediate_size - num_neurons_to_prune)

    # Sort indices to maintain order
    indices_to_keep, _ = torch.sort(indices_to_keep)

    # Create new Conv1D layers with reduced size
    new_c_fc = Conv1D(len(indices_to_keep), mlp.c_fc.weight.size(0)).to(device)  # Conv1D(new_intermediate_size, hidden_size)
    new_c_proj = Conv1D(mlp.c_proj.weight.size(1), len(indices_to_keep)).to(device)  # Conv1D(hidden_size, new_intermediate_size)

    return new_c_fc, new_c_proj, len(indices_to_keep), indices_to_keep

In [12]:
# Function to copy weights and biases to new pruned layers
def copy_weights_and_biases(mlp, new_c_fc, new_c_proj, indices_to_keep):
    # Copy weights and biases for the neurons we are keeping and move them to the specified device
    new_c_fc.weight.data = mlp.c_fc.weight.data[:, indices_to_keep].to(device)
    new_c_fc.bias.data = mlp.c_fc.bias.data[indices_to_keep].to(device)

    new_c_proj.weight.data = mlp.c_proj.weight.data[indices_to_keep, :].to(device)
    new_c_proj.bias.data = mlp.c_proj.bias.data.to(device)

# pruning loop

In [13]:
def update_model(model, prune_percent, device):
    new_intermediate_size = None

    # Iterate through each block in the model
    for idx, block in enumerate(model.transformer.h):
        mlp = block.mlp

        # Prune the neurons and create new layers
        new_c_fc, new_c_proj, new_size, indices_to_keep = prune_neurons(mlp, prune_percent, device)

        # Copy weights and biases from old layers to new pruned layers
        copy_weights_and_biases(mlp, new_c_fc, new_c_proj, indices_to_keep)

        # Replace old layers with new pruned layers
        mlp.c_fc = new_c_fc
        mlp.c_proj = new_c_proj

        # Update the intermediate size for the first block
        if new_intermediate_size is None:
            new_intermediate_size = new_size

    # Update the model configuration with the new intermediate size
    model.config.n_inner = new_intermediate_size

    return model

# Checking the pruned model

In [14]:
# prune percentage
prune_percent = 0.3

In [15]:
model = update_model(model,prune_percent,device)

In [16]:
# checking the parameter count
pruned_params_count = count_params(model)
print(f"Base Models parameters count: {base_params_count}")
print(f"Prune Models parameters count: {pruned_params_count}")
print(f"Reduction in parameters: {base_params_count - pruned_params_count}")

Base Models parameters count: 81912576
Prune Models parameters count: 73419114
Reduction in parameters: 8493462


In [17]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=2151, nx=768)
          (c_proj): Conv1D(nf=768, nx=2151)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [18]:
#config file pruned model.
print(model.config)

GPT2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": 2151,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.48.2",
  "use_cache

In [19]:
# Checking again
generated = get_output(prompt,model,tokenizer)
print(f"Generated text after pruning: {generated}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated text after pruning: ['Paris', ' is', ' the', ' capital', ' of', ' the', ' United', ' States', '.', '\n']


# Saving the model

In [20]:
# Save the pruned model
output_dir = './pruned_distilgpt2'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Pruned model saved to {output_dir}")

Pruned model saved to ./pruned_distilgpt2
