<a href="https://colab.research.google.com/github/Rohit-Singh12/Deep-LEARGNINGS/blob/main/Misc/Model%20Compression/Pruning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pruning
Pruning involoves removing neurons from the model which don't contibute much to model performance. In this way one can reduce the model size while still maintaining good performance.

### Pruning is done primarliy on two ways-


*   Neuron Pruning: Here the entire nodes are removed from the network resulting in small layers and hence faster inference
*   Weight Pruning: Here some of the weights in the matrices are set to 0 which don't contribute to model performance and the weights are then stored in CSR(Compressed Sparse Row) for efficient resource utilization. It doesn't improve interence time but model size is compressed



## Zero Pruning

It involves removing the weights from the matrices whose weight is close to zero. The idea is that the weight which is close to zero doesn't contribute much to model output. So, set the weights close to 0 to 0 and use CSR to store the sparse matrix efficiently.

In [1]:
!pip install pytorch-lightning

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.5.1-py3-none-any.whl.metadata (20 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.7.0-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.14.3-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.1.0->pytorch-lightning)
  Downloadi

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning.utilities.types import OptimizerLRScheduler
import torchmetrics

### Loading MNIST dataset from torcvision.datasets

In [3]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

trainset = torchvision.datasets.MNIST(
    root='./data',
    train=True,
    download=True,
    transform=transform
)

train_loader = DataLoader(trainset,
                          shuffle=True,
                          batch_size=64)

testset = torchvision.datasets.MNIST(
    root='./data',
    train=False,
    download=True,
    transform=transform
)

test_loader = DataLoader(testset,
                         shuffle=True,
                         batch_size=64)

100%|██████████| 9.91M/9.91M [00:00<00:00, 12.7MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 342kB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 2.73MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 6.98MB/s]


### Create the Network for Training the model

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
class CNN(pl.LightningModule):
  def __init__(self):
    super(CNN, self).__init__()
    self.Accuracy = torchmetrics.Accuracy(
        task='multiclass',
        num_classes=10
    )
    self.conv = nn.Conv2d(in_channels=1,
                          out_channels=32,
                          kernel_size=3,
                          stride=1,
                          padding=1,
                          device=device)
    self.pool = nn.MaxPool2d(kernel_size=3,
                             stride=1)  #[batch_size, 32, 26, 26]

    self.fc1 = nn.Linear(32*26*26, 512)
    self.fc2 = nn.Linear(512, 256)
    self.fc3 = nn.Linear(256, 128)
    self.fc4 = nn.Linear(128, 10)

  def forward(self, input):
    X = F.relu(self.conv(input))
    X = self.pool(X)
    X = X.view(X.size(0), -1)
    X = F.relu(self.fc1(X))
    X = F.relu(self.fc2(X))
    X = F.relu(self.fc3(X))
    return self.fc4(X)

  def configure_optimizers(self) -> OptimizerLRScheduler:
    optimizer = torch.optim.Adam(self.parameters(),
                                 lr=0.001)
    schedular = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
    return {
        "optimizer": optimizer,
        "lr_scheduler": {
            "scheduler": schedular,
            "interval": "epoch",
            "frequency": 1
        }
    }

  def training_step(self, batch, batch_idx):
    X,y = batch
    logits = self.forward(X)
    loss = F.cross_entropy(logits, y)
    accuracy = self.Accuracy(logits, y)

    self.log_dict({'train_loss': loss, 'accuracy': accuracy},
                  prog_bar=True,
                  on_epoch=True,
                  on_step=False)
    return loss

  def test_step(self, batch, batch_idx):
    X, y = batch
    logits = self.forward(X)
    loss = F.cross_entropy(logits, y)
    accuracy = self.Accuracy(logits, y)

    self.log_dict({'train_loss':loss, 'accuracy':accuracy},
                  prog_bar=True,
                  on_epoch=True,
                  on_step=False)
    return loss




In [6]:
model = CNN()
trainer = pl.Trainer(
    max_epochs=5,
    accelerator='gpu'
)

INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [7]:
trainer.fit(model, train_dataloaders=train_loader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name     | Type               | Params | Mode 
--------------------------------------------------------
0 | Accuracy | MulticlassAccuracy | 0      | train
1 | conv     | Conv2d             | 320    | train
2 | pool     | MaxPool2d          | 0      | train
3 | fc1      | Linear             | 11.1 M | train
4 | fc2      | Linear             | 131 K  | train
5 | fc3      | Linear             | 32.9 K | train
6 | fc4      | Linear             | 1.3 K  | train
--------------------------------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.968    Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


#### Define threshold for to zero out all the weights below it

In [8]:
trainer.test(model, test_loader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:476: Your `test_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'train_loss': 0.04744436591863632, 'accuracy': 0.986299991607666}]

In [9]:
threshold = 0.06
for name, param in model.named_parameters():
  if 'weight' in name:
    param.data[param.data.abs() < threshold]=0
trainer.test(model, test_loader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'train_loss': 0.2915991246700287, 'accuracy': 0.9660999774932861}]

In [10]:
total = 0
zeros = 0
for name, param in model.named_parameters():
    if 'weight' in name:
        total += param.numel()
        zeros += torch.sum(param == 0).item()

print(f"Sparsity: {100 * zeros / total:.2f}%")

Sparsity: 94.38%


As can be seen from above the matrix is 93% sparse. We can now use CSR to store these weights efficiently

In [11]:
import scipy.sparse as sp

sparse_weights = []
shapes = []

for name, param in model.named_parameters():
    if 'weight' in name:
        np_weight = param.data.cpu().numpy()
        original_shape = np_weight.shape
        shapes.append(original_shape)

        np_weight_2d = np_weight.reshape(np_weight.shape[0], -1)
        sparse_weights.append(sp.csr_matrix(np_weight_2d))


## Activation Pruning
It involves removing the weights from the hidden layer which don't contribute to the model performance.
The idea is that if the activation is very close to zero it will not have significant contribution to the output of the model as WX = 0 only

### Algorithm


*   Train the model
*   After training run the training data through the network again without calculating gradients
*   Calculate average activation of each neuron in the hiddent layer
*   Define threshold of pruning
*   Remove the weight matrices below threshold







In [12]:
model = CNN()
trainer = pl.Trainer(
    accelerator='gpu',
    max_epochs=4
)

INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [13]:
trainer.fit(model, test_loader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name     | Type               | Params | Mode 
--------------------------------------------------------
0 | Accuracy | MulticlassAccuracy | 0      | train
1 | conv     | Conv2d             | 320    | train
2 | pool     | MaxPool2d          | 0      | train
3 | fc1      | Linear             | 11.1 M | train
4 | fc2      | Linear             | 131 K  | train
5 | fc3      | Linear             | 32.9 K | train
6 | fc4      | Linear             | 1.3 K  | train
--------------------------------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.968    Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=4` reached.


Now apply pruning on the Fully Connected Hidden Layer fc1, fc2 and fc3

In [14]:
all_activations = [
    torch.zeros(512), #fc1
    torch.zeros(256), #fc2
    torch.zeros(128) #fc3
]

In [15]:
data_size = 0
for images, _ in train_loader:
  data_size += images.size(0)

  X = F.relu(model.conv(images))
  X = model.pool(X)
  X = X.view(X.size(0), -1)
  activation_fc1 = F.relu(model.fc1(X))
  all_activations[0] += activation_fc1.sum(dim=0)

  activation_fc2 = F.relu(model.fc2(activation_fc1))
  all_activations[1] += activation_fc2.sum(dim=0)

  activation_fc3 = F.relu(model.fc3(activation_fc2))
  all_activations[2] += activation_fc3.sum(dim=0)

  if (images.size(0)*10 == data_size):
    break

  # print([i.shape for i in all_activations])


# Calculating the average activation
for idx, activation in enumerate(all_activations):
  all_activations[idx] = activation/data_size

Pruning neurons of fc1

In [19]:
threshold = 0.01
new_net = CNN()
new_net.fc1.weight = model.fc1.weight
new_net.fc2.weight = model.fc2.weight
new_net.fc3.weight = model.fc3.weight
new_net.fc4.weight = model.fc4.weight

new_net.fc1.bias = model.fc1.bias
new_net.fc2.bias = model.fc2.bias
new_net.fc3.bias = model.fc3.bias
new_net.fc4.bias = model.fc4.bias

new_net.fc1.weight = nn.Parameter(new_net.fc1.weight[all_activations[0]>=threshold])

new_net.fc2.weight = nn.Parameter(new_net.fc2.weight[:, all_activations[0]>=threshold])
new_net.fc2.weight = nn.Parameter(new_net.fc2.weight[all_activations[1]>=threshold])

new_net.fc3.weight = nn.Parameter(new_net.fc3.weight[:, all_activations[1]>=threshold])
new_net.fc3.weight = nn.Parameter(new_net.fc3.weight[all_activations[2]>=threshold])

new_net.fc4.weight = nn.Parameter(new_net.fc4.weight[:, all_activations[2]>=threshold])


new_net.fc1.bias = nn.Parameter(new_net.fc1.bias[all_activations[0]>=threshold])
new_net.fc2.bias = nn.Parameter(new_net.fc2.bias[all_activations[1]>=threshold])
new_net.fc3.bias = nn.Parameter(new_net.fc3.bias[all_activations[2]>=threshold])

Checking the performance

In [20]:
trainer.test(model, test_loader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'train_loss': 0.08634582161903381, 'accuracy': 0.9688000082969666}]

In [21]:
trainer.test(new_net, test_loader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'train_loss': 5.35197639465332, 'accuracy': 0.16349999606609344}]