In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from functools import partial
from typing import Dict, Any, Tuple

import pytorch_lightning as pl
from hyperopt import fmin, tpe, hp, Trials, space_eval, STATUS_OK

from src.lab1.shakespeare_trainer import ShakespeareModule

In [93]:
module = ShakespeareModule.load_from_checkpoint("src/lab1/checkpoints/float-best.ckpt")

In [105]:
class QLinear(nn.Module):
    """
    A fully-connected layer with symmetric uniform quantization for weights and activations.
    """
    def __init__(
        self,
        in_features: int,
        out_features: int,
        bias: bool = True,
        weight_bitwidth: int = 8,
        act_bitwidth: int = 8,
    ) -> None:
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight_bitwidth = weight_bitwidth
        self.act_bitwidth = act_bitwidth

        # Buffers to hold quantized weight and scale
        self.register_buffer(
            "qweight",
            torch.zeros(out_features, in_features, dtype=torch.float32),
        )
        self.register_buffer("weight_scale", torch.ones(1))

        # Optional bias stored in float32
        if bias:
            self.register_buffer("bias", torch.zeros(out_features, dtype=torch.float32))
        else:
            self.bias = None

    @staticmethod
    def _quantize_tensor(
        x: torch.Tensor, bitwidth: int
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Quantize a tensor to signed integers in [-2^(b-1), 2^(b-1)-1].
        Returns (quantized_tensor, scale).
        """
        qmax = 2 ** (bitwidth - 1) - 1
        rmax = x.abs().max()
        scale = rmax / qmax if rmax > 0 else torch.tensor(1.0, device=x.device)
        q = torch.clamp(torch.round(x / scale), -qmax, qmax)
        return q, scale

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # 1. Quantize activations
        qx, act_scale = self._quantize_tensor(x, self.act_bitwidth)

        # 2. Integer GEMM
        qx = qx.to(self.qweight.dtype)
        acc = qx.matmul(self.qweight.t())

        # 3. Dequantize
        y = acc * act_scale * self.weight_scale

        # 4. Add bias if present
        if self.bias is not None:
            y = y + self.bias

        return y

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}("
            f"in={self.in_features}, out={self.out_features}, "
            f"w_bits={self.weight_bitwidth}, a_bits={self.act_bitwidth})"
        )

In [106]:
print(module)
    

ShakespeareModule(
  (model): AutoRegressiveTransformer(
    (embedding): Embedding(1024, 256)
    (pos_encoder): Embedding(1024, 256)
    (transformer_blocks): ModuleList(
      (0-7): 8 x TransformerBlock(
        (self_attn): MultiHeadAttention(
          (qkv_proj): Linear(in_features=256, out_features=768, bias=True)
          (out_proj): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): FeedForward(
          (linear1): Linear(in_features=256, out_features=1024, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=1024, out_features=256, bias=True)
          (activation): GELU(approximate='none')
        )
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
 

In [107]:
def get_init_qconfig(module):
    blocks = module.model.transformer_blocks
    qconfig = {}
    for name, module in blocks.named_modules(): 
        if isinstance(module, nn.Linear): 
            qconfig[f"model.transformer_blocks.{name}"] = 8
    return qconfig

In [108]:
module = ShakespeareModule.load_from_checkpoint("src/lab1/checkpoints/float-best.ckpt")
qconfig = get_init_qconfig(module)

In [109]:
qmodule = quantize_model(module, qconfig)

In [112]:
def obj_function(params): 
    trainer = pl.Trainer(
        max_epochs=1,                                      # the number of epochs to train fotraining
        accelerator="auto",                                 # the accelerator to use during training
        devices="auto",                                     # the devices to use during training
        precision="16-mixed",                                        # the precision to use during training 
    )
    module = ShakespeareModule.load_from_checkpoint("src/lab1/checkpoints/float-best.ckpt")
    module = quantize_model(module, params)
    result = trainer.test(module, verbose=False)
    print(result)
    return result[0]['test_loss']
    

In [113]:
module = ShakespeareModule.load_from_checkpoint("src/lab1/checkpoints/float-best.ckpt")
qconfig = get_init_qconfig(module)

obj_function(qconfig)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

[{'test_loss': nan, 'test_perplexity': nan}]


nan

In [80]:
space = {key: hp.choice(key, [2, 4, 8]) for key in qconfig.keys()}



In [81]:
from hyperopt import fmin, tpe, hp, Trials


space = {key: hp.choice(key, [2, 4, 8]) for key in qconfig.keys()}

trials = Trials()

best = fmin(
    fn=obj_function,
    space=space,
    algo=tpe.suggest,
    max_evals=10,
    trials=trials,
)

print(best)


  0%|                                                                                                     | 0/10 [00:00<?, ?trial/s, best loss=?][A

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


 10%|█████████▎                                                                                   | 1/10 [00:03<00:35,  3.89s/trial, best loss=?][A

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


 20%|██████████████████▌                                                                          | 2/10 [00:07<00:30,  3.87s/trial, best loss=?][A

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


 30%|███████████████████████████▉                                                                 | 3/10 [00:11<00:27,  3.86s/trial, best loss=?][A

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


 40%|█████████████████████████████████████▏                                                       | 4/10 [00:15<00:23,  3.85s/trial, best loss=?][A

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


 50%|██████████████████████████████████████████████▌                                              | 5/10 [00:19<00:19,  3.85s/trial, best loss=?][A

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


 60%|███████████████████████████████████████████████████████▊                                     | 6/10 [00:23<00:15,  3.86s/trial, best loss=?][A

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


 70%|█████████████████████████████████████████████████████████████████                            | 7/10 [00:27<00:11,  3.86s/trial, best loss=?][A

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


 80%|██████████████████████████████████████████████████████████████████████████▍                  | 8/10 [00:30<00:07,  3.86s/trial, best loss=?][A

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


 90%|███████████████████████████████████████████████████████████████████████████████████▋         | 9/10 [00:34<00:03,  3.86s/trial, best loss=?][A

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


100%|████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:38<00:00,  3.86s/trial, best loss=?][A


AllTrialsFailed: 