In [1]:
import torch

### Quantization with Random Scale and Zero point

In [2]:
def linear_q_with_scale_and_zero_point(
    tensor, scale, zero_point, dtype = torch.int8):

    scaled_and_shifted_tensor = tensor / scale + zero_point ## linear quantion r = s(q-z) => q = (r/s)+z

    rounded_tensor = torch.round(scaled_and_shifted_tensor)

    q_min = torch.iinfo(dtype).min
    q_max = torch.iinfo(dtype).max

    q_tensor = rounded_tensor.clamp(q_min,q_max).to(dtype)

    return q_tensor

In [3]:
## example
test_tensor=torch.tensor(
    [[191.6, -13.5, 728.6],
     [92.14, 295.5,  -184],
     [0,     684.6, 245.5]]
)

In [4]:
## randomly selected scale and zero point
scale = 3.5
zero_point = -70

In [5]:
quantized_tensor = linear_q_with_scale_and_zero_point(
    test_tensor, scale, zero_point)

In [6]:
quantized_tensor

tensor([[ -15,  -74,  127],
        [ -44,   14, -123],
        [ -70,  126,    0]], dtype=torch.int8)

### Dequantization with Random scale and Zero point

In [9]:
## r = s(q-z)
def linear_dequantization(quantized_tensor, scale, zero_point):
    return scale * (quantized_tensor.float() - zero_point)

In [10]:
dequantized_tensor = linear_dequantization(
    quantized_tensor, scale, zero_point)

In [11]:
dequantized_tensor

tensor([[ 192.5000,  -14.0000,  689.5000],
        [  91.0000,  294.0000, -185.5000],
        [   0.0000,  686.0000,  245.0000]])

### Quantization error

In [13]:
dequantized_tensor - test_tensor

tensor([[  0.9000,  -0.5000, -39.1000],
        [ -1.1400,  -1.5000,  -1.5000],
        [  0.0000,   1.4000,  -0.5000]])

In [14]:
(dequantized_tensor - test_tensor).square()

tensor([[8.0999e-01, 2.5000e-01, 1.5288e+03],
        [1.2996e+00, 2.2500e+00, 2.2500e+00],
        [0.0000e+00, 1.9601e+00, 2.5000e-01]])

In [15]:
(dequantized_tensor - test_tensor).square().mean()

tensor(170.8753)

### Getting Scale and Zero point

In [16]:
q_min = torch.iinfo(torch.int8).min
q_max = torch.iinfo(torch.int8).max

In [17]:
q_min

-128

In [18]:
q_max

127

In [21]:
r_min = test_tensor.min().item()
r_max = test_tensor.max().item()

In [22]:
r_min

-184.0

In [23]:
r_max

728.5999755859375

In [24]:
scale = (r_max - r_min) / (q_max - q_min)

In [25]:
zero_point = q_min - (r_min / scale)

In [26]:
scale

3.578823433670343

In [27]:
zero_point

-76.58645490333825

In [28]:
zero_point = int(round(zero_point))

In [29]:
zero_point

-77

In [30]:
def get_q_scale_and_zero_point(tensor, dtype=torch.int8):

    q_min, q_max = torch.iinfo(dtype).min, torch.iinfo(dtype).max
    r_min, r_max = tensor.min().item(), tensor.max().item()

    scale = (r_max - r_min) / (q_max - q_min)

    zero_point = q_min - (r_min / scale)

    ## for the zero point out of range
    if zero_point < q_min:
        zero_point = q_min
    elif zero_point > q_max:
        zero_point = q_max
    else:
        zero_point = int(round(zero_point))

    return scale, zero_point

In [31]:
new_scale, new_zero_point = get_q_scale_and_zero_point(
    test_tensor)

In [32]:
new_scale

3.578823433670343

In [33]:
new_zero_point

-77

In [34]:
quantized_tensor = linear_q_with_scale_and_zero_point(
    test_tensor, new_scale, new_zero_point)

In [35]:
dequantized_tensor = linear_dequantization(quantized_tensor,
                                           new_scale, new_zero_point)

In [36]:
(dequantized_tensor-test_tensor).square().mean()

tensor(1.5730)

In [37]:
def linear_quantization(tensor, dtype=torch.int8):
    scale, zero_point = get_q_scale_and_zero_point(tensor,
                                                   dtype=dtype)

    quantized_tensor = linear_q_with_scale_and_zero_point(tensor,
                                                          scale,
                                                          zero_point,
                                                          dtype=dtype)

    return quantized_tensor, scale , zero_point

In [38]:
r_tensor = torch.randn((4, 4))

In [39]:
r_tensor

tensor([[ 0.2098,  1.2369, -1.2385, -1.7775],
        [-1.5253,  0.5834,  0.4669, -0.3304],
        [-0.9849,  0.5451, -0.2857, -0.9546],
        [ 0.8311, -1.2696,  0.6569,  1.7952]])

In [41]:
quantized_tensor, scale, zero_point = linear_quantization(r_tensor)

In [42]:
scale

0.014010466313829609

In [43]:
zero_point

-1

In [44]:
quantized_tensor

tensor([[  14,   87,  -89, -128],
        [-110,   41,   32,  -25],
        [ -71,   38,  -21,  -69],
        [  58,  -92,   46,  127]], dtype=torch.int8)

In [45]:
dequantized_tensor = linear_dequantization(quantized_tensor,
                                           scale, zero_point)

In [46]:
(dequantized_tensor-r_tensor).square().mean()

tensor(1.5046e-05)

### Linear Quantization : Symmetric Mode

In [47]:
## [-r_max,r_max]--->[-q_max,q_max]
def get_q_scale_symmetric(tensor, dtype=torch.int8):
    r_max = tensor.abs().max().item()
    q_max = torch.iinfo(dtype).max

    return r_max/q_max ## s = r_max/q_max

In [48]:
test_tensor = torch.randn((4, 4))

In [49]:
test_tensor

tensor([[-2.2265,  0.2409,  1.0324, -0.5506],
        [-0.4014, -0.9915,  0.7110, -0.2541],
        [-0.9055, -0.2417, -1.0889, -1.0305],
        [ 0.0103,  0.4765,  1.8184,  1.2104]])

In [50]:
get_q_scale_symmetric(test_tensor)

0.01753173287459246

In [51]:
def linear_q_symmetric(tensor, dtype=torch.int8):
    scale = get_q_scale_symmetric(tensor)

    ## in symmetric quantization zero point is = 0
    quantized_tensor = linear_q_with_scale_and_zero_point(tensor,
                                                     scale=scale,
                                                    zero_point=0,
                                                      dtype=dtype)

    return quantized_tensor, scale

In [52]:
quantized_tensor, scale = linear_q_symmetric(test_tensor)

In [53]:
quantized_tensor

tensor([[-127,   14,   59,  -31],
        [ -23,  -57,   41,  -14],
        [ -52,  -14,  -62,  -59],
        [   1,   27,  104,   69]], dtype=torch.int8)

In [54]:
scale

0.01753173287459246

In [55]:
dequantized_tensor = linear_dequantization(quantized_tensor,scale,0)

In [57]:
def quantization_error(tensor, quantized_tensor):
    return (tensor - quantized_tensor).square().mean()

In [58]:
quantization_error(test_tensor, dequantized_tensor)

tensor(2.7008e-05)

### Finer Granularities for more precision : Per Tensor , Per Channel , Per Group

In [59]:
## per tensor
test_tensor=torch.tensor(
    [[191.6, -13.5, 728.6],
     [92.14, 295.5,  -184],
     [0,     684.6, 245.5]]
)

In [60]:
quantized_tensor, scale = linear_q_symmetric(test_tensor)

In [61]:
dequantized_tensor = linear_dequantization(quantized_tensor, scale, 0)

In [62]:
print(f"""Quantization Error : \
{quantization_error(test_tensor, dequantized_tensor)}""")

Quantization Error : 2.5091912746429443


In [63]:
## per channel
## dim = 0 along the rows
## dim = 1 along the cols
dim=0
output_dim = test_tensor.shape[dim]

In [64]:
output_dim

3

In [65]:
scale = torch.zeros(output_dim)
scale

tensor([0., 0., 0.])

In [66]:
for index in range(output_dim):
    sub_tensor = test_tensor.select(dim,index)
    print(sub_tensor)
    scale[index] = get_q_scale_symmetric(sub_tensor)

tensor([191.6000, -13.5000, 728.6000])
tensor([  92.1400,  295.5000, -184.0000])
tensor([  0.0000, 684.6000, 245.5000])


In [67]:
scale

tensor([5.7370, 2.3268, 5.3906])

In [68]:
scale_shape = [1] * test_tensor.dim()
scale_shape

[1, 1]

In [69]:
scale_shape[dim] = -1

In [70]:
scale_shape

[-1, 1]

In [71]:
scale = scale.view(scale_shape)

In [72]:
copy_scale = scale
scale

tensor([[5.7370],
        [2.3268],
        [5.3906]])

In [73]:
## tensor division ---> row division,col division
m = torch.tensor([[1,2,3],[4,5,6],[7,8,9]])
s = torch.tensor([1,5,10])
scale1 = torch.tensor([[1], [5], [10]]) ## row division
out1 = m / scale1
scale2 = torch.tensor([[1, 5, 10]]) ## col division
out2 = m / scale2
print(m)
print(out1)
print(out2)

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])
tensor([[1.0000, 2.0000, 3.0000],
        [0.8000, 1.0000, 1.2000],
        [0.7000, 0.8000, 0.9000]])
tensor([[1.0000, 0.4000, 0.3000],
        [4.0000, 1.0000, 0.6000],
        [7.0000, 1.6000, 0.9000]])


In [74]:
quantized_tensor = linear_q_with_scale_and_zero_point(
    test_tensor, scale=scale, zero_point=0)

In [75]:
quantized_tensor

tensor([[ 33,  -2, 127],
        [ 40, 127, -79],
        [  0, 127,  46]], dtype=torch.int8)

In [76]:
def linear_q_symmetric_per_channel(r_tensor, dim, dtype=torch.int8):

    output_dim = r_tensor.shape[dim]
    ## store the scales
    scale = torch.zeros(output_dim)

    for index in range(output_dim):
        sub_tensor = r_tensor.select(dim, index)
        scale[index] = get_q_scale_symmetric(sub_tensor, dtype=dtype)

    ## reshape the scale
    scale_shape = [1] * r_tensor.dim()
    scale_shape[dim] = -1
    scale = scale.view(scale_shape)
    quantized_tensor = linear_q_with_scale_and_zero_point(
        r_tensor, scale=scale, zero_point=0, dtype=dtype)

    return quantized_tensor, scale

In [77]:
test_tensor=torch.tensor(
    [[191.6, -13.5, 728.6],
     [92.14, 295.5,  -184],
     [0,     684.6, 245.5]]
)

In [81]:
## along the rows (dim = 0)
quantized_tensor_0, scale_0 = linear_q_symmetric_per_channel(
    test_tensor, dim=0)

dequantized_tensor_0 = linear_dequantization(
    quantized_tensor_0, scale_0, 0)

## along the columns (dim = 1)
quantized_tensor_1, scale_1 = linear_q_symmetric_per_channel(
    test_tensor, dim=1)

dequantized_tensor_1 = linear_dequantization(
    quantized_tensor_1, scale_1, 0)

In [82]:
print(f"""Quantization Error : \
{quantization_error(test_tensor, dequantized_tensor_0)}""")

Quantization Error : 1.8084441423416138


In [83]:
print(f"""Quantization Error : \
{quantization_error(test_tensor, dequantized_tensor_1)}""")

Quantization Error : 1.0781488418579102


In [84]:
## per group
def linear_q_symmetric_per_group(tensor, group_size,
                                 dtype=torch.int8):

    t_shape = tensor.shape
    assert t_shape[1] % group_size == 0
    assert tensor.dim() == 2

    tensor = tensor.view(-1, group_size)

    quantized_tensor, scale = linear_q_symmetric_per_channel(
                                tensor, dim=0, dtype=dtype)

    quantized_tensor = quantized_tensor.view(t_shape)

    return quantized_tensor, scale

In [85]:
def linear_dequantization_per_group(quantized_tensor, scale,
                                    group_size):

    q_shape = quantized_tensor.shape
    quantized_tensor = quantized_tensor.view(-1, group_size)

    dequantized_tensor = linear_dequantization(quantized_tensor,
                                               scale, 0)

    dequantized_tensor = dequantized_tensor.view(q_shape)

    return dequantized_tensor

In [90]:
group_size = 3

In [91]:
quantized_tensor, scale = linear_q_symmetric_per_group(
    test_tensor, group_size=group_size)

dequantized_tensor = linear_dequantization_per_group(
    quantized_tensor, scale, group_size=group_size)

In [92]:
print(f"""Quantization Error : \
{quantization_error(test_tensor, dequantized_tensor)}""")

Quantization Error : 1.8084441423416138


### Building 8 bit Quantizer

In [96]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [97]:
random_int8 = torch.randint(-128,127,(32,16)).to(torch.int8) ## weights
random_hs = torch.randn((1, 16), dtype=torch.bfloat16) ## inputs
scales = torch.randn((1, 32), dtype=torch.bfloat16)
bias = torch.randn((1, 32), dtype=torch.bfloat16)

In [98]:
F.linear(random_hs, random_int8.to(random_hs.dtype))

tensor([[ -432.0000,   161.0000,   240.0000,  -456.0000,   536.0000,    91.5000,
            -7.4375,   -37.2500,  -175.0000,   105.5000,  -490.0000,   376.0000,
          -584.0000,   430.0000,   644.0000,   -79.0000,   191.0000,    92.5000,
          -154.0000,   146.0000,   237.0000,   326.0000,    94.5000,   338.0000,
         -1072.0000,   508.0000,   113.0000,  -648.0000,    19.7500,   -15.0000,
          -308.0000,   -48.7500]], dtype=torch.bfloat16)

In [99]:
F.linear(random_hs, random_int8.to(random_hs.dtype)) * scales

tensor([[ 251.0000,   19.0000, -382.0000,  128.0000, -852.0000,   90.0000,
           -2.9062,   42.0000,  322.0000,   71.5000,  -99.5000, -159.0000,
         -840.0000, -510.0000, 1136.0000,  -41.7500,  408.0000,  -16.5000,
           63.2500,  106.5000,   19.8750, -332.0000, -104.0000,  206.0000,
          366.0000,    2.9219,  140.0000,  105.5000,  -23.8750,  -21.0000,
          206.0000,   74.0000]], dtype=torch.bfloat16)

In [100]:
(F.linear(random_hs, random_int8.to(random_hs.dtype)) * scales) + bias ## ((input.casted_weight)*scale)+bias

tensor([[ 251.0000,   19.0000, -382.0000,  130.0000, -856.0000,   90.0000,
           -4.1250,   41.0000,  324.0000,   70.5000,  -99.0000, -160.0000,
         -840.0000, -510.0000, 1136.0000,  -40.2500,  408.0000,  -17.5000,
           61.5000,  106.0000,   20.6250, -330.0000, -105.0000,  206.0000,
          366.0000,    1.4766,  141.0000,  105.0000,  -23.2500,  -20.1250,
          205.0000,   76.0000]], dtype=torch.bfloat16)

In [101]:
def w8_a16_forward(weight, input, scales, bias=None):

    casted_weights = weight.to(input.dtype)
    output = F.linear(input, casted_weights) * scales

    if bias is not None:
        output = output + bias

    return output

In [102]:
print("With bias:\n\n",
      w8_a16_forward(random_int8, random_hs, scales, bias))

print("\nWithout bias:\n\n",
      w8_a16_forward(random_int8, random_hs, scales))

With bias:

 tensor([[ 251.0000,   19.0000, -382.0000,  130.0000, -856.0000,   90.0000,
           -4.1250,   41.0000,  324.0000,   70.5000,  -99.0000, -160.0000,
         -840.0000, -510.0000, 1136.0000,  -40.2500,  408.0000,  -17.5000,
           61.5000,  106.0000,   20.6250, -330.0000, -105.0000,  206.0000,
          366.0000,    1.4766,  141.0000,  105.0000,  -23.2500,  -20.1250,
          205.0000,   76.0000]], dtype=torch.bfloat16)

Without bias:

 tensor([[ 251.0000,   19.0000, -382.0000,  128.0000, -852.0000,   90.0000,
           -2.9062,   42.0000,  322.0000,   71.5000,  -99.5000, -159.0000,
         -840.0000, -510.0000, 1136.0000,  -41.7500,  408.0000,  -16.5000,
           63.2500,  106.5000,   19.8750, -332.0000, -104.0000,  206.0000,
          366.0000,    2.9219,  140.0000,  105.5000,  -23.8750,  -21.0000,
          206.0000,   74.0000]], dtype=torch.bfloat16)


In [115]:
class W8A16LinearLayer(nn.Module):
    def __init__(self, in_features, out_features,
                 bias=True, dtype=torch.float32):
        super().__init__()


        self.register_buffer(
            "int8_weights",
            torch.randint(
                -128, 127, (out_features, in_features), dtype=torch.int8
            )
        )

        self.register_buffer("scales",
                             torch.randn((out_features), dtype=dtype))

        if bias:
            self.register_buffer("bias",
                                 torch.randn((1, out_features),
                                             dtype=dtype))

        else:
            self.bias = None

    def quantize(self, weights):
        w_fp32 = weights.clone().to(torch.float32)

        scales = w_fp32.abs().max(dim=-1).values / 127
        scales = scales.to(weights.dtype)

        int8_weights = torch.round(weights
                        /scales.unsqueeze(1)).to(torch.int8)

        self.int8_weights = int8_weights
        self.scales = scales

    def forward(self, input):
        return w8_a16_forward(self.int8_weights,
                              input, self.scales, self.bias)

In [116]:
module = W8A16LinearLayer(4, 8)

In [117]:
print("Weights before:\n" , module.int8_weights)

Weights before:
 tensor([[ -23,    5,    7,  -11],
        [ 114,   51,   12,   78],
        [ -41,  -40,  -38, -125],
        [ -96, -104,  -13,   16],
        [-126,  -31,  -53,  -73],
        [  52,   34,  -16, -103],
        [  -4,   17,   78,   63],
        [ -20,   67,  -13,   91]], dtype=torch.int8)


In [118]:
random_matrix = torch.randn((4, 8), dtype=torch.bfloat16)

In [119]:
module.quantize(random_matrix)

In [120]:
module.scales

tensor([0.0150, 0.0156, 0.0162, 0.0128], dtype=torch.bfloat16)

In [121]:
module.scales.shape

torch.Size([4])

In [122]:
module.int8_weights.shape

torch.Size([4, 8])

In [123]:
## dequantized weights
module.int8_weights * module.scales.unsqueeze(1)

tensor([[ 0.7070, -0.4805, -0.7188, -1.2031, -1.3203,  1.9062, -0.4961, -0.2695],
        [-1.7734,  1.9766, -0.6523, -1.5703, -1.1484,  1.8359,  0.4043,  0.8398],
        [ 2.0625, -0.0488,  0.7812,  0.5859, -1.1016,  0.8750, -0.5039,  1.4297],
        [-0.8984,  0.1543, -0.4609,  1.6250,  0.3965, -1.0000, -0.3711,  1.3594]],
       dtype=torch.bfloat16)

In [124]:
## original weights
random_matrix

tensor([[ 0.7070, -0.4863, -0.7266, -1.1953, -1.3281,  1.9062, -0.4980, -0.2773],
        [-1.7656,  1.9766, -0.6562, -1.5703, -1.1562,  1.8281,  0.4102,  0.8359],
        [ 2.0625, -0.0413,  0.7773,  0.5820, -1.1094,  0.8828, -0.4980,  1.4297],
        [-0.8906,  0.1475, -0.4609,  1.6250,  0.3984, -0.9961, -0.3711,  1.3672]],
       dtype=torch.bfloat16)

In [125]:
(random_matrix - module.int8_weights
 * module.scales.unsqueeze(1)).abs().mean()

tensor(0.0044, dtype=torch.bfloat16)

### Replace Pytorch layer with Quantized layer

In [127]:
def replace_linear_with_target(module,target_class, module_name_to_exclude):

    for name, child in module.named_children():
        if isinstance(child, nn.Linear) and not \
          any([x == name for x in module_name_to_exclude]):
            old_bias = child.bias

            new_module = target_class(child.in_features,
                                      child.out_features,
                                      old_bias is not None,
                                      child.weight.dtype)
            setattr(module, name, new_module)
            if old_bias is not None:
              getattr(module, name).bias = old_bias
        else:
            # Recursively call the function for nested modules
            replace_linear_with_target(
                child, target_class, module_name_to_exclude)

In [128]:
class DummyModel(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.emb = torch.nn.Embedding(1, 1)
    # Try with bias
    self.linear_1 = nn.Linear(1, 1)
    # Try without bias
    self.linear_2 = nn.Linear(1, 1, bias=False)
    # Lm prediction head
    self.lm_head = nn.Linear(1, 1, bias=False)

In [129]:
model_1 = DummyModel()
model_2 = DummyModel()

In [130]:
print(model_1)

DummyModel(
  (emb): Embedding(1, 1)
  (linear_1): Linear(in_features=1, out_features=1, bias=True)
  (linear_2): Linear(in_features=1, out_features=1, bias=False)
  (lm_head): Linear(in_features=1, out_features=1, bias=False)
)


In [131]:
replace_linear_with_target(model_1, W8A16LinearLayer, ["lm_head"])
print(model_1)

DummyModel(
  (emb): Embedding(1, 1)
  (linear_1): W8A16LinearLayer()
  (linear_2): W8A16LinearLayer()
  (lm_head): Linear(in_features=1, out_features=1, bias=False)
)


In [132]:
replace_linear_with_target(model_2, W8A16LinearLayer, [])
print(model_2)

DummyModel(
  (emb): Embedding(1, 1)
  (linear_1): W8A16LinearLayer()
  (linear_2): W8A16LinearLayer()
  (lm_head): W8A16LinearLayer()
)


In [133]:
def replace_linear_with_target_and_quantize(module, target_class, module_name_to_exclude):

    for name, child in module.named_children():
        if isinstance(child, nn.Linear) and not \
        any([x == name for x in module_name_to_exclude]):
            old_bias = child.bias
            old_weight = child.weight

            new_module = target_class(child.in_features,
                                      child.out_features,
                                      old_bias is not None,
                                      child.weight.dtype)
            setattr(module, name, new_module)

            getattr(module, name).quantize(old_weight)

            if old_bias is not None:
              getattr(module, name).bias = old_bias
        else:
            # Recursively call the function for nested modules
            replace_linear_with_target_and_quantize(child,
                     target_class, module_name_to_exclude)

In [134]:
model_3 = DummyModel()

In [135]:
replace_linear_with_target_and_quantize(model_3, W8A16LinearLayer, ["lm_head"])
print(model_3)

DummyModel(
  (emb): Embedding(1, 1)
  (linear_1): W8A16LinearLayer()
  (linear_2): W8A16LinearLayer()
  (lm_head): Linear(in_features=1, out_features=1, bias=False)
)


### Quantize any Open Source Pytorch Model

### [Salesforce/codegen-350M-mono](https://huggingface.co/Salesforce/codegen-350M-mono) Model Space Optimization

In [178]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
model = AutoModelForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")

In [179]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [180]:
print(pipe("def hello_world():", max_new_tokens=20, do_sample=False))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'def hello_world():\n    print("Hello World")\n\nhello_world()\n\n# 파'}]


In [181]:
previous_memory_footprint = model.get_memory_footprint()

In [182]:
previous_memory_footprint

1510735872

In [183]:
print("Model before:\n\n", model)

Model before:

 CodeGenForCausalLM(
  (transformer): CodeGenModel(
    (wte): Embedding(51200, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-19): 20 x CodeGenBlock(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): CodeGenMLP(
          (fc_in): Linear(in_features=1024, out_features=4096, bias=True)
          (fc_out): Linear(in_features=4096, out_features=1024, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=51200, bi

In [184]:
replace_linear_with_target_and_quantize(model,
                                        W8A16LinearLayer, ["lm_head"])

In [185]:
pipe.model

CodeGenForCausalLM(
  (transformer): CodeGenModel(
    (wte): Embedding(51200, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-19): 20 x CodeGenBlock(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): W8A16LinearLayer()
          (out_proj): W8A16LinearLayer()
        )
        (mlp): CodeGenMLP(
          (fc_in): W8A16LinearLayer()
          (fc_out): W8A16LinearLayer()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=51200, bias=True)
)

In [186]:
print(pipe("def hello_world():", max_new_tokens=20,
           do_sample=False)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


def hello_world():
    print("Hello World")

hello_world()

# 파


In [187]:
new_footprint = model.get_memory_footprint()

In [188]:
new_footprint

756498432

In [189]:
print("Memory saved in MBs: ",
      (previous_memory_footprint - new_footprint)/1e+6)

Memory saved in MBs:  754.23744


In [191]:
print("Percentage of memory optimized: ",
      (previous_memory_footprint - new_footprint) / previous_memory_footprint * 100, "%")


Percentage of memory optimized:  49.92516918271733 %


### [facebook/detr-resnet-50](https://huggingface.co/facebook/detr-resnet-50) Model Space Optimization

In [192]:
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
from PIL import Image
import requests

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

# you can specify the revision tag if you don't want the timm dependency
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50", revision="no_timm")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50", revision="no_timm")

In [193]:
previous_memory_footprint = model.get_memory_footprint()

In [194]:
print("Footprint of the model in MBs: ",
      previous_memory_footprint/1e+6)

Footprint of the model in MBs:  166.524032


In [195]:
img_path = "TTtournamentChampion.10.17 PM.jpeg"
image = Image.open(img_path).convert("RGB")

In [196]:
model

DetrForObjectDetection(
  (model): DetrModel(
    (backbone): DetrConvModel(
      (conv_encoder): DetrConvEncoder(
        (model): ResNetBackbone(
          (embedder): ResNetEmbeddings(
            (embedder): ResNetConvLayer(
              (convolution): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
              (normalization): DetrFrozenBatchNorm2d()
              (activation): ReLU()
            )
            (pooler): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
          )
          (encoder): ResNetEncoder(
            (stages): ModuleList(
              (0): ResNetStage(
                (layers): Sequential(
                  (0): ResNetBottleNeckLayer(
                    (shortcut): ResNetShortCut(
                      (convolution): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
                      (normalization): DetrFrozenBatchNorm2d()
                    )
                    (layer): Seq

In [197]:
replace_linear_with_target_and_quantize(model,
                                        W8A16LinearLayer,
               ["0", "1", "2", "class_labels_classifier"])

In [198]:
## Model after quantization
model

DetrForObjectDetection(
  (model): DetrModel(
    (backbone): DetrConvModel(
      (conv_encoder): DetrConvEncoder(
        (model): ResNetBackbone(
          (embedder): ResNetEmbeddings(
            (embedder): ResNetConvLayer(
              (convolution): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
              (normalization): DetrFrozenBatchNorm2d()
              (activation): ReLU()
            )
            (pooler): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
          )
          (encoder): ResNetEncoder(
            (stages): ModuleList(
              (0): ResNetStage(
                (layers): Sequential(
                  (0): ResNetBottleNeckLayer(
                    (shortcut): ResNetShortCut(
                      (convolution): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
                      (normalization): DetrFrozenBatchNorm2d()
                    )
                    (layer): Seq

In [199]:
new_footprint = model.get_memory_footprint()

In [200]:
print("Footprint of the model in MBs: ",
      new_footprint/1e+6)

Footprint of the model in MBs:  114.80384


In [201]:
## Memory saved
print("Memory saved in MBs: ",
      (previous_memory_footprint - new_footprint)/1e+6)

Memory saved in MBs:  51.720192


In [202]:
print("Percentage of memory optimized: ",
      (previous_memory_footprint - new_footprint) / previous_memory_footprint * 100, "%")

Percentage of memory optimized:  31.0586954800614 %
