# EfficientNet in PyTorch

In [1]:
import torch
from torch import nn, optim
import math
import os
from torchinfo import summary

In [2]:
def conv_block(in_channels, out_channels, kernel_size=3, 
               stride=1, padding=0, groups=1,
               bias=False, bn=True, act = True):
    layers = [
        nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, 
                  padding=padding, groups=groups, bias=bias),
        nn.BatchNorm2d(out_channels) if bn else nn.Identity(),
        nn.SiLU() if act else nn.Identity()
    ]
    return nn.Sequential(*layers)

In [3]:
class SEBlock(nn.Module):
    def __init__(self, c, r=24):
        super(SEBlock, self).__init__()
        self.squeeze = nn.AdaptiveMaxPool2d(1)
        self.excitation = nn.Sequential(
            nn.Conv2d(c, c // r, kernel_size=1),
            nn.SiLU(),
            nn.Conv2d(c // r, c, kernel_size=1),
            nn.Sigmoid()
        )
    def forward(self, x):
        s = self.squeeze(x)
        e = self.excitation(s)
        return x * e

In [4]:
class MBConv(nn.Module):
    """
    An implementation of the Inverted Residual from the MobileNet paper.
    """
    def __init__(self, n_in, n_out, expansion, kernel_size=3, stride=1, r=24, dropout=0.1):
        super(MBConv, self).__init__()
        self.skip_connection = (n_in == n_out) and (stride == 1)
        padding = (kernel_size-1)//2
        expanded = expansion*n_in
        
        self.expand_pw = nn.Identity() if expansion == 1 else conv_block(n_in, expanded, kernel_size=1)
        self.depthwise = conv_block(expanded, expanded, kernel_size=kernel_size, 
                                    stride=stride, padding=padding, groups=expanded)
        self.se = SEBlock(expanded, r=r)
        self.reduce_pw = conv_block(expanded, n_out, kernel_size=1, act=False)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        residual = x
        x = self.expand_pw(x)
        x = self.depthwise(x)
        x = self.se(x)
        x = self.reduce_pw(x)
        if self.skip_connection:
            x = self.dropout(x)
            x = x + residual
        return x

In [5]:
def mbconv1(n_in, n_out, kernel_size=3, stride=1, r=24, dropout=0.1):
    return MBConv(n_in, n_out, 1, kernel_size=kernel_size, stride=stride, r=r, dropout=dropout)

In [6]:
def mbconv6(n_in, n_out, kernel_size=3, stride=1, r=24, dropout=0.1):
    return MBConv(n_in, n_out, 6, kernel_size=kernel_size, stride=stride, r=r, dropout=dropout)

In [7]:
def create_stage(n_in, n_out, num_layers, layer=mbconv6, 
                 kernel_size=3, stride=1, r=24, ps=0):
    """
    A utility for creating a single EfficientNet stage.
    """
    layers = [layer(n_in, n_out, kernel_size=kernel_size,
                       stride=stride, r=r, dropout=ps)]
    layers += [layer(n_out, n_out, kernel_size=kernel_size,
                        r=r, dropout=ps) for _ in range(num_layers-1)]
    return nn.Sequential(*layers)

EfficientNet Base structure

| Stage (i) | Layer     | Resolution | Channels | Layers |
|-----------|-----------|------------|----------|--------|
| 1         | `mbconv1` | 224 x 224  | 32       | 1      |
| 2         | `mbconv6` | 112 x 112  | 16       | 1      |
| 3         | `mbconv6` | 112 x 112  | 24       | 2      |
| 4         | `mbconv6` | 56 x 56    | 40       | 2      |
| 5         | `mbconv6` | 28 x 28    | 80       | 3      |
| 6         | `mbconv6` | 14 x 14    | 112      | 3      |
| 7         | `mbconv6` | 14 x 14    | 192      | 4      |
| 8         | `mbconv6` | 7 x 7      | 320      | 1      |
| 9         | `mbconv6` | 7 x 7      | 1080     | 1      |

In [8]:
### Obtained from Paper ###
widths = [32, 16, 24, 40, 80, 112, 192, 320, 1280]
depths = [1, 2, 2, 3, 3, 4, 1]
kernel_sizes = [3, 3, 5, 3, 5, 5, 3]
strides = [1, 2, 2, 2, 1, 2, 1]
ps = [0, 0.029, 0.057, 0.086, 0.114, 0.143, 0.171]

In [9]:
def scale_width(w, w_factor):
    """
    This function scales the width.
    """
    w *= w_factor
    new_w = (int(w+4) // 8) * 8
    new_w = max(8, new_w)
    if new_w < 0.9*w:
        new_w += 8
    return int(new_w)

In [10]:
def efficientnet_scaler(w_factor=1, d_factor=1):
    """
    Efficientnet scaler function as defined in the paper.
    """
    scaled_widths = [scale_width(w, w_factor) for w in widths]
    scaled_depths = [math.ceil(d_factor*d) for d in depths]
    return scaled_widths, scaled_depths

In [11]:
class EfficientNet(nn.Module):
    """
    Generic EfficientNet class. This model is easily customizable for you can easily swap out the classification head 
    for something more complex.
    """
    def __init__(self, w_factor=1, d_factor=1, n_classes=1000):
        super(EfficientNet, self).__init__()
        scaled_widths, scaled_depths = efficientnet_scaler(w_factor=w_factor, d_factor=d_factor)
        
        self.stem = conv_block(3, scaled_widths[0], stride=2, padding=1)
        stages = [
            create_stage(scaled_widths[i], scaled_widths[i+1], scaled_depths[i], layer= mbconv1 if i==0 else mbconv6, 
                         kernel_size=kernel_sizes[i], stride=strides[i], r= 4 if i==0 else 24, ps=ps[i]) for i in range(7)
        ]
        self.stages = nn.Sequential(*stages)
        self.pre = conv_block(scaled_widths[-2], scaled_widths[-1], kernel_size=1)
        self.pool_flatten = nn.Sequential(nn.AdaptiveAvgPool2d(1), nn.Flatten())
        self.head = nn.Sequential(
            nn.Linear(scaled_widths[-1], n_classes)
        )
            
    def forward(self, x):
        x = self.stem(x)
        x = self.stages(x)
        x = self.pre(x)
        x = self.pool_flatten(x)
        x = self.head(x)
        return x

In [12]:
def EfficientNetSequential( w_factor=1, d_factor=1, n_classes=1000):
    """
    Another EfficientNet Builder. Is basically the same as the class above. However, it is harder to customize since you
    
    """
    scaled_widths, scaled_depths = efficientnet_scaler(w_factor=w_factor, d_factor=d_factor)
    layers = [
        conv_block(3, scaled_widths[0], stride=2, padding=1)
    ]
    stages = [
            create_stage(scaled_widths[i], scaled_widths[i+1], scaled_depths[i], layer= mbconv1 if i==0 else mbconv6, 
                         kernel_size=kernel_sizes[i], stride=strides[i], r= 4 if i==0 else 24, ps=ps[i]) for i in range(7)
    ]
    layers = layers + stages
    layers.append(conv_block(scaled_widths[-2], scaled_widths[-1], kernel_size=1))
    layers.append(nn.Sequential(nn.AdaptiveAvgPool2d(1), nn.Flatten()))
    layers.append(nn.Sequential(nn.Linear(scaled_widths[-1], n_classes)))
    return nn.Sequential(*layers)

In [13]:
def efficientnet_b0(n_classes=1000, builder = EfficientNet):
    return builder(n_classes=n_classes)

In [14]:
def efficientnet_b1(n_classes=1000, builder = EfficientNet):
    return builder(1, 1.1, n_classes=n_classes)

In [15]:
def efficientnet_b2(n_classes=1000, builder = EfficientNet):
    return builder(1.1, 1.2, n_classes=n_classes)

In [16]:
def efficientnet_b3(n_classes=1000, builder = EfficientNet):
    return builder(1.2, 1.4, n_classes=n_classes)

In [17]:
def efficientnet_b4(n_classes=1000, builder = EfficientNet):
    return builder(1.4, 1.8, n_classes=n_classes)

In [18]:
def efficientnet_b5(n_classes=1000, builder = EfficientNet):
    return builder(1.6, 2.2, n_classes=n_classes)

In [19]:
def efficientnet_b6(n_classes=1000, builder = EfficientNet):
    return builder(1.8, 2.6, n_classes=n_classes)

In [20]:
def efficientnet_b7(n_classes=1000, builder = EfficientNet):
    return builder(2, 3.1, n_classes=n_classes)

In [21]:
b0 = efficientnet_b0()
b1 = efficientnet_b1(builder=EfficientNetSequential)
b2 = efficientnet_b2()
b3 = efficientnet_b3(builder=EfficientNetSequential)
b4 = efficientnet_b4()
b5 = efficientnet_b5()
b6 = efficientnet_b6(builder=EfficientNetSequential)
b7 = efficientnet_b7()

In [22]:
inp = torch.randn(1, 3, 224, 224)
b0(inp).shape, b1(inp).shape

(torch.Size([1, 1000]), torch.Size([1, 1000]))

In [23]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

In [24]:
print_size_of_model(b0)
print_size_of_model(b1)
print_size_of_model(b2)
print_size_of_model(b3)
print_size_of_model(b4)
print_size_of_model(b5)
print_size_of_model(b6)
print_size_of_model(b7)

Size (MB): 21.446577
Size (MB): 31.593865
Size (MB): 36.885449
Size (MB): 49.471749
Size (MB): 78.111933
Size (MB): 122.546261
Size (MB): 173.387021
Size (MB): 267.054441


In [25]:
def fmat(n):
    return "{:.2f}M".format(n / 1_000_000)

In [26]:
def params(model, f=True):
    s = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return fmat(s) if f else s

In [27]:
params(b0),params(b1), params(b2), params(b3), params(b4), params(b5), params(b6), params(b7)
# roughly equivalent to the params mentioned in paper 
# (5.3M, 7.8M, 9.2M, 12M, 19M, 30M, 43M, 66M) <- param sizes in the paper

('5.29M', '7.79M', '9.11M', '12.23M', '19.34M', '30.39M', '43.04M', '66.35M')

In [28]:
summary(b0, (1, 3, 224, 224), depth=1) # pick a model.

Layer (type:depth-idx)                             Output Shape              Param #
EfficientNet                                       --                        --
├─Sequential: 1-1                                  [1, 32, 112, 112]         928
├─Sequential: 1-2                                  [1, 320, 7, 7]            3,594,460
├─Sequential: 1-3                                  [1, 1280, 7, 7]           412,160
├─Sequential: 1-4                                  [1, 1280]                 --
├─Sequential: 1-5                                  [1, 1000]                 1,281,000
Total params: 5,288,548
Trainable params: 5,288,548
Non-trainable params: 0
Total mult-adds (M): 385.87
Input size (MB): 0.60
Forward/backward pass size (MB): 107.89
Params size (MB): 21.15
Estimated Total Size (MB): 129.64

**B0 and B1 Tested**