<a href="https://colab.research.google.com/github/SsrCode/ELNet/blob/master/ELNetV2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Tue Apr  7 04:15:21 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8     7W /  75W |      0MiB /  7611MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms


# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
num_epochs = 200

# Image preprocessing modules
transform_train = transforms.Compose([
    transforms.Pad(4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32),
    #transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
    ])

transform_test = transforms.Compose([
    #transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

# CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root='/.data',
                                             train=True, 
                                             transform=transform_train,
                                             download=True)

test_dataset = torchvision.datasets.CIFAR10(root='/.data',
                                            train=True, 
                                            transform=transform_test)

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=128, 
                                           shuffle=True,
                                           num_workers=2)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=100, 
                                          shuffle=False,
                                          num_workers=2)

Files already downloaded and verified


In [3]:
！pwd

SyntaxError: ignored

In [0]:
def _make_divisible(v, divisor, min_value=None):
    """
    This function is taken from the original tf repo.
    It ensures that all layers have a channel number that is divisible by 8
    It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    """
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v

class Hswish(nn.Module):
    def __init__(self, inplace=True):
        super(Hswish, self).__init__()
        self.inplace = inplace

    def forward(self, x):
        return x * F.relu6(x + 3., inplace=self.inplace) / 6.


class Hsigmoid(nn.Module):
    def __init__(self, inplace=True):
        super(Hsigmoid, self).__init__()
        self.inplace = inplace

    def forward(self, x):
        return F.relu6(x + 3., inplace=self.inplace) / 6.


class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super(SEModule, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel, bias=False),
            Hsigmoid()
        )

    def forward(self, x):
        b, c, _, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1, 1)
        return x * y.expand_as(x)

def splitChannels(channels, num_groups):
    split_channels = [channels//num_groups for _ in range(num_groups)]
    split_channels[0] += channels - sum(split_channels)
    return split_channels


class Block(nn.Module):
    def __init__(self, in_channels,hidden,out_channels,kernel_size,stride,se=True,nl='RE',group=4):
        self.hid = hidden
        self.inc = in_channels
        super(Block,self).__init__()
        if nl == 'RE':
            nlin_layer = nn.ReLU # or ReLU6
        elif nl == 'HS':
            nlin_layer = Hswish
        self.conv1 = nn.Conv2d(in_channels,hidden,1,stride=1,groups = group)
        self.conv2 = nn.Conv2d(hidden,out_channels,1,stride=1,groups = group)

        self.operation1 = nn.Sequential(
            ##
            nn.BatchNorm2d(hidden),
            nlin_layer(inplace=True),
            #MDConv(in_channels,out_channels,kernel_size=[3,5],stride),
            nn.Conv2d(hidden,hidden,kernel_size,stride, kernel_size//2, groups=hidden, bias=False),
            nn.BatchNorm2d(hidden),
            SEModule(hidden),
            nlin_layer(inplace=True)
        )
        self.operation2 = nn.Sequential(
            nn.BatchNorm2d(out_channels),
            nlin_layer(inplace=True)
        )

        if stride == 1 and in_channels == out_channels:
            self.shortcut = nn.Sequential()
        else:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels,in_channels,3,stride,1,groups=in_channels, bias=False),
                nn.BatchNorm2d(in_channels),
                nn.Conv2d(in_channels, out_channels, 1, 1, 0, bias=False),
                nn.BatchNorm2d(out_channels),
                nlin_layer(inplace=True)
            )

    def forward(self,x):
        group1 = self.inc//4
        group2 = self.hid//4
        inx = torch.split(x, group1, 1)
       
        for i in range(4):
            y = inx[i]
            if i == 0:
                sp = y
                out = sp
            else:
                sp = sp + y
                out = torch.cat((out, sp), 1)
        out = self.conv1(out)
        out = self.operation1(out)
       
        inx = torch.split(out, group2, 1)
       
        for i in range(4):
            y = inx[3-i]
            if i == 0:
                sp = y
                out = sp
            else:
                sp = sp + y
                out = torch.cat((sp,out), 1)

        out = self.conv2(out)
        out = self.operation2(out)

        return out+self.shortcut(x)

class MyNet(nn.Module):
    def __init__(self, num_classes=10, width_mult=1.):
        super(MyNet, self).__init__()
        # setting of inverted residual blocks
        self.cfgs = [
            # k, exp, c,  se,     nl,  s,
            [3, 16,  16,  True,  'HS', 1],
            [3, 72,  24,  False, 'HS', 2],
            [3, 88,  24,  False, 'HS', 1],
            [5, 96,  40,  True,  'HS', 2],
            [5, 240, 40,  True,  'HS', 1],
            [5, 240, 40,  True,  'HS', 1],
            [5, 120, 48,  True,  'HS', 1],
            [5, 144, 48,  True,  'HS', 1],
            [5, 288, 96,  True,  'HS', 2],
            [5, 576, 96,  True,  'HS', 1],
            [5, 576, 96,  True,  'HS', 1],
            ]

        # building first layer
        output_channel = _make_divisible(16 * width_mult, 4)
        layers = [nn.Sequential(
            nn.Conv2d(3, output_channel, 3, 1, 1, bias=False),
            nn.BatchNorm2d(output_channel),
            nn.ReLU(inplace=True)
            )]
        input_channel = 16

        # building inverted residual blocks
        block = Block
        for k, exp_size, c, se, nl, s in self.cfgs:
            output_channel = c
            hidden_channel = exp_size
            layers.append(block(input_channel, hidden_channel, output_channel, k, s, se, nl))
            input_channel = output_channel
        self.features = nn.Sequential(*layers)

        # building last several layers
        output_channel = _make_divisible(exp_size * width_mult, 4)
        self.squeeze = nn.Sequential(
            nn.Conv2d(input_channel, output_channel, 1, 1, 0, bias=False),
            nn.BatchNorm2d(output_channel),
            Hswish(inplace=True),
            nn.AdaptiveAvgPool2d((1, 1)),
        )
        input_channel = output_channel

        output_channel = 1280
        self.classifier = nn.Sequential(
            nn.Linear(input_channel, output_channel, bias=False),
            nn.BatchNorm1d(output_channel),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(output_channel, num_classes),
        )

        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = self.squeeze(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)


In [0]:
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau

class GradualWarmupScheduler(_LRScheduler):
    """ Gradually warm-up(increasing) learning rate in optimizer.
    Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'.
    Args:
        optimizer (Optimizer): Wrapped optimizer.
        multiplier: target learning rate = base lr * multiplier
        total_epoch: target learning rate is reached at total_epoch, gradually
        after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
    """

    def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
        self.multiplier = multiplier
        if self.multiplier <= 1.:
            raise ValueError('multiplier should be greater than 1.')
        self.total_epoch = total_epoch
        self.after_scheduler = after_scheduler
        self.finished = False
        super().__init__(optimizer)

    def get_lr(self):
        if self.last_epoch > self.total_epoch:
            if self.after_scheduler:
                if not self.finished:
                    self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
                    self.finished = True
                return self.after_scheduler.get_lr()
            return [base_lr * self.multiplier for base_lr in self.base_lrs]

        return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]

    def step_ReduceLROnPlateau(self, metrics, epoch=None):
        if epoch is None:
            epoch = self.last_epoch + 1
        self.last_epoch = epoch if epoch != 0 else 1  # ReduceLROnPlateau is called at the end of epoch, whereas others are called at beginning
        if self.last_epoch <= self.total_epoch:
            warmup_lr = [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
            for param_group, lr in zip(self.optimizer.param_groups, warmup_lr):
                param_group['lr'] = lr
        else:
            if epoch is None:
                self.after_scheduler.step(metrics, None)
            else:
                self.after_scheduler.step(metrics, epoch - self.total_epoch)

    def step(self, epoch=None, metrics=None):
        if type(self.after_scheduler) != ReduceLROnPlateau:
            if self.finished and self.after_scheduler:
                if epoch is None:
                    self.after_scheduler.step(None)
                else:
                    self.after_scheduler.step(epoch - self.total_epoch)
            else:
                return super(GradualWarmupScheduler, self).step(epoch)
        else:
            self.step_ReduceLROnPlateau(metrics, epoch)

In [9]:
model = MyNet().to(device)
print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters())/1000000.0))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05,momentum = 0.9,weight_decay=4e-5)
scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 200)
scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=8, total_epoch=5, after_scheduler=scheduler_cosine)

total_step = len(train_loader)
def test():
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))
        
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    model.train()
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}"
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
            
    scheduler_warmup.step()
    if (epoch+1) % 10 == 0:
        test()

Total params: 1.44M
Epoch [1/200], Step [100/391] Loss: 2.2323
Epoch [1/200], Step [200/391] Loss: 2.5061
Epoch [1/200], Step [300/391] Loss: 2.1744
Epoch [2/200], Step [100/391] Loss: 1.8185
Epoch [2/200], Step [200/391] Loss: 1.6450
Epoch [2/200], Step [300/391] Loss: 1.6641
Epoch [3/200], Step [100/391] Loss: 1.6299
Epoch [3/200], Step [200/391] Loss: 1.5823
Epoch [3/200], Step [300/391] Loss: 1.4070
Epoch [4/200], Step [100/391] Loss: 1.3996
Epoch [4/200], Step [200/391] Loss: 1.4407
Epoch [4/200], Step [300/391] Loss: 1.8379
Epoch [5/200], Step [100/391] Loss: 1.5263
Epoch [5/200], Step [200/391] Loss: 1.2320
Epoch [5/200], Step [300/391] Loss: 1.6552
Epoch [6/200], Step [100/391] Loss: 1.6621
Epoch [6/200], Step [200/391] Loss: 1.5583
Epoch [6/200], Step [300/391] Loss: 1.3467
Epoch [7/200], Step [100/391] Loss: 1.2393
Epoch [7/200], Step [200/391] Loss: 1.2308
Epoch [7/200], Step [300/391] Loss: 1.1792
Epoch [8/200], Step [100/391] Loss: 1.2978
Epoch [8/200], Step [200/391] Loss

In [0]:
PATH = './drive/My Drive/COLAB/ELNetV2_200EPOCH.pth'
torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, PATH)