In [1]:
import torch
import torchvision
import torch.nn as nn
from cells import *
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### CIFAR dataset preprocessing 

In [8]:
# define epochs, batch_size to use and learning rate
num_epochs = 8
batch_size = 128
learning_rate = 0.025

#define transforms to use while making data
img_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = CIFAR10('./data', transform=img_transform, download = True, train = True)

test_dataset = CIFAR10('./data', transform=img_transform, download = True, train = False)

# create dataloaders for train and test which will be used later to get batches during training 
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)


Files already downloaded and verified
Files already downloaded and verified


In [9]:
#dataset sizes for both train and test
dataset_sizes = {'train': len(train_dataset), 'test': len(test_dataset)}

### Network
Construct a network using reduction and normal cells. The architecture is same as shown in the papers 
https://arxiv.org/pdf/1802.01548.pdf  and https://arxiv.org/pdf/1712.00559.pdf

In [5]:
class Network(nn.Module):
    
    def __init__(self, total_layers = 9, initial_ch = 32):
        super(Network, self).__init__()
        
        assert total_layers < 10
        
        # the first layer is not either type of a cell, it's a normal convolution layer to create initial feature map
        # which will be used by normal and reduction cells further
        self.first_layer = nn.Sequential(nn.Conv2d(3, initial_ch, 3, stride = 1, padding = 1),
                                   nn.ReLU())
        
        c_kminus1, c_kminus2 = initial_ch, initial_ch
        
        reduction = False
        #cells will be added to the ModuleDict by torch
        self.cells = nn.ModuleDict()
        for i in range(1, total_layers+ 1):
            if i % 3 == 0:
                # add reduction cell at every third layer in the network 
                self.cells['cell_reduction' + str(i)] = Reduction_cell(c_kminus1, c_kminus2 * 2)
                reduction = True 
                c_kminus1 = c_kminus1 * 2
            else:
                #add normal cell to network
                self.cells['cell_normal' + str(i)] = Normal_cell(c_kminus1, c_kminus2, reduction)
                reduction = False
                if c_kminus1 != c_kminus2:
                    c_kminus2 *= 2
                c_kminus1  = c_kminus1
                c_kminus2 = c_kminus2
                
        # after all the cells, global average pooling reduces the feature map size to 4x4
        self.global_pooling = nn.AdaptiveAvgPool2d(4)
        # two fully connected layers are added to make prediction in one of 10 classes
        self.final = nn.Sequential(nn.Linear(c_kminus1*4*4, 2048),
                                   nn.ReLU(),
                                   nn.Linear(2048, 10))
        
        
    def forward(self, x):
        k_minus_1 = self.first_layer(x)
        k_minus_2 = self.first_layer(x)
        
        for item, val in (self.cells).items():
            
            out = val(k_minus_1, k_minus_2)
            
            k_minus_2 = k_minus_1
            k_minus_1 = out
        
        out = self.global_pooling(out)
        out = self.final(out.view(out.size(0), -1))
        return F.softmax(out,1)


In [6]:
#initialize network a
model = Network().cuda()
model.train()

images, labels = next(iter(train_dataloader))
images = images.to(device)
grid = torchvision.utils.make_grid(images)

# using tensorboard to visualize a batch of images and Network
tb = SummaryWriter()
tb.add_image('images', grid)
tb.add_graph(model, images)
tb.close()

  assert k_minus_1.size(1) == k_minus_2.size(1)
  assert k_minus_1.size(1) == k_minus_2.size(1)
  assert k_minus_1.size(2) == k_minus_2.size(2)


### Training 
For training, to get the good results, paramters used in paper itself are used like learning rate, optimizer, scheduler

In [10]:
# define loss, optimizer and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), learning_rate, momentum = 0.9, weight_decay= 3e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, float(num_epochs))

In [11]:

train_loss = 0
correct = 0
total = 0
for epoch in range(num_epochs):
    
    print('-'*15 , 'Epoch' ,epoch +1,'-'*15)
    correct = 0
    train_loss = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(train_dataloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        if batch_idx % 100 == 0:
            print(batch_idx, len(train_dataloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
                % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
            
    scheduler.step()

--------------- Epoch 1 ---------------
0 391 Loss: 2.306 | Acc: 7.812% (10/128)
100 391 Loss: 2.178 | Acc: 26.942% (3483/12928)
200 391 Loss: 2.117 | Acc: 33.450% (8606/25728)
300 391 Loss: 2.081 | Acc: 37.189% (14328/38528)
--------------- Epoch 2 ---------------
0 391 Loss: 1.870 | Acc: 60.156% (77/128)
100 391 Loss: 1.929 | Acc: 52.970% (6848/12928)
200 391 Loss: 1.917 | Acc: 54.046% (13905/25728)
300 391 Loss: 1.910 | Acc: 54.830% (21125/38528)
--------------- Epoch 3 ---------------
0 391 Loss: 1.843 | Acc: 60.938% (78/128)
100 391 Loss: 1.843 | Acc: 61.649% (7970/12928)
200 391 Loss: 1.840 | Acc: 61.991% (15949/25728)
300 391 Loss: 1.832 | Acc: 62.791% (24192/38528)
--------------- Epoch 4 ---------------
0 391 Loss: 1.739 | Acc: 71.875% (92/128)
100 391 Loss: 1.780 | Acc: 68.224% (8820/12928)
200 391 Loss: 1.773 | Acc: 68.878% (17721/25728)
300 391 Loss: 1.769 | Acc: 69.241% (26677/38528)
--------------- Epoch 5 ---------------
0 391 Loss: 1.735 | Acc: 72.656% (93/128)
100 391 

### Evaluate on test data

In [12]:
model.eval()

Network(
  (first_layer): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
  )
  (cells): ModuleDict(
    (cell_normal1): Normal_cell(
      (maintain_ch): Compatible(
        (layer): Sequential(
          (0): ReLU()
          (1): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1))
          (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (kminus1to0): Separable(
        (layer): Sequential(
          (0): ReLU()
          (1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32)
          (2): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
          (3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (4): ReLU()
          (5): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32)
          (6): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
          (7): BatchNorm2d(32, eps=1

In [13]:

test_loss = 0
correct = 0
total = 0
for batch_idx, (inputs, targets) in enumerate(test_dataloader):
    
    inputs, targets = inputs.to(device), targets.to(device)
    outputs = model(inputs)
    loss = criterion(outputs, targets)

    test_loss += loss.item()
    _, predicted = outputs.max(1)
    total += targets.size(0)
    correct += predicted.eq(targets).sum().item()

print('loss ', test_loss/batch_idx)
print('Accuracy ', 100. * correct/dataset_sizes['test'])

loss  1.720286910350506
Accuracy  76.11
