# Kaggle MNIST Dataset

In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torchvision.transforms as tt
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data.dataloader import DataLoader

In [2]:
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x234156785f0>

In [3]:
mnist_train_df = pd.read_csv('train.csv')

In [4]:
train_transform = tt.Compose([
    tt.ToPILImage(),
    tt.RandomRotation(10),
    tt.RandomCrop(28, padding=4),
    tt.ToTensor()
])

In [5]:
mnist_train_labels = mnist_train_df['label'].values
mnist_train_data = torch.Tensor(mnist_train_df.drop('label', axis=1).values.reshape(-1, 1, 28, 28)) / 255.
mnist_train = []

for i in range(len(mnist_train_data)):
    mnist_train.append([train_transform(mnist_train_data[i]), mnist_train_labels[i]])

In [6]:
sample, label = mnist_train[0]
len(mnist_train), sample.shape, label

(42000, torch.Size([1, 28, 28]), 1)

In [7]:
def split_indices(n, val_pct):
    n_val = int(val_pct * n)
    idxs = np.random.permutation(n)
    return idxs[n_val:], idxs[:n_val]

In [8]:
train_idxs, val_idxs = split_indices(len(mnist_train_data), 0.2)

In [9]:
batch_size = 128

train_sampler = SubsetRandomSampler(train_idxs)
train_loader = DataLoader(mnist_train, batch_size=batch_size, sampler=train_sampler)

val_sampler = SubsetRandomSampler(val_idxs)
val_loader = DataLoader(mnist_train, batch_size=batch_size, sampler=val_sampler)

In [10]:
for sample, label in val_loader:
    print(sample.shape, label.shape, label[:5])
    break

torch.Size([128, 1, 28, 28]) torch.Size([128]) tensor([4, 8, 2, 8, 6])


In [11]:
def get_default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
def to_device(data, device):
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        return len(self.dl)

In [12]:
device = get_default_device()
device

device(type='cuda')

In [13]:
train_loader = DeviceDataLoader(train_loader, device)
val_loader = DeviceDataLoader(val_loader, device)

In [14]:
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

class ImageClassificationBase (nn.Module):
    def training_step(self, batch):
        images, labels = batch
        out = self(images)
        loss = F.cross_entropy(out, labels)
        return loss
    
    def validation_step(self, batch):
        images, labels = batch 
        out = self(images)
        loss = F.cross_entropy(out, labels)
        acc = accuracy(out, labels)
        return {'val_loss': loss.detach(), 'val_acc': acc}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()
        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
    
    def epoch_end(self, epoch, result):
        if 'val_loss' in result:
            print("Epoch [{}], last_lr: {:.6f}, train_loss: {:.6f}, val_loss: {:.6f}, val_acc: {:.6f}".format(
                epoch + 1, result['lrs'][-1], result['train_loss'], result['val_loss'], result['val_acc']))
        else:
            print("Epoch [{}], last_lr: {:.6f}, train_loss: {:.6f}".format(
                epoch + 1, result['lrs'][-1], result['train_loss']))

In [15]:
@torch.no_grad()
def evaluate(model, val_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def fit_one_cycle(epochs, max_lr, model, train_loader, val_loader=None, weight_decay=0, grad_clip=None, opt_func=torch.optim.SGD):
    torch.cuda.empty_cache()
    history = []
    
    optimizer = opt_func(model.parameters(), max_lr, weight_decay=weight_decay)
    sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr, epochs=epochs, steps_per_epoch=len(train_loader))
    
    for epoch in range(epochs):
        model.train()
        train_losses = []
        lrs = []
        for batch in train_loader:
            loss = model.training_step(batch)
            train_losses.append(loss)
            loss.backward()

            if grad_clip: 
                nn.utils.clip_grad_value_(model.parameters(), grad_clip)
            
            optimizer.step()
            optimizer.zero_grad()

            lrs.append(get_lr(optimizer))
            sched.step()
        
        result = {}
        if val_loader is not None:
            result = evaluate(model, val_loader)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        result['lrs'] = lrs
        model.epoch_end(epoch, result)
        history.append(result)
    return history

In [16]:
def conv_block(in_channels, out_channels, pool=False, drop=0.25):
    layers = [
        nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
        nn.BatchNorm2d(out_channels),
        nn.ReLU(inplace=True)
    ]
    if pool:
        layers.append(nn.MaxPool2d(2))
        layers.append(nn.Dropout2d(p=drop))
    return nn.Sequential(*layers)

def linear_block(in_number, out_number):
    layers = [
        nn.Linear(in_number, out_number),
        nn.BatchNorm1d(out_number),
        nn.ReLU(inplace=True)
    ]
    return nn.Sequential(*layers)

class MnistConvolutionalNeuralNetwork (ImageClassificationBase):
    def __init__(self, input_channels, hidden_layers, output_size):
        super().__init__()
        layers = []

        layers.append(conv_block(input_channels, 32))
        layers.append(conv_block(32, 32, pool=True))

        layers.append(conv_block(32, 64))
        layers.append(conv_block(64, 64, pool=True))

        layers.append(nn.Flatten())

        layers.append(linear_block(64 * 7 * 7, hidden_layers[0]))

        for i in range(len(hidden_layers) - 1):
            layers.append(linear_block(hidden_layers[i], hidden_layers[i + 1]))
        
        layers.append(nn.Dropout(p=0.3))
        
        layers.append(nn.Linear(hidden_layers[-1], output_size))
        self.connections = nn.Sequential(*layers)
    
    def forward(self, xb):
        return self.connections(xb)

In [17]:
input_channel = 1
hidden_layers = [256, 256, 128]
output_size = 10
model = MnistConvolutionalNeuralNetwork(input_channel, hidden_layers, output_size)
to_device(model, device)

MnistConvolutionalNeuralNetwork(
  (connections): Sequential(
    (0): Sequential(
      (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (1): Sequential(
      (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (4): Dropout2d(p=0.25, inplace=False)
    )
    (2): Sequential(
      (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (3): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05

In [18]:
epochs = 40
max_lr = 0.001
#grad_clip = 0.1
#weight_decay = 1e-4
opt_func = torch.optim.Adam

history = fit_one_cycle(epochs, max_lr, model, train_loader, val_loader=val_loader, opt_func=torch.optim.Adam)

Epoch [1], last_lr: 0.000056, train_loss: 1.869431, val_loss: 1.298114, val_acc: 0.649905
Epoch [2], last_lr: 0.000104, train_loss: 1.004244, val_loss: 0.494770, val_acc: 0.903930
Epoch [3], last_lr: 0.000180, train_loss: 0.439910, val_loss: 0.187152, val_acc: 0.957481
Epoch [4], last_lr: 0.000280, train_loss: 0.220764, val_loss: 0.114114, val_acc: 0.968774
Epoch [5], last_lr: 0.000395, train_loss: 0.143632, val_loss: 0.080328, val_acc: 0.976823
Epoch [6], last_lr: 0.000520, train_loss: 0.112954, val_loss: 0.072821, val_acc: 0.978977
Epoch [7], last_lr: 0.000644, train_loss: 0.093481, val_loss: 0.067949, val_acc: 0.977865
Epoch [8], last_lr: 0.000760, train_loss: 0.084364, val_loss: 0.064602, val_acc: 0.981108
Epoch [9], last_lr: 0.000859, train_loss: 0.073882, val_loss: 0.060685, val_acc: 0.980516
Epoch [10], last_lr: 0.000936, train_loss: 0.067685, val_loss: 0.064445, val_acc: 0.979948
Epoch [11], last_lr: 0.000984, train_loss: 0.062165, val_loss: 0.053730, val_acc: 0.983475
Epoch [1

In [30]:
input_channel = 1
hidden_layers = [256, 256, 128]
output_size = 10
model_2 = MnistConvolutionalNeuralNetwork(input_channel, hidden_layers, output_size)
to_device(model_2, device)

MnistConvolutionalNeuralNetwork(
  (connections): Sequential(
    (0): Sequential(
      (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (1): Sequential(
      (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (4): Dropout2d(p=0.25, inplace=False)
    )
    (2): Sequential(
      (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (3): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05

In [20]:
full_loader = DeviceDataLoader(DataLoader(mnist_train, batch_size=batch_size, shuffle=True), device)

In [31]:
epochs = 40
max_lr = 0.001
grad_clip = 0.1
weight_decay = 1e-4
opt_func = torch.optim.Adam

history = fit_one_cycle(epochs, max_lr, model_2, full_loader, weight_decay=weight_decay, grad_clip=grad_clip, opt_func=torch.optim.Adam)

Epoch [1], last_lr: 0.000056, train_loss: 1.748376
Epoch [2], last_lr: 0.000104, train_loss: 0.858887
Epoch [3], last_lr: 0.000180, train_loss: 0.353549
Epoch [4], last_lr: 0.000280, train_loss: 0.189252
Epoch [5], last_lr: 0.000396, train_loss: 0.129978
Epoch [6], last_lr: 0.000520, train_loss: 0.105757
Epoch [7], last_lr: 0.000644, train_loss: 0.091392
Epoch [8], last_lr: 0.000760, train_loss: 0.078775
Epoch [9], last_lr: 0.000859, train_loss: 0.079163
Epoch [10], last_lr: 0.000936, train_loss: 0.067745
Epoch [11], last_lr: 0.000984, train_loss: 0.060252
Epoch [12], last_lr: 0.001000, train_loss: 0.049691
Epoch [13], last_lr: 0.000997, train_loss: 0.049376
Epoch [14], last_lr: 0.000987, train_loss: 0.043813
Epoch [15], last_lr: 0.000972, train_loss: 0.040959
Epoch [16], last_lr: 0.000950, train_loss: 0.037468
Epoch [17], last_lr: 0.000923, train_loss: 0.038825
Epoch [18], last_lr: 0.000891, train_loss: 0.033598
Epoch [19], last_lr: 0.000854, train_loss: 0.027038
Epoch [20], last_lr: 

In [32]:
test_df = pd.read_csv('test.csv')
mnist_test_data = torch.Tensor(test_df.values.reshape(-1, 1, 28, 28)) / 255.
mnist_test_data.shape

torch.Size([28000, 1, 28, 28])

In [33]:
test_loader = DeviceDataLoader(DataLoader(mnist_test_data, batch_size=batch_size, shuffle=False), device)

In [34]:
for sample in test_loader:
    print(sample[0, 0, 15:20, 15:20])
    break
print(mnist_test_data[0, 0, 15:20, 15:20])

tensor([[0.7412, 0.7412, 0.8549, 0.9922, 0.9922],
        [0.9922, 0.9922, 0.9922, 0.9922, 0.9922],
        [0.9922, 0.9922, 0.9922, 0.9922, 0.9922],
        [0.9922, 0.9412, 0.4784, 0.4784, 0.7451],
        [0.9020, 0.2745, 0.0000, 0.0000, 0.0667]], device='cuda:0')
tensor([[0.7412, 0.7412, 0.8549, 0.9922, 0.9922],
        [0.9922, 0.9922, 0.9922, 0.9922, 0.9922],
        [0.9922, 0.9922, 0.9922, 0.9922, 0.9922],
        [0.9922, 0.9412, 0.4784, 0.4784, 0.7451],
        [0.9020, 0.2745, 0.0000, 0.0000, 0.0667]])


In [35]:
preds = []
for sample in test_loader:
    _, tmp_preds = torch.max(model_2(sample), dim=1)
    preds = preds + list(tmp_preds.to('cpu'))

In [36]:
preds = [x.item() for x in preds]
preds[:5]

[2, 0, 9, 0, 3]

In [37]:
output = pd.DataFrame({
    'ImageId': range(1, len(mnist_test_data) + 1),
    'Label': preds
})

In [38]:
output.to_csv('self_output_cnn_optim.csv', index=False)

In [29]:
torch.save(model_2.state_dict(), 'mnist-cnn-optim-1.pth')