# Kaggle MNIST Dataset

In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data.dataloader import DataLoader

In [2]:
mnist_train_df = pd.read_csv('train.csv')

In [3]:
mnist_train_labels = mnist_train_df['label'].values
mnist_train_data = torch.Tensor(mnist_train_df.drop('label', axis=1).values.reshape(-1, 1, 28, 28))
mnist_train = []

for i in range(len(mnist_train_data)):
    mnist_train.append([mnist_train_data[i] / 255., mnist_train_labels[i]])

In [4]:
sample, label = mnist_train[0]
len(mnist_train), sample.shape, label

(42000, torch.Size([1, 28, 28]), 1)

In [5]:
def split_indices(n, val_pct):
    n_val = int(val_pct * n)
    idxs = np.random.permutation(n)
    return idxs[n_val:], idxs[:n_val]

In [6]:
train_idxs, val_idxs = split_indices(len(mnist_train_data), 0.2)

In [7]:
batch_size = 128

train_sampler = SubsetRandomSampler(train_idxs)
train_loader = DataLoader(mnist_train, batch_size=batch_size, sampler=train_sampler)

val_sampler = SubsetRandomSampler(val_idxs)
val_loader = DataLoader(mnist_train, batch_size=batch_size, sampler=val_sampler)

In [8]:
for sample, label in val_loader:
    print(sample.shape, label.shape, label[:5])
    break

torch.Size([128, 1, 28, 28]) torch.Size([128]) tensor([5, 0, 6, 9, 5])


In [9]:
def get_default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
def to_device(data, device):
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        return len(self.dl)

In [10]:
device = get_default_device()
device

device(type='cuda')

In [11]:
train_loader = DeviceDataLoader(train_loader, device)
val_loader = DeviceDataLoader(val_loader, device)

In [12]:
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

class ImageClassificationBase (nn.Module):
    def training_step(self, batch):
        images, labels = batch
        out = self(images)
        loss = F.cross_entropy(out, labels)
        return loss
    
    def validation_step(self, batch):
        images, labels = batch 
        out = self(images)
        loss = F.cross_entropy(out, labels)
        acc = accuracy(out, labels)
        return {'val_loss': loss.detach(), 'val_acc': acc}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()
        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
    
    def epoch_end(self, epoch, result):
        if 'val_loss' in result:
            print("Epoch [{}], train_loss: {:.5f}, val_loss: {:.5f}, val_acc: {:.5f}".format(
                epoch + 1, result['train_loss'], result['val_loss'], result['val_acc']))
        else:
            print("Epoch [{}], train_loss: {:.5f}".format(
                epoch + 1, result['train_loss']))

In [13]:
@torch.no_grad()
def evaluate(model, val_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def fit_one_cycle(epochs, lr, model, train_loader, val_loader=None, opt_func=torch.optim.SGD):
    torch.cuda.empty_cache()
    history = []
    
    optimizer = opt_func(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        model.train()
        train_losses = []
        for batch in train_loader:
            loss = model.training_step(batch)
            train_losses.append(loss)
            loss.backward()
            
            optimizer.step()
            optimizer.zero_grad()
        
        result = {}
        if val_loader is not None:
            result = evaluate(model, val_loader)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        model.epoch_end(epoch, result)
        history.append(result)
    return history

In [14]:
def conv_block(input_channel, output_channel, stride=1, kernel_size=3):
    block = nn.Sequential(
        nn.Conv2d(input_channel, output_channel[0], kernel_size=kernel_size, stride=stride, padding=kernel_size//2),
        nn.ReLU(),
        nn.Conv2d(output_channel[0], output_channel[1], kernel_size=kernel_size, stride=stride, padding=kernel_size//2),
        nn.ReLU(),
        nn.MaxPool2d(2, 2)
    )
    return block

class MnistConvolutionalNeuralNetwork (ImageClassificationBase):
    def __init__(self, input_channels, hidden_layers, output_size):
        super().__init__()
        layers = []

        layers.append(conv_block(input_channels, [32, 64]))
        layers.append(conv_block(64, [128, 256]))

        layers.append(nn.Flatten())

        layers.append(nn.Linear(256 * 7 * 7, hidden_layers[0]))
        layers.append(nn.ReLU())
        for i in range(len(hidden_layers) - 1):
            layers.append(nn.Linear(hidden_layers[i], hidden_layers[i + 1]))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_layers[-1], output_size))
        self.connections = nn.Sequential(*layers)
    
    def forward(self, xb):
        return self.connections(xb)

In [15]:
input_channel = 1
hidden_layers = [256, 128, 32]
output_size = 10
model = MnistConvolutionalNeuralNetwork(input_channel, hidden_layers, output_size)
to_device(model, device)

MnistConvolutionalNeuralNetwork(
  (connections): Sequential(
    (0): Sequential(
      (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU()
      (2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU()
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU()
      (2): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU()
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (2): Flatten()
    (3): Linear(in_features=12544, out_features=256, bias=True)
    (4): ReLU()
    (5): Linear(in_features=256, out_features=128, bias=True)
    (6): ReLU()
    (7): Linear(in_features=128, out_features=32, bias=True)
    (8): ReLU()
    (9): Linear(in_features=32, out_features=10, bias=True)
  )
)

In [16]:
epochs = 20
lr = 0.001

history = fit_one_cycle(epochs, lr, model, train_loader, val_loader=val_loader, opt_func=torch.optim.Adam)

Epoch [1], train_loss: 0.35431, val_loss: 0.09437, val_acc: 0.97069
Epoch [2], train_loss: 0.06362, val_loss: 0.05753, val_acc: 0.98430
Epoch [3], train_loss: 0.04136, val_loss: 0.05516, val_acc: 0.98269
Epoch [4], train_loss: 0.03210, val_loss: 0.05022, val_acc: 0.98627
Epoch [5], train_loss: 0.02338, val_loss: 0.06172, val_acc: 0.98411
Epoch [6], train_loss: 0.02111, val_loss: 0.04966, val_acc: 0.98525
Epoch [7], train_loss: 0.01731, val_loss: 0.04901, val_acc: 0.98672
Epoch [8], train_loss: 0.01645, val_loss: 0.05997, val_acc: 0.98556
Epoch [9], train_loss: 0.01169, val_loss: 0.06393, val_acc: 0.98554
Epoch [10], train_loss: 0.01165, val_loss: 0.04698, val_acc: 0.98857
Epoch [11], train_loss: 0.00838, val_loss: 0.05407, val_acc: 0.98660
Epoch [12], train_loss: 0.01094, val_loss: 0.06032, val_acc: 0.98572
Epoch [13], train_loss: 0.00989, val_loss: 0.05311, val_acc: 0.98686
Epoch [14], train_loss: 0.01081, val_loss: 0.05147, val_acc: 0.98970
Epoch [15], train_loss: 0.00647, val_loss: 

In [17]:
epochs = 20
lr = 0.0001

history = fit_one_cycle(epochs, lr, model, train_loader, val_loader=val_loader, opt_func=torch.optim.Adam)

Epoch [1], train_loss: 0.00232, val_loss: 0.05180, val_acc: 0.99062
Epoch [2], train_loss: 0.00019, val_loss: 0.05193, val_acc: 0.99100
Epoch [3], train_loss: 0.00004, val_loss: 0.05559, val_acc: 0.99107
Epoch [4], train_loss: 0.00002, val_loss: 0.05456, val_acc: 0.99148
Epoch [5], train_loss: 0.00001, val_loss: 0.05643, val_acc: 0.99129
Epoch [6], train_loss: 0.00001, val_loss: 0.05881, val_acc: 0.99105
Epoch [7], train_loss: 0.00001, val_loss: 0.06032, val_acc: 0.99124
Epoch [8], train_loss: 0.00000, val_loss: 0.06221, val_acc: 0.99100
Epoch [9], train_loss: 0.00000, val_loss: 0.06400, val_acc: 0.99100
Epoch [10], train_loss: 0.00000, val_loss: 0.06559, val_acc: 0.99124
Epoch [11], train_loss: 0.00000, val_loss: 0.06715, val_acc: 0.99112
Epoch [12], train_loss: 0.00000, val_loss: 0.06891, val_acc: 0.99105
Epoch [13], train_loss: 0.00000, val_loss: 0.07009, val_acc: 0.99105
Epoch [14], train_loss: 0.00000, val_loss: 0.07176, val_acc: 0.99105
Epoch [15], train_loss: 0.00000, val_loss: 

In [18]:
input_channel = 1
hidden_layers = [256, 128, 32]
output_size = 10
model_2 = MnistConvolutionalNeuralNetwork(input_channel, hidden_layers, output_size)
to_device(model_2, device)

MnistConvolutionalNeuralNetwork(
  (connections): Sequential(
    (0): Sequential(
      (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU()
      (2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU()
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU()
      (2): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU()
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (2): Flatten()
    (3): Linear(in_features=12544, out_features=256, bias=True)
    (4): ReLU()
    (5): Linear(in_features=256, out_features=128, bias=True)
    (6): ReLU()
    (7): Linear(in_features=128, out_features=32, bias=True)
    (8): ReLU()
    (9): Linear(in_features=32, out_features=10, bias=True)
  )
)

In [19]:
full_loader = DeviceDataLoader(DataLoader(mnist_train, batch_size=batch_size, shuffle=True), device)

In [20]:
epochs = 15
lr = 0.001

history = fit_one_cycle(epochs, lr, model_2, full_loader, opt_func=torch.optim.Adam)

Epoch [1], train_loss: 0.27799
Epoch [2], train_loss: 0.05341
Epoch [3], train_loss: 0.03587
Epoch [4], train_loss: 0.02833
Epoch [5], train_loss: 0.02013
Epoch [6], train_loss: 0.01985
Epoch [7], train_loss: 0.01370
Epoch [8], train_loss: 0.01186
Epoch [9], train_loss: 0.00872
Epoch [10], train_loss: 0.00807
Epoch [11], train_loss: 0.01024
Epoch [12], train_loss: 0.00936
Epoch [13], train_loss: 0.00667
Epoch [14], train_loss: 0.00833
Epoch [15], train_loss: 0.00974


In [21]:
epochs = 15
lr = 0.0001

history = fit_one_cycle(epochs, lr, model_2, full_loader, opt_func=torch.optim.Adam)

Epoch [1], train_loss: 0.00072
Epoch [2], train_loss: 0.00008
Epoch [3], train_loss: 0.00003
Epoch [4], train_loss: 0.00001
Epoch [5], train_loss: 0.00001
Epoch [6], train_loss: 0.00001
Epoch [7], train_loss: 0.00000
Epoch [8], train_loss: 0.00000
Epoch [9], train_loss: 0.00000
Epoch [10], train_loss: 0.00000
Epoch [11], train_loss: 0.00000
Epoch [12], train_loss: 0.00000
Epoch [13], train_loss: 0.00000
Epoch [14], train_loss: 0.00000
Epoch [15], train_loss: 0.00000


In [22]:
test_df = pd.read_csv('test.csv')
mnist_test_data = torch.Tensor(test_df.values.reshape(-1, 1, 28, 28)) / 255.
mnist_test_data.shape

torch.Size([28000, 1, 28, 28])

In [23]:
test_loader = DeviceDataLoader(DataLoader(mnist_test_data, batch_size=batch_size, shuffle=False), device)

In [24]:
for sample in test_loader:
    print(sample[0, 0, 15:20, 15:20])
    break
print(mnist_test_data[0, 0, 15:20, 15:20])

tensor([[0.7412, 0.7412, 0.8549, 0.9922, 0.9922],
        [0.9922, 0.9922, 0.9922, 0.9922, 0.9922],
        [0.9922, 0.9922, 0.9922, 0.9922, 0.9922],
        [0.9922, 0.9412, 0.4784, 0.4784, 0.7451],
        [0.9020, 0.2745, 0.0000, 0.0000, 0.0667]], device='cuda:0')
tensor([[0.7412, 0.7412, 0.8549, 0.9922, 0.9922],
        [0.9922, 0.9922, 0.9922, 0.9922, 0.9922],
        [0.9922, 0.9922, 0.9922, 0.9922, 0.9922],
        [0.9922, 0.9412, 0.4784, 0.4784, 0.7451],
        [0.9020, 0.2745, 0.0000, 0.0000, 0.0667]])


In [27]:
preds = []
for sample in test_loader:
    _, tmp_preds = torch.max(model_2(sample), dim=1)
    preds = preds + list(tmp_preds.to('cpu'))

In [28]:
preds = [x.item() for x in preds]
preds[:5]

[2, 0, 9, 0, 3]

In [30]:
output = pd.DataFrame({
    'ImageId': range(1, len(mnist_test_data) + 1),
    'Label': preds
})

In [31]:
output.to_csv('self_output_cnn.csv', index=False)