# MNIST Dataset (For Kaggle)

In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data.dataloader import DataLoader

In [2]:
mnist_train_df = pd.read_csv('train.csv')

In [3]:
mnist_train_df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
np.max(mnist_train_df.values), np.min(mnist_train_df.values)

(255, 0)

In [5]:
mnist_train_labels = mnist_train_df['label'].values
mnist_train_data = torch.Tensor(mnist_train_df.drop('label', axis=1).values)
mnist_train = []

for i in range(len(mnist_train_data)):
    mnist_train.append([mnist_train_data[i] / 255., mnist_train_labels[i]])

In [6]:
def split_indices(n, val_pct):
    n_val = int(val_pct * n)
    idxs = np.random.permutation(n)
    return idxs[n_val:], idxs[:n_val]

In [7]:
train_idxs, val_idxs = split_indices(len(mnist_train_data), 0.2)

In [8]:
len(train_idxs), len(val_idxs)

(33600, 8400)

In [9]:
batch_size = 128

train_sampler = SubsetRandomSampler(train_idxs)
train_loader = DataLoader(mnist_train, batch_size=batch_size, sampler=train_sampler)

val_sampler = SubsetRandomSampler(val_idxs)
val_loader = DataLoader(mnist_train, batch_size=batch_size, sampler=val_sampler)

In [10]:
for samples, labels in train_loader:
    print(samples.shape, labels.shape)
    break

torch.Size([128, 784]) torch.Size([128])


In [11]:
def get_default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
def to_device(data, device):
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        return len(self.dl)

In [12]:
device = get_default_device()
device

device(type='cuda')

In [13]:
train_loader = DeviceDataLoader(train_loader, device)
val_loader = DeviceDataLoader(val_loader, device)

In [14]:
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

class ImageClassificationBase (nn.Module):
    def training_step(self, batch):
        images, labels = batch
        out = self(images)
        loss = F.cross_entropy(out, labels)
        return loss
    
    def validation_step(self, batch):
        images, labels = batch 
        out = self(images)
        loss = F.cross_entropy(out, labels)
        acc = accuracy(out, labels)
        return {'val_loss': loss.detach(), 'val_acc': acc}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()
        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
    
    def epoch_end(self, epoch, result):
        if 'val_loss' in result:
            print("Epoch [{}], train_loss: {:.5f}, val_loss: {:.5f}, val_acc: {:.5f}".format(
                epoch + 1, result['train_loss'], result['val_loss'], result['val_acc']))
        else:
            print("Epoch [{}], train_loss: {:.5f}".format(
                epoch + 1, result['train_loss']))

In [15]:
@torch.no_grad()
def evaluate(model, val_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def fit_one_cycle(epochs, lr, model, train_loader, val_loader=None, opt_func=torch.optim.SGD):
    torch.cuda.empty_cache()
    history = []
    
    optimizer = opt_func(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        model.train()
        train_losses = []
        for batch in train_loader:
            loss = model.training_step(batch)
            train_losses.append(loss)
            loss.backward()
            
            optimizer.step()
            optimizer.zero_grad()
        
        result = {}
        if val_loader is not None:
            result = evaluate(model, val_loader)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        model.epoch_end(epoch, result)
        history.append(result)
    return history

In [16]:
class MnistNetwork (ImageClassificationBase):
    def __init__(self, input_size, hidden_layers, output_size):
        super().__init__()
        layers = [
            nn.Linear(input_size, hidden_layers[0]),
            nn.ReLU()
        ]
        for i in range(len(hidden_layers) - 1):
            layers.append(nn.Linear(hidden_layers[i], hidden_layers[i + 1]))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_layers[-1], output_size))
        self.connections = nn.Sequential(*layers)
    
    def forward(self, xb):
        return self.connections(xb)

In [17]:
input_size = 784
hidden_layers = [256, 32]
output_size = 10
model = MnistNetwork(input_size, hidden_layers, output_size)
model

MnistNetwork(
  (connections): Sequential(
    (0): Linear(in_features=784, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=10, bias=True)
  )
)

In [18]:
to_device(model, device)

MnistNetwork(
  (connections): Sequential(
    (0): Linear(in_features=784, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=10, bias=True)
  )
)

In [19]:
epochs = 25
lr = 0.005

history = fit_one_cycle(epochs, lr, model, train_loader, val_loader=val_loader, opt_func=torch.optim.Adam)

Epoch [1], train_loss: 0.33611, val_loss: 0.18513, val_acc: 0.94247
Epoch [2], train_loss: 0.13425, val_loss: 0.11731, val_acc: 0.96480
Epoch [3], train_loss: 0.08629, val_loss: 0.11339, val_acc: 0.96499
Epoch [4], train_loss: 0.06619, val_loss: 0.10275, val_acc: 0.96868
Epoch [5], train_loss: 0.05246, val_loss: 0.11968, val_acc: 0.96636
Epoch [6], train_loss: 0.03972, val_loss: 0.12809, val_acc: 0.96494
Epoch [7], train_loss: 0.03214, val_loss: 0.14668, val_acc: 0.96622
Epoch [8], train_loss: 0.03790, val_loss: 0.12022, val_acc: 0.97218
Epoch [9], train_loss: 0.03136, val_loss: 0.12558, val_acc: 0.97280
Epoch [10], train_loss: 0.02360, val_loss: 0.13341, val_acc: 0.97282
Epoch [11], train_loss: 0.02363, val_loss: 0.16308, val_acc: 0.96468
Epoch [12], train_loss: 0.02501, val_loss: 0.15304, val_acc: 0.96913
Epoch [13], train_loss: 0.02146, val_loss: 0.14381, val_acc: 0.97192
Epoch [14], train_loss: 0.02380, val_loss: 0.15200, val_acc: 0.97263
Epoch [15], train_loss: 0.01153, val_loss: 

In [20]:
epochs = 25
lr = 0.001

history = fit_one_cycle(epochs, lr, model, train_loader, val_loader=val_loader, opt_func=torch.optim.Adam)

Epoch [1], train_loss: 0.00347, val_loss: 0.16128, val_acc: 0.97621
Epoch [2], train_loss: 0.00031, val_loss: 0.16423, val_acc: 0.97756
Epoch [3], train_loss: 0.00009, val_loss: 0.16413, val_acc: 0.97675
Epoch [4], train_loss: 0.00005, val_loss: 0.16708, val_acc: 0.97663
Epoch [5], train_loss: 0.00003, val_loss: 0.17284, val_acc: 0.97701
Epoch [6], train_loss: 0.00002, val_loss: 0.17665, val_acc: 0.97725
Epoch [7], train_loss: 0.00001, val_loss: 0.17895, val_acc: 0.97786
Epoch [8], train_loss: 0.00001, val_loss: 0.18608, val_acc: 0.97758
Epoch [9], train_loss: 0.00001, val_loss: 0.18984, val_acc: 0.97741
Epoch [10], train_loss: 0.00001, val_loss: 0.19348, val_acc: 0.97784
Epoch [11], train_loss: 0.00000, val_loss: 0.19661, val_acc: 0.97765
Epoch [12], train_loss: 0.00000, val_loss: 0.19892, val_acc: 0.97746
Epoch [13], train_loss: 0.00000, val_loss: 0.20014, val_acc: 0.97741
Epoch [14], train_loss: 0.00000, val_loss: 0.20074, val_acc: 0.97756
Epoch [15], train_loss: 0.00000, val_loss: 

In [21]:
input_size = 784
hidden_layers = [256, 32]
output_size = 10
model_2 = MnistNetwork(input_size, hidden_layers, output_size)
to_device(model_2, device)

MnistNetwork(
  (connections): Sequential(
    (0): Linear(in_features=784, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=10, bias=True)
  )
)

In [22]:
full_loader = DeviceDataLoader(DataLoader(mnist_train, batch_size=batch_size, shuffle=True), device)

In [23]:
epochs = 25
lr = 0.005

history = fit_one_cycle(epochs, lr, model_2, full_loader, opt_func=torch.optim.Adam)

Epoch [1], train_loss: 0.30448
Epoch [2], train_loss: 0.11336
Epoch [3], train_loss: 0.07668
Epoch [4], train_loss: 0.05831
Epoch [5], train_loss: 0.04907
Epoch [6], train_loss: 0.04420
Epoch [7], train_loss: 0.03326
Epoch [8], train_loss: 0.03123
Epoch [9], train_loss: 0.03469
Epoch [10], train_loss: 0.02420
Epoch [11], train_loss: 0.02513
Epoch [12], train_loss: 0.02784
Epoch [13], train_loss: 0.02053
Epoch [14], train_loss: 0.02158
Epoch [15], train_loss: 0.01923
Epoch [16], train_loss: 0.02093
Epoch [17], train_loss: 0.02122
Epoch [18], train_loss: 0.02060
Epoch [19], train_loss: 0.01640
Epoch [20], train_loss: 0.02341
Epoch [21], train_loss: 0.01446
Epoch [22], train_loss: 0.01186
Epoch [23], train_loss: 0.01926
Epoch [24], train_loss: 0.01210
Epoch [25], train_loss: 0.01061


In [24]:
epochs = 25
lr = 0.001

history = fit_one_cycle(epochs, lr, model_2, full_loader, opt_func=torch.optim.Adam)

Epoch [1], train_loss: 0.00314
Epoch [2], train_loss: 0.00036
Epoch [3], train_loss: 0.00009
Epoch [4], train_loss: 0.00004
Epoch [5], train_loss: 0.00003
Epoch [6], train_loss: 0.00002
Epoch [7], train_loss: 0.00002
Epoch [8], train_loss: 0.00001
Epoch [9], train_loss: 0.00001
Epoch [10], train_loss: 0.00001
Epoch [11], train_loss: 0.00001
Epoch [12], train_loss: 0.00000
Epoch [13], train_loss: 0.00000
Epoch [14], train_loss: 0.00000
Epoch [15], train_loss: 0.00000
Epoch [16], train_loss: 0.00000
Epoch [17], train_loss: 0.00000
Epoch [18], train_loss: 0.00000
Epoch [19], train_loss: 0.00000
Epoch [20], train_loss: 0.00000
Epoch [21], train_loss: 0.00000
Epoch [22], train_loss: 0.00000
Epoch [23], train_loss: 0.00000
Epoch [24], train_loss: 0.00000
Epoch [25], train_loss: 0.00000


In [25]:
test_df = pd.read_csv('test.csv')

In [26]:
mnist_test_data = torch.Tensor(test_df.values) / 255.
mnist_test_data.shape

torch.Size([28000, 784])

In [27]:
mnist_test_data = to_device(mnist_test_data, device)

In [28]:
_, preds = torch.max(model_2(mnist_test_data), dim=1)

In [29]:
preds.shape

torch.Size([28000])

In [30]:
output = pd.DataFrame({
    'ImageId': range(1, len(mnist_test_data) + 1),
    'Label': preds.to('cpu')
})

In [31]:
output.head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3


In [32]:
output.to_csv('self_output_1.csv', index=False)