In [None]:
import torch
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torchvision as tv

In [None]:
BATCH_SIZE=256

In [None]:
train_dataset = tv.datasets.FashionMNIST('.', train=True, transform=tv.transforms.ToTensor(), download=True)
test_dataset = tv.datasets.FashionMNIST('.', train=False, transform=tv.transforms.ToTensor(), download=True)
train = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE)
test = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
train_dataset[0][0].shape

torch.Size([1, 28, 28])

### Base Model

In [None]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 10),
)

In [None]:
model

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=256, bias=True)
  (2): ReLU()
  (3): Linear(in_features=256, out_features=10, bias=True)
)

In [None]:
loss = torch.nn.CrossEntropyLoss()
trainer = torch.optim.Adam(model.parameters(), lr=0.01)


In [None]:
def train_model(num_epochs):
    for ep in range(num_epochs):
        train_iters, train_passed  = 0, 0
        train_loss, train_acc = 0., 0.
        start=time.time()

        model.train()
        for X, y in train:
            trainer.zero_grad()
            y_pred = model(X)
            l = loss(y_pred, y)
            l.backward()
            trainer.step()
            train_loss += l.item()
            train_acc += (y_pred.argmax(dim=1) == y).sum().item()
            train_iters += 1
            train_passed += len(X)

        test_iters, test_passed  = 0, 0
        test_loss, test_acc = 0., 0.
        model.eval()
        for X, y in test:
            y_pred = model(X)
            l = loss(y_pred, y)
            test_loss += l.item()
            test_acc += (y_pred.argmax(dim=1) == y).sum().item()
            test_iters += 1
            test_passed += len(X)

        print("ep: {}, taked: {:.3f}, train_loss: {}, train_acc: {}, test_loss: {}, test_acc: {}".format(
            ep, time.time() - start, train_loss / train_iters, train_acc / train_passed,
            test_loss / test_iters, test_acc / test_passed)
        )

In [None]:
train_model(10)

ep: 0, taked: 2.551, train_loss: 0.5255452503549292, train_acc: 0.8119, test_loss: 0.4641087159514427, test_acc: 0.8278
ep: 1, taked: 2.914, train_loss: 0.3807825371940085, train_acc: 0.8624333333333334, test_loss: 0.4323874294757843, test_acc: 0.8414
ep: 2, taked: 2.493, train_loss: 0.35115152413540696, train_acc: 0.8717333333333334, test_loss: 0.4140103034675121, test_acc: 0.8482
ep: 3, taked: 2.804, train_loss: 0.33045497232295096, train_acc: 0.8773666666666666, test_loss: 0.4110592074692249, test_acc: 0.8554
ep: 4, taked: 4.254, train_loss: 0.31521974041106854, train_acc: 0.8826, test_loss: 0.4480829246342182, test_acc: 0.8509
ep: 5, taked: 3.386, train_loss: 0.30565745354966917, train_acc: 0.88575, test_loss: 0.42148313485085964, test_acc: 0.8558
ep: 6, taked: 3.585, train_loss: 0.29487083351358456, train_acc: 0.8900666666666667, test_loss: 0.4125928644090891, test_acc: 0.8591
ep: 7, taked: 2.861, train_loss: 0.286420871919774, train_acc: 0.8930833333333333, test_loss: 0.398257839

### More layers

In [None]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 10)
)

In [None]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model(10)

ep: 0, taked: 5.502, train_loss: 0.2673619069951646, train_acc: 0.9025333333333333, test_loss: 0.4105944700539112, test_acc: 0.8707
ep: 1, taked: 5.781, train_loss: 0.2579672835608746, train_acc: 0.9064166666666666, test_loss: 0.4042858507484198, test_acc: 0.8775
ep: 2, taked: 5.516, train_loss: 0.265577502199944, train_acc: 0.9042833333333333, test_loss: 0.4069808002561331, test_acc: 0.8718
ep: 3, taked: 5.951, train_loss: 0.25379588324972924, train_acc: 0.9075666666666666, test_loss: 0.40211314633488654, test_acc: 0.8778
ep: 4, taked: 5.465, train_loss: 0.2492960200664845, train_acc: 0.9085333333333333, test_loss: 0.3941861007362604, test_acc: 0.8769
ep: 5, taked: 5.499, train_loss: 0.2448240192012584, train_acc: 0.9100666666666667, test_loss: 0.4137551229447126, test_acc: 0.879
ep: 6, taked: 5.419, train_loss: 0.23531387017128316, train_acc: 0.91295, test_loss: 0.40246985424309967, test_acc: 0.8766
ep: 7, taked: 5.297, train_loss: 0.24798560478585832, train_acc: 0.90905, test_loss: 

### BatchNorm + 15 Epochs

In [None]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.BatchNorm1d(512),
    torch.nn.ReLU(),
    torch.nn.Linear(512, 256),
    torch.nn.BatchNorm1d(256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 128),
    torch.nn.BatchNorm1d(128),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 10)
)

In [None]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model(15)

ep: 0, taked: 3.884, train_loss: 0.4697841873828401, train_acc: 0.82845, test_loss: 0.397604320012033, test_acc: 0.8569
ep: 1, taked: 3.580, train_loss: 0.34653277283019207, train_acc: 0.87245, test_loss: 0.40305748693645, test_acc: 0.8509
ep: 2, taked: 3.620, train_loss: 0.30170548146075393, train_acc: 0.8897166666666667, test_loss: 0.38787022288888695, test_acc: 0.8504
ep: 3, taked: 3.602, train_loss: 0.2742162881379432, train_acc: 0.8976, test_loss: 0.36961090844124556, test_acc: 0.8633
ep: 4, taked: 3.777, train_loss: 0.25171961162952666, train_acc: 0.9064833333333333, test_loss: 0.3615937980823219, test_acc: 0.8663
ep: 5, taked: 3.646, train_loss: 0.2309481660102276, train_acc: 0.9149, test_loss: 0.3837518675252795, test_acc: 0.867
ep: 6, taked: 4.342, train_loss: 0.21535365904899353, train_acc: 0.92015, test_loss: 0.38387817549519243, test_acc: 0.8666
ep: 7, taked: 4.099, train_loss: 0.200710488126633, train_acc: 0.9237166666666666, test_loss: 0.3837200918700546, test_acc: 0.8785

### Dropout+ 17 Epochs

In [None]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.BatchNorm1d(512),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.Linear(512, 256),
    torch.nn.BatchNorm1d(256),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.Linear(256, 128),
    torch.nn.BatchNorm1d(128),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.Linear(128, 10)
)

In [None]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model(17)

ep: 0, taked: 3.350, train_loss: 0.2774702658678623, train_acc: 0.9014, test_loss: 0.32211491782218216, test_acc: 0.8872
ep: 1, taked: 3.945, train_loss: 0.27521205639585533, train_acc: 0.90165, test_loss: 0.3158109850250185, test_acc: 0.8847
ep: 2, taked: 3.407, train_loss: 0.26866213755404694, train_acc: 0.9031833333333333, test_loss: 0.30800861176103356, test_acc: 0.888
ep: 3, taked: 3.573, train_loss: 0.26639323792559033, train_acc: 0.90365, test_loss: 0.3064592137001455, test_acc: 0.8896
ep: 4, taked: 3.633, train_loss: 0.25836192115824275, train_acc: 0.90765, test_loss: 0.3134343011304736, test_acc: 0.8865
ep: 5, taked: 4.048, train_loss: 0.25737611803602667, train_acc: 0.9080166666666667, test_loss: 0.30954485554248096, test_acc: 0.8888
ep: 6, taked: 4.169, train_loss: 0.24996127023341808, train_acc: 0.90945, test_loss: 0.31773582808673384, test_acc: 0.8869
ep: 7, taked: 4.031, train_loss: 0.2511639891786778, train_acc: 0.9103, test_loss: 0.32120816633105276, test_acc: 0.8857
ep