In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mnist-dataset/train-images.idx3-ubyte
/kaggle/input/mnist-dataset/t10k-labels.idx1-ubyte
/kaggle/input/mnist-dataset/t10k-images.idx3-ubyte
/kaggle/input/mnist-dataset/train-labels.idx1-ubyte
/kaggle/input/mnist-dataset/t10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte
/kaggle/input/mnist-dataset/t10k-images-idx3-ubyte/t10k-images-idx3-ubyte
/kaggle/input/mnist-dataset/train-labels-idx1-ubyte/train-labels-idx1-ubyte
/kaggle/input/mnist-dataset/train-images-idx3-ubyte/train-images-idx3-ubyte


In [2]:
import numpy as np

from tqdm import tqdm, trange

import torch
import torch.nn as nn
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader

from torchvision.transforms import ToTensor
from torchvision.datasets.mnist import MNIST

np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x78aeedac5890>

In [3]:
def main():
    transform = ToTensor()

    train_set = MNIST(root='./../datasets', train=True, download=True, transform=transform)
    test_set = MNIST(root='./../datasets', train=False, download=True, transform=transform)

    train_loader = DataLoader(train_set, shuffle=True, batch_size=128)
    test_loader = DataLoader(test_set, shuffle=False, batch_size=128)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device: ", device, f"({torch.cuda.get_device_name(device)})" if torch.cuda.is_available() else "")
    model = MyViT((1, 28, 28), n_patches=7, n_blocks=2, hidden_d=8, n_heads=2, out_d=10).to(device)
    N_EPOCHS = 5
    LR = 0.005

    optimizer = Adam(model.parameters(), lr=LR)
    criterion = CrossEntropyLoss()
    for epoch in trange(N_EPOCHS, desc="Training"):
        train_loss = 0.0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1} in training", leave=False):
            x, y = batch
            x, y = x.to(device), y.to(device)
            y_hat = model(x)
            loss = criterion(y_hat, y)

            train_loss += loss.detach().cpu().item() / len(train_loader)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}/{N_EPOCHS} loss: {train_loss:.2f}")

    with torch.no_grad():
        correct, total = 0, 0
        test_loss = 0.0
        for batch in tqdm(test_loader, desc="Testing"):
            x, y = batch
            x, y = x.to(device), y.to(device)
            y_hat = model(x)
            loss = criterion(y_hat, y)
            test_loss += loss.detach().cpu().item() / len(test_loader)

            correct += torch.sum(torch.argmax(y_hat, dim=1) == y).detach().cpu().item()
            total += len(x)
        print(f"Test loss: {test_loss:.2f}")
        print(f"Test accuracy: {correct / total * 100:.2f}%")

In [4]:
class MyViT(nn.Module):
  def __init__(self):
    super(MyViT, self).__init__()

  def forward(self, images):
    pass

In [5]:
def patchify(images, n_patches):
    n, c, h, w = images.shape

    assert h == w, "Patchify method is implemented for square images only"

    patches = torch.zeros(n, n_patches ** 2, h * w * c // n_patches ** 2)
    patch_size = h // n_patches

    for idx, image in enumerate(images):
        for i in range(n_patches):
            for j in range(n_patches):
                patch = image[:, i * patch_size: (i + 1) * patch_size, j * patch_size: (j + 1) * patch_size]
                patches[idx, i * n_patches + j] = patch.flatten()
    return patches

In [6]:
class MyViT(nn.Module):
  def __init__(self, chw=(1, 28, 28), n_patches=7):
    super(MyViT, self).__init__()

    self.chw = chw 
    self.n_patches = n_patches

    assert chw[1] % n_patches == 0, "Input shape not entirely divisible by number of patches"
    assert chw[2] % n_patches == 0, "Input shape not entirely divisible by number of patches"

  def forward(self, images):
    patches = patchify(images, self.n_patches)
    return patches

In [7]:
if __name__ == '__main__':
  model = MyViT(
    chw=(1, 28, 28),
    n_patches=7
  )

  x = torch.randn(7, 1, 28, 28)
  print(model(x).shape) 

torch.Size([7, 49, 16])


In [8]:
class MyViT(nn.Module):
  def __init__(self, chw=(1, 28, 28), n_patches=7):
    super(MyViT, self).__init__()

    self.chw = chw
    self.n_patches = n_patches

    assert chw[1] % n_patches == 0, "Input shape not entirely divisible by number of patches"
    assert chw[2] % n_patches == 0, "Input shape not entirely divisible by number of patches"
    self.patch_size = (chw[1] / n_patches, chw[2] / n_patches)
    self.input_d = int(chw[0] * self.patch_size[0] * self.patch_size[1])
    self.linear_mapper = nn.Linear(self.input_d, self.hidden_d)

  def forward(self, images):
    patches = patchify(images, self.n_patches)
    tokens = self.linear_mapper(patches)
    return tokens

In [9]:
class MyViT(nn.Module):
  def __init__(self, chw=(1, 28, 28), n_patches=7):
    super(MyViT, self).__init__()
    self.chw = chw
    self.n_patches = n_patches

    assert chw[1] % n_patches == 0, "Input shape not entirely divisible by number of patches"
    assert chw[2] % n_patches == 0, "Input shape not entirely divisible by number of patches"
    self.patch_size = (chw[1] / n_patches, chw[2] / n_patches)
    self.input_d = int(chw[0] * self.patch_size[0] * self.patch_size[1])
    self.linear_mapper = nn.Linear(self.input_d, self.hidden_d)
    self.class_token = nn.Parameter(torch.rand(1, self.hidden_d))

  def forward(self, images):
    patches = patchify(images, self.n_patches)
    tokens = self.linear_mapper(patches)
    tokens = torch.stack([torch.vstack((self.class_token, tokens[i])) for i in range(len(tokens))])
    return tokens

In [10]:
def get_positional_embeddings(sequence_length, d):
    result = torch.ones(sequence_length, d)
    for i in range(sequence_length):
        for j in range(d):
            result[i][j] = np.sin(i / (10000 ** (j / d))) if j % 2 == 0 else np.cos(i / (10000 ** ((j - 1) / d)))
    return result

In [11]:
class MyViT(nn.Module):
  def __init__(self, chw=(1, 28, 28), n_patches=7):
    super(MyViT, self).__init__()
    self.chw = chw 
    self.n_patches = n_patches

    assert chw[1] % n_patches == 0, "Input shape not entirely divisible by number of patches"
    assert chw[2] % n_patches == 0, "Input shape not entirely divisible by number of patches"
    self.patch_size = (chw[1] / n_patches, chw[2] / n_patches)

    self.input_d = int(chw[0] * self.patch_size[0] * self.patch_size[1])
    self.linear_mapper = nn.Linear(self.input_d, self.hidden_d)
    self.class_token = nn.Parameter(torch.rand(1, self.hidden_d))
    self.pos_embed = nn.Parameter(torch.tensor(get_positional_embeddings(self.n_patches ** 2 + 1, self.hidden_d)))
    self.pos_embed.requires_grad = False

  def forward(self, images):
    patches = patchify(images, self.n_patches)
    tokens = self.linear_mapper(patches)

    tokens = torch.stack([torch.vstack((self.class_token, tokens[i])) for i in range(len(tokens))])
    pos_embed = self.pos_embed.repeat(n, 1, 1)
    out = tokens + pos_embed
    return out

In [12]:
class MyMSA(nn.Module):
    def __init__(self, d, n_heads=2):
        super(MyMSA, self).__init__()
        self.d = d
        self.n_heads = n_heads

        assert d % n_heads == 0, f"Can't divide dimension {d} into {n_heads} heads"

        d_head = int(d / n_heads)
        self.q_mappings = nn.ModuleList([nn.Linear(d_head, d_head) for _ in range(self.n_heads)])
        self.k_mappings = nn.ModuleList([nn.Linear(d_head, d_head) for _ in range(self.n_heads)])
        self.v_mappings = nn.ModuleList([nn.Linear(d_head, d_head) for _ in range(self.n_heads)])
        self.d_head = d_head
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, sequences):

        result = []
        for sequence in sequences:
            seq_result = []
            for head in range(self.n_heads):
                q_mapping = self.q_mappings[head]
                k_mapping = self.k_mappings[head]
                v_mapping = self.v_mappings[head]

                seq = sequence[:, head * self.d_head: (head + 1) * self.d_head]
                q, k, v = q_mapping(seq), k_mapping(seq), v_mapping(seq)

                attention = self.softmax(q @ k.T / (self.d_head ** 0.5))
                seq_result.append(attention @ v)
            result.append(torch.hstack(seq_result))
        return torch.cat([torch.unsqueeze(r, dim=0) for r in result])

In [13]:
class MyViTBlock(nn.Module):
    def __init__(self, hidden_d, n_heads, mlp_ratio=4):
        super(MyViTBlock, self).__init__()
        self.hidden_d = hidden_d
        self.n_heads = n_heads

        self.norm1 = nn.LayerNorm(hidden_d)
        self.mhsa = MyMSA(hidden_d, n_heads)
        self.norm2 = nn.LayerNorm(hidden_d)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_d, mlp_ratio * hidden_d),
            nn.GELU(),
            nn.Linear(mlp_ratio * hidden_d, hidden_d)
        )

    def forward(self, x):
        out = x + self.mhsa(self.norm1(x))
        out = out + self.mlp(self.norm2(out))
        return out

In [14]:
if __name__ == '__main__':
  model = MyViTBlock(hidden_d=8, n_heads=2)

  x = torch.randn(7, 50, 8)  
  print(model(x).shape) 

torch.Size([7, 50, 8])


In [15]:
class MyViT(nn.Module):
    def __init__(self, chw, n_patches=7, n_blocks=2, hidden_d=8, n_heads=2, out_d=10):
        super(MyViT, self).__init__()
        
        self.chw = chw 
        self.n_patches = n_patches
        self.n_blocks = n_blocks
        self.n_heads = n_heads
        self.hidden_d = hidden_d
        assert chw[1] % n_patches == 0, "Input shape not entirely divisible by number of patches"
        assert chw[2] % n_patches == 0, "Input shape not entirely divisible by number of patches"
        self.patch_size = (chw[1] / n_patches, chw[2] / n_patches)

        self.input_d = int(chw[0] * self.patch_size[0] * self.patch_size[1])
        self.linear_mapper = nn.Linear(self.input_d, self.hidden_d)
        
        self.class_token = nn.Parameter(torch.rand(1, self.hidden_d))
        
        self.register_buffer('positional_embeddings', get_positional_embeddings(n_patches ** 2 + 1, hidden_d), persistent=False)
        
        self.blocks = nn.ModuleList([MyViTBlock(hidden_d, n_heads) for _ in range(n_blocks)])

    def forward(self, images):
        n, c, h, w = images.shape
        patches = patchify(images, self.n_patches).to(self.positional_embeddings.device)
        
        tokens = self.linear_mapper(patches)
        
        tokens = torch.cat((self.class_token.expand(n, 1, -1), tokens), dim=1)
        
        out = tokens + self.positional_embeddings.repeat(n, 1, 1)
        
        for block in self.blocks:
            out = block(out)
            
        return out

In [16]:
class MyViT(nn.Module):
    def __init__(self, chw, n_patches=7, n_blocks=2, hidden_d=8, n_heads=2, out_d=10):
        super(MyViT, self).__init__()
        
        self.chw = chw 
        self.n_patches = n_patches
        self.n_blocks = n_blocks
        self.n_heads = n_heads
        self.hidden_d = hidden_d
        
        assert chw[1] % n_patches == 0, "Input shape not entirely divisible by number of patches"
        assert chw[2] % n_patches == 0, "Input shape not entirely divisible by number of patches"
        self.patch_size = (chw[1] / n_patches, chw[2] / n_patches)

        self.input_d = int(chw[0] * self.patch_size[0] * self.patch_size[1])
        self.linear_mapper = nn.Linear(self.input_d, self.hidden_d)
        
        self.class_token = nn.Parameter(torch.rand(1, self.hidden_d))
        
        self.register_buffer('positional_embeddings', get_positional_embeddings(n_patches ** 2 + 1, hidden_d), persistent=False)
        
        self.blocks = nn.ModuleList([MyViTBlock(hidden_d, n_heads) for _ in range(n_blocks)])
        
        self.mlp = nn.Sequential(
            nn.Linear(self.hidden_d, out_d),
            nn.Softmax(dim=-1)
        )

    def forward(self, images):
        n, c, h, w = images.shape
        patches = patchify(images, self.n_patches).to(self.positional_embeddings.device)
        
        tokens = self.linear_mapper(patches)
        
        tokens = torch.cat((self.class_token.expand(n, 1, -1), tokens), dim=1)
        
        out = tokens + self.positional_embeddings.repeat(n, 1, 1)
        
        for block in self.blocks:
            out = block(out)
            
        out = out[:, 0]
        
        return self.mlp(out) 

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MyViT((1, 28, 28), n_patches=7, n_blocks=2, hidden_d=8, n_heads=2, out_d=10).to(device)

In [18]:
import numpy as np
from tqdm import tqdm, trange
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
from torchvision.datasets.mnist import MNIST

np.random.seed(0)
torch.manual_seed(0)


def main():
    transform = ToTensor()

    train_set = MNIST(root='./../datasets', train=True, download=True, transform=transform)
    test_set = MNIST(root='./../datasets', train=False, download=True, transform=transform)

    train_loader = DataLoader(train_set, shuffle=True, batch_size=128)
    test_loader = DataLoader(test_set, shuffle=False, batch_size=128)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device: ", device, f"({torch.cuda.get_device_name(device)})" if torch.cuda.is_available() else "")
    model = MyViT((1, 28, 28), n_patches=7, n_blocks=2, hidden_d=8, n_heads=2, out_d=10).to(device)
    N_EPOCHS = 5
    LR = 0.005

    optimizer = Adam(model.parameters(), lr=LR)
    criterion = CrossEntropyLoss()
    for epoch in trange(N_EPOCHS, desc="Training"):
        train_loss = 0.0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1} in training", leave=False):
            x, y = batch
            x, y = x.to(device), y.to(device)
            y_hat = model(x)
            loss = criterion(y_hat, y)

            train_loss += loss.detach().cpu().item() / len(train_loader)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}/{N_EPOCHS} loss: {train_loss:.2f}")

    with torch.no_grad():
        actual_labels = []
        predicted_labels = []

        for batch in tqdm(test_loader, desc="Making Predictions"):
            x, y = batch
            x, y = x.to(device), y.to(device)
            y_hat = model(x)

            _, predicted = torch.max(y_hat, 1)
            actual_labels.extend(y.cpu().numpy())
            predicted_labels.extend(predicted.cpu().numpy())

        print("Actual labels for the first 10 samples:")
        print(actual_labels[:10])
        print("Predicted labels for the first 10 samples:")
        print(predicted_labels[:10])
    with torch.no_grad():
        correct, total = 0, 0
        test_loss = 0.0
        predicted_labels = []

        for batch in tqdm(test_loader, desc="Testing"):
            x, y = batch
            x, y = x.to(device), y.to(device)
            y_hat = model(x)
            loss = criterion(y_hat, y)
            test_loss += loss.detach().cpu().item() / len(test_loader)

            _, predicted = torch.max(y_hat, 1)
            predicted_labels.extend(predicted.cpu().numpy())

            correct += torch.sum(predicted == y).detach().cpu().item()
            total += len(x)

        print(f"Test loss: {test_loss:.2f}")
        print(f"Test accuracy: {correct / total * 100:.2f}%")

        print("Predicted labels for the first 10 samples:")
        print(predicted_labels[:10])


class MyViTBlock(nn.Module):
    def __init__(self, hidden_d, n_heads, mlp_ratio=4):
        super(MyViTBlock, self).__init__()
        self.hidden_d = hidden_d
        self.n_heads = n_heads

        self.norm1 = nn.LayerNorm(hidden_d)
        self.mhsa = MyMSA(hidden_d, n_heads)
        self.norm2 = nn.LayerNorm(hidden_d)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_d, mlp_ratio * hidden_d),
            nn.GELU(),
            nn.Linear(mlp_ratio * hidden_d, hidden_d)
        )

    def forward(self, x):
        out = x + self.mhsa(self.norm1(x))
        out = out + self.mlp(self.norm2(out))
        return out


class MyViT(nn.Module):
    def __init__(self, chw, n_patches=7, n_blocks=2, hidden_d=8, n_heads=2, out_d=10):
        super(MyViT, self).__init__()

        self.chw = chw  
        self.n_patches = n_patches
        self.n_blocks = n_blocks
        self.n_heads = n_heads
        self.hidden_d = hidden_d

        assert chw[1] % n_patches == 0, "Input shape not entirely divisible by the number of patches"
        assert chw[2] % n_patches == 0, "Input shape not entirely divisible by the number of patches"
        self.patch_size = (chw[1] // n_patches, chw[2] // n_patches)

        self.input_d = int(chw[0] * self.patch_size[0] * self.patch_size[1])
        self.linear_mapper = nn.Linear(self.input_d, hidden_d)

        self.class_token = nn.Parameter(torch.rand(1, hidden_d))
        self.register_buffer('positional_embeddings', get_positional_embeddings(n_patches ** 2 + 1, hidden_d),
                             persistent=False)

        self.blocks = nn.ModuleList([MyViTBlock(hidden_d, n_heads) for _ in range(n_blocks)])
        self.mlp = nn.Sequential(
            nn.Linear(hidden_d, out_d),
        )

    def forward(self, images):
        n, c, h, w = images.shape
        patches = patchify(images, self.n_patches).to(self.positional_embeddings.device)

        tokens = self.linear_mapper(patches)
        tokens = torch.cat((self.class_token.expand(n, 1, -1), tokens), dim=1)
        out = tokens + self.positional_embeddings.repeat(n, 1, 1)

        for block in self.blocks:
            out = block(out)

        out = out[:, 0]
        out = self.mlp(out)

        return out


class MyViTBlock(nn.Module):
    def __init__(self, hidden_d, n_heads, mlp_ratio=4):
        super(MyViTBlock, self).__init__()
        self.hidden_d = hidden_d
        self.n_heads = n_heads

        self.norm1 = nn.LayerNorm(hidden_d)
        self.mhsa = MyMSA(hidden_d, n_heads)
        self.norm2 = nn.LayerNorm(hidden_d)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_d, mlp_ratio * hidden_d),
            nn.GELU(),
            nn.Linear(mlp_ratio * hidden_d, hidden_d)
        )

    def forward(self, x):
        out = x + self.mhsa(self.norm1(x))
        out = out + self.mlp(self.norm2(out))
        return out


class MyMSA(nn.Module):
    def __init__(self, hidden_d, n_heads):
        super(MyMSA, self).__init__()
        self.hidden_d = hidden_d
        self.n_heads = n_heads

        self.qkv = nn.Linear(hidden_d, hidden_d * 3)
        self.fc = nn.Linear(hidden_d, hidden_d)

    def forward(self, x):
        qkv = self.qkv(x)
        q, k, v = torch.chunk(qkv, 3, dim=-1)

        q = q.view(x.size(0), -1, self.n_heads, self.hidden_d // self.n_heads)
        k = k.view(x.size(0), -1, self.n_heads, self.hidden_d // self.n_heads)
        v = v.view(x.size(0), -1, self.n_heads, self.hidden_d // self.n_heads)

        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        scores = torch.matmul(q, k.transpose(-2, -1)) / (self.hidden_d // self.n_heads) ** 0.5
        attn = torch.softmax(scores, dim=-1)

        out = torch.matmul(attn, v)
        out = out.transpose(1, 2).contiguous().view(x.size(0), -1, self.hidden_d)

        out = self.fc(out)

        return out


def patchify(images, n_patches):
    n, c, h, w = images.shape

    assert h == w, "Patchify method is implemented for square images only"

    patches = torch.zeros(n, n_patches ** 2, h * w * c // n_patches ** 2)
    patch_size = h // n_patches

    for idx, image in enumerate(images):
        for i in range(n_patches):
            for j in range(n_patches):
                patch = image[:, i * patch_size: (i + 1) * patch_size, j * patch_size: (j + 1) * patch_size]
                patches[idx, i * n_patches + j] = patch.flatten()
    return patches


def get_positional_embeddings(sequence_length, d):
    result = torch.ones(sequence_length, d)
    for i in range(sequence_length):
        for j in range(d):
            result[i][j] = np.sin(i / (10000 ** (j / d))) if j % 2 == 0 else np.cos(i / (10000 ** ((j - 1) / d)))
    return result


if __name__ == '__main__':
    main()

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./../datasets/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 135501244.16it/s]


Extracting ./../datasets/MNIST/raw/train-images-idx3-ubyte.gz to ./../datasets/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./../datasets/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 43062813.30it/s]


Extracting ./../datasets/MNIST/raw/train-labels-idx1-ubyte.gz to ./../datasets/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./../datasets/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 37460141.89it/s]


Extracting ./../datasets/MNIST/raw/t10k-images-idx3-ubyte.gz to ./../datasets/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./../datasets/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 7161852.92it/s]


Extracting ./../datasets/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./../datasets/MNIST/raw

Using device:  cuda (Tesla P100-PCIE-16GB)


Training:   0%|          | 0/5 [00:00<?, ?it/s]
Epoch 1 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 1 in training:   0%|          | 1/469 [00:00<05:54,  1.32it/s][A
Epoch 1 in training:   0%|          | 2/469 [00:01<03:50,  2.03it/s][A
Epoch 1 in training:   1%|          | 3/469 [00:01<03:11,  2.43it/s][A
Epoch 1 in training:   1%|          | 4/469 [00:01<02:53,  2.69it/s][A
Epoch 1 in training:   1%|          | 5/469 [00:02<02:42,  2.86it/s][A
Epoch 1 in training:   1%|▏         | 6/469 [00:02<02:35,  2.98it/s][A
Epoch 1 in training:   1%|▏         | 7/469 [00:02<02:31,  3.05it/s][A
Epoch 1 in training:   2%|▏         | 8/469 [00:02<02:27,  3.12it/s][A
Epoch 1 in training:   2%|▏         | 9/469 [00:03<02:26,  3.15it/s][A
Epoch 1 in training:   2%|▏         | 10/469 [00:03<02:24,  3.18it/s][A
Epoch 1 in training:   2%|▏         | 11/469 [00:03<02:23,  3.18it/s][A
Epoch 1 in training:   3%|▎         | 12/469 [00:04<02:23,  3.19it/s][A
Epoch 1 in training: 

Epoch 1/5 loss: 1.62



Epoch 2 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 2 in training:   0%|          | 1/469 [00:00<02:27,  3.17it/s][A
Epoch 2 in training:   0%|          | 2/469 [00:00<02:25,  3.21it/s][A
Epoch 2 in training:   1%|          | 3/469 [00:00<02:24,  3.23it/s][A
Epoch 2 in training:   1%|          | 4/469 [00:01<02:24,  3.21it/s][A
Epoch 2 in training:   1%|          | 5/469 [00:01<02:23,  3.23it/s][A
Epoch 2 in training:   1%|▏         | 6/469 [00:01<02:23,  3.24it/s][A
Epoch 2 in training:   1%|▏         | 7/469 [00:02<02:30,  3.07it/s][A
Epoch 2 in training:   2%|▏         | 8/469 [00:02<02:31,  3.04it/s][A
Epoch 2 in training:   2%|▏         | 9/469 [00:02<02:29,  3.09it/s][A
Epoch 2 in training:   2%|▏         | 10/469 [00:03<02:27,  3.11it/s][A
Epoch 2 in training:   2%|▏         | 11/469 [00:03<02:25,  3.14it/s][A
Epoch 2 in training:   3%|▎         | 12/469 [00:03<02:24,  3.15it/s][A
Epoch 2 in training:   3%|▎         | 13/469 [00:04<02:23,  3.18it/s

Epoch 2/5 loss: 0.83



Epoch 3 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 3 in training:   0%|          | 1/469 [00:00<02:24,  3.25it/s][A
Epoch 3 in training:   0%|          | 2/469 [00:00<02:24,  3.23it/s][A
Epoch 3 in training:   1%|          | 3/469 [00:00<02:23,  3.25it/s][A
Epoch 3 in training:   1%|          | 4/469 [00:01<02:23,  3.23it/s][A
Epoch 3 in training:   1%|          | 5/469 [00:01<02:24,  3.22it/s][A
Epoch 3 in training:   1%|▏         | 6/469 [00:01<02:23,  3.22it/s][A
Epoch 3 in training:   1%|▏         | 7/469 [00:02<02:23,  3.22it/s][A
Epoch 3 in training:   2%|▏         | 8/469 [00:02<02:27,  3.12it/s][A
Epoch 3 in training:   2%|▏         | 9/469 [00:02<02:25,  3.16it/s][A
Epoch 3 in training:   2%|▏         | 10/469 [00:03<02:24,  3.19it/s][A
Epoch 3 in training:   2%|▏         | 11/469 [00:03<02:23,  3.20it/s][A
Epoch 3 in training:   3%|▎         | 12/469 [00:03<02:22,  3.21it/s][A
Epoch 3 in training:   3%|▎         | 13/469 [00:04<02:20,  3.24it/s

Epoch 3/5 loss: 0.61



Epoch 4 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 4 in training:   0%|          | 1/469 [00:00<02:25,  3.21it/s][A
Epoch 4 in training:   0%|          | 2/469 [00:00<02:24,  3.23it/s][A
Epoch 4 in training:   1%|          | 3/469 [00:00<02:24,  3.23it/s][A
Epoch 4 in training:   1%|          | 4/469 [00:01<02:23,  3.23it/s][A
Epoch 4 in training:   1%|          | 5/469 [00:01<02:26,  3.18it/s][A
Epoch 4 in training:   1%|▏         | 6/469 [00:01<02:24,  3.20it/s][A
Epoch 4 in training:   1%|▏         | 7/469 [00:02<02:23,  3.21it/s][A
Epoch 4 in training:   2%|▏         | 8/469 [00:02<02:23,  3.22it/s][A
Epoch 4 in training:   2%|▏         | 9/469 [00:02<02:22,  3.23it/s][A
Epoch 4 in training:   2%|▏         | 10/469 [00:03<02:21,  3.23it/s][A
Epoch 4 in training:   2%|▏         | 11/469 [00:03<02:21,  3.24it/s][A
Epoch 4 in training:   3%|▎         | 12/469 [00:03<02:22,  3.22it/s][A
Epoch 4 in training:   3%|▎         | 13/469 [00:04<02:21,  3.22it/s

Epoch 4/5 loss: 0.48



Epoch 5 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 5 in training:   0%|          | 1/469 [00:00<02:30,  3.11it/s][A
Epoch 5 in training:   0%|          | 2/469 [00:00<02:37,  2.97it/s][A
Epoch 5 in training:   1%|          | 3/469 [00:01<02:39,  2.91it/s][A
Epoch 5 in training:   1%|          | 4/469 [00:01<02:36,  2.97it/s][A
Epoch 5 in training:   1%|          | 5/469 [00:01<02:32,  3.04it/s][A
Epoch 5 in training:   1%|▏         | 6/469 [00:01<02:29,  3.10it/s][A
Epoch 5 in training:   1%|▏         | 7/469 [00:02<02:26,  3.14it/s][A
Epoch 5 in training:   2%|▏         | 8/469 [00:02<02:25,  3.17it/s][A
Epoch 5 in training:   2%|▏         | 9/469 [00:02<02:24,  3.18it/s][A
Epoch 5 in training:   2%|▏         | 10/469 [00:03<02:23,  3.21it/s][A
Epoch 5 in training:   2%|▏         | 11/469 [00:03<02:22,  3.21it/s][A
Epoch 5 in training:   3%|▎         | 12/469 [00:03<02:21,  3.22it/s][A
Epoch 5 in training:   3%|▎         | 13/469 [00:04<02:20,  3.24it/s

Epoch 5/5 loss: 0.41


Making Predictions: 100%|██████████| 79/79 [00:23<00:00,  3.31it/s]


Actual labels for the first 10 samples:
[7, 2, 1, 0, 4, 1, 4, 9, 5, 9]
Predicted labels for the first 10 samples:
[7, 2, 1, 0, 4, 1, 4, 4, 0, 9]


Testing: 100%|██████████| 79/79 [00:23<00:00,  3.31it/s]

Test loss: 0.36
Test accuracy: 88.57%
Predicted labels for the first 10 samples:
[7, 2, 1, 0, 4, 1, 4, 4, 0, 9]



