In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import random_split, DataLoader

#input transformation seqeunce
transform = transforms.Compose([
        transforms.Resize((28, 28)), #resize input to 28x28
        transforms.ToTensor(), #convert image to pytorch tensor [0, 1] range
        transforms.Normalize((0.5, ), (0.5, )) #normalize values to [-1, 1] range
])

#load traning and test FashionMNIST dataset
'''
root --> save location if download is needed#train --> what portion of the dataset to load (training or testing)
download --> download dataset if not present in root
transform --> apply transitions to each image as it is being accessed
'''
full_train_dataset = torchvision.datasets.FashionMNIST(
        root='./data', train=True, download=True, transform=transform
)

test_dataset = torchvision.datasets.FashionMNIST(
        root='./data', train=False, download=True, transform=transform
)

In [3]:
def train_val_split(train_ratio):
        #Split training dataset into training and validation sets (80%/20%)
        train_size = int(train_ratio * len(full_train_dataset))
        val_size = len(full_train_dataset) - train_size
        train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size])

        #Create DataLoaders
        '''
        batch_size --> each batch retreived by the loader will get 512 samples at once
                - reduces gradient noise
                - allows parallel computing by GPU
                - forward pass --> CEL --> backward pass --> Gradient update
        shuffle --> shuffle order samples are loaded
                - shuffle for training set to prevent overfitting
                - no shuffle for validation and testing set to ensure consistent & reproducible evaluation
        num_workers --> how many parallel processes will be handling data loading
        '''
        train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=2)
        val_loader = DataLoader(val_dataset, batch_size=512, shuffle=False, num_workers=2)
        test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=2)

        print(f"Train: {len(train_dataset)} | Val: {len(val_dataset)} | Test: {len(test_dataset)}")

        return train_loader, val_loader, test_loader

In [4]:
import torch.nn.functional as F

class FashionMNISTNet(nn.Module):
        '''
        input: 28x28
        conv1 = 32x28x28
        maxpool = 32x14x14
        conv2 = 64x16x16
        maxpool = 64x8x8
        conv3 = 64x8x8
        maxpool = 64x4x4
        flatten = 1024
        '''
        def __init__(self):
                super(FashionMNISTNet, self).__init__()
                self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1)
                self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=2)
                self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1)
                self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
                self.fc1 = nn.Linear(in_features=1024, out_features=256)
                self.fc2 = nn.Linear(in_features=256, out_features=128)
                self.fc3 = nn.Linear(in_features=128, out_features=10)

        def forward(self, x):
                x = self.pool(F.relu(self.conv1(x)))
                x = self.pool(F.relu(self.conv2(x)))
                x = self.pool(F.relu(self.conv3(x)))
                x = torch.flatten(x, start_dim=1)
                x = F.relu(self.fc1(x))
                x = F.relu(self.fc2(x))
                x = self.fc3(x) #no ReLU on the last fully connected layer
                # x = F.softmax(x, dim=1)
                return x
        
net = FashionMNISTNet()
print(net)

FashionMNISTNet(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=1024, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=10, bias=True)
)


In [5]:
def evaluate_model(model, test_loader, device):
        #Test
        correct = 0
        total = 0

        # Disable gradient calculation
        with torch.no_grad():
                for inputs, labels in test_loader:

                        # Move the inputs and labels to the GPU if available
                        inputs = inputs.to(device)
                        labels = labels.to(device)

                        # Forward pass
                        outputs = model(inputs)

                        # Get the predicted class
                        _, predicted = torch.max(outputs.data, 1)

                        # Update the total number of samples and correct predictions
                        total += labels.size(0)
                        correct += (predicted == labels).sum().item()

        # Calculate the accuracy
        accuracy = 100 * correct / total
        print(f"Accuracy: {accuracy:.2f}%")

In [6]:
def train_net(model, train_loader, val_loader, device, criterion, optimizer):
        # Training loop
        num_epochs = 15
        for epoch in range(num_epochs):
                model.train()
                running_loss = 0.0
                for i, (inputs, labels) in enumerate(train_loader, 0):
                        inputs = inputs.to(device)
                        labels = labels.to(device)

                        optimizer.zero_grad()
                        outputs = model(inputs)
                        loss = criterion(outputs, labels)
                        loss.backward()
                        optimizer.step()
                        running_loss += loss.item()

                avg_loss = running_loss / (i + 1)

                # ---validation---
                model.eval()
                correct = 0
                total = 0
                with torch.no_grad():  
                        for inputs, labels in val_loader:
                                inputs, labels = inputs.to(device), labels.to(device)
                                outputs = model(inputs)
                                _, predicted = torch.max(outputs.data, 1)
                                total += labels.size(0)
                                correct += (predicted == labels).sum().item()
                if (total == 0):
                        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")
                else:
                        val_acc = 100 * correct / total
                        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}, Val Acc: {val_acc:.2f}%")

                


In [None]:
'''
Below - Training with differing validation set sizes:

The accuracy of the different validation sets are as following:
0% validation set: 87.28%
10% validation set: 86.78%
20% validation set: 86.57%
30% validation set: 78.45%
40% validation set: 84.71%

One general trend between acuraccy and the size of the validation set was that as the size of 
the validation set increased, the accuracy of the trained model decreased. Doing further research
I learned that the decrease in accuracy is likely due to the fact that there is less data for the 
model to train on when more data is allocated for validation within the full training set. The 
difference in accuracy from 0%-20% was minimal, but took a big dip from 20%-30% validation, likely
indicating that the model underfit when 30% of the full training set was used for validation, as 
the model did not have enough data to work with.

Initially, one confusing part was the rebound in accuracy at 40%, however I believe this is due to the fact that
the training/validation split was made randomly, and therefore the model might have been trained on 
differing data that resulting in a normal fluctuation of the accuracy, although the general trend 
remains. It might be difficult to get a consistent evaluation of these due to the fact that the size 
and contents of the training set are hard to normalize due to the difference in size, although the validation
set should be more accurate the larger the validation set is, due to there being more data used to 
validate the model during training.

All in all, the size of the validation set is a balance between the risk of underfitting vs validation accuracy/
per epoch training time. The larger the validation set, the more accurate the validation, but the smaller the training 
set, which results in shorter training time, but potentially a less accurate model. Although if the database is large
enough it shouldn't be a big issue.

Moving forward with the differed learning rate, I will pick the 10-90 validation-training split as the 0%-20% 
validation set sizes did not differ significantly in accuracy. 
'''

In [30]:
# 1. Trained with no validation set
no_val_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
no_val_net.to(device)

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(no_val_net.parameters(), lr=0.1)

train_loader, val_loader, test_loader = train_val_split(train_ratio=1.0)

train_net(no_val_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(no_val_net, test_loader, device)


Using device: cuda
Train: 60000 | Val: 0 | Test: 10000
Epoch [1/15], Loss: 1.8478
Epoch [2/15], Loss: 0.8143
Epoch [3/15], Loss: 0.6209
Epoch [4/15], Loss: 0.5441
Epoch [5/15], Loss: 0.4826
Epoch [6/15], Loss: 0.4376
Epoch [7/15], Loss: 0.4045
Epoch [8/15], Loss: 0.3815
Epoch [9/15], Loss: 0.3633
Epoch [10/15], Loss: 0.3413
Epoch [11/15], Loss: 0.3274
Epoch [12/15], Loss: 0.3168
Epoch [13/15], Loss: 0.3060
Epoch [14/15], Loss: 0.2961
Epoch [15/15], Loss: 0.2844
Accuracy: 87.28%


In [31]:
# 2. Trained with 10% validation set
ten_val_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
ten_val_net.to(device)

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(ten_val_net.parameters(), lr=0.1)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(ten_val_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(ten_val_net, test_loader, device)

Using device: cuda
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: 2.0532, Val Acc: 56.17%
Epoch [2/15], Loss: 0.9493, Val Acc: 64.67%
Epoch [3/15], Loss: 0.6862, Val Acc: 75.27%
Epoch [4/15], Loss: 0.5957, Val Acc: 73.23%
Epoch [5/15], Loss: 0.5247, Val Acc: 78.47%
Epoch [6/15], Loss: 0.4793, Val Acc: 81.72%
Epoch [7/15], Loss: 0.4418, Val Acc: 83.47%
Epoch [8/15], Loss: 0.4159, Val Acc: 81.20%
Epoch [9/15], Loss: 0.3917, Val Acc: 84.45%
Epoch [10/15], Loss: 0.3716, Val Acc: 84.95%
Epoch [11/15], Loss: 0.3531, Val Acc: 85.60%
Epoch [12/15], Loss: 0.3390, Val Acc: 86.67%
Epoch [13/15], Loss: 0.3260, Val Acc: 86.10%
Epoch [14/15], Loss: 0.3159, Val Acc: 85.85%
Epoch [15/15], Loss: 0.3035, Val Acc: 87.15%
Accuracy: 86.78%


In [32]:
# 2. Trained with 20% validation set
twenty_val_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
twenty_val_net.to(device)

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(twenty_val_net.parameters(), lr=0.1)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.8)

train_net(twenty_val_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(twenty_val_net, test_loader, device)

Using device: cuda
Train: 48000 | Val: 12000 | Test: 10000
Epoch [1/15], Loss: 2.1238, Val Acc: 26.60%
Epoch [2/15], Loss: 1.1016, Val Acc: 61.98%
Epoch [3/15], Loss: 0.7309, Val Acc: 70.56%
Epoch [4/15], Loss: 0.6272, Val Acc: 75.03%
Epoch [5/15], Loss: 0.5598, Val Acc: 80.89%
Epoch [6/15], Loss: 0.5135, Val Acc: 81.32%
Epoch [7/15], Loss: 0.4776, Val Acc: 81.37%
Epoch [8/15], Loss: 0.4495, Val Acc: 82.20%
Epoch [9/15], Loss: 0.4237, Val Acc: 81.81%
Epoch [10/15], Loss: 0.3964, Val Acc: 84.22%
Epoch [11/15], Loss: 0.3843, Val Acc: 86.17%
Epoch [12/15], Loss: 0.3685, Val Acc: 83.92%
Epoch [13/15], Loss: 0.3505, Val Acc: 86.00%
Epoch [14/15], Loss: 0.3419, Val Acc: 86.66%
Epoch [15/15], Loss: 0.3290, Val Acc: 87.78%
Accuracy: 86.57%


In [33]:
# 3. Trained with 30% validation set
thirty_val_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
thirty_val_net.to(device)

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(thirty_val_net.parameters(), lr=0.1)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.7)

train_net(thirty_val_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(thirty_val_net, test_loader, device)

Using device: cuda
Train: 42000 | Val: 18000 | Test: 10000
Epoch [1/15], Loss: 2.1463, Val Acc: 23.76%
Epoch [2/15], Loss: 1.0852, Val Acc: 39.68%
Epoch [3/15], Loss: 0.8286, Val Acc: 59.61%
Epoch [4/15], Loss: 0.6396, Val Acc: 65.60%
Epoch [5/15], Loss: 0.5743, Val Acc: 69.29%
Epoch [6/15], Loss: 0.5401, Val Acc: 55.15%
Epoch [7/15], Loss: 0.5093, Val Acc: 68.13%
Epoch [8/15], Loss: 0.4687, Val Acc: 73.27%
Epoch [9/15], Loss: 0.4304, Val Acc: 75.32%
Epoch [10/15], Loss: 0.4190, Val Acc: 39.94%
Epoch [11/15], Loss: 0.5142, Val Acc: 73.03%
Epoch [12/15], Loss: 0.3934, Val Acc: 67.86%
Epoch [13/15], Loss: 0.3894, Val Acc: 83.32%
Epoch [14/15], Loss: 0.3502, Val Acc: 80.28%
Epoch [15/15], Loss: 0.3403, Val Acc: 79.33%
Accuracy: 78.45%


In [34]:
# 4. Trained with 40% validation set
fourty_val_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
fourty_val_net.to(device)

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(fourty_val_net.parameters(), lr=0.1)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.6)

train_net(fourty_val_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(fourty_val_net, test_loader, device)

Using device: cuda
Train: 36000 | Val: 24000 | Test: 10000
Epoch [1/15], Loss: 2.2716, Val Acc: 36.68%
Epoch [2/15], Loss: 1.4614, Val Acc: 49.98%
Epoch [3/15], Loss: 0.9178, Val Acc: 67.57%
Epoch [4/15], Loss: 0.7411, Val Acc: 72.68%
Epoch [5/15], Loss: 0.6485, Val Acc: 76.72%
Epoch [6/15], Loss: 0.5834, Val Acc: 78.47%
Epoch [7/15], Loss: 0.5371, Val Acc: 80.68%
Epoch [8/15], Loss: 0.5059, Val Acc: 80.10%
Epoch [9/15], Loss: 0.4834, Val Acc: 82.59%
Epoch [10/15], Loss: 0.4507, Val Acc: 83.62%
Epoch [11/15], Loss: 0.4373, Val Acc: 82.45%
Epoch [12/15], Loss: 0.4154, Val Acc: 81.84%
Epoch [13/15], Loss: 0.4028, Val Acc: 82.98%
Epoch [14/15], Loss: 0.3869, Val Acc: 85.95%
Epoch [15/15], Loss: 0.3763, Val Acc: 85.40%
Accuracy: 84.71%


In [None]:
'''
Below - Training with varying SGD learning rates using 10-90 validation-training split:

0.001 LR: 10.00%
0.01  LR: 73.89%
0.1   LR: 86.63%
1     LR: 10.00%
10    LR: 10.00%

The trend appears that that any learning rate too low (0.001) or too high (1, 10) will 
result in the model completely failing, resulting in a minimum accuracy of 10%. The 
minimum accuracy is 10% due to the fact that the FashionMNIST is a 10-way classification,
so a pure guess by the model results in a 10% accuracy on average.

This result makes sense as a learning rate that is too low will result in the parameter barely moving, 
as the step taken towards the negative gradient is far too small. This results in the loss of the model
to barely decrease and the accuracy to barely increase after each epoch. Similarly, a learning rate 
that is too large will result in the parameter moving too much towards the negative gradient, and even
potentially past the minimum cross entropy loss theta*.

One interesting result is that the CEL during training for the LR 10 model is NaN. This is due to the 
-log(0) that happens when the CEL is calculated. This occurs when the model is so inaccurate that it
predicts the target to have a probability of 0 in the probability distribution when in reality the 
target was the correct answer, resulting in the -(1*log(0) + ...) calculation. Mathematically -(1*log(0))
is -log(0), which is undefined or NaN.

Here a learning rate of 0.1 had the best accuracy of 86.63%, so I will move forward with that value.
'''

'\nBelow - Training with varying SGD learning rates using 10-90 validation-training split:\n\n0.001 LR: 10.00%\n0.01  LR: 73.89%\n0.1   LR: 86.63%\n1     LR: 10.00%\n10    LR: 10.00%\n\nThe trend appears that that any learning rate too low (0.001) or too high (1, 10) will \nresult in the model completely failing, resulting in a minimum accuracy of 10%. The \nminimum accuracy is 10% due to the fact that the FashionMNIST is a 10-way classification,\nso a pure guess by the model results in a 10% accuracy on average.\n\nThis result makes sense as a learning rate that is too low will result in the parameter barely moving, \nas the step taken towards the negative gradient is far too small. This results in the loss of the model\nto barely decrease and the accuracy to barely increase after each epoch. Similarly, a learning rate \nthat is too large will result in the parameter moving too much towards the negative gradient, and even\npotentially past the minimum cross entropy loss theta*.\n\nOne

In [None]:
# 1. Trained with 0.001 Learning Rate
point_o_o_one_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
point_o_o_one_net.to(device)

# Define the loss function and the optimizer (0.001 LR)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(point_o_o_one_net.parameters(), lr=0.001)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(point_o_o_one_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(point_o_o_one_net, test_loader, device)

#time: 7m 15.8s

Using device: cpu
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: 2.3036, Val Acc: 9.63%
Epoch [2/15], Loss: 2.3031, Val Acc: 9.63%
Epoch [3/15], Loss: 2.3026, Val Acc: 9.63%
Epoch [4/15], Loss: 2.3022, Val Acc: 9.63%
Epoch [5/15], Loss: 2.3017, Val Acc: 9.63%
Epoch [6/15], Loss: 2.3013, Val Acc: 9.63%
Epoch [7/15], Loss: 2.3009, Val Acc: 9.63%
Epoch [8/15], Loss: 2.3006, Val Acc: 9.63%
Epoch [9/15], Loss: 2.3002, Val Acc: 9.63%
Epoch [10/15], Loss: 2.2998, Val Acc: 9.63%
Epoch [11/15], Loss: 2.2994, Val Acc: 9.63%
Epoch [12/15], Loss: 2.2989, Val Acc: 9.63%
Epoch [13/15], Loss: 2.2985, Val Acc: 9.63%
Epoch [14/15], Loss: 2.2981, Val Acc: 9.63%
Epoch [15/15], Loss: 2.2977, Val Acc: 9.63%
Accuracy: 10.00%


In [None]:
# 2. Trained with 0.01 Learning Rate
point_o_one_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
point_o_one_net.to(device)

# Define the loss function and the optimizer (0.001 LR)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(point_o_one_net.parameters(), lr=0.01)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(point_o_one_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(point_o_one_net, test_loader, device)

#time: 8m 13.6s

Using device: cpu
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: 2.3024, Val Acc: 14.68%
Epoch [2/15], Loss: 2.2983, Val Acc: 17.22%
Epoch [3/15], Loss: 2.2935, Val Acc: 29.32%
Epoch [4/15], Loss: 2.2861, Val Acc: 42.05%
Epoch [5/15], Loss: 2.2716, Val Acc: 44.42%
Epoch [6/15], Loss: 2.2320, Val Acc: 39.12%
Epoch [7/15], Loss: 2.0406, Val Acc: 46.38%
Epoch [8/15], Loss: 1.3927, Val Acc: 59.17%
Epoch [9/15], Loss: 0.9809, Val Acc: 68.52%
Epoch [10/15], Loss: 0.8481, Val Acc: 67.80%
Epoch [11/15], Loss: 0.7912, Val Acc: 58.43%
Epoch [12/15], Loss: 0.7432, Val Acc: 68.08%
Epoch [13/15], Loss: 0.7028, Val Acc: 67.47%
Epoch [14/15], Loss: 0.6815, Val Acc: 71.98%
Epoch [15/15], Loss: 0.6592, Val Acc: 73.72%
Accuracy: 73.89%


In [None]:
# 3. Trained with 0.1 Learning Rate
point_one_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
point_one_net.to(device)

# Define the loss function and the optimizer (0.01 LR)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(point_one_net.parameters(), lr=0.1)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(point_one_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(point_one_net, test_loader, device)

#time: 8m 18.6s

Using device: cpu
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: 2.1207, Val Acc: 42.70%
Epoch [2/15], Loss: 0.9575, Val Acc: 70.85%
Epoch [3/15], Loss: 0.6703, Val Acc: 77.10%
Epoch [4/15], Loss: 0.5746, Val Acc: 79.47%
Epoch [5/15], Loss: 0.5241, Val Acc: 80.88%
Epoch [6/15], Loss: 0.4710, Val Acc: 80.80%
Epoch [7/15], Loss: 0.4352, Val Acc: 81.25%
Epoch [8/15], Loss: 0.4080, Val Acc: 81.75%
Epoch [9/15], Loss: 0.3834, Val Acc: 85.43%
Epoch [10/15], Loss: 0.3656, Val Acc: 79.75%
Epoch [11/15], Loss: 0.3504, Val Acc: 84.85%
Epoch [12/15], Loss: 0.3385, Val Acc: 86.47%
Epoch [13/15], Loss: 0.3239, Val Acc: 85.92%
Epoch [14/15], Loss: 0.3156, Val Acc: 86.23%
Epoch [15/15], Loss: 0.3080, Val Acc: 86.78%
Accuracy: 86.63%


In [None]:
# 3. Trained with 1.0 Learning Rate
one_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
one_net.to(device)

# Define the loss function and the optimizer (1 LR)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(one_net.parameters(), lr=1)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(one_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(one_net, test_loader, device)

#time: 8m 2.7s

Using device: cpu
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: 2.4302, Val Acc: 10.27%
Epoch [2/15], Loss: 2.3031, Val Acc: 9.58%
Epoch [3/15], Loss: 2.3032, Val Acc: 9.93%
Epoch [4/15], Loss: 2.3033, Val Acc: 10.05%
Epoch [5/15], Loss: 2.3031, Val Acc: 9.83%
Epoch [6/15], Loss: 2.3032, Val Acc: 10.17%
Epoch [7/15], Loss: 2.3030, Val Acc: 10.05%
Epoch [8/15], Loss: 2.3033, Val Acc: 9.58%
Epoch [9/15], Loss: 2.3033, Val Acc: 9.93%
Epoch [10/15], Loss: 2.3033, Val Acc: 10.17%
Epoch [11/15], Loss: 2.3032, Val Acc: 9.58%
Epoch [12/15], Loss: 2.3031, Val Acc: 9.57%
Epoch [13/15], Loss: 2.3031, Val Acc: 9.83%
Epoch [14/15], Loss: 2.3033, Val Acc: 10.05%
Epoch [15/15], Loss: 2.3031, Val Acc: 10.27%
Accuracy: 10.00%


In [None]:
# 3. Trained with 10 Learning Rate
ten_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
ten_net.to(device)

# Define the loss function and the optimizer (10 LR)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(ten_net.parameters(), lr=10)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(ten_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(ten_net, test_loader, device)

#time: 8m 12.8s

Using device: cpu
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: nan, Val Acc: 10.62%
Epoch [2/15], Loss: nan, Val Acc: 10.62%
Epoch [3/15], Loss: nan, Val Acc: 10.62%
Epoch [4/15], Loss: nan, Val Acc: 10.62%
Epoch [5/15], Loss: nan, Val Acc: 10.62%
Epoch [6/15], Loss: nan, Val Acc: 10.62%
Epoch [7/15], Loss: nan, Val Acc: 10.62%
Epoch [8/15], Loss: nan, Val Acc: 10.62%
Epoch [9/15], Loss: nan, Val Acc: 10.62%
Epoch [10/15], Loss: nan, Val Acc: 10.62%
Epoch [11/15], Loss: nan, Val Acc: 10.62%
Epoch [12/15], Loss: nan, Val Acc: 10.62%
Epoch [13/15], Loss: nan, Val Acc: 10.62%
Epoch [14/15], Loss: nan, Val Acc: 10.62%
Epoch [15/15], Loss: nan, Val Acc: 10.62%
Accuracy: 10.00%


In [None]:
''' 
Below - Training with Adam Algorithm


'''

In [None]:
# Trained with Adam
adam_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
adam_net.to(device)

# Define the loss function and the optimizer (0.1 LR)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(adam_net.parameters(), lr=0.1)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(adam_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(adam_net, test_loader, device)

#time: 7m 26.4s

Using device: cpu
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: 171.5309, Val Acc: 10.13%
Epoch [2/15], Loss: 2.3058, Val Acc: 10.00%
Epoch [3/15], Loss: 2.3063, Val Acc: 10.02%
Epoch [4/15], Loss: 2.3054, Val Acc: 10.02%
Epoch [5/15], Loss: 2.3046, Val Acc: 10.02%
Epoch [6/15], Loss: 2.3049, Val Acc: 9.87%
Epoch [7/15], Loss: 2.3050, Val Acc: 9.70%
Epoch [8/15], Loss: 2.3048, Val Acc: 10.13%
Epoch [9/15], Loss: 2.3052, Val Acc: 10.40%
Epoch [10/15], Loss: 2.3049, Val Acc: 9.70%
Epoch [11/15], Loss: 2.3046, Val Acc: 9.87%
Epoch [12/15], Loss: 2.3048, Val Acc: 10.40%
Epoch [13/15], Loss: 2.3054, Val Acc: 10.40%
Epoch [14/15], Loss: 2.3056, Val Acc: 10.00%
Epoch [15/15], Loss: 2.3059, Val Acc: 9.82%
Accuracy: 10.00%


In [None]:
# Trained with Adam 1e-3 learning rate
adam_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
adam_net.to(device)

# Define the loss function and the optimizer (0.001 LR)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(adam_net.parameters(), lr=1e-3)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(adam_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(adam_net, test_loader, device)

#time: 7m 49.2s

Using device: cpu
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: 0.8612, Val Acc: 80.37%
Epoch [2/15], Loss: 0.4806, Val Acc: 84.98%
Epoch [3/15], Loss: 0.3848, Val Acc: 86.75%
Epoch [4/15], Loss: 0.3395, Val Acc: 88.03%
Epoch [5/15], Loss: 0.3019, Val Acc: 88.75%
Epoch [6/15], Loss: 0.2814, Val Acc: 89.65%
Epoch [7/15], Loss: 0.2612, Val Acc: 90.47%
Epoch [8/15], Loss: 0.2481, Val Acc: 90.92%
Epoch [9/15], Loss: 0.2339, Val Acc: 91.02%
Epoch [10/15], Loss: 0.2252, Val Acc: 91.23%
Epoch [11/15], Loss: 0.2109, Val Acc: 91.03%
Epoch [12/15], Loss: 0.1980, Val Acc: 91.67%
Epoch [13/15], Loss: 0.1891, Val Acc: 91.12%
Epoch [14/15], Loss: 0.1835, Val Acc: 91.58%
Epoch [15/15], Loss: 0.1733, Val Acc: 91.48%
Accuracy: 91.24%
