In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import random_split, DataLoader

#input transformation seqeunce
transform = transforms.Compose([
        transforms.Resize((28, 28)), #resize input to 28x28
        transforms.ToTensor(), #convert image to pytorch tensor [0, 1] range
        transforms.Normalize((0.5, ), (0.5, )) #normalize values to [-1, 1] range
])

#load traning and test FashionMNIST dataset
'''
root --> save location if download is needed#train --> what portion of the dataset to load (training or testing)
download --> download dataset if not present in root
transform --> apply transitions to each image as it is being accessed
'''
full_train_dataset = torchvision.datasets.FashionMNIST(
        root='./data', train=True, download=True, transform=transform
)

test_dataset = torchvision.datasets.FashionMNIST(
        root='./data', train=False, download=True, transform=transform
)

In [14]:
def train_val_split(train_ratio):
        #Split training dataset into training and validation sets (80%/20%)
        train_size = int(train_ratio * len(full_train_dataset))
        val_size = len(full_train_dataset) - train_size
        train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size])

        #Create DataLoaders
        '''
        batch_size --> each batch retreived by the loader will get 512 samples at once
                - reduces gradient noise
                - allows parallel computing by GPU
                - forward pass --> CEL --> backward pass --> Gradient update
        shuffle --> shuffle order samples are loaded
                - shuffle for training set to prevent overfitting
                - no shuffle for validation and testing set to ensure consistent & reproducible evaluation
        num_workers --> how many parallel processes will be handling data loading
        '''
        train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=2)
        val_loader = DataLoader(val_dataset, batch_size=512, shuffle=False, num_workers=2)
        test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=2)

        print(f"Train: {len(train_dataset)} | Val: {len(val_dataset)} | Test: {len(test_dataset)}")

        return train_loader, val_loader, test_loader

In [15]:
import torch.nn.functional as F

class FashionMNISTNet(nn.Module):
        '''
        input: 28x28
        conv1 = 32x28x28
        maxpool = 32x14x14
        conv2 = 64x16x16
        maxpool = 64x8x8
        conv3 = 64x8x8
        maxpool = 64x4x4
        flatten = 1024
        '''
        def __init__(self):
                super(FashionMNISTNet, self).__init__()
                self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1)
                self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=2)
                self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1)
                self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
                self.fc1 = nn.Linear(in_features=1024, out_features=256)
                self.fc2 = nn.Linear(in_features=256, out_features=128)
                self.fc3 = nn.Linear(in_features=128, out_features=10)

        def forward(self, x):
                x = self.pool(F.relu(self.conv1(x)))
                x = self.pool(F.relu(self.conv2(x)))
                x = self.pool(F.relu(self.conv3(x)))
                x = torch.flatten(x, start_dim=1)
                x = F.relu(self.fc1(x))
                x = F.relu(self.fc2(x))
                x = self.fc3(x) #no ReLU on the last fully connected layer
                # x = F.softmax(x, dim=1)
                return x
        
net = FashionMNISTNet()
print(net)

FashionMNISTNet(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=1024, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=10, bias=True)
)


In [16]:
def evaluate_model(model, test_loader, device):
        #Test
        correct = 0
        total = 0

        # Disable gradient calculation
        with torch.no_grad():
                for inputs, labels in test_loader:

                        # Move the inputs and labels to the GPU if available
                        inputs = inputs.to(device)
                        labels = labels.to(device)

                        # Forward pass
                        outputs = model(inputs)

                        # Get the predicted class
                        _, predicted = torch.max(outputs.data, 1)

                        # Update the total number of samples and correct predictions
                        total += labels.size(0)
                        correct += (predicted == labels).sum().item()

        # Calculate the accuracy
        accuracy = 100 * correct / total
        print(f"Accuracy: {accuracy:.2f}%")

In [17]:
def train_net(model, train_loader, val_loader, device, criterion, optimizer):
        # Training loop
        num_epochs = 15
        for epoch in range(num_epochs):
                model.train()
                running_loss = 0.0
                for i, (inputs, labels) in enumerate(train_loader, 0):
                        inputs = inputs.to(device)
                        labels = labels.to(device)

                        optimizer.zero_grad()
                        outputs = model(inputs)
                        loss = criterion(outputs, labels)
                        loss.backward()
                        optimizer.step()
                        running_loss += loss.item()

                avg_loss = running_loss / (i + 1)

                # ---validation---
                model.eval()
                correct = 0
                total = 0
                with torch.no_grad():  
                        for inputs, labels in val_loader:
                                inputs, labels = inputs.to(device), labels.to(device)
                                outputs = model(inputs)
                                _, predicted = torch.max(outputs.data, 1)
                                total += labels.size(0)
                                correct += (predicted == labels).sum().item()
                if (total == 0):
                        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")
                else:
                        val_acc = 100 * correct / total
                        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}, Val Acc: {val_acc:.2f}%")

                


In [None]:
'''
Below - Training with differing validation set sizes:

The accuracy of the different validation sets are as following:
0% validation set: 85.00%
10% validation set: 87.15%
20% validation set: 87.48%
30% validation set: 82.67%
40% validation set: 83.93%

One general trend between acuraccy and the size of the validation set was that as the size of 
the validation set increased, the accuracy of the trained model decreased. Doing further research
I learned that the decrease in accuracy is likely due to the fact that there is less data for the 
model to train on when more data is allocated for validation within the full training set. The 
difference in accuracy from 0%-20% was minimal, but took a big dip from 20%-30% validation, likely
indicating that the model underfit when 30% of the full training set was used for validation, as 
the model did not have enough data to work with.

Initially, one confusing part was the rebound in accuracy at 40%, however I believe this is due to the fact that
the training/validation split was made randomly, and therefore the model might have been trained on 
differing data that resulting in a normal fluctuation of the accuracy, although the general trend 
remains. It might be difficult to get a consistent evaluation of these due to the fact that the size 
and contents of the training set are hard to normalize due to the difference in size, although the validation
set should be more accurate the larger the validation set is, due to there being more data used to 
validate the model during training.

All in all, the size of the validation set is a balance between the risk of underfitting vs validation accuracy/
per epoch training time. The larger the validation set, the more accurate the validation, but the smaller the training 
set, which results in shorter training time, but potentially a less accurate model. Although if the database is large
enough it shouldn't be a big issue.

Moving forward with the differed learning rate, I will pick the 10-90 validation-training split as the 0%-20% 
validation set sizes did not differ significantly in accuracy. 
'''

"\nBelow - Training with differing validation set sizes:\n\nThe accuracy of the different validation sets are as following:\n0% validation set: 87.28%\n10% validation set: 86.78%\n20% validation set: 86.57%\n30% validation set: 78.45%\n40% validation set: 84.71%\n\nOne general trend between acuraccy and the size of the validation set was that as the size of \nthe validation set increased, the accuracy of the trained model decreased. Doing further research\nI learned that the decrease in accuracy is likely due to the fact that there is less data for the \nmodel to train on when more data is allocated for validation within the full training set. The \ndifference in accuracy from 0%-20% was minimal, but took a big dip from 20%-30% validation, likely\nindicating that the model underfit when 30% of the full training set was used for validation, as \nthe model did not have enough data to work with.\n\nInitially, one confusing part was the rebound in accuracy at 40%, however I believe this is

In [19]:
# 1. Trained with no validation set
no_val_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
no_val_net.to(device)

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(no_val_net.parameters(), lr=0.1)

train_loader, val_loader, test_loader = train_val_split(train_ratio=1.0)

train_net(no_val_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(no_val_net, test_loader, device)

torch.save(no_val_net.state_dict(), 'q2_val_train_test/no_val_net.pth')


Using device: cuda
Train: 60000 | Val: 0 | Test: 10000
Epoch [1/15], Loss: 1.8668
Epoch [2/15], Loss: 0.8099
Epoch [3/15], Loss: 0.6262
Epoch [4/15], Loss: 0.5419
Epoch [5/15], Loss: 0.4858
Epoch [6/15], Loss: 0.4446
Epoch [7/15], Loss: 0.4199
Epoch [8/15], Loss: 0.3936
Epoch [9/15], Loss: 0.3750
Epoch [10/15], Loss: 0.3632
Epoch [11/15], Loss: 0.3464
Epoch [12/15], Loss: 0.3341
Epoch [13/15], Loss: 0.3209
Epoch [14/15], Loss: 0.3091
Epoch [15/15], Loss: 0.3042
Accuracy: 85.00%


In [20]:
# 2. Trained with 10% validation set
ten_val_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
ten_val_net.to(device)

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(ten_val_net.parameters(), lr=0.1)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(ten_val_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(ten_val_net, test_loader, device)

torch.save(ten_val_net.state_dict(), 'q2_val_train_test/ten_val_net.pth')


Using device: cuda
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: 2.1868, Val Acc: 48.85%
Epoch [2/15], Loss: 1.0661, Val Acc: 71.62%
Epoch [3/15], Loss: 0.7014, Val Acc: 73.35%
Epoch [4/15], Loss: 0.5883, Val Acc: 75.47%
Epoch [5/15], Loss: 0.5273, Val Acc: 80.05%
Epoch [6/15], Loss: 0.4797, Val Acc: 82.98%
Epoch [7/15], Loss: 0.4390, Val Acc: 83.93%
Epoch [8/15], Loss: 0.4105, Val Acc: 85.57%
Epoch [9/15], Loss: 0.3863, Val Acc: 85.73%
Epoch [10/15], Loss: 0.3673, Val Acc: 84.67%
Epoch [11/15], Loss: 0.3484, Val Acc: 86.10%
Epoch [12/15], Loss: 0.3362, Val Acc: 82.45%
Epoch [13/15], Loss: 0.3244, Val Acc: 87.78%
Epoch [14/15], Loss: 0.3099, Val Acc: 87.32%
Epoch [15/15], Loss: 0.3007, Val Acc: 88.03%
Accuracy: 87.15%


In [21]:
# 3. Trained with 20% validation set
twenty_val_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
twenty_val_net.to(device)

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(twenty_val_net.parameters(), lr=0.1)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.8)

train_net(twenty_val_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(twenty_val_net, test_loader, device)

torch.save(twenty_val_net.state_dict(), 'q2_val_train_test/twenty_val_net.pth')

Using device: cuda
Train: 48000 | Val: 12000 | Test: 10000
Epoch [1/15], Loss: 2.1865, Val Acc: 35.18%
Epoch [2/15], Loss: 1.1363, Val Acc: 69.84%
Epoch [3/15], Loss: 0.7202, Val Acc: 76.31%
Epoch [4/15], Loss: 0.5965, Val Acc: 78.87%
Epoch [5/15], Loss: 0.5308, Val Acc: 79.99%
Epoch [6/15], Loss: 0.4901, Val Acc: 82.32%
Epoch [7/15], Loss: 0.4520, Val Acc: 83.09%
Epoch [8/15], Loss: 0.4254, Val Acc: 83.52%
Epoch [9/15], Loss: 0.4021, Val Acc: 84.47%
Epoch [10/15], Loss: 0.3833, Val Acc: 85.32%
Epoch [11/15], Loss: 0.3683, Val Acc: 84.58%
Epoch [12/15], Loss: 0.3528, Val Acc: 86.78%
Epoch [13/15], Loss: 0.3416, Val Acc: 86.84%
Epoch [14/15], Loss: 0.3238, Val Acc: 87.59%
Epoch [15/15], Loss: 0.3162, Val Acc: 87.60%
Accuracy: 87.48%


In [22]:
# 4. Trained with 30% validation set
thirty_val_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
thirty_val_net.to(device)

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(thirty_val_net.parameters(), lr=0.1)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.7)

train_net(thirty_val_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(thirty_val_net, test_loader, device)

torch.save(thirty_val_net.state_dict(), 'q2_val_train_test/thirty_val_net.pth')

Using device: cuda
Train: 42000 | Val: 18000 | Test: 10000
Epoch [1/15], Loss: 2.1410, Val Acc: 34.98%
Epoch [2/15], Loss: 1.1080, Val Acc: 45.98%
Epoch [3/15], Loss: 0.8034, Val Acc: 47.35%
Epoch [4/15], Loss: 0.7052, Val Acc: 63.43%
Epoch [5/15], Loss: 0.5904, Val Acc: 60.37%
Epoch [6/15], Loss: 0.5515, Val Acc: 60.93%
Epoch [7/15], Loss: 0.5060, Val Acc: 69.52%
Epoch [8/15], Loss: 0.4745, Val Acc: 44.48%
Epoch [9/15], Loss: 0.5440, Val Acc: 64.92%
Epoch [10/15], Loss: 0.4445, Val Acc: 81.33%
Epoch [11/15], Loss: 0.4093, Val Acc: 64.04%
Epoch [12/15], Loss: 0.4114, Val Acc: 73.24%
Epoch [13/15], Loss: 0.3766, Val Acc: 83.37%
Epoch [14/15], Loss: 0.3535, Val Acc: 80.84%
Epoch [15/15], Loss: 0.3443, Val Acc: 83.48%
Accuracy: 82.67%


In [23]:
# 5. Trained with 40% validation set
fourty_val_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
fourty_val_net.to(device)

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(fourty_val_net.parameters(), lr=0.1)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.6)

train_net(fourty_val_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(fourty_val_net, test_loader, device)

torch.save(fourty_val_net.state_dict(), 'q2_val_train_test/fourty_val_net.pth')

Using device: cuda
Train: 36000 | Val: 24000 | Test: 10000
Epoch [1/15], Loss: 2.2839, Val Acc: 41.00%
Epoch [2/15], Loss: 1.5134, Val Acc: 61.35%
Epoch [3/15], Loss: 0.8790, Val Acc: 62.75%
Epoch [4/15], Loss: 0.7132, Val Acc: 71.54%
Epoch [5/15], Loss: 0.6276, Val Acc: 77.56%
Epoch [6/15], Loss: 0.5721, Val Acc: 76.14%
Epoch [7/15], Loss: 0.5363, Val Acc: 79.83%
Epoch [8/15], Loss: 0.4933, Val Acc: 79.44%
Epoch [9/15], Loss: 0.4659, Val Acc: 80.95%
Epoch [10/15], Loss: 0.4491, Val Acc: 81.20%
Epoch [11/15], Loss: 0.4221, Val Acc: 83.83%
Epoch [12/15], Loss: 0.4101, Val Acc: 83.34%
Epoch [13/15], Loss: 0.3923, Val Acc: 84.37%
Epoch [14/15], Loss: 0.3817, Val Acc: 86.28%
Epoch [15/15], Loss: 0.3637, Val Acc: 84.94%
Accuracy: 83.93%


In [None]:
'''
Below - Training with varying SGD learning rates using 10-90 validation-training split:

0.001 LR: 10.00%
0.01  LR: 74.11%
0.1   LR: 86.78%
1     LR: 10.00%
10    LR: 10.00%

The trend appears that that any learning rate too low (0.001) or too high (1, 10) will 
result in the model completely failing, resulting in a minimum accuracy of 10%. The 
minimum accuracy is 10% due to the fact that the FashionMNIST is a 10-way classification,
so a pure guess by the model results in a 10% accuracy on average.

This result makes sense as a learning rate that is too low will result in the parameter barely moving, 
as the step taken towards the negative gradient is far too small. This results in the loss of the model
to barely decrease and the accuracy to barely increase after each epoch. Similarly, a learning rate 
that is too large will result in the parameter moving too much towards the negative gradient, and even
potentially past the minimum cross entropy loss theta*.

One interesting result is that the CEL during training for the LR 10 model is NaN. This is due to the 
-log(0) that happens when the CEL is calculated. This occurs when the model is so inaccurate that it
predicts the target to have a probability of 0 in the probability distribution when in reality the 
target was the correct answer, resulting in the -(1*log(0) + ...) calculation. Mathematically -(1*log(0))
is -log(0), which is undefined or NaN.

Here a learning rate of 0.1 had the best accuracy of 86.63%, so I will move forward with that value.
'''

'\nBelow - Training with varying SGD learning rates using 10-90 validation-training split:\n\n0.001 LR: 10.00%\n0.01  LR: 73.89%\n0.1   LR: 86.63%\n1     LR: 10.00%\n10    LR: 10.00%\n\nThe trend appears that that any learning rate too low (0.001) or too high (1, 10) will \nresult in the model completely failing, resulting in a minimum accuracy of 10%. The \nminimum accuracy is 10% due to the fact that the FashionMNIST is a 10-way classification,\nso a pure guess by the model results in a 10% accuracy on average.\n\nThis result makes sense as a learning rate that is too low will result in the parameter barely moving, \nas the step taken towards the negative gradient is far too small. This results in the loss of the model\nto barely decrease and the accuracy to barely increase after each epoch. Similarly, a learning rate \nthat is too large will result in the parameter moving too much towards the negative gradient, and even\npotentially past the minimum cross entropy loss theta*.\n\nOne

In [25]:
# 1. Trained with 0.001 Learning Rate
point_o_o_one_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
point_o_o_one_net.to(device)

# Define the loss function and the optimizer (0.001 LR)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(point_o_o_one_net.parameters(), lr=0.001)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(point_o_o_one_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(point_o_o_one_net, test_loader, device)

torch.save(point_o_o_one_net.state_dict(), 'q3_learning_rate_test/point_o_o_one_net.pth')

Using device: cuda
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: 2.3030, Val Acc: 10.05%
Epoch [2/15], Loss: 2.3027, Val Acc: 10.05%
Epoch [3/15], Loss: 2.3025, Val Acc: 10.05%
Epoch [4/15], Loss: 2.3022, Val Acc: 10.05%
Epoch [5/15], Loss: 2.3019, Val Acc: 10.05%
Epoch [6/15], Loss: 2.3016, Val Acc: 10.05%
Epoch [7/15], Loss: 2.3014, Val Acc: 10.05%
Epoch [8/15], Loss: 2.3011, Val Acc: 10.05%
Epoch [9/15], Loss: 2.3009, Val Acc: 10.05%
Epoch [10/15], Loss: 2.3006, Val Acc: 10.05%
Epoch [11/15], Loss: 2.3003, Val Acc: 10.05%
Epoch [12/15], Loss: 2.3000, Val Acc: 10.05%
Epoch [13/15], Loss: 2.2998, Val Acc: 10.05%
Epoch [14/15], Loss: 2.2995, Val Acc: 10.05%
Epoch [15/15], Loss: 2.2992, Val Acc: 10.05%
Accuracy: 10.00%


In [26]:
# 2. Trained with 0.01 Learning Rate
point_o_one_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
point_o_one_net.to(device)

# Define the loss function and the optimizer (0.001 LR)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(point_o_one_net.parameters(), lr=0.01)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(point_o_one_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(point_o_one_net, test_loader, device)

torch.save(point_o_one_net.state_dict(), 'q3_learning_rate_test/point_o_one_net.pth')

Using device: cuda
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: 2.3028, Val Acc: 10.42%
Epoch [2/15], Loss: 2.2996, Val Acc: 10.42%
Epoch [3/15], Loss: 2.2962, Val Acc: 11.32%
Epoch [4/15], Loss: 2.2918, Val Acc: 22.05%
Epoch [5/15], Loss: 2.2847, Val Acc: 29.57%
Epoch [6/15], Loss: 2.2717, Val Acc: 49.15%
Epoch [7/15], Loss: 2.2396, Val Acc: 42.30%
Epoch [8/15], Loss: 2.1038, Val Acc: 43.83%
Epoch [9/15], Loss: 1.4761, Val Acc: 57.53%
Epoch [10/15], Loss: 1.0358, Val Acc: 66.03%
Epoch [11/15], Loss: 0.8905, Val Acc: 63.67%
Epoch [12/15], Loss: 0.8015, Val Acc: 67.48%
Epoch [13/15], Loss: 0.7541, Val Acc: 68.13%
Epoch [14/15], Loss: 0.7082, Val Acc: 71.82%
Epoch [15/15], Loss: 0.6752, Val Acc: 75.03%
Accuracy: 74.11%


In [27]:
# 3. Trained with 0.1 Learning Rate
point_one_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
point_one_net.to(device)

# Define the loss function and the optimizer (0.01 LR)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(point_one_net.parameters(), lr=0.1)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(point_one_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(point_one_net, test_loader, device)

torch.save(point_one_net.state_dict(), 'q3_learning_rate_test/point_one_net.pth')

Using device: cuda
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: 2.0065, Val Acc: 40.28%
Epoch [2/15], Loss: 0.9320, Val Acc: 74.72%
Epoch [3/15], Loss: 0.6597, Val Acc: 75.85%
Epoch [4/15], Loss: 0.5552, Val Acc: 73.90%
Epoch [5/15], Loss: 0.4968, Val Acc: 81.77%
Epoch [6/15], Loss: 0.4536, Val Acc: 83.03%
Epoch [7/15], Loss: 0.4197, Val Acc: 84.47%
Epoch [8/15], Loss: 0.3915, Val Acc: 84.83%
Epoch [9/15], Loss: 0.3667, Val Acc: 84.65%
Epoch [10/15], Loss: 0.3549, Val Acc: 84.70%
Epoch [11/15], Loss: 0.3357, Val Acc: 84.42%
Epoch [12/15], Loss: 0.3269, Val Acc: 86.42%
Epoch [13/15], Loss: 0.3115, Val Acc: 87.78%
Epoch [14/15], Loss: 0.3055, Val Acc: 87.35%
Epoch [15/15], Loss: 0.2971, Val Acc: 87.35%
Accuracy: 86.78%


In [28]:
# 4. Trained with 1.0 Learning Rate
one_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
one_net.to(device)

# Define the loss function and the optimizer (1 LR)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(one_net.parameters(), lr=1)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(one_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(one_net, test_loader, device)

torch.save(one_net.state_dict(), 'q3_learning_rate_test/one_net.pth')

Using device: cuda
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: 2.4207, Val Acc: 9.92%
Epoch [2/15], Loss: 2.2105, Val Acc: 9.92%
Epoch [3/15], Loss: 2.2850, Val Acc: 10.08%
Epoch [4/15], Loss: 2.3041, Val Acc: 9.60%
Epoch [5/15], Loss: 2.3037, Val Acc: 9.60%
Epoch [6/15], Loss: 2.3037, Val Acc: 9.60%
Epoch [7/15], Loss: 2.3038, Val Acc: 9.90%
Epoch [8/15], Loss: 2.3036, Val Acc: 10.13%
Epoch [9/15], Loss: 2.3034, Val Acc: 10.35%
Epoch [10/15], Loss: 2.3037, Val Acc: 9.90%
Epoch [11/15], Loss: 2.3035, Val Acc: 9.60%
Epoch [12/15], Loss: 2.3036, Val Acc: 9.92%
Epoch [13/15], Loss: 2.3035, Val Acc: 10.35%
Epoch [14/15], Loss: 2.3035, Val Acc: 9.90%
Epoch [15/15], Loss: 2.3034, Val Acc: 10.08%
Accuracy: 10.00%


In [29]:
# 5. Trained with 10 Learning Rate
ten_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
ten_net.to(device)

# Define the loss function and the optimizer (10 LR)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(ten_net.parameters(), lr=10)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(ten_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(ten_net, test_loader, device)

torch.save(ten_net.state_dict(), 'q3_learning_rate_test/ten_net.pth')

Using device: cuda
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: nan, Val Acc: 9.82%
Epoch [2/15], Loss: nan, Val Acc: 9.82%
Epoch [3/15], Loss: nan, Val Acc: 9.82%
Epoch [4/15], Loss: nan, Val Acc: 9.82%
Epoch [5/15], Loss: nan, Val Acc: 9.82%
Epoch [6/15], Loss: nan, Val Acc: 9.82%
Epoch [7/15], Loss: nan, Val Acc: 9.82%
Epoch [8/15], Loss: nan, Val Acc: 9.82%
Epoch [9/15], Loss: nan, Val Acc: 9.82%
Epoch [10/15], Loss: nan, Val Acc: 9.82%
Epoch [11/15], Loss: nan, Val Acc: 9.82%
Epoch [12/15], Loss: nan, Val Acc: 9.82%
Epoch [13/15], Loss: nan, Val Acc: 9.82%
Epoch [14/15], Loss: nan, Val Acc: 9.82%
Epoch [15/15], Loss: nan, Val Acc: 9.82%
Accuracy: 10.00%


In [None]:
''' 
Below - Training with Adam Algorithm

LR 0.1: 10.00%
LR 0.01: 91.26%
LR 0.001: 91.24%
LR 0.0001: 85.16% 

Using the best learning rate from the SGD algorithm (0.1), the Adam algorithm performed poorly. The model 
was at the minimum probability accuracy of 10%. This was confusing, but after some research, I learned 
that the adam algorithm has different requirements for learning rates, and unlike the SGD algorithm, Adam 
typically performs better with a learning rate between 0.001~0.01. Therefore I also ran extra tests with 
those learning rates.

Surprisingly, when using a smaller learning rate, the accuracy of the model using those learning rates 
and the adam algorithm were much better than the models trained with SGD, with a maximum accuracy being 
achieved with a learning rate of 0.01, at 91.26%. However the accuracy of the model decreased when a 
learning rate of 1e-4 was used.

One additional finding was that during the 0.01 and 0.001 learning rate training, the accuracy of the 
model seemed to plateau at around around 90% after 6-7 epochs, which may be a performance ceiling as 
with the current setup of the project.

'''

' \nBelow - Training with Adam Algorithm\n\nLR 0.1: 10.00%\nLR 0.01: 90.98%\nLR 0.001: 91.24%\nLR 0.0001: 85.57% \n\nUsing the best learning rate from the SGD algorithm (0.1), the Adam algorithm performed poorly. The model \nwas at the minimum probability accuracy of 10%. This was confusing, but after some research, I learned \nthat the adam algorithm has different requirements for learning rates, and unlike the SGD algorithm, Adam \ntypically performs better with a learning rate between 0.001~0.01. Therefore I also ran extra tests with \nthose learning rates.\n\nSurprisingly, when using a smaller learning rate, the accuracy of the model using those learning rates \nand the adam algorithm were much better than the models trained with SGD, with a maximum accuracy being \nachieved with a learning rate of 0.001, at 91.24%. However the accuracy of the model decreased when a \nlearning rate of 1e-4 was used.\n\nOne additional finding was that during the 0.01 and 0.001 learning rate training

In [31]:
# Trained with Adam
adam_net = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
adam_net.to(device)

# Define the loss function and the optimizer (0.1 LR)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(adam_net.parameters(), lr=0.1)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(adam_net, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(adam_net, test_loader, device)

torch.save(adam_net.state_dict(), 'q4_adam_test/adam_net.pth')


Using device: cuda
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: 413.6102, Val Acc: 10.23%
Epoch [2/15], Loss: 2.3049, Val Acc: 9.50%
Epoch [3/15], Loss: 2.3053, Val Acc: 10.23%
Epoch [4/15], Loss: 2.3046, Val Acc: 10.35%
Epoch [5/15], Loss: 2.3056, Val Acc: 10.38%
Epoch [6/15], Loss: 2.3052, Val Acc: 9.97%
Epoch [7/15], Loss: 2.3051, Val Acc: 10.38%
Epoch [8/15], Loss: 2.3050, Val Acc: 10.02%
Epoch [9/15], Loss: 2.3057, Val Acc: 10.35%
Epoch [10/15], Loss: 2.3053, Val Acc: 10.23%
Epoch [11/15], Loss: 2.3053, Val Acc: 9.52%
Epoch [12/15], Loss: 2.3055, Val Acc: 9.52%
Epoch [13/15], Loss: 2.3057, Val Acc: 9.50%
Epoch [14/15], Loss: 2.3053, Val Acc: 10.23%
Epoch [15/15], Loss: 2.3065, Val Acc: 10.38%
Accuracy: 10.00%


In [32]:
# Trained with Adam 1e-2 learning rate
adam_net_o_one = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
adam_net_o_one.to(device)

# Define the loss function and the optimizer (0.001 LR)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(adam_net_o_one.parameters(), lr=1e-2)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(adam_net_o_one, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(adam_net_o_one, test_loader, device)

torch.save(adam_net_o_one.state_dict(), 'q4_adam_test/adam_net_o_one.pth')


Using device: cuda
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: 0.7722, Val Acc: 84.75%
Epoch [2/15], Loss: 0.3665, Val Acc: 87.55%
Epoch [3/15], Loss: 0.3001, Val Acc: 90.15%
Epoch [4/15], Loss: 0.2712, Val Acc: 90.57%
Epoch [5/15], Loss: 0.2515, Val Acc: 90.80%
Epoch [6/15], Loss: 0.2372, Val Acc: 91.07%
Epoch [7/15], Loss: 0.2272, Val Acc: 91.03%
Epoch [8/15], Loss: 0.2125, Val Acc: 91.48%
Epoch [9/15], Loss: 0.1999, Val Acc: 92.27%
Epoch [10/15], Loss: 0.1932, Val Acc: 91.28%
Epoch [11/15], Loss: 0.1824, Val Acc: 91.73%
Epoch [12/15], Loss: 0.1764, Val Acc: 91.25%
Epoch [13/15], Loss: 0.1779, Val Acc: 91.77%
Epoch [14/15], Loss: 0.1745, Val Acc: 92.23%
Epoch [15/15], Loss: 0.1664, Val Acc: 92.07%
Accuracy: 91.26%


In [33]:
# Trained with Adam 1e-3 learning rate
adam_net_o_o_one = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
adam_net_o_o_one.to(device)

# Define the loss function and the optimizer (0.001 LR)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(adam_net_o_o_one.parameters(), lr=1e-3)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(adam_net_o_o_one, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(adam_net_o_o_one, test_loader, device)

torch.save(adam_net_o_o_one.state_dict(), 'q4_adam_test/adam_net_o_o_one.pth')

Using device: cuda
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: 0.8743, Val Acc: 78.30%
Epoch [2/15], Loss: 0.4730, Val Acc: 83.58%
Epoch [3/15], Loss: 0.3811, Val Acc: 87.07%
Epoch [4/15], Loss: 0.3358, Val Acc: 87.75%
Epoch [5/15], Loss: 0.3034, Val Acc: 88.82%
Epoch [6/15], Loss: 0.2773, Val Acc: 89.38%
Epoch [7/15], Loss: 0.2608, Val Acc: 89.73%
Epoch [8/15], Loss: 0.2475, Val Acc: 89.67%
Epoch [9/15], Loss: 0.2313, Val Acc: 90.07%
Epoch [10/15], Loss: 0.2241, Val Acc: 91.23%
Epoch [11/15], Loss: 0.2098, Val Acc: 91.55%
Epoch [12/15], Loss: 0.1987, Val Acc: 91.23%
Epoch [13/15], Loss: 0.1877, Val Acc: 91.42%
Epoch [14/15], Loss: 0.1775, Val Acc: 91.57%
Epoch [15/15], Loss: 0.1747, Val Acc: 91.33%
Accuracy: 91.24%


In [34]:
# Trained with Adam 1e-4 learning rate
adam_net_o_o_o_one = FashionMNISTNet()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
adam_net_o_o_o_one.to(device)

# Define the loss function and the optimizer (0.001 LR)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(adam_net_o_o_o_one.parameters(), lr=1e-4)

train_loader, val_loader, test_loader = train_val_split(train_ratio=0.9)

train_net(adam_net_o_o_o_one, train_loader, val_loader, device, criterion, optimizer)

evaluate_model(adam_net_o_o_o_one, test_loader, device)

torch.save(adam_net_o_o_o_one.state_dict(), 'q4_adam_test/adam_net_o_o_o_one.pth')

Using device: cuda
Train: 54000 | Val: 6000 | Test: 10000
Epoch [1/15], Loss: 1.6981, Val Acc: 67.80%
Epoch [2/15], Loss: 0.7749, Val Acc: 74.50%
Epoch [3/15], Loss: 0.6513, Val Acc: 77.12%
Epoch [4/15], Loss: 0.5922, Val Acc: 78.05%
Epoch [5/15], Loss: 0.5553, Val Acc: 79.35%
Epoch [6/15], Loss: 0.5283, Val Acc: 81.28%
Epoch [7/15], Loss: 0.5074, Val Acc: 82.07%
Epoch [8/15], Loss: 0.4838, Val Acc: 82.68%
Epoch [9/15], Loss: 0.4675, Val Acc: 83.32%
Epoch [10/15], Loss: 0.4517, Val Acc: 84.17%
Epoch [11/15], Loss: 0.4399, Val Acc: 84.08%
Epoch [12/15], Loss: 0.4244, Val Acc: 84.88%
Epoch [13/15], Loss: 0.4120, Val Acc: 85.20%
Epoch [14/15], Loss: 0.3994, Val Acc: 85.58%
Epoch [15/15], Loss: 0.3914, Val Acc: 86.07%
Accuracy: 85.16%


In [None]:
'''
Below - Area Under Curve Testing 

using the best model: 10-90 validation-training split, Adam algorithm, 0.01 LR.
'''

'\nBelow - Area Under Curve Testing \n\nusing the best model: 10-90 validation-training split, Adam algorithm, 0.001 LR.\n'

In [36]:
from sklearn.metrics import roc_auc_score #computes false-positive rates and true positive rates
import torch.nn.functional as F 

def evaluate_model_auc(model, test_loader, device, positive_class):
        model.eval()

        #Test
        y_true = [] #holds 0/1 ground truth values
        y_score = [] #holds predicted probability scores for class 2

        # Disable gradient calculation
        with torch.no_grad():
                for inputs, labels in test_loader:

                        # Move the inputs and labels to the GPU if available
                        inputs = inputs.to(device)
                        labels = labels.to(device)

                        # Forward pass
                        outputs = model(inputs)
                        probabilities = F.softmax(outputs, dim=1) #converts outputs into probabilities

                        # Build binary lists
                        y_true.extend((labels == positive_class).int().cpu().numpy())
                        y_score.extend(probabilities[:, positive_class].cpu().numpy())

        # Compute ROC curve
        auc_val = roc_auc_score(y_true, y_score)

        # return results
        print(f"AUC: {auc_val:.4f}")