In [1]:
import os
import time
import os.path as osp

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

from torchvision import datasets
from torchvision import transforms
import torchvision

import matplotlib.pyplot as plt
from PIL import Image

### Hyperparameters

In [2]:
# random seed
SEED = 1 
NUM_CLASS = 10

# Training
BATCH_SIZE = 128
NUM_EPOCHS = 60
EVAL_INTERVAL=1
SAVE_DIR = './log'

# Optimizer
LEARNING_RATE = 1e-2
MOMENTUM = 0.9
STEP=5
GAMMA=0.5


### Device

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")

In [4]:
import torch
print(torch.cuda.is_available())  # Check if CUDA is available
print(torch.cuda.get_device_name(0))  # Check the name of the CUDA device

True
NVIDIA A30 MIG 1g.6gb



### Dataset


In [5]:
transform= transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

train_set = torchvision.datasets.CIFAR100(root='../data', train=True,
                                        download=True, transform=transform)
train_dataloader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE,
                                          shuffle=True, num_workers=2)

test_set = torchvision.datasets.CIFAR100(root='../data', train=False,
                                       download=True, transform=transform)
test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=BATCH_SIZE,
                                         shuffle=False, num_workers=2)

class_names = train_set.classes


Files already downloaded and verified
Files already downloaded and verified


### Model

In [6]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.relu1 = nn.ReLU()
        
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.relu2 = nn.ReLU()
        
        self.pool = nn.MaxPool2d(2, 2)
        
        self.conv3 = nn.Conv2d(32, 64, 3, padding=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.relu3 = nn.ReLU()
        
        self.fc1 = nn.Linear(64 * 4 * 4, 256)
        self.dropout1 = nn.Dropout(0.5)
        self.relu4 = nn.ReLU()
        
        self.fc2 = nn.Linear(256, 100)

    def forward(self, x):
        x = self.pool(self.relu1(self.bn1(self.conv1(x))))
        x = self.pool(self.relu2(self.bn2(self.conv2(x))))
        x = self.pool(self.relu3(self.bn3(self.conv3(x))))
        
        x = x.view(-1, 64 * 4 * 4)
        
        x = self.relu4(self.dropout1(self.fc1(x)))
        x = self.fc2(x)
        return x

In [7]:
# class ConvNet(nn.Module):
#     def __init__(self):
#         super(ConvNet, self).__init__()
#         self.conv1 = nn.Conv2d(3, 4, 3)  
#         self.pool = nn.MaxPool2d(2, 2)
#         self.conv2 = nn.Conv2d(4, 8, 3)  
#         self.fc1 = nn.Linear(8 * 6 * 6, 32)
#         self.fc2 = nn.Linear(32, 100)

#     def forward(self, x):
#         x = self.pool(torch.relu(self.conv1(x)))
#         x = self.pool(torch.relu(self.conv2(x)))
#         x = x.view(-1, 8 * 6 * 6)
#         x = torch.relu(self.fc1(x))
#         x = self.fc2(x)
#         return x

In [8]:
model = ConvNet()
model.to(device)

ConvNet(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU()
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU()
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu3): ReLU()
  (fc1): Linear(in_features=1024, out_features=256, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (relu4): ReLU()
  (fc2): Linear(in_features=256, out_features=100, bias=True)
)

### Optimizer

In [9]:
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=STEP, gamma=GAMMA)

### Task 1: per batch training/testing
---

Please denfine two function named ``train_batch`` and ``test_batch``. These functions are essential for training and evaluating machine learning models using batched data from dataloaders.

**To do**: 
1. Define the loss function i.e [nn.CrossEntropyLoss()](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html).
2. Take the image as the input and generate the output using the pre-defined SimpleNet.
3. Calculate the loss between the output and the corresponding label using the loss function.

### One-Hot Encoding

In [10]:
def one_hot_encoding(target, num_classes):
    return F.one_hot(target, num_classes=num_classes).float()

### Focal Loss Function

In [11]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=1.0):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha

    def forward(self, y_pred, y_true):
        """
        y_pred: raw scores (logits) for each class. [batch_size, num_classes]
        y_true: ground truth labels. [batch_size]
        """
        # Convert y_true labels into one-hot encoding
        y_true_onehot = torch.zeros(y_pred.size(), device=y_pred.device).scatter_(1, y_true.unsqueeze(1).long(), 1)
        
        # Calculate softmax over y_pred for calculating probabilities
        probs = F.softmax(y_pred, dim=1)
        
        # Calculate focal loss
        focal_loss = -self.alpha * (y_true_onehot * torch.log(probs) * (1 - probs) ** self.gamma).sum(dim=1).mean()
        
        return focal_loss

### Loss Function define

In [12]:
##################### Write your answer here ##################
# Define the loss function
criterion = nn.CrossEntropyLoss()
#criterion = FocalLoss()
#criterion = nn.L1Loss()
#lossfunction = 'epo100FC201'
###############################################################

In [13]:
def train_batch(model, image, target):
    """
    Perform one training batch iteration.

    Args:
        model (torch.nn.Module): The machine learning model to train.
        image (torch.Tensor): Batch of input data (images).
        target (torch.Tensor): Batch of target labels.

    Returns:
        torch.Tensor: Model output (predictions) for the batch.
        torch.Tensor: Loss value calculated by the defined loss function loss_fn().
    """
    
    ##################### Write your answer here ##################
    output = model(image)
    loss = criterion(output, target)
    ###############################################################

    return output, loss

In [14]:

def test_batch(model, image, target):
    """
    Perform one testing batch iteration.

    Args:
        model (torch.nn.Module): The machine learning model to evaluate.
        image (torch.Tensor): Batch of input data (images).
        target (torch.Tensor): Batch of target labels.

    Returns:
        torch.Tensor: Model output (predictions) for the batch.
        torch.Tensor: Loss value calculated for the batch.
    """

    ##################### Write your answer here ##################
    output = model(image)
    loss = criterion(output, target)
    ###############################################################

    return output, loss

### Model Training

In [15]:
training_loss = []
training_acc = []
testing_loss = []
testing_acc = []

run_num = 1

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Now run your training code
for epoch in range(NUM_EPOCHS):
    model.train()

    running_cls_loss = 0.0
    running_cls_corrects = 0
    
    for batch_idx, (image, target) in enumerate(train_dataloader):
        
#         print("Image shape:", image.shape)
#         print("Image dtype:", image.dtype)
#         print("Contains NaN:", torch.isnan(image).any())
#         print("Contains Inf:", torch.isinf(image).any())

#         print(run_num)
#         run_num += 1
#         print(target.shape)
#         print(target)
#         print('\n')
        
        image = image.to(device)
        target = target.to(device)

        optimizer.zero_grad()  # Clear gradients before the forward pass

        outputs, loss = train_batch(model, image, target)
        _, preds = torch.max(outputs, 1)

        loss_data = loss.item()
        if np.isnan(loss_data):
            raise ValueError('Loss is NaN while training')

        running_cls_loss += loss_data
        running_cls_corrects += torch.sum(preds == target.data)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()

    epoch_loss = running_cls_loss / len(train_set)
    epoch_acc = running_cls_corrects.double() / len(train_set)

    #print(f'Epoch: {epoch+1}/{NUM_EPOCHS} Train Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

    # Rest of your code for testing and checkpoint saving...

    training_loss.append(epoch_loss)
    training_acc.append(epoch_acc.cpu().detach().numpy())

    # change learning rate
    scheduler.step()


    ##########################
    ### Testing
    ##########################
    # # eval model during training or in the last epoch
    if (epoch + 1) % EVAL_INTERVAL == 0 or (epoch +1) == NUM_EPOCHS:
        print(f'epoch:{epoch} Begin test......')
        model.eval()
    
        val_loss = 0.0
        val_corrects = 0
        for batch_idx, (image, target) in enumerate(test_dataloader):  #CE-Loss/FCLoss


            image = image.to(device)
            target = target.to(device)

            # test model
            outputs, loss = test_batch(model, image, target)
            _, preds = torch.max(outputs, 1)
            
            val_loss += loss.item()
            val_corrects += torch.sum(preds == target.data)           #CE-Loss/FCLoss
      

        val_loss = val_loss / len(test_set)
        val_acc = val_corrects.double() / len(test_set)
        #print(f'Test Loss: {val_loss:.4f} Acc: {val_acc:.4f}')
        print(f'epoch:{epoch} finished')
        testing_loss.append(val_loss)
        testing_acc.append(val_acc.cpu().detach().numpy())

        # save the model in last epoch
        if (epoch +1) == NUM_EPOCHS:
            
            state = {
            'state_dict': model.state_dict(),
            'acc': epoch_acc,
            'epoch': (epoch+1),
            }

            # check the dir
            if not os.path.exists(SAVE_DIR):
                os.makedirs(SAVE_DIR)

            # save the state
            torch.save(state, osp.join(SAVE_DIR, 'checkpoint_%s.pth' % (str(epoch+1))))

epoch:0 Begin test......
epoch:0 finished
epoch:1 Begin test......
epoch:1 finished
epoch:2 Begin test......
epoch:2 finished
epoch:3 Begin test......
epoch:3 finished
epoch:4 Begin test......
epoch:4 finished
epoch:5 Begin test......
epoch:5 finished
epoch:6 Begin test......
epoch:6 finished
epoch:7 Begin test......
epoch:7 finished
epoch:8 Begin test......
epoch:8 finished
epoch:9 Begin test......
epoch:9 finished
epoch:10 Begin test......
epoch:10 finished
epoch:11 Begin test......
epoch:11 finished
epoch:12 Begin test......
epoch:12 finished
epoch:13 Begin test......
epoch:13 finished
epoch:14 Begin test......
epoch:14 finished
epoch:15 Begin test......
epoch:15 finished
epoch:16 Begin test......
epoch:16 finished
epoch:17 Begin test......
epoch:17 finished
epoch:18 Begin test......
epoch:18 finished
epoch:19 Begin test......
epoch:19 finished
epoch:20 Begin test......
epoch:20 finished
epoch:21 Begin test......
epoch:21 finished
epoch:22 Begin test......
epoch:22 finished
epoch:23

In [16]:
print(f'NN Accuracy on CIFAR-100 test images: {testing_acc[-1] * 100:.2f}%')

NN Accuracy on CIFAR-100 test images: 39.71%
