# HW3 Image Classification
## We strongly recommend that you run with Kaggle for this homework
https://www.kaggle.com/c/ml2022spring-hw3b/code?competitionId=34954&sortBy=dateCreated

# Get Data
Notes: if the links are dead, you can download the data directly from Kaggle and upload it to the workspace, or you can use the Kaggle API to directly download the data into colab.


In [4]:
# cd

In [5]:
# ! wget https://www.dropbox.com/s/6l2vcvxl54b0b6w/food11.zip

In [6]:
# ! unzip food11.zip

# Training

In [7]:
_exp_name = "sample"

In [8]:
# Import necessary packages.
import numpy as np
import pandas as pd
import torch
import os
import torch.nn as nn
import torchvision.transforms as transforms
from PIL import Image
# "ConcatDataset" and "Subset" are possibly useful when doing semi-supervised learning.
from torch.utils.data import ConcatDataset, DataLoader, Subset, Dataset
from torchvision.datasets import DatasetFolder, VisionDataset

# This is for the progress bar.
from tqdm.auto import tqdm
import random
from sklearn.model_selection import KFold

In [9]:
myseed = 6666  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

## **Transforms**
Torchvision provides lots of useful utilities for image preprocessing, data wrapping as well as data augmentation.

Please refer to PyTorch official website for details about different transforms.

In [10]:
# Normally, We don't need augmentations in testing and validation.
# All we need here is to resize the PIL image and transform it into Tensor.
test_tfm = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

# However, it is also possible to use augmentation in the testing phase.
# You may use train_tfm to produce a variety of images and then test using ensemble methods
# train_tfm = transforms.Compose([
#     # Resize the image into a fixed shape (height = width = 128)
#     transforms.Resize((128, 128)),
#     # You may add some transforms here.
#     # ToTensor() should be the last one of the transforms.
#     transforms.ToTensor(),
# ])

# 变换相关的网站：https://pytorch.org/vision/stable/auto_examples/transforms/plot_transforms_illustrations.html#sphx-glr-auto-examples-transforms-plot-transforms-illustrations-py
train_tfm2 = transforms.Compose([
    # Resize the image into a fixed shape (height = width = 128)
    transforms.Resize((128, 128)),
    # You may add some transforms here.
    # 这个变换可以去一些地方找一些新方法
    transforms.RandomChoice(transforms=[
        # Apply TrivialAugmentWide data augmentation method
        transforms.TrivialAugmentWide(),
        # Return original image
        transforms.Lambda(lambda x: x),
    ],p=[0.95, 0.05]),

    # ToTensor() should be the last one of the transfo·rms.
    transforms.ToTensor(),
])


## **Datasets**
The data is labelled by the name, so we load images and label while calling '__getitem__'

In [11]:
class FoodDataset(Dataset):

    def __init__(self, path=None, tfm=test_tfm, files=None):
        super(FoodDataset, self).__init__()
        if path:
            self.path = path
            self.files = sorted([os.path.join(path, x) for x in os.listdir(path) if x.endswith(".jpg")])
        if files is not None:
            self.files = files
        if self.files:  # 添加一个检查以确保self.files不为空
            print(f"One sample", self.files[0])
        else:
            print(f"No samples found in {path}")
        self.transform = tfm

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        fname = self.files[idx]
        im = Image.open(fname)
        im = self.transform(im)
        try:
            label = int(fname.split("/")[-1].split("_")[0])
        except:
            label = -1 # test has no label
        return im, label

class MySubset(Subset):
    def __init__(self, dataset, indices, tfm=test_tfm):
        super().__init__(dataset, indices)
        self.transform = tfm

    def __getitem__(self, idx):
        if isinstance(idx, list):
            return self.dataset[[self.indices[i] for i in idx]]
        self.dataset.transform = self.transform
        return self.dataset[self.indices[idx]]
        # do something with self.dataset[self.indices[idx]]
        # return a modified item

# Model

In [12]:
from torch import nn
# ResNet
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()

        self.cnn_layer1 = nn.Sequential(
            nn.Conv2d(3, 64, 3, 1, 1),
            nn.BatchNorm2d(64),
        )

        self.cnn_layer2 = nn.Sequential(
            nn.Conv2d(64, 64, 3, 1, 1),
            nn.BatchNorm2d(64),
        )

        self.cnn_layer3 = nn.Sequential(
            nn.Conv2d(64, 128, 3, 2, 1),
            nn.BatchNorm2d(128),
        )

        self.cnn_layer4 = nn.Sequential(
            nn.Conv2d(128, 128, 3, 1, 1),
            nn.BatchNorm2d(128),
        )
        self.cnn_layer5 = nn.Sequential(
            nn.Conv2d(128, 256, 3, 2, 1),
            nn.BatchNorm2d(256),
        )
        self.cnn_layer6 = nn.Sequential(
            nn.Conv2d(256, 256, 3, 1, 1),
            nn.BatchNorm2d(256),
        )
        self.fc_layer = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(256* 32* 32, 256),
            nn.ReLU(),
            nn.Linear(256, 11)
        )
        self.relu = nn.ReLU()

    def forward(self, x):
        # input (x): [batch_size, 3, 128, 128]
        # output: [batch_size, 11]

        # Extract features by convolutional layers.
        x1 = self.cnn_layer1(x)

        x1 = self.relu(x1)
        
        Residual = x1 
        x2 = self.cnn_layer2(x1)
        x2 = x2 + Residual
        x2 = self.relu(x2)

        x3 = self.cnn_layer3(x2)
        
        x3 = self.relu(x3)
        Residual = x3 
        
        x4 = self.cnn_layer4(x3)
        x4 = x4 + Residual
        x4 = self.relu(x4)

        x5 = self.cnn_layer5(x4)
        
        x5 = self.relu(x5)
        Residual = x5
        x6 = self.cnn_layer6(x5)
        x6 = x6 + Residual
        x6 = self.relu(x6)

        # The extracted feature map must be flatten before going to fully-connected layers.
        xout = x6.flatten(1)

        # The features are transformed by fully-connected layers to obtain the final logits.
        xout = self.fc_layer(xout)
        return xout

# VGG16
class VGG16(nn.Module):
    def __init__(self):
        super(VGG16, self).__init__()
        # 修改vgg16的最后一层，因为原来的vgg16是1000分类，我们需要11分类
        self.model = vgg16(weights=None)
        num_features = self.model.classifier[6].in_features
        self.model.classifier[6] = nn.Linear(num_features, 11)
        self.model.num_classes = 11

    def forward(self, x):
        return self.model(x)
    
class EnsembleModel(nn.Module):
    def __init__(self, base_models):
        super(EnsembleModel, self).__init__()
        self.base_models = nn.ModuleList(base_models)
        # self.classifier = nn.Linear(len(base_models) * 11, 11)

    def forward(self, x):
        outputs = [model(x) for model in self.base_models]
        # outputs = torch.cat(outputs, dim=1)
        # outputs = self.classifier(outputs)     
        outputs = torch.stack(outputs)
        outputs = torch.mean(outputs, dim=0)
        return outputs


In [13]:
# mixup
def mixup_data(x, y, alpha=1.0):
    '''Returns mixed inputs, targets, and lambda
    Parameters
    ----------
    x: input data
    y: target
    alpha: value of alpha and beta in beta distribution 
    '''
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1
    batch_size = x.size()[0]
    index = torch.randperm(batch_size) # shuffle index
    mixed_x = lam * x + (1 - lam) * x[index, :] # mixup between original image order and shuffled image order
    y_a, y_b = y, y[index] # return target of both images order
    
    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    """ Updated loss for mixup.
    Args:
    -----
    criterion: loss function to use, example: crossentropy loss
    preds: predictions from network
    y_a: original labels
    y_b: labels of the shuffled batch
    lam: alpha used for mixup
    """
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

import torchmetrics.functional as tmf
def mixup_accuracy(metric, preds, y_a, y_b, lam,num_classes,task='multiclass',):
    """
    Updated metric calculation:
    Args:
    -----
    metric: metric to use, example: accuracy
    preds: predictions from network
    y_a: original labels
    y_b: labels of the shuffled batch
    lam: alpha used for mixup
    """
    return lam * metric(preds, y_a,num_classes=num_classes,task=task) + (1 - lam) * metric(preds, y_b,num_classes=num_classes,task=task)

# Configurations

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

batch_size = 64
n_epochs = 200
patience = 20 # If no improvement in 'patience' epochs, early stop
k_fold = 5

model = Classifier().to(device)

Resume = False
if Resume:
    model.load_state_dict(torch.load(f"{_exp_name}_best.ckpt", map_location='cuda'))
    
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=1e-5) # 3e-4
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.8, patience=patience/2, threshold=0.05)


cuda


# Dataloader

In [15]:
train_dir = "/kaggle/input/ml2022spring-hw3b/food11/training"
val_dir = "/kaggle/input/ml2022spring-hw3b/food11/validation"
test_dir = "/kaggle/input/ml2022spring-hw3b/food11/test"

train_files = [os.path.join(train_dir, x) for x in os.listdir(train_dir) if x.endswith('.jpg')]
val_files = [os.path.join(val_dir, x) for x in os.listdir(val_dir) if x.endswith('.jpg')]
test_files = [os.path.join(test_dir, x) for x in os.listdir(test_dir) if x.endswith('.jpg')]

total_files = train_files + val_files


# tta

In [16]:
def create_tta_loaders():
    # Initialize an empty list to store the TTA test loaders
    tta_test_loaders = []
    for i in range(num_tta):
        # Create a FoodDataset instance with the test folder and the train transformation
        test_set_i = FoodDataset(tfm = train_tfm2,files=test_files)
        tta_test_loader = DataLoader(test_set_i, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
        tta_test_loaders.append(test_loader)
    return tta_test_loaders
    

def tta(num_folds=5, model=None, ):
    # Create an empty list to store the best model for each fold
    base_models = []

    # Load the best model for each fold
    for fold in range(num_folds):
        # Create a model instance and move it to the device (GPU or CPU)
        model_best_i = model().to(device)
        # Load the state dictionary of the best model for this fold from a checkpoint file
        model_best_i.load_state_dict(torch.load(f"models/{_exp_name}_{model_best_i.__class__.__name__}_fold{fold+1}_best.ckpt"))
        # Append the model to the base_models list
        base_models.append(model_best_i)
    # Create an ensemble model that combines the base models and move it to the device
    model_best = EnsembleModel(base_models=base_models).to(device)

    # Set the model to evaluation mode
    model_best.eval()

    # Initialize an empty list to store the prediction without test-time augmentation (TTA)
    prediction = []
    with torch.no_grad():
        for data,_ in tqdm(test_loader):
            # Get the model output on the data and move it to CPU
            test_pred = model_best(data.to(device)).cpu().data.numpy()
            # Add the output to the prediction list
            prediction += test_pred.squeeze().tolist()
    # Convert the prediction list to a numpy array
    prediction = np.array(prediction)

    # Initialize an empty list to store the predictions with TTA
    tta_predictions = []
    with torch.no_grad():
        for i in range(num_tta):
            # Initialize an empty list to store the prediction for this iteration
            tta_prediction = []
            for data,_ in tqdm(tta_test_loaders[i]):
                # Get the model output on the data and move it to CPU
                test_pred = model_best(data.to(device)).cpu().data.numpy()
                # Add the output to the tta_prediction list
                tta_prediction += test_pred.squeeze().tolist()
            # Append the tta_prediction list to the tta_predictions list
            tta_predictions.append(tta_prediction)

    # Initialize a zero array with the same shape as prediction to store the total TTA predictions
    total_tta_predictions = np.zeros_like(prediction)
    for i in range(num_tta):
        total_tta_predictions += tta_predictions[i]
    # Divide the total TTA predictions by t to get the average TTA predictions
    avg_tta_predictions = np.divide(total_tta_predictions, num_tta)

    # Get the weighted average of prediction and TTA prediction
    prediction = [0.8 * prediction[i] + 0.2 * avg_tta_predictions[i] for i in range(len(prediction))]
    # Get the predicted labels
    prediction = np.argmax(np.array(prediction), axis=1)
    return prediction

In [24]:
# cd /kaggle/working
#os.makedirs("models")

!ls

models	sample_log.txt


# Stacking

In [None]:
def stack(dataset,test_set,n_splits,models=[],tb=False):
    # 判断谁否有多个模型
    if not isinstance(models,list):
        models = [models]
    elif not len(models):
        raise ValueError("No models to stack")
    
    ntrain = len(dataset)
    ntest = len(test_set)
    
    G = torch.Generator()
    G.manual_seed(myseed)
    
    # 记录交叉验证的模型
    oof_dict = {}
    # 记录交叉验证的分数
    val_acc_dict = {}
    
    for i in range(len(models)):
        
        oof_train = np.zeros((ntrain,))
        oof_test = np.zeros((ntest,))
        
        # 记录？
        oof_test_skf = np.empty((n_splits, ntest))
        
        # 创建k折交叉验证
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=myseed)
        
        # 用于所有折的验证数据取平均
        val_accs = []
        
        # 交叉验证
        for fold,(train_idx,valid_idx) in enumerate(kf.split(dataset)):
            
            model = models[i]().to(device)
            
            print(f"Model: {models[i].__name__},fold:{fold+1}")
            
            print(f"Train Index (len: {len(train_idx)}): {train_idx}")
            print(f"Valid Index (len: {len(valid_idx)}): {valid_idx}")
            
            # 每次训练要重新定义optim和scheduler
            optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=1e-5) # 3e-4
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.8, patience=patience/2, threshold=0.05)     
            
            # Create a subset of the dataset based on the train indices
            # 用Subset的原因：如果直接在原来上面tfm会导致验证集的tfm也被改变
            # 所以先借用Subset分割
            train_set = MySubset(dataset, train_idx, train_tfm2)
            train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True, generator=G)
            # Create a subset of the dataset based on the valid indices
            valid_set = MySubset(dataset, valid_idx, test_tfm)
            valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)    
            
            stale = 0
            best_acc = 0
            
            for epoch in range(n_epochs):
                
                # ---------- Training ----------------------------------------
                model.train()
                
                train_loss = []
                train_accs = []
                
                correct = 0
                total = 0
                
                for imgs,labels in tqdm(train_loader):
            
                    # A batch consists of image data and corresponding labels.
                    imgs, labels = imgs.to(device),labels.to(device)
                    imgs,labels_a,labels_b,lam=mixup_data(imgs, labels)
                    
                    # Forward the data. (Make sure data and model are on the same device.)
                    logits = model(imgs)
            
                    # Calculate the cross-entropy loss.
                    # We don't need to apply softmax before computing cross-entropy as it is done automatically.
                    loss = mixup_criterion(criterion, logits, labels_a, labels_b, lam)
            
                    # Gradients stored in the parameters in the previous step should be cleared out first.
                    optimizer.zero_grad()
            
                    # Compute the gradients for parameters.
                    loss.backward()
                    
                    # Clip the gradient norms for stable training.
                    grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_norm=10)
            
                    # Update the parameters with computed gradients.
                    optimizer.step()
            
                    # Compute the accuracy for current batch.
                    _,predicted = logits.max(1)
                    correct += (lam * predicted.eq(labels_a).sum().item()+ (1 - lam) * predicted.eq(labels_b).sum().item())
                    total+=labels.size(0)
                    
                    # Record the loss and accuracy.
                    train_loss.append(loss.item())
            
                train_loss = sum(train_loss) / len(train_loss)
                train_acc = correct / total
            
                print(f"Fold {fold+1}: [ Train | {epoch + 1:03d}/{n_epochs:03d} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}")
                
                
                # ---------- Validation -----------------------------------------
                # Make sure the model is in eval mode so that some modules like dropout are disabled and work normally.                
                model.eval()
                
                # These are used to record information in validation.
                valid_loss = []
                valid_accs = []
                
                # Iterate the validation set by batches.
                for batch in tqdm(valid_loader):

                    # A batch consists of image data and corresponding labels.
                    imgs, labels = batch
                    #imgs = imgs.half()

                    # We don't need gradient in validation.
                    # Using torch.no_grad() accelerates the forward process.
                    with torch.no_grad():
                        logits = model(imgs.to(device))

                    # We can still compute the loss (but not the gradient).
                    loss = criterion(logits, labels.to(device))

                    # Compute the accuracy for current batch.
                    acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

                    # Record the loss and accuracy.
                    valid_loss.append(loss.item())
                    valid_accs.append(acc)
                    #break

                # The average loss and accuracy for entire validation set is the average of the recorded values.
                valid_loss = sum(valid_loss) / len(valid_loss)
                valid_acc = sum(valid_accs) / len(valid_accs)
                
                # Update logs
                if valid_acc > best_acc:
                    with open(f"./{_exp_name}_log.txt","a"):
                        print(f"Fold {fold+1}: [ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f} -> best")
                else:
                    with open(f"./{_exp_name}_log.txt","a"):
                        print(f"Fold {fold+1}: [ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")


                # Save models
                if valid_acc > best_acc:
                    print(f"Best model found at epoch {epoch}, saving model")
                    torch.save(model.state_dict(), f"models/{_exp_name}_{models[i].__name__}_fold{fold+1}_best.ckpt") # only save best to prevent output memory exceed error
                    best_acc = valid_acc
                    stale = 0
                else:
                    stale += 1
                    if stale > patience:
                        print(f"No improvment {patience} consecutive epochs, early stopping")
                        break
                
                # 每个训练周期之后调用
                scheduler.step(best_acc)
            
            # ---------- Use the same training loop as in the previous section ----------        
            # Append the best accuracy to the list
            val_accs.append(best_acc)

            # Out-of-Fold Predictions
            model_best = model
            print(f"{_exp_name}_{models[i].__name__}_fold{fold+1}_best.ckpt")
            model_best.load_state_dict(torch.load(f"/kaggle/working/models/{_exp_name}_{models[i].__name__}_fold{fold+1}_best.ckpt"))
            model_best.eval()

            val_prediction, test_prediction = [], []
            
            test_prediction = tta(num_folds=NSPLITS, model = models[i])
            
            with torch.no_grad():
                for data,_ in tqdm(valid_loader):
                    val_pred = np.argmax(model_best(data.to(device)).cpu().data.numpy(), axis=1)
                    val_prediction += val_pred.squeeze().tolist()
                oof_train[valid_idx] = val_prediction

        oof_test[:] = oof_test_skf.mean(axis=0)
        oof_dict[f"{models[i].__name__}_oof_train"] = oof_train.reshape(-1, 1)
        oof_dict[f"{models[i].__name__}_oof_test"] = oof_test.reshape(-1, 1)
        
        # Compute the average validation accuracy across all folds
        avg_val_acc = sum(val_accs) / len(val_accs)
        val_acc_dict[f"{models[i].__name__}_val_accs"] = val_accs
        val_acc_dict[f"{models[i].__name__}_avg_val_acc"] = avg_val_acc
        
        # Print the average validation accuracy
        print(f"Average validation accuracy: {avg_val_acc:.4f}")
        
    return oof_dict, val_acc_dict

dataset = FoodDataset(tfm=test_tfm,files=total_files)

test_set = FoodDataset(tfm=test_tfm,files=test_files)
test_loader = DataLoader(dataset,batch_size = batch_size,shuffle=False,num_workers=0,pin_memory=True)

print("begin----")
NSPLITS = 5

num_tta = 5

tta_test_loaders = create_tta_loaders()
print("--")
models = [Classifier,VGG16]
                
oof_dict,val_acc_dict = stack(dataset,test_set,NSPLITS,models=models)

One sample /kaggle/input/ml2022spring-hw3b/food11/training/8_248.jpg
One sample /kaggle/input/ml2022spring-hw3b/food11/test/0664.jpg
begin----
One sample /kaggle/input/ml2022spring-hw3b/food11/test/0664.jpg
One sample /kaggle/input/ml2022spring-hw3b/food11/test/0664.jpg
One sample /kaggle/input/ml2022spring-hw3b/food11/test/0664.jpg
One sample /kaggle/input/ml2022spring-hw3b/food11/test/0664.jpg
One sample /kaggle/input/ml2022spring-hw3b/food11/test/0664.jpg
--
Model: Classifier,fold:1
Train Index (len: 10636): [    0     2     3 ... 13293 13294 13295]
Valid Index (len: 2660): [    1     5     7 ... 13268 13277 13292]


  0%|          | 0/167 [00:00<?, ?it/s]

## Cross validation 交叉验证
You can implement cross validation entirely using the cross_val_score in sklearn.model_selection, but I'll show you another way to implement it here.

# 保存模型

In [None]:
orch.save(model.state_dict(), "Classifier1.pth")

# 加载测试集

In [None]:
test_set = FoodDataset(os.path.join(_dataset_dir,"test"), tfm=test_tfm)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)

# Testing and generate prediction CSV

In [None]:
!cd /kaggle/working/

In [None]:
model_best = Classifier().to(device)
model_best.load_state_dict(torch.load(f"{_exp_name}_best.ckpt"))
model_best.eval()
prediction = []
with torch.no_grad():
    for data,_ in test_loader:
        test_pred = model_best(data.to(device))
        test_label = np.argmax(test_pred.cpu().data.numpy(), axis=1)
        prediction += test_label.squeeze().tolist()

In [None]:
#create test csv
def pad4(i):
    return "0"*(4-len(str(i)))+str(i)
df = pd.DataFrame()
df["Id"] = [pad4(i) for i in range(1,len(test_set)+1)]
df["Category"] = prediction
df.to_csv("submission.csv",index = False)

# Q1. Augmentation Implementation
## Implement augmentation by finishing train_tfm in the code with image size of your choice.
## Directly copy the following block and paste it on GradeScope after you finish the code
### Your train_tfm must be capable of producing 5+ different results when given an identical image multiple times.
### Your  train_tfm in the report can be different from train_tfm in your training code.


In [None]:
train_tfm = transforms.Compose([
    # Resize the image into a fixed shape (height = width = 128)
    transforms.Resize((128, 128)),
    # You need to add some transforms here.
    transforms.ToTensor(),
])

# Q2. Residual Implementation
![](https://i.imgur.com/GYsq1Ap.png)
## Directly copy the following block and paste it on GradeScope after you finish the code


In [None]:
from torch import nn
class Residual_Network(nn.Module):
    def __init__(self):
        super(Residual_Network, self).__init__()

        self.cnn_layer1 = nn.Sequential(
            nn.Conv2d(3, 64, 3, 1, 1),
            nn.BatchNorm2d(64),
        )

        self.cnn_layer2 = nn.Sequential(
            nn.Conv2d(64, 64, 3, 1, 1),
            nn.BatchNorm2d(64),
        )

        self.cnn_layer3 = nn.Sequential(
            nn.Conv2d(64, 128, 3, 2, 1),
            nn.BatchNorm2d(128),
        )

        self.cnn_layer4 = nn.Sequential(
            nn.Conv2d(128, 128, 3, 1, 1),
            nn.BatchNorm2d(128),
        )
        self.cnn_layer5 = nn.Sequential(
            nn.Conv2d(128, 256, 3, 2, 1),
            nn.BatchNorm2d(256),
        )
        self.cnn_layer6 = nn.Sequential(
            nn.Conv2d(256, 256, 3, 1, 1),
            nn.BatchNorm2d(256),
        )
        self.fc_layer = nn.Sequential(
            nn.Linear(256* 32* 32, 256),
            nn.ReLU(),
            nn.Linear(256, 11)
        )
        self.relu = nn.ReLU()

    def forward(self, x):
        # input (x): [batch_size, 3, 128, 128]
        # output: [batch_size, 11]

        # Extract features by convolutional layers.
        x1 = self.cnn_layer1(x)

        x1 = self.relu(x1)
        
        Residual = x1 
        x2 = self.cnn_layer2(x1)
        x2 = x2 + Residual
        x2 = self.relu(x2)

        x3 = self.cnn_layer3(x2)
        
        x3 = self.relu(x3)
        Residual = x3 
        
        x4 = self.cnn_layer4(x3)
        x4 = x4 + Residual
        x4 = self.relu(x4)

        x5 = self.cnn_layer5(x4)
        
        x5 = self.relu(x5)
        Residual = x5
        x6 = self.cnn_layer6(x5)
        x6 = x6 + Residual
        x6 = self.relu(x6)

        # The extracted feature map must be flatten before going to fully-connected layers.
        xout = x6.flatten(1)

        # The features are transformed by fully-connected layers to obtain the final logits.
        xout = self.fc_layer(xout)
        return xout