# COMP34212 Summative Lab Task2


In [None]:
import torch
import torch.nn as nn
from torch.utils.data.sampler import SubsetRandomSampler
import torch.optim as optim

from tqdm import tqdm
import numpy as np
from torchvision import datasets
import torchvision.transforms as transforms
import random
import os
device = 'cuda' if torch.cuda.is_available() else 'cpu' # Check if GPU is available
print('Using {} device'.format(device))

folder_name = 'checkpoint' # Folder to save the model
if not os.path.exists(folder_name):
        os.makedirs(folder_name)

# from torch.utils.tensorboard import SummaryWriter
# writer = SummaryWriter(log_dir='./log') # Tensorboard writer

In [None]:
seed = 42
def seed_torch(seed=1029):
	random.seed(seed)
	os.environ['PYTHONHASHSEED'] = str(seed) # To make all the computations deterministic
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)
	torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
	torch.backends.cudnn.benchmark = False
	torch.backends.cudnn.deterministic = True

seed_torch(seed=seed)

In [None]:
def init_fn(worker_id):
    np.random.seed(int(seed)+worker_id)
# trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, worker_init_fn=_init_fn)


In [None]:
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)

# TODO: change variable names
def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

In [None]:
class BasicBlock(nn.Module):
    """
    A basic block for ResNet18 and ResNet34

    Args:
    inplanes: int, number of input channels
    planes: int, number of output channels
    stride: int, stride of the first convolutional layer
    downsample: nn.Module, downsample layer
    groups: int, number of groups for the convolutional layers
    base_width: int, base width of the convolutional layers
    dilation: int, dilation rate of the convolutional layers
    norm_layer: nn.Module, normalization layer

    Returns:
    out: tensor, output of the basic block
    """
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(BasicBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.relu(self.bn1(self.conv1(x)))

        out = self.bn2(self.conv2(out))

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

In [None]:
class Bottleneck(nn.Module):
    """
    For deeper models, we use the "bottleneck" building block to reduce the number of parameters.
    It is based on the following principle:
    1. A 1x1 convolution reduces the dimensionality of the input to a bottleneck representation.
    2. A 3x3 convolution is applied to the bottleneck representation.
    3. A 1x1 convolution increases the dimensionality of the representation back to the original.
    4. The output is added to the original input.
    5. The result is passed through a ReLU activation function.

    The bottleneck block is used in the ResNet-50, ResNet-101, and ResNet-152 architectures.
    """
    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
    # This variant is also known as ResNet V1.5 and improves accuracy according to
    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.

    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(Bottleneck, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        width = int(planes * (base_width / 64.)) * groups
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


In [None]:
class ResNet(nn.Module):
    """
    The ResNet architecture is based on the following principles:
    1. The input is passed through a 7x7 convolutional layer with stride 2.
    2. The output is passed through a 3x3 max pooling layer with stride 2.
    3. The output is passed through a series of residual blocks.
    4. The output is passed through a global average pooling layer.
    5. The output is passed through a fully connected layer with softmax activation.

    The ResNet architecture is defined by the number of layers and the type of residual block used.
    The ResNet-18 and ResNet-34 architectures use the "basic block" residual block.
    The ResNet-50, ResNet-101, and ResNet-152 architectures use the "bottleneck" residual block.
    """
    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None):
        super(ResNet, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        #self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)    # delete maxpool layer
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        """
        Create a layer of residual blocks

        Args:
        block: nn.Module, type of residual block: BasicBlock or Bottleneck
        planes: int, number of output channels
        blocks: int, number of blocks
        """
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def _forward_impl(self, x):
        # See note [TorchScript super()]
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        #x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

    def forward(self, x):
        return self._forward_impl(x)


def _resnet(block, layers, **kwargs):
    model = ResNet(block, layers, **kwargs)
    return model


def ResNet18(**kwargs):
    return _resnet(BasicBlock, [2, 2, 2, 2],**kwargs)

def ResNet34(**kwargs):
    return _resnet(BasicBlock, [3, 4, 6, 3],**kwargs)

def ResNet50(**kwargs):
    return _resnet(Bottleneck, [3, 4, 6, 3],**kwargs)

def ResNet101(**kwargs):
    return _resnet(Bottleneck, [3, 4, 23, 3],**kwargs)

def ResNet152(**kwargs):
    return _resnet(Bottleneck, [3, 8, 36, 3],**kwargs)

In [None]:
class Cutout(object):
    """
    Randomly mask out one or more patches from an image.
    Args:
        n_holes (int): Number of patches to cut out of each image.
        length (int): The length (in pixels) of each square patch.
    """
    def __init__(self, n_holes, length):
        self.n_holes = n_holes
        self.length = length

    def __call__(self, img):
        h = img.size(1)
        w = img.size(2)

        mask = np.ones((h, w), np.float32)

        for n in range(self.n_holes):
        	# (x,y) makes the center of the hole
            y = np.random.randint(h)
            x = np.random.randint(w)

            y1 = np.clip(y - self.length // 2, 0, h)
            y2 = np.clip(y + self.length // 2, 0, h)
            x1 = np.clip(x - self.length // 2, 0, w)
            x2 = np.clip(x + self.length // 2, 0, w)

            mask[y1: y2, x1: x2] = 0.

        mask = torch.from_numpy(mask)
        mask = mask.expand_as(img)
        img = img * mask

        return img

## Fetch data for training/validation/test
(Load CIFAR 10 and apply data augementation)

In [None]:
def read_dataset(batch_size=16,valid_size=0.2,num_workers=0,pic_path='dataset'):
    """
    batch_size: Number of loaded drawings per batch
    valid_size: Percentage of training set to use as validation
    num_workers: Number of subprocesses to use for data loading
    pic_path: The path of the pictrues
    """
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),  # first, we perform a random crop of the image to 32x32 using padding of 4 pixels
        transforms.RandomHorizontalFlip(),  # randomly flip the image horizontally, probability of 0.5
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]), # normalize the image based on the mean and standard deviation from the ImageNet dataset
        Cutout(n_holes=1, length=16),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
    ])


    # Define the data loaders
    train_data = datasets.CIFAR10(pic_path, train=True,
                                download=True, transform=transform_train)
    valid_data = datasets.CIFAR10(pic_path, train=True,
                                download=True, transform=transform_test)
    test_data = datasets.CIFAR10(pic_path, train=False,
                                download=True, transform=transform_test)
        

    # obtain training indices that will be used for validation
    num_train = len(train_data)
    indices = list(range(num_train))
    # random indices
    np.random.shuffle(indices)
    # the ratio of split
    split = int(np.floor(valid_size * num_train))
    # divide data to radin_data and valid_data
    train_idx, valid_idx = indices[split:], indices[:split]

    # define samplers for obtaining training and validation batches
    # samples elements randomly from a given list of indices, while not repeating elements
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    # prepare data loaders (combine dataset and sampler)
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,
        sampler=train_sampler, num_workers=num_workers)
    valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size, 
        sampler=valid_sampler, num_workers=num_workers)
    test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, 
        num_workers=num_workers)

    return train_loader,valid_loader,test_loader

# TRAINING

In [None]:
def train_model(model, criterion, optimizer, train_loader): 
    model.train() # batchNorm or dropout layers will work in training mode
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    for data, target in train_loader:
        data = data.to(device)
        target = target.to(device)
        optimizer.zero_grad()   # clear the gradients of all optimized variables
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data).to(device)  # output = model.forward(data).to(device)
        # calculate the batch loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        
        # update training loss
        train_loss += loss.item()*data.size(0)

        _, pred = torch.max(output, 1)    
        correct_tensor = pred.eq(target.data)
        train_correct += torch.sum(correct_tensor).item()
        train_total += data.size(0)

    train_accuracy = train_correct / train_total
    train_loss /= len(train_loader.sampler)
    
    return train_loss, train_accuracy

In [None]:
def evaluate_model(model, criterion, valid_loader):
    model.eval()
    valid_loss = 0.0
    
    total_sample = 0
    right_sample = 0
    valid_accuracy = []

    with torch.no_grad():
        for data, target in valid_loader:
            data = data.to(device)
            target = target.to(device)
            # forward pass: compute predicted outputs by passing inputs to the model
            output = model(data).to(device)
            # calculate the batch loss
            loss = criterion(output, target)
            
            # update average validation loss 
            valid_loss += loss.item()*data.size(0)
            # convert output probabilities to predicted class
            _, pred = torch.max(output, 1)    
            # compare predictions to true label
            correct_tensor = pred.eq(target.data.view_as(pred))
            # right_sample += torch.sum(correct_tensor).item()
            # correct = np.squeeze(correct_tensor.to(device).numpy())
            total_sample += data.size(0)
            for i in correct_tensor:
                if i:
                    right_sample += 1

    valid_accuracy = right_sample / total_sample
    valid_loss /= len(valid_loader.sampler)

    return valid_loss, valid_accuracy

In [None]:
def train_and_validate(model, optimizer, criterion, trainloader, valloader, num_epochs, save_path = 'checkpoint/resnet18_cifar10.pt'):
    # model.to(device)
    valid_loss_min = np.Inf # track change in validation loss
    counter = 0
    early_stop_threshold = 50   # early stopping patience

    train_loss = 0.0
    train_accuracy=0.0

    valid_loss = 0.0
    val_accuracy=0.0
        
    for epoch in tqdm(range(50, num_epochs+1)):

        # Dynamic learning rate
        if counter/10 ==1:
            counter = 0
            lr = lr*0.5
        
        train_loss, train_accuracy = train_model(model,criterion,optimizer,trainloader)
        valid_loss, valid_accuracy = evaluate_model(model,criterion,valloader)
        
        print("Train Accuracy:",100.*train_accuracy,"%  \n","Evaluation Accuracy:",100.*valid_accuracy,"%")
        
        # train_loss = train_loss/len(train_loader.sampler)
        # valid_loss = valid_loss/len(valid_loader.sampler)
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch, train_loss, valid_loss))
        # writer.add_scalar('Train Loss', train_loss, valid_loss, epoch)

        # if the validation loss has decreased, save the model
        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
            torch.save(model.state_dict(), save_path)
            valid_loss_min = valid_loss
            counter = 0
        else:
            counter += 1
            # Early stopping
            if counter >= early_stop_threshold:
                print("Early stopping")
                break

    return train_loss, train_accuracy, valid_loss, val_accuracy

In [None]:
num_workers = 8 # number of subprocesses to use for data loading
batch_size = 128 # how many samples per batch to load
valid_size = 0.2    # percentage of training set to use as validation

batch_size = 128
train_loader,valid_loader,test_loader = read_dataset(batch_size=batch_size,pic_path='dataset')
n_class = 10

"""
ResNet18网络的7x7降采样卷积和池化操作容易丢失一部分信息,
所以在实验中我们将7x7的降采样层和最大池化层去掉,替换为一个3x3的降采样卷积,
同时减小该卷积层的步长和填充大小
"""
model = ResNet18()
model.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1, bias=False)
model.fc = torch.nn.Linear(512, n_class) # modify the last layer for ResNet18
# model.fc = torch.nn.Linear(2048, n_class) # modify the last layer for ResNet50

# Load the model parameters from disk
checkpoint_path = 'checkpoint/resnet18_cifar10.pt'
if os.path.exists(checkpoint_path):
    model_state_dict = torch.load(checkpoint_path)
    model.load_state_dict(model_state_dict)
    print("Loaded model parameters from disk.")

model = model.to(device)
criterion = nn.CrossEntropyLoss().to(device)    # Use cross entropy loss function
n_epochs = 250
lr = 0.1
optimizer_SGD = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)

In [None]:
train_and_validate(model, optimizer_SGD, criterion, train_loader, valid_loader, num_epochs=n_epochs, save_path = 'checkpoint/resnet18_cifar10.pt')

# TEST


In [None]:
n_class = 10
batch_size = 100
train_loader,valid_loader,test_loader = read_dataset(batch_size=batch_size,pic_path='dataset')
model = ResNet18()
model.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1, bias=False)
model.fc = torch.nn.Linear(512, n_class) # modify the last layer for ResNet18
# model.fc = torch.nn.Linear(2048, n_class) # modify the last layer for ResNet50

# Load the model parameters from disk
model.load_state_dict(torch.load('checkpoint/resnet18_cifar10.pt'))
model = model.to(device)

In [None]:
def test_model(model, test_loader):
    model.eval()
    total_sample = 0
    right_sample = 0
    for data, target in test_loader:
        data = data.to(device)
        target = target.to(device)
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data).to(device)
        # convert output probabilities to predicted class
        _, pred = torch.max(output, 1)    
        # compare predictions to true label
        correct_tensor = pred.eq(target.data.view_as(pred))
        # correct = np.squeeze(correct_tensor.to(device).numpy())
        total_sample += batch_size
        for i in correct_tensor:
            if i:
                right_sample += 1

    print("Accuracy:",100*right_sample/total_sample,"%")