In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/UCCD3074_Labs/Assignment2/

/content/drive/MyDrive/UCCD3074_Labs/Assignment2


In [3]:
############################
#Reference: <https://www.kaggle.com/code/abhinand05/vision-transformer-vit-tutorial-baseline/notebook>
############################

#install TPU dependencies
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py #transfer data to the notebook
!python pytorch-xla-env-setup.py --version 1.7 #get and setup torch_xla version
!pip install timm 

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  6034  100  6034    0     0   9763      0 --:--:-- --:--:-- --:--:--  9763
Updating... This may take around 2 minutes.
Updating TPU runtime to pytorch-1.7 ...
Found existing installation: torch 1.12.1+cu113
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cloud-tpu-client
  Downloading cloud_tpu_client-0.10-py3-none-any.whl (7.4 kB)
Collecting google-api-python-client==1.8.0
  Downloading google_api_python_client-1.8.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 2.8 MB/s 
Uninstalling torch-1.12.1+cu113:
Installing collected packages: google-api-python-client, cloud-tpu-client
  Attempting uninstall: google-api-python-client
    Found existing installation: google-api-python-client 1.12.11
    Uninstalling google-api-python-client-1.12.1

In [4]:
#import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.style.use("ggplot")

import torch
import torch.nn as nn
import torchvision.transforms as transforms

import albumentations

import torch_xla #to connect notebook to use Cloud TPU device
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.distributed.parallel_loader as pl

import timm #to collect newest computer vision model

import gc #garbage collector
import os #operating system
import time
import random #generate random number
from datetime import datetime

from PIL import Image
from tqdm.notebook import tqdm #to create progress bar
from sklearn import model_selection, metrics

# For parallelization in TPUs
os.environ["XLA_USE_BF16"] = "1"
os.environ["XLA_TENSOR_ALLOCATOR_MAXSIZE"] = "100000000"

print("torchversion:",torch.__version__)



torchversion: 1.7.0a0+7e71a98


In [5]:
############################
#Coded by Ng Jiun Shen
############################

# model specific global variables
Set = {
    'seed': 3074,
    'model_arch': 'vit_base_patch16_224',
    'img_size': 224,
    'epochs': 10,
    'train_bs': 16,
    'valid_bs': 16,
    'lr': 2e-05,
}

In [6]:
############################
#Coded by Ng Jiun Shen
############################

#read file
df = pd.read_csv("../Assignment2/train.csv")

#check success loaded
print("top 5 records\n",df.head()) 
print("\nlast 5 records\n",df.tail()) 

top 5 records
          image_id  label
0  1000015157.jpg      0
1  1000201771.jpg      3
2   100042118.jpg      1
3  1000723321.jpg      1
4  1000812911.jpg      3

last 5 records
             image_id  label
21392  999068805.jpg      3
21393  999329392.jpg      3
21394  999474432.jpg      1
21395  999616605.jpg      4
21396  999998473.jpg      4


In [7]:
############################
#Coded by Leong Wai Yin
############################

#Split into train,valid,test set
df_train = df.sample(frac=0.7, random_state=Set['seed'])
val_test = df.loc[~df.index.isin(df_train.index)]
df_test = val_test.sample(frac=0.5, random_state=Set['seed'])
df_valid = val_test.loc[~val_test.index.isin(df_test.index)]
print("dataset's length is",len(df))
print("trainset's length is",len(df_train))
print("validset's length is",len(df_valid))
print("testset's length is",len(df_test))

dataset's length is 21397
trainset's length is 14978
validset's length is 3209
testset's length is 3210


In [8]:
############################
#Reference: <https://www.kaggle.com/code/abhinand05/vision-transformer-vit-tutorial-baseline/notebook>
############################

class CassavaDataset(torch.utils.data.Dataset): #class for dataset
    def __init__(self, df, data_path="../Assignment2", mode="train", transforms=None):
        super().__init__()
        self.df_data = df.values
        self.data_path = data_path
        self.transforms = transforms
        self.mode = mode
        self.data_dir = "train_images" if mode == "train" else "test_images"

    def __len__(self):
        return len(self.df_data)

    def __getitem__(self, index):
        img_name, label = self.df_data[index] #assign index to each image
        img_path = os.path.join(self.data_path, self.data_dir, img_name)
        img = Image.open(img_path).convert("RGB")

        if self.transforms is not None:
            image = self.transforms(img)

        return image, label

In [9]:
############################
#Adapted from <https://www.kaggle.com/code/abhinand05/vision-transformer-vit-tutorial-baseline/notebook>
############################

# create image augmentations
transforms_train = transforms.Compose(
    [
        transforms.Resize((Set['img_size'], Set['img_size'])),
        transforms.RandomRotation(0.1),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomVerticalFlip(p=0.5),
        transforms.RandomResizedCrop(Set['img_size']),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ]
)

transforms_valid = transforms.Compose( #no augmentation in valid set
    [
        transforms.Resize((Set['img_size'], Set['img_size'])),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ]
)

In [10]:
############################
#Reference: <https://www.kaggle.com/code/abhinand05/vision-transformer-vit-tutorial-baseline/notebook>
############################

class ViTBase16(nn.Module): #class for VIT module
    def __init__(self, n_classes, pretrained=False):

        super(ViTBase16, self).__init__()

        self.model = timm.create_model(Set['model_arch'], pretrained=False)
        if pretrained:
            self.model.load_state_dict(torch.load("../Assignment2/vit-base-models-pretrained-pytorch/jx_vit_base_p16_224-80ecf9dd.pth"))

        self.model.head = nn.Linear(self.model.head.in_features, n_classes)

    def forward(self, x):
        x = self.model(x)
        return x

    def train_one_epoch(self, train_loader, criterion, optimizer, device):
        # keep track of training loss
        epoch_loss = 0.0
        epoch_accuracy = 0.0

        ###################
        # train the model #
        ###################
        self.model.train()
        for i, (data, target) in enumerate(train_loader):
            # move tensors to GPU if CUDA is available
            if device.type == "cuda":
                data, target = data.cuda(), target.cuda()
            elif device.type == "xla":
                data = data.to(device, dtype=torch.float32)
                target = target.to(device, dtype=torch.int64)

            # clear the gradients of all optimized variables
            optimizer.zero_grad()
            # forward pass: compute predicted outputs by passing inputs to the model
            output = self.forward(data)
            # calculate the batch loss
            loss = criterion(output, target)
            # backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()
            # Calculate Accuracy
            accuracy = (output.argmax(dim=1) == target).float().mean()
            # update training loss and accuracy
            epoch_loss += loss
            epoch_accuracy += accuracy

            # perform a single optimization step (parameter update)
            if device.type == "xla":
                xm.optimizer_step(optimizer)

                if i % 20 == 0:
                    xm.master_print(f"\tBATCH {i+1}/{len(train_loader)} - LOSS: {loss}")

            else:
                optimizer.step()

        return epoch_loss / len(train_loader), epoch_accuracy / len(train_loader)

    def validate_one_epoch(self, valid_loader, criterion, device):
        # keep track of validation loss
        valid_loss = 0.0
        valid_accuracy = 0.0

        ######################
        # validate the model #
        ######################
        self.model.eval()
        for data, target in valid_loader:
            # move tensors to GPU if CUDA is available
            if device.type == "cuda":
                data, target = data.cuda(), target.cuda()
            elif device.type == "xla":
                data = data.to(device, dtype=torch.float32)
                target = target.to(device, dtype=torch.int64)

            with torch.no_grad():
                # forward pass: compute predicted outputs by passing inputs to the model
                output = self.model(data)
                # calculate the batch loss
                loss = criterion(output, target)
                # Calculate Accuracy
                accuracy = (output.argmax(dim=1) == target).float().mean()
                # update average validation loss and accuracy
                valid_loss += loss
                valid_accuracy += accuracy

        return valid_loss / len(valid_loader), valid_accuracy / len(valid_loader)

In [11]:
############################
#Reference: <https://www.kaggle.com/code/abhinand05/vision-transformer-vit-tutorial-baseline/notebook>
############################
def fit_tpu(
    model, epochs, device, criterion, optimizer, train_loader, valid_loader=None
):

    valid_loss_min = np.Inf  # track change in validation loss

    # keeping track of losses as it happen
    train_losses = []
    valid_losses = []
    train_accs = []
    valid_accs = []

    for epoch in range(1, epochs + 1):
        gc.collect() #collect garbage data
        para_train_loader = pl.ParallelLoader(train_loader, [device]) #multiple TPU

        xm.master_print(f"{'='*50}")
        xm.master_print(f"EPOCH {epoch} - TRAINING...")
        train_loss, train_acc = model.train_one_epoch(
            para_train_loader.per_device_loader(device), criterion, optimizer, device
        )
        xm.master_print(
            f"\n\t[TRAIN] EPOCH {epoch} - LOSS: {train_loss}, ACCURACY: {train_acc}\n "
        )
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        gc.collect()

        if valid_loader is not None:
            gc.collect()
            para_valid_loader = pl.ParallelLoader(valid_loader, [device])
            xm.master_print(f"EPOCH {epoch} - VALIDATING...")
            valid_loss, valid_acc = model.validate_one_epoch(
                para_valid_loader.per_device_loader(device), criterion, device
            )
            xm.master_print(f"\t[VALID] LOSS: {valid_loss}, ACCURACY: {valid_acc}\n")
            valid_losses.append(valid_loss)
            valid_accs.append(valid_acc)
            gc.collect()

            content = time.ctime() + ' ' + f'EPOCH {epoch}, [VALID] LOSS: {valid_loss},ACCURACY: {valid_acc}'

            with open(f'log.txt', 'a') as appender:
                appender.write(content + '\n')

            # save model if validation loss has decreased
            if valid_loss <= valid_loss_min and epoch != 1:
                xm.master_print(
                    "Validation loss decreased ({:.4f} --> {:.4f}).  Saving model ...".format(
                        valid_loss_min, valid_loss
                    )
                )
            #                 xm.save(model.state_dict(), 'best_model.pth')

            valid_loss_min = valid_loss

    return {
        "train_loss": train_losses,
        "valid_losses": valid_losses,
        "train_acc": train_accs,
        "valid_acc": valid_accs,
    }

In [12]:
############################
#Reference: <https://www.kaggle.com/code/abhinand05/vision-transformer-vit-tutorial-baseline/notebook>
############################

model = ViTBase16(n_classes=5, pretrained=True)

In [13]:
############################
#Adapted from: <https://www.kaggle.com/code/abhinand05/vision-transformer-vit-tutorial-baseline/notebook>
############################

def _run():
    train_dataset = CassavaDataset(df_train, transforms=transforms_train)
    valid_dataset = CassavaDataset(df_valid, transforms=transforms_valid)

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=True,
    )

    valid_sampler = torch.utils.data.distributed.DistributedSampler(
        valid_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=False,
    )

    train_loader = torch.utils.data.DataLoader(
        dataset=train_dataset,
        batch_size=Set['train_bs'],
        sampler=train_sampler,
        drop_last=True,
        num_workers=8,
    )

    valid_loader = torch.utils.data.DataLoader(
        dataset=valid_dataset,
        batch_size=Set['valid_bs'],
        sampler=valid_sampler,
        drop_last=True,
        num_workers=8,
    )

    criterion = nn.CrossEntropyLoss()
    #     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = xm.xla_device()
    model.to(device)

    lr = Set['lr'] * xm.xrt_world_size()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    xm.master_print(f"INITIALIZING TRAINING ON {xm.xrt_world_size()} TPU CORES")
    start_time = datetime.now()
    xm.master_print(f"Start Time: {start_time}")

    logs = fit_tpu(
        model=model,
        epochs=Set['epochs'],
        device=device,
        criterion=criterion,
        optimizer=optimizer,
        train_loader=train_loader,
        valid_loader=valid_loader,
    )

    xm.master_print(f"Execution time: {datetime.now() - start_time}")

    xm.master_print("Saving Model")
    xm.save(
        model.state_dict(), f'model_5e_{datetime.now().strftime("%Y%m%d-%H%M")}.pth'
    )

In [14]:
############################
#Reference: <https://www.kaggle.com/code/abhinand05/vision-transformer-vit-tutorial-baseline/notebook>
############################

# Start training processes
def _mp_fn(rank, flags):
    torch.set_default_tensor_type("torch.FloatTensor")
    a = _run()


# _run()
FLAGS = {}
xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=8, start_method="fork")

INITIALIZING TRAINING ON 8 TPU CORES
Start Time: 2022-09-03 13:57:49.172921
EPOCH 1 - TRAINING...
	BATCH 1/117 - LOSS: 1.3671875
	BATCH 21/117 - LOSS: 0.85546875
	BATCH 41/117 - LOSS: 0.546875
	BATCH 61/117 - LOSS: 0.60546875
	BATCH 81/117 - LOSS: 0.67578125
	BATCH 101/117 - LOSS: 0.75390625

	[TRAIN] EPOCH 1 - LOSS: 0.65234375, ACCURACY: 0.78515625
 
EPOCH 1 - VALIDATING...
	[VALID] LOSS: 0.55078125, ACCURACY: 0.7890625

EPOCH 2 - TRAINING...
	BATCH 1/117 - LOSS: 0.1767578125
	BATCH 21/117 - LOSS: 0.390625
	BATCH 41/117 - LOSS: 0.3125
	BATCH 61/117 - LOSS: 0.466796875
	BATCH 81/117 - LOSS: 0.3203125
	BATCH 101/117 - LOSS: 0.7890625

	[TRAIN] EPOCH 2 - LOSS: 0.453125, ACCURACY: 0.87109375
 
EPOCH 2 - VALIDATING...
	[VALID] LOSS: 0.50390625, ACCURACY: 0.82421875

Validation loss decreased (0.5508 --> 0.5039).  Saving model ...
EPOCH 3 - TRAINING...
	BATCH 1/117 - LOSS: 0.2158203125
	BATCH 21/117 - LOSS: 0.50390625
	BATCH 41/117 - LOSS: 0.228515625
	BATCH 61/117 - LOSS: 0.287109375
	BATC

In [15]:
############################
#Coded by Ng Jiun Shen
############################

f = open(f'./log.txt', "r")

print(f.read())

.LOGSat Sep  3 13:14:36 2022 EPOCH 1, [VALID] LOSS: 0.455078125,ACCURACY: 0.8515625
Sat Sep  3 13:14:37 2022 EPOCH 1, [VALID] LOSS: 0.52734375,ACCURACY: 0.80859375
Sat Sep  3 13:14:37 2022 EPOCH 1, [VALID] LOSS: 0.44921875,ACCURACY: 0.85546875
Sat Sep  3 13:14:37 2022 EPOCH 1, [VALID] LOSS: 0.5234375,ACCURACY: 0.828125
Sat Sep  3 13:14:37 2022 EPOCH 1, [VALID] LOSS: 0.50390625,ACCURACY: 0.828125
Sat Sep  3 13:14:37 2022 EPOCH 1, [VALID] LOSS: 0.431640625,ACCURACY: 0.8203125
Sat Sep  3 13:14:37 2022 EPOCH 1, [VALID] LOSS: 0.55078125,ACCURACY: 0.8203125
Sat Sep  3 13:14:37 2022 EPOCH 1, [VALID] LOSS: 0.404296875,ACCURACY: 0.87109375
Sat Sep  3 13:21:10 2022 EPOCH 2, [VALID] LOSS: 0.515625,ACCURACY: 0.83984375
Sat Sep  3 13:21:10 2022 EPOCH 2, [VALID] LOSS: 0.390625,ACCURACY: 0.86328125
Sat Sep  3 13:21:10 2022 EPOCH 2, [VALID] LOSS: 0.45703125,ACCURACY: 0.828125
Sat Sep  3 13:21:10 2022 EPOCH 2, [VALID] LOSS: 0.4453125,ACCURACY: 0.83984375
Sat Sep  3 13:21:10 2022 EPOCH 2, [VALID] LOSS: 