<a href="https://colab.research.google.com/github/XinhengLyu/The-application-of-Vision-Transformers-in-Digital-Pathology/blob/main/The_application_of_Vision_Transformers_in_Digital_Pathology.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import

In [None]:
!pip install optuna==2.10.1
!pip install torch-cka

In [None]:
import matplotlib.pyplot as plt
import torch
import torchvision
from torch import nn
import pandas as pd
import time
import os
import math
import argparse
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.tensorboard import SummaryWriter
from torchvision import transforms
from torchvision.models.resnet import ResNet, BasicBlock
import sys
import json
import pickle
import random
from functools import partial
from collections import OrderedDict
from tqdm import tqdm
import optuna
import logging
import numpy as np

# Model and Method

## ViT

In [None]:
def drop_path(x, drop_prob: float = 0., training: bool = False):

    if drop_prob == 0. or not training:
        return x
    keep_prob = 1 - drop_prob
    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
    random_tensor.floor_()  # binarize
    output = x.div(keep_prob) * random_tensor
    return output


class DropPath(nn.Module):
    """
    Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """
    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)


class PatchEmbed(nn.Module):

    def __init__(self, img_size=224, patch_size=16, in_c=3, embed_dim=768, norm_layer=None):
        super().__init__()
        img_size = (img_size, img_size)
        patch_size = (patch_size, patch_size)
        self.img_size = img_size
        self.patch_size = patch_size
        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
        self.num_patches = self.grid_size[0] * self.grid_size[1]

        self.proj = nn.Conv2d(in_c, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()

    def forward(self, x):
        B, C, H, W = x.shape
        assert H == self.img_size[0] and W == self.img_size[1], \
            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."

        # flatten: [B, C, H, W] -> [B, C, HW]
        # transpose: [B, C, HW] -> [B, HW, C]
        x = self.proj(x).flatten(2).transpose(1, 2)
        x = self.norm(x)
        return x


class Attention(nn.Module):
    def __init__(self,
                 dim,  
                 num_heads=8,
                 qkv_bias=False,
                 qk_scale=None,
                 attn_drop_ratio=0.,
                 proj_drop_ratio=0.):
        super(Attention, self).__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim ** -0.5
        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop_ratio)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop_ratio)

    def forward(self, x):
        # [batch_size, num_patches + 1, total_embed_dim]
        B, N, C = x.shape

        # qkv(): -> [batch_size, num_patches + 1, 3 * total_embed_dim]
        # reshape: -> [batch_size, num_patches + 1, 3, num_heads, embed_dim_per_head]
        # permute: -> [3, batch_size, num_heads, num_patches + 1, embed_dim_per_head]
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        # [batch_size, num_heads, num_patches + 1, embed_dim_per_head]
        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)

        # transpose: -> [batch_size, num_heads, embed_dim_per_head, num_patches + 1]
        # @: multiply -> [batch_size, num_heads, num_patches + 1, num_patches + 1]
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        # @: multiply -> [batch_size, num_heads, num_patches + 1, embed_dim_per_head]
        # transpose: -> [batch_size, num_patches + 1, num_heads, embed_dim_per_head]
        # reshape: -> [batch_size, num_patches + 1, total_embed_dim]
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class Mlp(nn.Module):
    """
    MLP as used in Vision Transformer, MLP-Mixer and related networks
    """
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):

        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


class Block(nn.Module):
    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4.,
                 qkv_bias=False,
                 qk_scale=None,
                 drop_ratio=0.,
                 attn_drop_ratio=0.,
                 drop_path_ratio=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm):
        super(Block, self).__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
                              attn_drop_ratio=attn_drop_ratio, proj_drop_ratio=drop_ratio)
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(drop_path_ratio) if drop_path_ratio > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop_ratio)

    def forward(self, x):
        x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x


class VisionTransformer(nn.Module):
    def __init__(self, img_size=224, patch_size=16, in_c=3, num_classes=1000,
                 embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.0, qkv_bias=True,
                 qk_scale=None, representation_size=None, distilled=False, drop_ratio=0.,
                 attn_drop_ratio=0., drop_path_ratio=0., embed_layer=PatchEmbed, norm_layer=None,
                 act_layer=None):

        super(VisionTransformer, self).__init__()
        self.num_classes = num_classes
        self.in_c=in_c
        self.img_size=img_size
        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
        self.num_tokens = 2 if distilled else 1
        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
        act_layer = act_layer or nn.GELU

        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_c=in_c, embed_dim=embed_dim)
        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.dist_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if distilled else None
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
        self.pos_drop = nn.Dropout(p=drop_ratio)

        dpr = [x.item() for x in torch.linspace(0, drop_path_ratio, depth)]  # stochastic depth decay rule
        self.blocks = nn.Sequential(*[
            Block(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
                  drop_ratio=drop_ratio, attn_drop_ratio=attn_drop_ratio, drop_path_ratio=dpr[i],
                  norm_layer=norm_layer, act_layer=act_layer)
            for i in range(depth)
        ])
        self.norm = norm_layer(embed_dim)

        # Representation layer
        if representation_size and not distilled:
            self.has_logits = True
            self.num_features = representation_size
            self.pre_logits = nn.Sequential(OrderedDict([
                ("fc", nn.Linear(embed_dim, representation_size)),
                ("act", nn.Tanh())
            ]))
        else:
            self.has_logits = False
            self.pre_logits = nn.Identity()

        # Classifier head(s)
        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
        self.head_dist = None
        if distilled:
            self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()

        # Weight init
        nn.init.trunc_normal_(self.pos_embed, std=0.02)
        if self.dist_token is not None:
            nn.init.trunc_normal_(self.dist_token, std=0.02)

        nn.init.trunc_normal_(self.cls_token, std=0.02)
        self.apply(_init_vit_weights)

    def forward_features(self, x):
        # [B, C, H, W] -> [B, num_patches, embed_dim]
        x = self.patch_embed(x)  # [B, 196, 768]
        # [1, 1, 768] -> [B, 1, 768]
        cls_token = self.cls_token.expand(x.shape[0], -1, -1)
        if self.dist_token is None:
            x = torch.cat((cls_token, x), dim=1)  # [B, 197, 768]
        else:
            x = torch.cat((cls_token, self.dist_token.expand(x.shape[0], -1, -1), x), dim=1)

        x = self.pos_drop(x + self.pos_embed)
        x = self.blocks(x)
        x = self.norm(x)
        if self.dist_token is None:
            return self.pre_logits(x[:, 0])
        else:
            return x[:, 0], x[:, 1]

    def forward(self, x):
        x = self.forward_features(x)
        if self.head_dist is not None:
            x, x_dist = self.head(x[0]), self.head_dist(x[1])
            if self.training and not torch.jit.is_scripting():
                # during inference, return the average of both classifier predictions
                return x, x_dist
            else:
                return (x + x_dist) / 2
        else:
            x = self.head(x)
        return x


def _init_vit_weights(m):
    """
    ViT weight initialization
    :param m: module
    """
    if isinstance(m, nn.Linear):
        nn.init.trunc_normal_(m.weight, std=.01)
        if m.bias is not None:
            nn.init.zeros_(m.bias)
    elif isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight, mode="fan_out")
        if m.bias is not None:
            nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LayerNorm):
        nn.init.zeros_(m.bias)
        nn.init.ones_(m.weight)


def ViT(img_size:int=224,num_classes: int = 1000,patch_size: int =16,embed_dim: int=768,depth: int=12,num_heads: int=12,in_c:int=3):

    model = VisionTransformer(img_size=img_size,
                              patch_size=patch_size,
                              embed_dim=embed_dim,
                              depth=depth,
                              num_heads=num_heads,
                              representation_size=None,
                              in_c=in_c,
                              num_classes=num_classes)
    return model


## Train and Evaluate

In [None]:
def train_one_epoch(model, optimizer, data_loader, device, epoch):
    model.train()
    loss_function = torch.nn.CrossEntropyLoss()
    accu_loss = torch.zeros(1).to(device)
    accu_num = torch.zeros(1).to(device)
    optimizer.zero_grad()
    sample_num = 0
    data_loader = tqdm(data_loader, file=sys.stdout)
    for step, data in enumerate(data_loader):
        images, labels = data
        sample_num += images.shape[0]

        pred = model(images.to(device))
        pred_classes = torch.max(pred, dim=1)[1]
        accu_num += torch.eq(pred_classes, labels.to(device)).sum()

        loss = loss_function(pred, labels.to(device))
        # train_loss.append(loss.cpu().item())
        loss.backward()
        accu_loss += loss.detach()

        data_loader.desc = "[train epoch {}] loss: {:.3f}, acc: {:.3f}".format(epoch,
                                                                               accu_loss.item() / (step + 1),
                                                                               accu_num.item() / sample_num)

        if not torch.isfinite(loss):
            print('WARNING: non-finite loss, ending training ', loss)
            sys.exit(1)

        optimizer.step()
        optimizer.zero_grad()

    return accu_loss.item() / (step + 1), accu_num.item() / sample_num


@torch.no_grad()
def evaluate(model, data_loader, device, epoch):
    loss_function = torch.nn.CrossEntropyLoss()

    model.eval()

    accu_num = torch.zeros(1).to(device)   
    accu_loss = torch.zeros(1).to(device) 
   
    sample_num = 0
    data_loader = tqdm(data_loader, file=sys.stdout)
    for step, data in enumerate(data_loader):
        images, labels = data
        sample_num += images.shape[0]

        pred = model(images.to(device))
        pred_classes = torch.max(pred, dim=1)[1]
        accu_num += torch.eq(pred_classes, labels.to(device)).sum()

        loss = loss_function(pred, labels.to(device))
        accu_loss += loss

        data_loader.desc = "[valid epoch {}] loss: {:.3f}, acc: {:.3f}".format(epoch,
                                                                               accu_loss.item() / (step + 1),
                                                                               accu_num.item() / sample_num)

    return accu_loss.item() / (step + 1), accu_num.item() / sample_num

## ResNet

In [None]:
class MyResNet(ResNet):
    def __init__(self):
        super(MyResNet, self).__init__(BasicBlock, [2,2,1,1], num_classes=10)
        self.conv1 = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        
    def forward(self, x):
        return torch.softmax(super(MyResNet, self).forward(x), dim=-1)

# Mnist

## Get Dataset

In [None]:
Mnist_batch_size = 128
Mnist_transform = transforms.Compose([transforms.ToTensor()])

In [None]:
Mnist_DOWNLOAD_PATH = './'

Mnist_train_set = torchvision.datasets.MNIST(Mnist_DOWNLOAD_PATH, train=True, download=True,
                                       transform=Mnist_transform)
Mnist_train_loader = torch.utils.data.DataLoader(Mnist_train_set,batch_size=Mnist_batch_size,shuffle=True,num_workers=4)

Mnist_test_set = torchvision.datasets.MNIST(Mnist_DOWNLOAD_PATH, train=False, download=True,
                                      transform=Mnist_transform)
Mnist_test_loader = torch.utils.data.DataLoader(Mnist_test_set,batch_size=Mnist_batch_size,shuffle=False,num_workers=4)

In [None]:
images, labels = next(iter(Mnist_train_loader))
print(images.size())
plt.figure(figsize=(9, 9))
for i in range(10):
    plt.subplot(1, 10, i+1)
    plt.title(labels[i].item())
    img=images[i].permute(1, 2, 0)
    plt.imshow(torch.squeeze(img, dim=2), cmap='gray')
    plt.axis('off')
plt.show()

## Train-ViT

In [None]:
torch.manual_seed(4)
epochs=20

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tb_writer = SummaryWriter()

MNIST_ViT_model = ViT(img_size=28,num_classes=10,patch_size=7,embed_dim=64,depth=3,num_heads=8,in_c=1).to(device)

optimizer = optim.Adam(MNIST_ViT_model.parameters(), lr=0.0003)

for epoch in range(epochs):
    # train
    train_loss, train_acc= train_one_epoch(model=MNIST_ViT_model,
                                            optimizer=optimizer,
                                            data_loader=Mnist_train_loader,
                                            device=device,
                                            epoch=epoch)



    # validate
    val_loss, val_acc = evaluate(model=MNIST_ViT_model,
                                 data_loader=Mnist_test_loader,
                                 device=device,
                                 epoch=epoch)


    tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
    tb_writer.add_scalar(tags[0], train_loss, epoch)
    tb_writer.add_scalar(tags[1], train_acc, epoch)
    tb_writer.add_scalar(tags[2], val_loss, epoch)
    tb_writer.add_scalar(tags[3], val_acc, epoch)
    tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)


In [None]:
Mnist_ViT_loss={}
Mnist_ViT_acc={}


for i in range(5):
  print("train",i)
  torch.manual_seed(i)
  epochs=20

  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  tb_writer = SummaryWriter()

  model = ViT(img_size=28,num_classes=10,patch_size=7,embed_dim=256,depth=3,num_heads=8,in_c=1).to(device)

  optimizer = optim.Adam(model.parameters(), lr=0.0003)

  Mnist_ViT_loss["train_exp_"+str(i)]=[]
  Mnist_ViT_acc["train_exp_"+str(i)]=[]
  Mnist_ViT_loss["val_exp_"+str(i)]=[]
  Mnist_ViT_acc["val_exp_"+str(i)]=[]

  for epoch in range(epochs):
      # train
      train_loss, train_acc= train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=Mnist_train_loader,
                                              device=device,
                                              epoch=epoch)

      Mnist_ViT_loss["train_exp_"+str(i)].append(train_loss)
      Mnist_ViT_acc["train_exp_"+str(i)].append(train_acc)
      
      # validate
      val_loss, val_acc = evaluate(model=model,
                                  data_loader=Mnist_test_loader,
                                  device=device,
                                  epoch=epoch)
      end_time=time.time()
      Mnist_ViT_loss["val_exp_"+str(i)].append(val_loss)
      Mnist_ViT_acc["val_exp_"+str(i)].append(val_acc)
      
      tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
      tb_writer.add_scalar(tags[0], train_loss, epoch)
      tb_writer.add_scalar(tags[1], train_acc, epoch)
      tb_writer.add_scalar(tags[2], val_loss, epoch)
      tb_writer.add_scalar(tags[3], val_acc, epoch)
      tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)


In [None]:
import json

mnist_vit={"Mnist_ViT_loss":Mnist_ViT_loss,"Mnist_ViT_acc":Mnist_ViT_acc}
mnist_vit_js= json.dumps(mnist_vit) 
 
mnist_vit_fileObject = open('/content/drive/MyDrive/mnist_vit.json', 'w')
mnist_vit_fileObject.write(mnist_vit_js)
mnist_vit_fileObject.close()

## Train-ResNet

In [None]:
torch.manual_seed(4)

epochs=20
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tb_writer = SummaryWriter()


MNIST_res_model = MyResNet().to(device)

optimizer = optim.Adam(MNIST_res_model.parameters(), lr=0.0003)

for epoch in range(epochs):
    # train
    train_loss, train_acc = train_one_epoch(model=MNIST_res_model,
                                            optimizer=optimizer,
                                            data_loader=Mnist_train_loader,
                                            device=device,
                                            epoch=epoch)

    # validate
    val_loss, val_acc = evaluate(model=MNIST_res_model,
                                data_loader=Mnist_test_loader,
                                device=device,
                                epoch=epoch)
    tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
    tb_writer.add_scalar(tags[0], train_loss, epoch)
    tb_writer.add_scalar(tags[1], train_acc, epoch)
    tb_writer.add_scalar(tags[2], val_loss, epoch)
    tb_writer.add_scalar(tags[3], val_acc, epoch)
    tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)

In [None]:
Mnist_Res_loss={}
Mnist_Res_acc={}


for i in range(5):
  print("train",i)
  torch.manual_seed(i)
  epochs=20

  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  tb_writer = SummaryWriter()

  model = MyResNet().to(device)

  optimizer = optim.Adam(model.parameters(), lr=0.00018)

  Mnist_Res_loss["train_exp_"+str(i)]=[]
  Mnist_Res_acc["train_exp_"+str(i)]=[]
  Mnist_Res_loss["val_exp_"+str(i)]=[]
  Mnist_Res_acc["val_exp_"+str(i)]=[]
  for epoch in range(epochs):
      # train
      train_loss, train_acc= train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=Mnist_train_loader,
                                              device=device,
                                              epoch=epoch)

      Mnist_Res_loss["train_exp_"+str(i)].append(train_loss)
      Mnist_Res_acc["train_exp_"+str(i)].append(train_acc)
      
      # validate
      val_loss, val_acc = evaluate(model=model,
                                  data_loader=Mnist_test_loader,
                                  device=device,
                                  epoch=epoch)
     
      Mnist_Res_loss["val_exp_"+str(i)].append(val_loss)
      Mnist_Res_acc["val_exp_"+str(i)].append(val_acc)
      
      tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
      tb_writer.add_scalar(tags[0], train_loss, epoch)
      tb_writer.add_scalar(tags[1], train_acc, epoch)
      tb_writer.add_scalar(tags[2], val_loss, epoch)
      tb_writer.add_scalar(tags[3], val_acc, epoch)
      tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)


In [None]:
import json

mnist_res={"Mnist_Res_loss":Mnist_Res_loss,"Mnist_Res_acc":Mnist_Res_acc}
mnist_res_js= json.dumps(mnist_res) 
 
mnist_res_fileObject = open('/content/drive/MyDrive/mnist_res.json', 'w')
mnist_res_fileObject.write(mnist_res_js)
mnist_res_fileObject.close()

## Experiments with different parameters

### diff_patch_size

In [None]:
diff_patch_size_train_loss={}
diff_patch_size_train_acc={}
diff_patch_size_val_loss={}
diff_patch_size_val_acc={}
diff_patch_size=[2,4,7,14]

In [None]:
torch.manual_seed(4)
for i in diff_patch_size:
  epochs=10
  lrf=0.01
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  tb_writer = SummaryWriter()

  model = ViT(img_size=28,num_classes=10,patch_size=i,embed_dim=32,depth=3,num_heads=4,in_c=1).to(device)

  optimizer = optim.Adam(model.parameters(), lr=0.003)

  diff_patch_size_train_loss["patch_size"+str(i)]=[]
  diff_patch_size_train_acc["patch_size"+str(i)]=[]
  diff_patch_size_val_loss["patch_size"+str(i)]=[]
  diff_patch_size_val_acc["patch_size"+str(i)]=[]
  for epoch in range(epochs):
      # train
      train_loss, train_acc = train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=Mnist_train_loader,
                                              device=device,
                                              epoch=epoch)
      diff_patch_size_train_loss["patch_size"+str(i)].append(train_loss)
      diff_patch_size_train_acc["patch_size"+str(i)].append(train_acc)

      # validate
      val_loss, val_acc = evaluate(model=model,
                                  data_loader=Mnist_test_loader,
                                  device=device,
                                  epoch=epoch)
      
      diff_patch_size_val_loss["patch_size"+str(i)].append(val_loss)
      diff_patch_size_val_acc["patch_size"+str(i)].append(val_acc)

      tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
      tb_writer.add_scalar(tags[0], train_loss, epoch)
      tb_writer.add_scalar(tags[1], train_acc, epoch)
      tb_writer.add_scalar(tags[2], val_loss, epoch)
      tb_writer.add_scalar(tags[3], val_acc, epoch)
      tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)

In [None]:
#save the results
diff_patch_size_res={"diff_patch_size_train_loss":diff_patch_size_train_loss,
                     "diff_patch_size_train_acc":diff_patch_size_train_acc,
                     "diff_patch_size_val_loss":diff_patch_size_val_loss,
                     "diff_patch_size_val_acc":diff_patch_size_val_acc}
diff_patch_size_res_js= json.dumps(diff_patch_size_res) 
 
diff_patch_size_fileObject = open('diff_patch_size.json', 'w')
diff_patch_size_fileObject.write(diff_patch_size_res_js)
diff_patch_size_fileObject.close()

### diff_dim

In [None]:
diff_dim=[64,128,256,512,1024]
diff_dim_train_loss={}
diff_dim_train_acc={}
diff_dim_val_loss={}
diff_dim_val_acc={}

In [None]:
torch.manual_seed(4)
for i in diff_dim:
  epochs=20
  lrf=0.01
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  tb_writer = SummaryWriter()

  model = ViT(img_size=28,num_classes=10,patch_size=7,embed_dim=i,depth=6,num_heads=8,in_c=1).to(device)

  pg = [p for p in model.parameters() if p.requires_grad]
  optimizer = optim.Adam(model.parameters(), lr=0.003)

  diff_dim_train_loss["embed_dim"+str(i)]=[]
  diff_dim_train_acc["embed_dim"+str(i)]=[]
  diff_dim_val_loss["embed_dim"+str(i)]=[]
  diff_dim_val_acc["embed_dim"+str(i)]=[]
  for epoch in range(epochs):
      # train
      train_loss, train_acc = train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=Mnist_train_loader,
                                              device=device,
                                              epoch=epoch)
      diff_dim_train_loss["embed_dim"+str(i)].append(train_loss)
      diff_dim_train_acc["embed_dim"+str(i)].append(train_acc)


  

      # validate
      val_loss, val_acc = evaluate(model=model,
                                  data_loader=Mnist_test_loader,
                                  device=device,
                                  epoch=epoch)
      
      diff_dim_val_loss["embed_dim"+str(i)].append(val_loss)
      diff_dim_val_acc["embed_dim"+str(i)].append(val_acc)

      tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
      tb_writer.add_scalar(tags[0], train_loss, epoch)
      tb_writer.add_scalar(tags[1], train_acc, epoch)
      tb_writer.add_scalar(tags[2], val_loss, epoch)
      tb_writer.add_scalar(tags[3], val_acc, epoch)
      tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)

In [None]:
diff_dim_res={"diff_dim_train_loss":diff_dim_train_loss,
                     "diff_dim_train_acc":diff_dim_train_acc,
                     "diff_dim_val_loss":diff_dim_val_loss,
                     "diff_dim_val_acc":diff_dim_val_acc}
diff_dim_res_js= json.dumps(diff_dim_res) 
 
diff_dim_fileObject = open('/content/drive/MyDrive/diff_dim.json', 'w')
diff_dim_fileObject.write(diff_dim_res_js)
diff_dim_fileObject.close()

### diff_depth

In [None]:
diff_depth=[3,6,9,12]
diff_depth_train_loss={}
diff_depth_train_acc={}
diff_depth_val_loss={}
diff_depth_val_acc={}

In [None]:
torch.manual_seed(4)
for i in diff_depth:
  epochs=20
  lrf=0.01
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  tb_writer = SummaryWriter()

  model = ViT(img_size=28,num_classes=10,patch_size=7,embed_dim=64,depth=i,num_heads=8,in_c=1).to(device)

  pg = [p for p in model.parameters() if p.requires_grad]
  optimizer = optim.Adam(model.parameters(), lr=0.003)

  diff_depth_train_loss["diff_depth"+str(i)]=[]
  diff_depth_train_acc["diff_depth"+str(i)]=[]
  diff_depth_val_loss["diff_depth"+str(i)]=[]
  diff_depth_val_acc["diff_depth"+str(i)]=[]
  for epoch in range(epochs):
      # train
      train_loss, train_acc = train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=Mnist_train_loader,
                                              device=device,
                                              epoch=epoch)
      diff_depth_train_loss["diff_depth"+str(i)].append(train_loss)
      diff_depth_train_acc["diff_depth"+str(i)].append(train_acc)

      # validate
      val_loss, val_acc = evaluate(model=model,
                                  data_loader=Mnist_test_loader,
                                  device=device,
                                  epoch=epoch)
      
      diff_depth_val_loss["diff_depth"+str(i)].append(val_loss)
      diff_depth_val_acc["diff_depth"+str(i)].append(val_acc)

      tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
      tb_writer.add_scalar(tags[0], train_loss, epoch)
      tb_writer.add_scalar(tags[1], train_acc, epoch)
      tb_writer.add_scalar(tags[2], val_loss, epoch)
      tb_writer.add_scalar(tags[3], val_acc, epoch)
      tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)

In [None]:
diff_depth_res={"diff_depth_train_loss":diff_depth_train_loss,
                     "diff_depth_train_acc":diff_depth_train_acc,
                     "diff_depth_val_loss":diff_depth_val_loss,
                     "diff_depth_val_acc":diff_depth_val_acc}
diff_depth_res_js= json.dumps(diff_depth_res) 
 
diff_depth_fileObject = open('/content/drive/MyDrive/diff_depth.json', 'w')
diff_depth_fileObject.write(diff_depth_res_js)
diff_depth_fileObject.close()

### diff_num_heads

In [None]:
diff_num_heads=[8,16,32]
diff_num_heads_train_loss={}
diff_num_heads_train_acc={}
diff_num_heads_val_loss={}
diff_num_heads_val_acc={}

In [None]:
torch.manual_seed(4)
for i in diff_num_heads:
  epochs=20
  lrf=0.01
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  tb_writer = SummaryWriter()

  model = ViT(img_size=28,num_classes=10,patch_size=7,embed_dim=64,depth=6,num_heads=i,in_c=1).to(device)

  optimizer = optim.Adam(model.parameters(), lr=0.003)

  diff_num_heads_train_loss["num_heads"+str(i)]=[]
  diff_num_heads_train_acc["num_heads"+str(i)]=[]
  diff_num_heads_val_loss["num_heads"+str(i)]=[]
  diff_num_heads_val_acc["num_heads"+str(i)]=[]
  for epoch in range(epochs):
      # train
      train_loss, train_acc = train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=Mnist_train_loader,
                                              device=device,
                                              epoch=epoch)
      diff_num_heads_train_loss["num_heads"+str(i)].append(train_loss)
      diff_num_heads_train_acc["num_heads"+str(i)].append(train_acc)


      # scheduler.step()

      # validate
      val_loss, val_acc = evaluate(model=model,
                                  data_loader=Mnist_test_loader,
                                  device=device,
                                  epoch=epoch)
      
      diff_num_heads_val_loss["num_heads"+str(i)].append(val_loss)
      diff_num_heads_val_acc["num_heads"+str(i)].append(val_acc)

      tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
      tb_writer.add_scalar(tags[0], train_loss, epoch)
      tb_writer.add_scalar(tags[1], train_acc, epoch)
      tb_writer.add_scalar(tags[2], val_loss, epoch)
      tb_writer.add_scalar(tags[3], val_acc, epoch)
      tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)

In [None]:
diff_num_heads_res={"diff_num_heads_train_loss":diff_num_heads_train_loss,
                     "diff_num_heads_train_acc":diff_num_heads_train_acc,
                     "diff_num_heads_val_loss":diff_num_heads_val_loss,
                     "diff_num_heads_val_acc":diff_num_heads_val_acc}
diff_num_heads_res_js= json.dumps(diff_num_heads_res) 
 
diff_num_heads_fileObject = open('/content/drive/MyDrive/diff_num_heads.json', 'w')
diff_num_heads_fileObject.write(diff_num_heads_res_js)
diff_num_heads_fileObject.close()

### diff_lr

In [None]:
diff_lr=[0.0001,0.0005,0.001,0.003,0.005,0.01]
diff_lr_train_loss={}
diff_lr_train_acc={}
diff_lr_val_loss={}
diff_lr_val_acc={}

In [None]:
torch.manual_seed(4)
for i in diff_lr:
  epochs=20
  lrf=0.01
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  tb_writer = SummaryWriter()

  model = ViT(img_size=28,num_classes=10,patch_size=7,embed_dim=64,depth=6,num_heads=8,in_c=1).to(device)

  pg = [p for p in model.parameters() if p.requires_grad]
  optimizer = optim.Adam(model.parameters(), lr=i)
 
  diff_lr_train_loss["lr"+str(i)]=[]
  diff_lr_train_acc["lr"+str(i)]=[]
  diff_lr_val_loss["lr"+str(i)]=[]
  diff_lr_val_acc["lr"+str(i)]=[]
  for epoch in range(epochs):
      # train
      train_loss, train_acc = train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=Mnist_train_loader,
                                              device=device,
                                              epoch=epoch)
      diff_lr_train_loss["lr"+str(i)].append(train_loss)
      diff_lr_train_acc["lr"+str(i)].append(train_acc)


      # validate
      val_loss, val_acc = evaluate(model=model,
                                  data_loader=Mnist_test_loader,
                                  device=device,
                                  epoch=epoch)
      
      diff_lr_val_loss["lr"+str(i)].append(val_loss)
      diff_lr_val_acc["lr"+str(i)].append(val_acc)

      tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
      tb_writer.add_scalar(tags[0], train_loss, epoch)
      tb_writer.add_scalar(tags[1], train_acc, epoch)
      tb_writer.add_scalar(tags[2], val_loss, epoch)
      tb_writer.add_scalar(tags[3], val_acc, epoch)
      tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)

In [None]:
diff_lr_res={"diff_lr_train_loss":diff_lr_train_loss,
                     "diff_lr_train_acc":diff_lr_train_acc,
                     "diff_lr_val_loss":diff_lr_val_loss,
                     "diff_lr_val_acc":diff_lr_val_acc}
diff_lr_res_js= json.dumps(diff_lr_res) 
 
diff_lr_fileObject = open('/content/drive/MyDrive/diff_lr.json', 'w')
diff_lr_fileObject.write(diff_lr_res_js)
diff_lr_fileObject.close()

# CIFAR-10

## Get Dataset

In [None]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 128

CIF_trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
CIF_trainloader = torch.utils.data.DataLoader(CIF_trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=4)

CIF_testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
CIF_testloader = torch.utils.data.DataLoader(CIF_testset, batch_size=batch_size,
                                         shuffle=False, num_workers=4)

In [None]:
images, labels = next(iter(CIF_trainloader))
print(images.size())
plt.figure(figsize=(9, 9))
for i in range(9):
    plt.subplot(3, 3, i+1)
    plt.title(labels[i].item())
    img=images[i].permute(1, 2, 0)
    plt.imshow(torch.squeeze(img, dim=2), cmap='gray')
    plt.axis('off')
plt.show()

## Train-ViT

In [None]:
torch.manual_seed(4)
epochs=20

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tb_writer = SummaryWriter()

CIF_vit_model = ViT(img_size=32,num_classes=10,patch_size=4,embed_dim=96,depth=6,num_heads=8,in_c=3).to(device)

optimizer = optim.Adam(CIF_vit_model.parameters(), lr=0.0003)


for epoch in range(epochs):
    # train
    train_loss, train_acc= train_one_epoch(model=CIF_vit_model,
                                            optimizer=optimizer,
                                            data_loader=CIF_trainloader,
                                            device=device,
                                            epoch=epoch)


    # validate
    val_loss, val_acc = evaluate(model=CIF_vit_model,
                                data_loader=CIF_testloader,
                                device=device,
                                epoch=epoch)

      

In [None]:
CIF_ViT_loss={}
CIF_ViT_acc={}


for i in range(5):
  print("train",i)
  torch.manual_seed(i)
  epochs=20

  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  tb_writer = SummaryWriter()

  model = ViT(img_size=32,num_classes=10,patch_size=4,embed_dim=96,depth=6,num_heads=8,in_c=3).to(device)

  optimizer = optim.Adam(model.parameters(), lr=0.0003)

  CIF_ViT_loss["train_exp_"+str(i)]=[]
  CIF_ViT_acc["train_exp_"+str(i)]=[]
  CIF_ViT_loss["val_exp_"+str(i)]=[]
  CIF_ViT_acc["val_exp_"+str(i)]=[]

  for epoch in range(epochs):
      # train
      train_loss, train_acc= train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=CIF_trainloader,
                                              device=device,
                                              epoch=epoch)

      CIF_ViT_loss["train_exp_"+str(i)].append(train_loss)
      CIF_ViT_acc["train_exp_"+str(i)].append(train_acc)
      
      # validate
      val_loss, val_acc = evaluate(model=model,
                                  data_loader=CIF_testloader,
                                  device=device,
                                  epoch=epoch)
      end_time=time.time()
      CIF_ViT_loss["val_exp_"+str(i)].append(val_loss)
      CIF_ViT_acc["val_exp_"+str(i)].append(val_acc)
      
      tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
      tb_writer.add_scalar(tags[0], train_loss, epoch)
      tb_writer.add_scalar(tags[1], train_acc, epoch)
      tb_writer.add_scalar(tags[2], val_loss, epoch)
      tb_writer.add_scalar(tags[3], val_acc, epoch)
      tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)


In [None]:
import json

CIF_vit={"CIF_ViT_loss":CIF_ViT_loss,"CIF_ViT_acc":CIF_ViT_acc}
CIF_vit_js= json.dumps(CIF_vit) 
 
CIF_vit_fileObject = open('/content/drive/MyDrive/CIF_vit.json', 'w')
CIF_vit_fileObject.write(CIF_vit_js)
CIF_vit_fileObject.close()

## Train-Resnet

In [None]:
torch.manual_seed(4)

epochs=20
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tb_writer = SummaryWriter()


CIF_res_model = MyResNet().to(device)

optimizer = optim.Adam(CIF_res_model.parameters(), lr=0.0003)

for epoch in range(epochs):
    # train
    train_loss, train_acc = train_one_epoch(model=CIF_res_model,
                                            optimizer=optimizer,
                                            data_loader=CIF_trainloader,
                                            device=device,
                                            epoch=epoch)

    # validate
    val_loss, val_acc = evaluate(model=CIF_res_model,
                                data_loader=CIF_testloader,
                                device=device,
                                epoch=epoch)
    tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
    tb_writer.add_scalar(tags[0], train_loss, epoch)
    tb_writer.add_scalar(tags[1], train_acc, epoch)
    tb_writer.add_scalar(tags[2], val_loss, epoch)
    tb_writer.add_scalar(tags[3], val_acc, epoch)
    tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)

In [None]:
CIF_Res_loss={}
CIF_Res_acc={}


for i in range(5):
  print("train",i)
  torch.manual_seed(i)
  epochs=20

  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  model = MyResNet().to(device)

  optimizer = optim.Adam(model.parameters(), lr=0.0003)

  CIF_Res_loss["train_exp_"+str(i)]=[]
  CIF_Res_acc["train_exp_"+str(i)]=[]
  CIF_Res_loss["val_exp_"+str(i)]=[]
  CIF_Res_acc["val_exp_"+str(i)]=[]
  for epoch in range(epochs):
      # train
      train_loss, train_acc= train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=CIF_trainloader,
                                              device=device,
                                              epoch=epoch)

      CIF_Res_loss["train_exp_"+str(i)].append(train_loss)
      CIF_Res_acc["train_exp_"+str(i)].append(train_acc)
      
      # validate
      val_loss, val_acc = evaluate(model=model,
                                  data_loader=CIF_testloader,
                                  device=device,
                                  epoch=epoch)
     
      CIF_Res_loss["val_exp_"+str(i)].append(val_loss)
      CIF_Res_acc["val_exp_"+str(i)].append(val_acc)
      
      tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
      tb_writer.add_scalar(tags[0], train_loss, epoch)
      tb_writer.add_scalar(tags[1], train_acc, epoch)
      tb_writer.add_scalar(tags[2], val_loss, epoch)
      tb_writer.add_scalar(tags[3], val_acc, epoch)
      tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)


In [None]:
import json

CIF_res={"CIF_Res_loss":CIF_Res_loss,"CIF_Res_acc":CIF_Res_acc}
CIF_res_js= json.dumps(CIF_res) 
 
CIF_res_fileObject = open('/content/drive/MyDrive/CIF_res.json', 'w')
CIF_res_fileObject.write(CIF_res_js)
CIF_res_fileObject.close()

## Experiments with different parameters

### diff_patch_size

In [None]:
diff_patch_size_train_loss={}
diff_patch_size_train_acc={}
diff_patch_size_val_loss={}
diff_patch_size_val_acc={}
diff_patch_size=[2,4,8]

In [None]:
torch.manual_seed(4)
for i in diff_patch_size:
  epochs=20
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  model = ViT(img_size=32,num_classes=10,patch_size=i,embed_dim=96,depth=6,num_heads=6,in_c=3).to(device)

  optimizer = optim.Adam(model.parameters(), lr=0.003)

  diff_patch_size_train_loss["patch_size"+str(i)]=[]
  diff_patch_size_train_acc["patch_size"+str(i)]=[]
  diff_patch_size_val_loss["patch_size"+str(i)]=[]
  diff_patch_size_val_acc["patch_size"+str(i)]=[]
  for epoch in range(epochs):
      # train
      train_loss, train_acc = train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=CIF_trainloader,
                                              device=device,
                                              epoch=epoch)
      diff_patch_size_train_loss["patch_size"+str(i)].append(train_loss)
      diff_patch_size_train_acc["patch_size"+str(i)].append(train_acc)

      # validate
      val_loss, val_acc = evaluate(model=model,
                                  data_loader=CIF_testloader,
                                  device=device,
                                  epoch=epoch)
      
      diff_patch_size_val_loss["patch_size"+str(i)].append(val_loss)
      diff_patch_size_val_acc["patch_size"+str(i)].append(val_acc)


In [None]:
#save the results
diff_patch_size_res={"diff_patch_size_train_loss":diff_patch_size_train_loss,
                     "diff_patch_size_train_acc":diff_patch_size_train_acc,
                     "diff_patch_size_val_loss":diff_patch_size_val_loss,
                     "diff_patch_size_val_acc":diff_patch_size_val_acc}
diff_patch_size_res_js= json.dumps(diff_patch_size_res) 
 
diff_patch_size_fileObject = open('/content/drive/MyDrive/CIF_diff_patch_size.json', 'w')
diff_patch_size_fileObject.write(diff_patch_size_res_js)
diff_patch_size_fileObject.close()

### diff_dim

In [None]:
diff_dim=[48,96,192,384,768]
diff_dim_train_loss={}
diff_dim_train_acc={}
diff_dim_val_loss={}
diff_dim_val_acc={}

In [None]:
torch.manual_seed(4)
for i in diff_dim:
  epochs=20
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  
  model = ViT(img_size=32,num_classes=10,patch_size=8,embed_dim=i,depth=6,num_heads=8,in_c=3).to(device)

  optimizer = optim.Adam(model.parameters(), lr=0.003)

  diff_dim_train_loss["embed_dim"+str(i)]=[]
  diff_dim_train_acc["embed_dim"+str(i)]=[]
  diff_dim_val_loss["embed_dim"+str(i)]=[]
  diff_dim_val_acc["embed_dim"+str(i)]=[]
  for epoch in range(epochs):
      # train
      train_loss, train_acc = train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=CIF_trainloader,
                                              device=device,
                                              epoch=epoch)
      diff_dim_train_loss["embed_dim"+str(i)].append(train_loss)
      diff_dim_train_acc["embed_dim"+str(i)].append(train_acc)


  

      # validate
      val_loss, val_acc = evaluate(model=model,
                                  data_loader=CIF_testloader,
                                  device=device,
                                  epoch=epoch)
      
      diff_dim_val_loss["embed_dim"+str(i)].append(val_loss)
      diff_dim_val_acc["embed_dim"+str(i)].append(val_acc)


In [None]:
diff_dim_res={"diff_dim_train_loss":diff_dim_train_loss,
                     "diff_dim_train_acc":diff_dim_train_acc,
                     "diff_dim_val_loss":diff_dim_val_loss,
                     "diff_dim_val_acc":diff_dim_val_acc}
diff_dim_res_js= json.dumps(diff_dim_res) 
 
diff_dim_fileObject = open('/content/drive/MyDrive/CIR_diff_dim.json', 'w')
diff_dim_fileObject.write(diff_dim_res_js)
diff_dim_fileObject.close()

### diff_depth

In [None]:
diff_depth=[3,6,9,12]
diff_depth_train_loss={}
diff_depth_train_acc={}
diff_depth_val_loss={}
diff_depth_val_acc={}

In [None]:
torch.manual_seed(4)
for i in diff_depth:
  epochs=20
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  model = ViT(img_size=32,num_classes=10,patch_size=8,embed_dim=96,depth=i,num_heads=6,in_c=3).to(device)
  optimizer = optim.Adam(model.parameters(), lr=0.003)

  diff_depth_train_loss["diff_depth"+str(i)]=[]
  diff_depth_train_acc["diff_depth"+str(i)]=[]
  diff_depth_val_loss["diff_depth"+str(i)]=[]
  diff_depth_val_acc["diff_depth"+str(i)]=[]
  for epoch in range(epochs):
      # train
      train_loss, train_acc = train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=CIF_trainloader,
                                              device=device,
                                              epoch=epoch)
      diff_depth_train_loss["diff_depth"+str(i)].append(train_loss)
      diff_depth_train_acc["diff_depth"+str(i)].append(train_acc)

      # validate
      val_loss, val_acc = evaluate(model=model,
                                  data_loader=CIF_testloader,
                                  device=device,
                                  epoch=epoch)
      
      diff_depth_val_loss["diff_depth"+str(i)].append(val_loss)
      diff_depth_val_acc["diff_depth"+str(i)].append(val_acc)


In [None]:
diff_depth_res={"diff_depth_train_loss":diff_depth_train_loss,
                     "diff_depth_train_acc":diff_depth_train_acc,
                     "diff_depth_val_loss":diff_depth_val_loss,
                     "diff_depth_val_acc":diff_depth_val_acc}
diff_depth_res_js= json.dumps(diff_depth_res) 
 
diff_depth_fileObject = open('/content/drive/MyDrive/CIF_diff_depth.json', 'w')
diff_depth_fileObject.write(diff_depth_res_js)
diff_depth_fileObject.close()

### diff_num_heads

In [None]:
diff_num_heads=[6,8,12]
diff_num_heads_train_loss={}
diff_num_heads_train_acc={}
diff_num_heads_val_loss={}
diff_num_heads_val_acc={}

In [None]:
torch.manual_seed(4)
for i in diff_num_heads:
  epochs=20
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  model = ViT(img_size=32,num_classes=10,patch_size=8,embed_dim=96,depth=6,num_heads=i,in_c=3).to(device)
 
  optimizer = optim.Adam(model.parameters(), lr=0.001)

  diff_num_heads_train_loss["num_heads"+str(i)]=[]
  diff_num_heads_train_acc["num_heads"+str(i)]=[]
  diff_num_heads_val_loss["num_heads"+str(i)]=[]
  diff_num_heads_val_acc["num_heads"+str(i)]=[]
  for epoch in range(epochs):
      # train
      train_loss, train_acc = train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=CIF_trainloader,
                                              device=device,
                                              epoch=epoch)
      diff_num_heads_train_loss["num_heads"+str(i)].append(train_loss)
      diff_num_heads_train_acc["num_heads"+str(i)].append(train_acc)

      # validate
      val_loss, val_acc = evaluate(model=model,
                                  data_loader=CIF_testloader,
                                  device=device,
                                  epoch=epoch)
      
      diff_num_heads_val_loss["num_heads"+str(i)].append(val_loss)
      diff_num_heads_val_acc["num_heads"+str(i)].append(val_acc)

In [None]:
diff_num_heads=[6,8,12]
diff_num_heads_train_loss={}
diff_num_heads_train_acc={}
diff_num_heads_val_loss={}
diff_num_heads_val_acc={}

In [None]:
torch.manual_seed(4)
for i in diff_num_heads:
  epochs=20
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  model = ViT(img_size=32,num_classes=10,patch_size=8,embed_dim=96,depth=6,num_heads=i,in_c=3).to(device)
 
  optimizer = optim.Adam(model.parameters(), lr=0.003)

  diff_num_heads_train_loss["num_heads"+str(i)]=[]
  diff_num_heads_train_acc["num_heads"+str(i)]=[]
  diff_num_heads_val_loss["num_heads"+str(i)]=[]
  diff_num_heads_val_acc["num_heads"+str(i)]=[]
  for epoch in range(epochs):
      # train
      train_loss, train_acc = train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=CIF_trainloader,
                                              device=device,
                                              epoch=epoch)
      diff_num_heads_train_loss["num_heads"+str(i)].append(train_loss)
      diff_num_heads_train_acc["num_heads"+str(i)].append(train_acc)

      # validate
      val_loss, val_acc = evaluate(model=model,
                                  data_loader=CIF_testloader,
                                  device=device,
                                  epoch=epoch)
      
      diff_num_heads_val_loss["num_heads"+str(i)].append(val_loss)
      diff_num_heads_val_acc["num_heads"+str(i)].append(val_acc)

In [None]:
diff_num_heads_res={"diff_num_heads_train_loss":diff_num_heads_train_loss,
                     "diff_num_heads_train_acc":diff_num_heads_train_acc,
                     "diff_num_heads_val_loss":diff_num_heads_val_loss,
                     "diff_num_heads_val_acc":diff_num_heads_val_acc}
diff_num_heads_res_js= json.dumps(diff_num_heads_res) 
 
diff_num_heads_fileObject = open('/content/drive/MyDrive/CIF_diff_num_heads.json', 'w')
diff_num_heads_fileObject.write(diff_num_heads_res_js)
diff_num_heads_fileObject.close()

### diff_lr

In [None]:
diff_lr=[0.0001,0.0005,0.001,0.003,0.005,0.01]
diff_lr_train_loss={}
diff_lr_train_acc={}
diff_lr_val_loss={}
diff_lr_val_acc={}

In [None]:
torch.manual_seed(4)
for i in diff_lr:
  epochs=20
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  model = ViT(img_size=32,num_classes=10,patch_size=8,embed_dim=96,depth=6,num_heads=6,in_c=3).to(device)

  optimizer = optim.Adam(model.parameters(), lr=i)
  
  diff_lr_train_loss["lr"+str(i)]=[]
  diff_lr_train_acc["lr"+str(i)]=[]
  diff_lr_val_loss["lr"+str(i)]=[]
  diff_lr_val_acc["lr"+str(i)]=[]
  for epoch in range(epochs):
      # train
      train_loss, train_acc = train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=CIF_trainloader,
                                              device=device,
                                              epoch=epoch)
      diff_lr_train_loss["lr"+str(i)].append(train_loss)
      diff_lr_train_acc["lr"+str(i)].append(train_acc)


      # validate
      val_loss, val_acc = evaluate(model=model,
                                  data_loader=CIF_testloader,
                                  device=device,
                                  epoch=epoch)
      
      diff_lr_val_loss["lr"+str(i)].append(val_loss)
      diff_lr_val_acc["lr"+str(i)].append(val_acc)

In [None]:
diff_lr_res={"diff_lr_train_loss":diff_lr_train_loss,
                     "diff_lr_train_acc":diff_lr_train_acc,
                     "diff_lr_val_loss":diff_lr_val_loss,
                     "diff_lr_val_acc":diff_lr_val_acc}
diff_lr_res_js= json.dumps(diff_lr_res) 
 
diff_lr_fileObject = open('/content/drive/MyDrive/CIF_diff_lr2.json', 'w')
diff_lr_fileObject.write(diff_lr_res_js)
diff_lr_fileObject.close()

# PCAM

## Get Dataset

In [None]:
!cp -r /content/drive/MyDrive/data/PCAM/pcam ./pcam

In [None]:
batch_size = 128
data_transform = {
    "train": transforms.Compose([ 
                                
                                 transforms.ToTensor(),
                                 transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
                                 ]),
    "val": transforms.Compose([transforms.ToTensor(),
                               transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
                               ])}

In [None]:
PCAM_DOWNLOAD_PATH = './'
PCAM_train_set = torchvision.datasets.PCAM(PCAM_DOWNLOAD_PATH, split="train", transform=data_transform['train'],download=True)
PCAM_val_dataset = torchvision.datasets.PCAM(PCAM_DOWNLOAD_PATH, split="val", transform=data_transform['val'],download=True)

PCAM_train_loader = torch.utils.data.DataLoader(PCAM_train_set,batch_size=batch_size,shuffle=True,num_workers=4)
PCAM_val_loader =torch.utils.data.DataLoader(PCAM_val_dataset,batch_size=batch_size,shuffle=False,num_workers=4)

In [None]:
from torchvision.utils import save_image
images, labels = next(iter(PCAM_train_loader))
print(images.size())
plt.figure(figsize=(9, 9))
for i in range(9):
    ax=plt.subplot(3, 3, i+1)
    if labels[i].item()==1:
      tit="Tumor"
    else:
      tit="No Tumor"
    plt.title(tit)
    plt.imshow(images[i].permute(1, 2, 0), cmap='gray')
    plt.axis('off')
    plt.savefig('./test1.jpg')

plt.show()


## Train--ViT

In [None]:
torch.manual_seed(4)
epochs=10

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tb_writer = SummaryWriter()

PCAM_ViT_model = ViT(img_size=96,num_classes=2,patch_size=8,embed_dim=288,depth=3,num_heads=12,in_c=3).to(device)

optimizer = optim.Adam(PCAM_ViT_model.parameters(), lr=0.0003,weight_decay=5E-5,amsgrad=True)

for epoch in range(epochs):
    # train
    train_loss, train_acc = train_one_epoch(model=PCAM_ViT_model,
                                            optimizer=optimizer,
                                            data_loader=PCAM_train_loader,
                                            device=device,
                                            epoch=epoch)


    # validate
    val_loss, val_acc = evaluate(model=PCAM_ViT_model,
                                 data_loader=PCAM_val_loader,
                                 device=device,
                                 epoch=epoch)

    tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
    tb_writer.add_scalar(tags[0], train_loss, epoch)
    tb_writer.add_scalar(tags[1], train_acc, epoch)
    tb_writer.add_scalar(tags[2], val_loss, epoch)
    tb_writer.add_scalar(tags[3], val_acc, epoch)
    tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)


In [None]:
PCAM_ViT_loss={}
PCAM_ViT_acc={}


for i in range(5):
  print("train",i)
  torch.manual_seed(i)
  epochs=10

  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  tb_writer = SummaryWriter()

  model = ViT(img_size=96,num_classes=2,patch_size=8,embed_dim=288,depth=3,num_heads=12,in_c=3).to(device)

  optimizer = optim.Adam(model.parameters(), lr=0.001)

  PCAM_ViT_loss["train_exp_"+str(i)]=[]
  PCAM_ViT_acc["train_exp_"+str(i)]=[]
  PCAM_ViT_loss["val_exp_"+str(i)]=[]
  PCAM_ViT_acc["val_exp_"+str(i)]=[]

  for epoch in range(epochs):
      # train
      train_loss, train_acc= train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=PCAM_train_loader,
                                              device=device,
                                              epoch=epoch)

      PCAM_ViT_loss["train_exp_"+str(i)].append(train_loss)
      PCAM_ViT_acc["train_exp_"+str(i)].append(train_acc)
      
      # validate
      val_loss, val_acc = evaluate(model=model,
                                  data_loader=PCAM_val_loader,
                                  device=device,
                                  epoch=epoch)
      end_time=time.time()
      PCAM_ViT_loss["val_exp_"+str(i)].append(val_loss)
      PCAM_ViT_acc["val_exp_"+str(i)].append(val_acc)
      
      tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
      tb_writer.add_scalar(tags[0], train_loss, epoch)
      tb_writer.add_scalar(tags[1], train_acc, epoch)
      tb_writer.add_scalar(tags[2], val_loss, epoch)
      tb_writer.add_scalar(tags[3], val_acc, epoch)
      tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)


In [None]:
#save the result

PCAM_vit={"PCAM_ViT_loss":PCAM_ViT_loss,"PCAM_ViT_acc":PCAM_ViT_acc}
PCAM_vit_js= json.dumps(PCAM_vit) 
 
PCAM_vit_fileObject = open('/content/drive/MyDrive/PCAM_vit2.json', 'w')
PCAM_vit_fileObject.write(PCAM_vit_js)
PCAM_vit_fileObject.close()

## hyperparameter optimization

In [None]:
def PCAM_objective(trial):
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  epochs=10
  lr=trial.suggest_float("LR", 1e-4, 1e-2,log=True)
  
  patch_size=trial.suggest_categorical("Patch_Size", [8,16,32])
  embed_dim=trial.suggest_categorical("Hidden_Size", [96,192,288,384,480])
  depth=trial.suggest_categorical("Layers", [3,6,9])
  num_heads=trial.suggest_categorical("Heads", [6,8,12])
  
  model = ViT(img_size=96,num_classes=2,patch_size=patch_size,embed_dim=embed_dim,depth=depth,num_heads=8,in_c=3).to(device)
  print("patch_size",patch_size,"embed_dim",embed_dim,"depth",depth,"epochs",epochs,"lr",lr)
  optimizer = optim.Adam(model.parameters(),lr=lr,weight_decay=5E-5,amsgrad=True)
  total_time=0
  for epoch in range(epochs):
      epoch_start_time=time.time()
      train_loss, train_acc = train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=PCAM_train_loader,
                                              device=device,
                                              epoch=epoch)

      val_loss, val_acc = evaluate(model=model,
                                  data_loader=PCAM_val_loader,
                                  device=device,
                                  epoch=epoch)
      
      epoch_end_time=time.time()
      epoch_time=epoch_end_time-epoch_start_time
      total_time=total_time+epoch_time
      trial.report(val_acc, epoch)
      # Handle pruning based on the intermediate value.
      if trial.should_prune():
        raise optuna.exceptions.TrialPruned()
      if epoch_time>600:
        raise optuna.exceptions.TrialPruned()
      if total_time>3800:
        raise optuna.exceptions.TrialPruned()
  return val_acc 

In [None]:
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study_name = "PCAM_ViT_study3"
storage_name = "sqlite:////content/drive/MyDrive/{}.db".format(study_name)


In [None]:
PCAM_ViT_study = optuna.create_study(direction='maximize',study_name=study_name,storage=storage_name)
PCAM_ViT_study.optimize(PCAM_objective, n_trials=30)

In [None]:
optuna.visualization.plot_param_importances(PCAM_ViT_study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

## Train--RseNet

In [None]:
torch.manual_seed(4)

epochs=10

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tb_writer = SummaryWriter()

PCAM_ResNet_model = MyResNet().to(device)


optimizer = optim.Adam(PCAM_ResNet_model.parameters(), lr=0.0003,weight_decay=5E-5,amsgrad=True)

for epoch in range(epochs):
    # train
    train_loss, train_acc = train_one_epoch(model=PCAM_ResNet_model,
                                            optimizer=optimizer,
                                            data_loader=PCAM_train_loader,
                                            device=device,
                                            epoch=epoch)
    # validate
    val_loss, val_acc = evaluate(model=PCAM_ResNet_model,
                                data_loader=PCAM_val_loader,
                                device=device,
                                epoch=epoch)

    tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
    tb_writer.add_scalar(tags[0], train_loss, epoch)
    tb_writer.add_scalar(tags[1], train_acc, epoch)
    tb_writer.add_scalar(tags[2], val_loss, epoch)
    tb_writer.add_scalar(tags[3], val_acc, epoch)
    tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)

In [None]:
PCAM_Res_loss={}
PCAM_Res_acc={}


for i in range(5):
  print("train",i)
  torch.manual_seed(i)
  epochs=10

  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  tb_writer = SummaryWriter()

  model = MyResNet().to(device)

  optimizer = optim.Adam(model.parameters(), lr=0.003)

  PCAM_Res_loss["train_exp_"+str(i)]=[]
  PCAM_Res_acc["train_exp_"+str(i)]=[]
  PCAM_Res_loss["val_exp_"+str(i)]=[]
  PCAM_Res_acc["val_exp_"+str(i)]=[]
  for epoch in range(epochs):
      # train
      train_loss, train_acc= train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=PCAM_train_loader,
                                              device=device,
                                              epoch=epoch)

      PCAM_Res_loss["train_exp_"+str(i)].append(train_loss)
      PCAM_Res_acc["train_exp_"+str(i)].append(train_acc)
      
      # validate
      val_loss, val_acc = evaluate(model=model,
                                  data_loader=PCAM_val_loader,
                                  device=device,
                                  epoch=epoch)
     
      PCAM_Res_loss["val_exp_"+str(i)].append(val_loss)
      PCAM_Res_acc["val_exp_"+str(i)].append(val_acc)
      
      tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
      tb_writer.add_scalar(tags[0], train_loss, epoch)
      tb_writer.add_scalar(tags[1], train_acc, epoch)
      tb_writer.add_scalar(tags[2], val_loss, epoch)
      tb_writer.add_scalar(tags[3], val_acc, epoch)
      tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)


In [None]:
import json

PCAM_res={"PCAM_Res_loss":PCAM_Res_loss,"PCAM_Res_acc":PCAM_Res_acc}
PCAM_res_js= json.dumps(PCAM_res) 
 
PCAM_res_fileObject = open('/content/drive/MyDrive/PCAM_res.json', 'w')
PCAM_res_fileObject.write(PCAM_res_js)
PCAM_res_fileObject.close()

# NCT-CRC-HE-100K

## Dowload Dataset

In [None]:
!cp -r /content/drive/MyDrive/data/NCT-CRC-HE-100K.zip ./NCT-CRC-HE-100K

In [None]:
# !wget https://zenodo.org/record/1214456/files/NCT-CRC-HE-100K.zip?download=1
!mv NCT-CRC-HE-100K NCT-CRC-HE-100K.zip
!unzip -o -d ./ /content/NCT-CRC-HE-100K.zip

## Get DataSet

In [None]:
torch.manual_seed(4)
data_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5,0.5,0.5], std=[0.5,0.5,0.5])
    ])


all_dataset = torchvision.datasets.ImageFolder('./NCT-CRC-HE-100K', transform=data_transform)
batch_size=10

train_test_set,valid_set = torch.utils.data.random_split(dataset= all_dataset, lengths=[90000, 10000])
train_set,test_set = torch.utils.data.random_split(dataset= train_test_set, lengths=[80000, 10000])
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=4)
valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader= torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=4)

In [None]:
torch.manual_seed(5)
images, labels = next(iter(train_loader))
print(images.size())
plt.figure(figsize=(9, 9))
for i in range(9):
    plt.subplot(3, 3, i+1)
    img=images[i].permute(1, 2, 0)
    
    plt.imshow(torch.squeeze(img, dim=2), cmap='gray')
    plt.axis('off')
plt.show()

## Train-ViT

In [None]:
NCT_Vit_loss={}
NCT_Vit_acc={}

for i in range(5):
  print("train",i)
  torch.manual_seed(i)
  epochs=10
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  tb_writer = SummaryWriter()

  NCT_ViT_model = ViT(img_size=224,num_classes=9,patch_size=16,embed_dim=192,depth=3,num_heads=12,in_c=3).to(device)

  optimizer = optim.Adam(NCT_ViT_model.parameters(), lr=0.0003,weight_decay=5E-5,amsgrad=True)

  NCT_Vit_loss["train_exp_"+str(i)]=[]
  NCT_Vit_acc["train_exp_"+str(i)]=[]
  NCT_Vit_loss["val_exp_"+str(i)]=[]
  NCT_Vit_acc["val_exp_"+str(i)]=[]
  for epoch in range(epochs):
    # train
    train_loss, train_acc = train_one_epoch(model=NCT_ViT_model,
                                            optimizer=optimizer,
                                            data_loader=train_loader,
                                            device=device,
                                            epoch=epoch)
    NCT_Vit_loss["train_exp_"+str(i)].append(train_loss)
    NCT_Vit_acc["train_exp_"+str(i)].append(train_acc)


    # validate
    val_loss, val_acc = evaluate(model=NCT_ViT_model,
                                data_loader=valid_loader,
                                device=device,
                                epoch=epoch)

    NCT_Vit_loss["val_exp_"+str(i)].append(train_loss)
    NCT_Vit_acc["val_exp_"+str(i)].append(train_acc)

    tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
    tb_writer.add_scalar(tags[0], train_loss, epoch)
    tb_writer.add_scalar(tags[1], train_acc, epoch)
    tb_writer.add_scalar(tags[2], val_loss, epoch)
    tb_writer.add_scalar(tags[3], val_acc, epoch)
    tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)

In [None]:
NCT_Vit={"NCT_Vit_loss":NCT_Vit_loss,"NCT_Vit_acc":NCT_Vit_acc}
NCT_Vit_js= json.dumps(NCT_Vit) 
 
NCT_Vit_fileObject = open('/content/drive/MyDrive/NCT_Vit.json', 'w')
NCT_Vit_fileObject.write(NCT_Vit_js)
NCT_Vit_fileObject.close()

## Train-ResNet

In [None]:
NCT_Res_loss={}
NCT_Res_acc={}

for i in range(5):
  print("train",i)
  torch.manual_seed(i)
  epochs=10
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  tb_writer = SummaryWriter()

  NCT_ResNet_model = MyResNet().to(device)

  optimizer = optim.Adam(NCT_ResNet_model.parameters(), lr=0.003)

  NCT_Res_loss["train_exp_"+str(i)]=[]
  NCT_Res_acc["train_exp_"+str(i)]=[]
  NCT_Res_loss["val_exp_"+str(i)]=[]
  NCT_Res_acc["val_exp_"+str(i)]=[]
  for epoch in range(epochs):
      # train
      train_loss, train_acc = train_one_epoch(model=NCT_ResNet_model,
                                              optimizer=optimizer,
                                              data_loader=train_loader,
                                              device=device,
                                              epoch=epoch)
      NCT_Res_loss["train_exp_"+str(i)].append(train_loss)
      NCT_Res_acc["train_exp_"+str(i)].append(train_acc)


      # validate
      val_loss, val_acc = evaluate(model=NCT_ResNet_model,
                                  data_loader=valid_loader,
                                  device=device,
                                  epoch=epoch)
      NCT_Res_loss["val_exp_"+str(i)].append(val_loss)
      NCT_Res_acc["val_exp_"+str(i)].append(val_acc)

      tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
      tb_writer.add_scalar(tags[0], train_loss, epoch)
      tb_writer.add_scalar(tags[1], train_acc, epoch)
      tb_writer.add_scalar(tags[2], val_loss, epoch)
      tb_writer.add_scalar(tags[3], val_acc, epoch)
      tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)

In [None]:
NCT_res={"NCT_Res_loss":NCT_Res_loss,"NCT_Res_acc":NCT_Res_acc}
NCT_res_js= json.dumps(NCT_res) 
 
NCT_res_fileObject = open('/content/drive/MyDrive/NCT_res.json', 'w')
NCT_res_fileObject.write(NCT_res_js)
NCT_res_fileObject.close()

## hyperparameter optimization

In [None]:
import optuna


def objective(trial):
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  epochs=10
  lr=trial.suggest_float("lr", 1e-4, 1e-2,log=True)
  
  patch_size=trial.suggest_categorical("Patch_Size", [16,28,32])
  embed_dim=trial.suggest_categorical("Hidden_Size", [96,192,288,384,480])
  depth=trial.suggest_categorical("Layers", [3,6,9])
  num_heads=trial.suggest_categorical("Heads", [6,8,12])

  model = ViT(img_size=224,num_classes=9,patch_size=patch_size,embed_dim=embed_dim,depth=depth,num_heads=num_heads,in_c=3).to(device)

  print("patch_size",patch_size,"embed_dim",embed_dim,"depth",depth,"epochs",epochs,"lr",lr)
  optimizer = optim.Adam(model.parameters(),lr=lr,weight_decay=5E-5,amsgrad=True)
  total_time=0

  for epoch in range(epochs):
      epoch_start_time=time.time()
      train_loss, train_acc = train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=train_loader,
                                              device=device,
                                              epoch=epoch)

      val_loss, val_acc = evaluate(model=model,
                                  data_loader=valid_loader,
                                  device=device,
                                  epoch=epoch)
      
      epoch_end_time=time.time()
      epoch_time=epoch_end_time-epoch_start_time
      total_time=total_time+epoch_time
      trial.report(val_acc, epoch)
      # Handle pruning based on the intermediate value.
      if trial.should_prune():
        raise optuna.exceptions.TrialPruned()
      if epoch_time>720:
        raise optuna.exceptions.TrialPruned()
      if total_time>3800:
        raise optuna.exceptions.TrialPruned()
  return val_acc 

In [None]:
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study_name = "NCT_ViT_study"
storage_name = "sqlite:////content/drive/MyDrive/{}.db".format(study_name)

In [None]:
study = optuna.create_study(direction='maximize',study_name=study_name,storage=storage_name)
study.optimize(objective, n_trials=30)

In [None]:
optuna.visualization.plot_parallel_coordinate(study) 


In [None]:
optuna.visualization.plot_param_importances(study)

# Data show

## tensorboard

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir=runs

## Mean, Variance and Standard deviation

In [None]:
arr =[95.26,96.44,97.43,95.72,96.95]
 
# Mean
arr_mean = np.mean(arr)
 
# Variance
arr_var = np.var(arr)
 
# Standard deviation
arr_std = np.std(arr)
 
 
print("Mean:%f" % arr_mean)
print("Variance:%f" % arr_var)
print("Standard deviation:%f" % arr_std)

## T-test

In [None]:
from scipy import stats
from scipy.stats import ttest_ind


Sample_A = [79.92,74.59,78.39,77.29,75.63]
Sample_B = [79.48,78.45,82.64,79.77,82.36]


_, levene_p = stats.levene(Sample_A, Sample_B)
print(f"levene_p = {levene_p}")

#α = 0.05
if levene_p > 0.05: 
    t, p = ttest_ind(Sample_A, Sample_B, equal_var=True)
else:
    t, p = ttest_ind(Sample_A, Sample_B, equal_var=False)

print(f"t = {t}, p = {p}")
if p<0.05:
    print("The difference is statistically significant")
else:
    print("The difference is not statistically significant")

## The images of experiments using different hyperparameters on the Mnist

In [None]:
result_json=["diff_Hidden_Size","diff_Layers","diff_lr","diff_num_heads","diff_patch_size"]
for doc in result_json:
  with open('/content/drive/MyDrive/'+doc+'.json', mode='r') as f:
      dicts = json.load(f)

  fig=plt.figure(figsize=(16,6)) 

  ax1 = fig.add_subplot(121)
  ax1.set_xlabel('Epoch(times)',fontsize=14)
  ax1.set_ylabel('Loss',fontsize=16)
  title_loss="Different "+doc[5:].title()+" Loss"
  ax1.set_title(title_loss,fontsize=14)

  ax2 = fig.add_subplot(122)
  ax2.set_xlabel('Epoch(times)',fontsize=14)
  ax2.set_ylabel('Accuracy',fontsize=16)
  title_acc="Different "+doc[5:].title()+" Accuracy"
  ax2.set_title(title_acc,fontsize=14)


  for dict_ in dicts:
    if str(dict_).endswith("train_loss"):
      for i in dicts[str(dict_)]:
        ax1.plot(dicts[str(dict_)][i],label=i)
    if str(dict_).endswith("val_acc"):
      for i in dicts[str(dict_)]:
        ax2.plot(dicts[str(dict_)][i],label=i)
  ax1.set_yticks(np.arange(0.6,2),minor=True)
  ax2.set_yticks(np.arange(0,1),minor=True)

  ax1.set_xticks(np.arange(0,21,2))
  ax2.set_xticks(np.arange(0,21,2))

  ax2.legend(loc='best')
  ax1.legend(loc='best')
  title_='Different '+doc[5:].title()+' on MNIST'
  plt.suptitle(title_,fontsize=18)
  plt.show()

## The images of experiments using different hyperparameters on the CIFAR-10

In [None]:
result_json=["CIR_diff_Hidden_Size","CIF_diff_Layers","CIF_diff_LR","CIF_diff_num_heads","CIF_diff_patch_size"]
for doc in result_json:
  with open('/content/drive/MyDrive/'+doc+'.json', mode='r') as f:
      dicts = json.load(f)

  fig=plt.figure(figsize=(16,6)) 

  ax1 = fig.add_subplot(121)
  ax1.set_xlabel('Epoch(times)',fontsize=14)
  ax1.set_ylabel('Loss',fontsize=16)
  title_loss="Different "+doc[9:].title()+" Loss"
  ax1.set_title(title_loss,fontsize=14)

  ax2 = fig.add_subplot(122)
  ax2.set_xlabel('Epoch(times)',fontsize=14)
  ax2.set_ylabel('Accuracy',fontsize=16)
  title_acc="Different "+doc[9:].title()+" Accuracy"
  ax2.set_title(title_acc,fontsize=14)


  for dict_ in dicts:
    if str(dict_).endswith("train_loss"):
      for i in dicts[str(dict_)]:
        ax1.plot(dicts[str(dict_)][i],label=i)
    if str(dict_).endswith("val_acc"):
      for i in dicts[str(dict_)]:
        ax2.plot(dicts[str(dict_)][i],label=i)
  ax1.set_yticks(np.arange(0.5,2),minor=True)
  ax2.set_yticks(np.arange(0,1),minor=True)

  ax1.set_xticks(np.arange(0,21,2))
  ax2.set_xticks(np.arange(0,21,2))

  ax2.legend(loc='best')
  ax1.legend(loc='best')
  title_='Different '+doc[9:].title()+" on CIFAR-10"
  plt.suptitle(title_,fontsize=18)
  plt.show()

## Results of five experiments

In [None]:
from sklearn import preprocessing
from scipy.ndimage import gaussian_filter1d

In [None]:
def loss_acc_img(dicts):
  n=list(dicts[0].keys())[0][:5]
  if n=="Mnist":
    xticks_range=21
  else:
    xticks_range=11

  fig=plt.figure(figsize=(16,6)) 

  ax1 = fig.add_subplot(121)
  ax1.set_xlabel('Epoch(times)',fontsize=14)
  ax1.set_ylabel('Loss',fontsize=16)
  ax1.set_title("Loss",fontsize=14)


  ax2 = fig.add_subplot(122)
  ax2.set_xlabel('Epoch(times)',fontsize=14)
  ax2.set_ylabel('Accuracy',fontsize=16)
  ax2.set_title("Accuracy",fontsize=14)

  for dataset_dicts in dicts:
    for dataset_dict in dataset_dicts:
      dict_=dataset_dicts[dataset_dict]
      
      a=dataset_dict[-4:]
      if dataset_dicts==dicts[0]:
        label="ViT"
        color="#EF8636"
      else:
        label="ResNet"
        color="#3B75AF"
      label_show=True
      for i,exp in enumerate(dict_):

        if a=="loss":
          if exp[:5]=="train":
            ax1.plot(preprocessing.minmax_scale(dict_[str(exp)]),alpha = 0.7,color=color,label=label if i==0 else None )
        else:
          if exp[:3]=="val":
            ax2.plot(gaussian_filter1d(dict_[str(exp)], sigma=2),color=color,label=label if label_show else None )
            label_show=False

  ax1.set_yticks(np.arange(0,1),minor=True)
  ax2.set_yticks(np.arange(0.9,1),minor=True)
  ax1.grid(ls=':', color='gray', alpha=0.3)
  ax2.grid(ls=':', color='gray', alpha=0.3)
  ax1.set_xticks(np.arange(0,xticks_range,2))
  ax2.set_xticks(np.arange(0,xticks_range,2))


  ax2.legend(loc='best')
  ax1.legend(loc='best')
  plt.suptitle('',fontsize=18)
  plt.show()

### Mnist

In [None]:
with open('/content/drive/MyDrive/mnist_vit.json', mode='r') as f:
    mnist_vit_dicts = json.load(f)

with open('/content/drive/MyDrive/mnist_res.json', mode='r') as f:
    mnist_res_dicts = json.load(f)

dicts=[mnist_vit_dicts,mnist_res_dicts]

loss_acc_img(dicts)

### CIFAR-10

In [None]:
with open('/content/drive/MyDrive/CIF_vit.json', mode='r') as f:
    CIF_vit_dicts = json.load(f)

with open('/content/drive/MyDrive/CIF_res.json', mode='r') as f:
    CIF_res_dicts = json.load(f)

dicts=[CIF_vit_dicts,CIF_res_dicts]

loss_acc_img(dicts)

### 100K

In [None]:
with open('/content/drive/MyDrive/NCT_Vit.json', mode='r') as f:
    NCT_Vit_dicts = json.load(f)

with open('/content/drive/MyDrive/NCT_res.json', mode='r') as f:
    NCT_res_dicts = json.load(f)

dicts=[NCT_Vit_dicts,NCT_res_dicts]

loss_acc_img(dicts)

### PCAM

In [None]:
with open('/content/drive/MyDrive/PCAM_vit.json', mode='r') as f:
    PCAM_vit_dicts = json.load(f)

with open('/content/drive/MyDrive/PCAM_res.json', mode='r') as f:
    PCAM_res_dicts = json.load(f)

dicts=[PCAM_vit_dicts,PCAM_res_dicts]
loss_acc_img(dicts)

# CKA

In [None]:
from torch_cka import CKA

def CKA_image(model_ViT,model_Res,dataset,device,Dataset_name):
  cka1 = CKA(model_ViT, model_ViT,
          model1_name="ViT",   
          model2_name="ViT",   
          device=device)

  cka1.compare(dataset) 

  results1 = cka1.export() 
  save_path="/content/drive/MyDrive/ViT-ViT_compare_"+Dataset_name+".png"
  cka1.plot_results(save_path=save_path)

  cka2 = CKA(model_Res, model_Res,
          model1_name="ResNet",   
          model2_name="ResNet",   
          device=device)

  cka2.compare(dataset) 

  results2 = cka2.export() 
  save_path="/content/drive/MyDrive/Res-Res_compare_"+Dataset_name+".png"
  cka2.plot_results(save_path=save_path)

  cka3 = CKA(model_ViT, model_Res,
          model1_name="ViT",   
          model2_name="ResNet",   
          device=device)

  cka3.compare(dataset) 

  results3 = cka3.export() 
  save_path="/content/drive/MyDrive/ViT-Res_compare_"+Dataset_name+".png"
  cka3.plot_results(save_path=save_path)


### CIFAR-10

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
CKA_image(CIF_vit_model,CIF_res_model,CIF_testloader,device,"CIF")

### PCAM

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
CKA_image(PCAM_ViT_model,PCAM_ResNet_model,PCAM_val_loader,device,"PCAM")

### 100K

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
CKA_image(NCT_ViT_model, NCT_ResNet_model,valid_loader,device,"100k")

# Release GPU memory

In [None]:
!sudo fuser /dev/nvidia*


/dev/nvidia0:         2726m
/dev/nvidiactl:       2726m
/dev/nvidia-uvm:      2726m


In [None]:
!kill -9 2726

# Calculating model parameters

In [None]:
 model = ViT(img_size=224,num_classes=9,patch_size=16,embed_dim=192,depth=3,num_heads=8,in_c=3)

In [None]:
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
pytorch_total_params

In [None]:
model = MyResNet()

In [None]:
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
pytorch_total_params