## Engine1_Interpolation

### Check GPU Type

In [1]:
!nvidia-smi

Tue Oct  3 23:26:02 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 528.24       Driver Version: 528.24       CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  Off |
|  0%   42C    P8    22W / 450W |   1506MiB / 24564MiB |     11%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Import Packages

In [2]:
_exp_name = "10_pos_v4_sample"

In [3]:
# Import necessary packages.
import numpy as np
import pandas as pd
import torch
import os
import torch.nn as nn
import torchvision.transforms as transforms
from PIL import Image
# "ConcatDataset" and "Subset" are possibly useful when doing semi-supervised learning.
from torch.utils.data import ConcatDataset, DataLoader, Subset, Dataset
from torchvision.datasets import DatasetFolder, VisionDataset
# This is for the progress bar.
from tqdm.auto import tqdm
import random
from random import shuffle

In [4]:
# read slice count csv
sc = pd.read_csv("slice_count.csv")

In [5]:
myseed = 6666  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
#This flag allows you to enable the inbuilt cudnn auto-tuner to find the best algorithm to use for your hardware.
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

### Datasets

In [6]:
class DADataset(Dataset):
    def __init__(self, path, part, files=None):
        super(DADataset, self).__init__()
        self.path = path
        self.files = sorted([os.path.join(path, x) for x in os.listdir(path)])
        # random.Random(8).shuffle(self.files)
        cutTrain = len(self.files) // 10 * 1
        cutTest = int(len(self.files) // 10 * 1.5)
        if part == "train":
            self.files = self.files[:cutTrain]
        elif part == "val":
            self.files = self.files[cutTrain:cutTest]
        elif part == "test":
            self.files = self.files[cutTest:]
        else:
            raise ValueError("Invalid part. Must be 'train', 'val', or 'test'.")
        if files != None:
            self.files = files

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        fname = self.files[idx]
        # Split the string using the underscore as a delimiter
        pid = fname.split("\\")[-1].split("_")[-2]
        slice_num = int(fname.split("\\")[-1].split("_")[-1])
        pos = int((slice_num / int(sc.loc[sc["ID"] == pid, 'Slice_Count'].item())) * 512)
        im = torch.from_numpy(np.load(fname + "/train.npy")).float()
        label = torch.from_numpy(np.load(fname + "/val.npy")).float()
        # print(pid, slice_num, pos)

        return pid, slice_num, pos, im, label

### Model

In [7]:
import torch
import torch.nn as nn

class UNetWithPositionalEncoding(nn.Module):
    def __init__(self):
        super(UNetWithPositionalEncoding, self).__init__()
        self.num_categories = 512
        
        self.down_conv1 = self.double_conv(4, 64)
        self.down_conv2 = self.double_conv(64, 128)
        self.down_conv3 = self.double_conv(128, 256)
        self.down_conv4 = self.double_conv(256, 512)
        
        self.up_conv1 = self.double_conv(512 + 256, 256)
        self.up_conv2 = self.double_conv(256 + 128, 128)
        self.up_conv3 = self.double_conv(128 + 64, 64)
        self.up_conv4 = nn.Conv2d(64, 1, kernel_size=1)
        
        self.maxpool = nn.MaxPool2d(2)
        self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
    
    def double_conv(self, in_channels, out_channels):
        return nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
            nn.ReLU(inplace=True)
        )

    def forward(self, x, pos):
        
        # Iterate through the 'pos' values and update the tensor accordingly
        for i, position in enumerate(pos):
            x[i, :, 0, 0] = position
        
        # Downward path
        x1 = self.down_conv1(x)
        x2 = self.maxpool(x1)
        x3 = self.down_conv2(x2)
        x4 = self.maxpool(x3)
        x5 = self.down_conv3(x4)
        x6 = self.maxpool(x5)
        x7 = self.down_conv4(x6)

        # Upward path
        x = self.upsample(x7)
        x = torch.cat([x, x5], dim=1)
        x = self.up_conv1(x)
        x = self.upsample(x)
        x = torch.cat([x, x3], dim=1)
        x = self.up_conv2(x)
        x = self.upsample(x)
        x = torch.cat([x, x1], dim=1)
        x = self.up_conv3(x)
        x = self.up_conv4(x)
        
        return x

### Configurations

In [8]:
# "cuda" only when GPUs are available.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize a model and put it on the specified device.
model = UNetWithPositionalEncoding().to(device)

# The number of batch size.
batch_size = 10

# The number of training epochs.
n_epochs = 10

# If no improvement in 'patience' epochs, early stop.
patience = 10

# For the classification task, we use mean squared error as the measurement of performance.
criterion = nn.MSELoss()

# Initialize optimizer. You may fine-tune some hyperparameters such as learning rate on your own.
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

### Dataloader

In [9]:
# Construct train and valid datasets.
# The argument "loader" tells how torchvision reads the data.
train_set = DADataset("../Luna16_data/split4_data", "train")
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
valid_set = DADataset("../Luna16_data/split4_data", "val")
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=False)

### Start Training

In [10]:
# Initialize trackers, these are not parameters and should not be changed
stale = 0
best_loss = float('inf')

for epoch in range(n_epochs):
    # ---------- Training ----------
    # Make sure the model is in train mode before training.
    model.train()

    # These are used to record information in training.
    train_loss = []
    # train_accs = []

    for batch in tqdm(train_loader):
        # A batch consists of image data and corresponding labels.
        pid, slice_num, pos, imgs, labels = batch

        # Forward the data. (Make sure data and model are on the same device.)
        images = imgs.to(device)
        position = pos.to(device)

        # Forward pass
        logits = model(images, position)

        # Calculate the loss
        loss = criterion(logits.view(-1, 512*512), labels.view(-1, 512*512).to(device))  # Calculate the loss

        # Gradients stored in the parameters in the previous step should be cleared out first.
        optimizer.zero_grad()

        # Compute the gradients for parameters.
        loss.backward()

        # Clip the gradient norms for stable training.
        grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_norm=10)

        # Update the parameters with computed gradients.
        optimizer.step()

        # Compute the accuracy for the current batch.
        # acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

        # Record the loss and accuracy.
        train_loss.append(loss.item())
        # train_accs.append(acc.item())
        
    train_loss = sum(train_loss) / len(train_loss)
    # train_acc = sum(train_accs) / len(train_accs)

    # Print the information.
    print(f"[ Train | {epoch + 1:03d}/{n_epochs:03d} ] loss = {train_loss:.5f}")

    # ---------- Validation ----------
    # Make sure the model is in eval mode so that some modules like dropout are disabled and work normally.
    model.eval()

    # These are used to record information in validation.
    valid_loss = []
    # valid_accs = []

    # Iterate over the validation set by batches.
    for batch in tqdm(valid_loader):
        # A batch consists of image data and corresponding labels.
        pid, slice_num, pos, imgs, labels = batch

        # We don't need gradient in validation.
        # Using torch.no_grad() accelerates the forward process.
        with torch.no_grad():
            # Forward the data. (Make sure data and model are on the same device.)
            images = imgs.to(device)
            position = pos.to(device)

            # Forward pass
            logits = model(images, position)

        # We can still compute the loss (but not the gradient).
        loss = criterion(logits.view(-1, 512*512), labels.view(-1, 512*512).to(device))  # Calculate the loss

        # Compute the accuracy for the current batch.
        # acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

        # Record the loss and accuracy.
        valid_loss.append(loss.item())
        # valid_accs.append(acc.item())

    # The average loss and accuracy for the entire validation set is the average of the recorded values.
    valid_loss = sum(valid_loss) / len(valid_loss)
    # valid_acc = sum(valid_accs) / len(valid_accs)

    # Print the information.
    print(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}") # , acc = {valid_acc:.5f}

    # Update logs
    if valid_loss < best_loss:
        with open(f"./{_exp_name}_log.txt", "a") as f:
            f.write(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f} -> best\n") # , acc = {valid_acc:.5f}
    else:
        with open(f"./{_exp_name}_log.txt", "a") as f:
            f.write(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}\n") # , acc = {valid_acc:.5f}

    # Save models
    if valid_loss < best_loss:
        print(f"Best model found at epoch {epoch}, saving model")
        torch.save(model.state_dict(), f"{_exp_name}_best.ckpt")  # Save the best model to prevent output memory exceed error
        best_loss = valid_loss
        stale = 0
    else:
        stale += 1
        if stale > patience:
            print(f"No improvement for {patience} consecutive epochs. Early stopping.")
            break

  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 001/020 ] loss = 8723.91656


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 001/020 ] loss = 3488.82673
Best model found at epoch 0, saving model


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 002/020 ] loss = 3534.45163


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 002/020 ] loss = 3302.40693
Best model found at epoch 1, saving model


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 003/020 ] loss = 3012.81014


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 003/020 ] loss = 3172.01898
Best model found at epoch 2, saving model


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 004/020 ] loss = 2990.86507


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 004/020 ] loss = 3153.72088
Best model found at epoch 3, saving model


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 005/020 ] loss = 2909.54787


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 005/020 ] loss = 3157.14890


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 006/020 ] loss = 3086.67437


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 006/020 ] loss = 3221.50208


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 007/020 ] loss = 2974.38075


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 007/020 ] loss = 3113.71520
Best model found at epoch 6, saving model


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 008/020 ] loss = 3066.83996


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 008/020 ] loss = 3173.63361


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 009/020 ] loss = 2949.42894


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 009/020 ] loss = 3237.60275


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 010/020 ] loss = 2896.90107


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 010/020 ] loss = 3115.19744


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 011/020 ] loss = 2952.39610


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 011/020 ] loss = 3541.94576


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 012/020 ] loss = 2855.60235


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 012/020 ] loss = 3031.68368
Best model found at epoch 11, saving model


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 013/020 ] loss = 2897.14650


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 013/020 ] loss = 3087.99521


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 014/020 ] loss = 2913.02082


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 014/020 ] loss = 3069.61925


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 015/020 ] loss = 2873.92152


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 015/020 ] loss = 3086.43918


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 016/020 ] loss = 2886.23252


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 016/020 ] loss = 3112.15636


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 017/020 ] loss = 2873.82381


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 017/020 ] loss = 3098.38832


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 018/020 ] loss = 2860.99979


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 018/020 ] loss = 3095.83633


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 019/020 ] loss = 2881.37043


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 019/020 ] loss = 3104.15950


  0%|          | 0/2237 [00:00<?, ?it/s]

[ Train | 020/020 ] loss = 2840.26899


  0%|          | 0/1119 [00:00<?, ?it/s]

[ Valid | 020/020 ] loss = 3010.87404
Best model found at epoch 19, saving model
