In [9]:
#import necessary modules:
import os
import numpy as np
import pandas as pd
from glob import glob
from PIL import Image
from natsort import natsorted
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.transforms import transforms
from torch.utils.data import Dataset, DataLoader
import torchvision
import torchvision.models as models
import torchvision.datasets as datasets
import timm
import cv2
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm import tqdm
import random
from torch.hub import load_state_dict_from_url
import time
from sklearn.model_selection import StratifiedKFold

In [2]:
#define dataset and dataloaders
train_df_src = r'\\fatherserverdw\Kevin\unstained_blank_classifier\train_df.xlsx'
train_df = pd.read_excel(train_df_src) # 1= white , 0=nonwhite, unbalanced, 79271 0's and 195376 1's. Need stratifiedgroupKfold for CV.
train_df = train_df.drop(columns="Unnamed: 0")
train_df

Unnamed: 0,imagepath,label
0,\\shelter\Kyu\unstain2stain\tiles\registered_t...,0
1,\\shelter\Kyu\unstain2stain\tiles\registered_t...,0
2,\\shelter\Kyu\unstain2stain\tiles\registered_t...,0
3,\\shelter\Kyu\unstain2stain\tiles\registered_t...,0
4,\\shelter\Kyu\unstain2stain\tiles\registered_t...,0
...,...,...
274642,\\shelter\Kyu\unstain2stain\tiles\registered_t...,1
274643,\\shelter\Kyu\unstain2stain\tiles\registered_t...,1
274644,\\shelter\Kyu\unstain2stain\tiles\registered_t...,1
274645,\\shelter\Kyu\unstain2stain\tiles\registered_t...,1


### First find mean and std of dataset for image normalization:
### code for finding dataset std and mean from: https://kozodoi.me/blog/20210308/compute-image-stats

In [3]:
class Unstain2StainData(Dataset):
    def __init__(self,df,transform=None):
        self.df = df
        self.directory = df["imagepath"].tolist()
        self.transform = transform

    def __len__(self):
        return int(len(self.directory)/3)

    def __getitem__(self,idx):
        path = self.directory[idx]
        image = cv2.imread(path, cv2.COLOR_BGR2RGB)

        if self.transform is not None:
            image = self.transform(image = image)['image']
        return image

In [4]:
device      = torch.device('cpu')
num_workers = 0
image_size  = 384
batch_size  = 4

In [5]:
augmentations = A.Compose([A.Resize(height= image_size ,width = image_size ),
                                   A.Normalize(mean=(0,0,0), std=(1,1,1)),
                                   ToTensorV2()])

In [6]:
unstain2stain_dataset = Unstain2StainData(df = train_df, transform = augmentations)# data loader
image_loader = DataLoader(unstain2stain_dataset,
                          batch_size  = batch_size,
                          shuffle     = False,
                          num_workers = num_workers,
                          pin_memory  = True)
images = next(iter(image_loader))
print("Images have a tensor size of {}.".
      format(images.size()))

Images have a tensor size of torch.Size([4, 3, 384, 384]).


In [7]:
# from pathos.multiprocessing import ProcessingPool as Pool
#
# # placeholders
# psum    = torch.tensor([0.0, 0.0, 0.0])
# psum_sq = torch.tensor([0.0, 0.0, 0.0])
#
# def compute_sum_sq(inputs):
#     return inputs.sum(axis=[0,2,3]), (inputs**2).sum(axis=[0,2,3])
#
# # create a pool of workers
# pool = Pool()
#
# # loop through images and compute sums in parallel
# results = pool.imap(compute_sum_sq, image_loader)
#
# # accumulate results
# for psum_i, psum_sq_i in tqdm(results, total=len(image_loader), colour='red'):
#     psum += psum_i
#     psum_sq += psum_sq_i
#
# # close the pool
# pool.close()
#
# # pixel count
# count = len(train_df) * image_size * image_size
#
# # mean and std
# total_mean = psum / count
# total_var  = (psum_sq / count) - (total_mean ** 2)
# total_std  = torch.sqrt(total_var)
#
# # output
# print('mean: ' + str(total_mean))
# print('std:  ' + str(total_std))


In [8]:
# # code without multiprocessing
# # compute mean/std for 1/3 of the images for time's sake:
# # placeholders
# psum    = torch.tensor([0.0, 0.0, 0.0])
# psum_sq = torch.tensor([0.0, 0.0, 0.0])
#
# # loop through images
# for inputs in tqdm(image_loader,colour='red'):
#     psum    += inputs.sum(axis = [0, 2, 3]) # sum over axis 1
#     psum_sq += (inputs ** 2).sum(axis = [0, 2, 3]) # sum over axis 1
#
# # pixel count
# count = len(train_df) * image_size * image_size
#
# # mean and std
# total_mean = psum / count
# total_var  = (psum_sq / count) - (total_mean ** 2)
# total_std  = torch.sqrt(total_var)
#
# # output
# print('mean: ' + str(total_mean))
# print('std:  ' + str(total_std))

### We can now use the above calculated mean and std value for our augmentations for the dataset. Also, use StratifiedKfold to perform 5-fold CV with each fold containing the equal percentages of the blank and non-blank images:


In [9]:
# add stratifiedkfold to df:
new_df_train = train_df.copy(deep=True)
strat_kfold = StratifiedKFold(shuffle = True, random_state = 42) #use default n_split = 5, random_state for reproducibility

#split on white and non-white and add a new column fold to it:
for each_fold, (idx1,idx2) in enumerate (strat_kfold.split(X = new_df_train, y = new_df_train['label'])):
    new_df_train.loc[idx2,'fold'] = int(each_fold) #create new fold column with the fold number (up to 5)

new_df_train["fold"] = new_df_train["fold"].apply(lambda x: int(x)) # somehow doesn't turn to int, so change to int, fold from 0~4

In [10]:
new_df_train

Unnamed: 0,imagepath,label,fold
0,\\shelter\Kyu\unstain2stain\tiles\registered_t...,0,1
1,\\shelter\Kyu\unstain2stain\tiles\registered_t...,0,4
2,\\shelter\Kyu\unstain2stain\tiles\registered_t...,0,1
3,\\shelter\Kyu\unstain2stain\tiles\registered_t...,0,0
4,\\shelter\Kyu\unstain2stain\tiles\registered_t...,0,1
...,...,...,...
274642,\\shelter\Kyu\unstain2stain\tiles\registered_t...,1,3
274643,\\shelter\Kyu\unstain2stain\tiles\registered_t...,1,2
274644,\\shelter\Kyu\unstain2stain\tiles\registered_t...,1,4
274645,\\shelter\Kyu\unstain2stain\tiles\registered_t...,1,1


In [11]:
#check if stratification worked by grouping:
grouped = new_df_train.groupby(['fold','label']) # look how it's splitted
display(grouped.fold.count())

ratio_list = []
for k in range(5):
    ratio = grouped.fold.count()[k][0]/grouped.fold.count()[k][1]
    ratio_list.append(ratio)
print("the ratios of the folds are: {}".format(ratio_list)) #ratios to check stratification

fold  label
0     0        15855
      1        39075
1     0        15854
      1        39076
2     0        15854
      1        39075
3     0        15854
      1        39075
4     0        15854
      1        39075
Name: fold, dtype: int64

the ratios of the folds are: [0.40575815738963533, 0.4057221824137578, 0.4057325655790147, 0.4057325655790147, 0.4057325655790147]


### As we can see above, stratification was successful. Now define transforms:

In [12]:
#define transforms/image augmentation for the dataset

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(384), # efficientnetv2_s 384 x 384
    transforms.RandomHorizontalFlip(0.5),
    transforms.RandomVerticalFlip(0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.2966, 0.3003, 0.3049], std=[0.4215, 0.4267, 0.4332]) #calculated above mean & std
])

val_transform = transforms.Compose([
 # validate at 1024 x 1024, you want to use val dataset to real world application, but maybe resize to 384 if performance is bad.
    #transforms.Resize(384),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.2966, 0.3003, 0.3049], std=[0.4215, 0.4267, 0.4332]) #calculated above mean & std
])

In [13]:
# build train dataset
class TrainDataSet(Dataset):
    # initialize df, label, imagepath and transforms
    def __init__(self, df, label=True, transforms = None):
        self.df = df
        self.label = df["label"].tolist()
        self.imagepaths = df["imagepath"].tolist()
        self.transforms = transforms
    # define length, which is simply length of all imagepaths
    def __len__(self):
        return len(self.df)
    # define main function to read image and label, apply transform function and return the transformed images.
    def __getitem__(self,idx):
        image_path = self.imagepaths[idx]
        img = cv2.imread(image_path, cv2.COLOR_BGR2RGB)
        img = Image.fromarray(img)
        if self.label:
            label = self.label[idx]
        if self.transforms is not None:
            image = self.transforms(img)

        return image, label

In [24]:
# all model configs go here so that they can be changed when we want to:
class model_config:
    seed = 42
    model_name = "efficientnetv2_l"
    train_batch_size = 4
    valid_batch_size = 8
    epochs = 5
    learning_rate = 0.001
    scheduler = "CosineAnnealingLR"
    n_accumulate = 1
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [25]:
# Sets the seed of the entire notebook so results are the same every time we run for reproducibility.
def set_seed(seed = 42):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(model_config.seed)

In [15]:
# define dataloading function:
def load_dataset(fold):
    model_df_train = new_df_train.query("fold!=@fold").reset_index(drop=True)
    model_df_val = new_df_train.query("fold==@fold").reset_index(drop=True)
    train_dataset = TrainDataSet(df = model_df_train, transforms = train_transform)
    val_dataset = TrainDataSet(df = model_df_val, transforms = val_transform)
    train_dataloader = DataLoader(dataset = train_dataset,
        batch_size = model_config.train_batch_size, # pin_memory= true allows faster data transport from cpu to gpu
        num_workers = 0, pin_memory = True, shuffle = True)
    val_dataloader = DataLoader(dataset = val_dataset,
        batch_size = model_config.valid_batch_size,
        num_workers = 0, pin_memory = True, shuffle = True)
    return train_dataloader, val_dataloader

In [16]:
train_dataloader, val_dataloader = load_dataset(fold = 0)
image, labels = next(iter(train_dataloader))
print("Images have a tensor size of {}, and Labels have a tensor size of {}".
      format(image.size(),labels.size()))

Images have a tensor size of torch.Size([4, 3, 384, 384]), and Labels have a tensor size of torch.Size([4])


In [17]:
images, labels = next(iter(val_dataloader))
print("Images have a tensor size of {}, and Labels have a tensor size of {}".
      format(images.size(),labels.size()))

Images have a tensor size of torch.Size([8, 3, 1024, 1024]), and Labels have a tensor size of torch.Size([8])


### Now move on to the model:

In [22]:
def build_model():
    model = timm.create_model(model_config.model_name,pretrained=False)
    num_features = model.classifier.in_features
    model.classifier = nn.Linear(num_features,1) #in_features = 1280, out_features = 1, so that 0 or 1 binary classification
    model.add_module('sigmoid', nn.Sigmoid()) # obtain probability b/w 0 and 1
    model.to(model_config.device) # model to gpu
    return model

In [None]:
# #define loss function, optimizer and device


# criterion = nn.BCEWithLogitsLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)
# # optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# model.to(device)

### training loop:

In [None]:
def save_model(epoch, model, optimizer, criterion):
    torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': criterion,
                }, 'outputs/model.pth')

In [None]:
def save_plots(train_accuracy_list, val_accuracy_list,train_loss_list,val_loss_list):    # accuracy plots
    plt.figure(figsize=(10, 7))
    plt.plot(
        train_accuracy_list, color='green', linestyle='-',
        label='train accuracy'
    )
    plt.plot(
        val_accuracy_list, color='blue', linestyle='-',
        label='validation accuracy'
    )
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.savefig('outputs/accuracy.png')

    # loss plots
    plt.figure(figsize=(10, 7))
    plt.plot(
        train_loss_list, color='orange', linestyle='-',
        label='train loss'
    )
    plt.plot(
        val_loss_list, color='red', linestyle='-',
        label='validataion loss'
    )
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig('outputs/loss_vs_epochs.png')

In [None]:
#training loop
num_epochs = 10
train_loss_list, val_loss_list = [], []
train_accuracy_list, val_accuracy_list = [], []

for epoch in range(num_epochs):
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    for inputs, labels in train_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        _, predicted = torch.max(outputs.data, 1)
        train_loss += loss.item()
        train_correct += (predicted == labels).sum().item()
        train_total += labels.size(0)

    train_loss = train_loss / len(train_dataset)
    train_accuracy = train_correct / train_total
    train_loss_list.append(train_loss)
    train_accuracy_list.append(train_accuracy)

    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for inputs, labels in val_dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            _, predicted = torch.max(outputs.data, 1)
            val_loss += loss.item()
            val_correct +=  (predicted == labels).sum().item()
            val_total += labels.size(0)

    val_loss = val_loss / len(val_dataset)
    val_accuracy = val_correct / val_total
    val_loss_list.append(val_loss)
    val_accuracy_list.append(val_accuracy)

    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

save_model(epoch, model, optimizer, criterion)
save_plots(train_accuracy, val_accuracy,train_loss,val_loss)