# Introduction 

**This is a basic CNN Model training notebook**

It is based on: 
- Thumbnail images
- Basic data transformation (using Albumentation):
    - resizing images to 512x512
    - normalizing pixel values
- CNN Architecture


**Todos:**

- Learn about Dataset & DataLoader
- add augmentations (albumentation)
- gem pooling

In [1]:


import sys
sys.path.append('/kaggle/input/dummypy')



In [2]:
from main import print_hello_world

In [3]:
print_hello_world()

hello world!


In [1]:
!pip install --quiet torch_optimizer
import torch_optimizer as torch_optimizer


In [1]:
!pip install --quiet mlflow dagshub
import mlflow.pytorch 
from mlflow import MlflowClient


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ydata-profiling 4.3.1 requires dacite>=1.8, but you have dacite 1.6.0 which is incompatible.
ydata-profiling 4.3.1 requires scipy<1.11,>=1.4.1, but you have scipy 1.11.2 which is incompatible.[0m[31m
[0m

In [2]:


import os
import gc
import cv2
import datetime
import math
import copy
import time
import random
import glob
from matplotlib import pyplot as plt
from skimage import io


# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.cuda import amp
import torchvision

import optuna
from optuna.trial import TrialState

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict


from PIL import Image
from joblib import Parallel, delayed
from tqdm.auto import tqdm

# Sklearn Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, f1_score
from torch.utils.data.sampler import WeightedRandomSampler

# For Image Models
import timm

import dagshub
from getpass import getpass

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
sr_ = Style.RESET_ALL

import warnings
# warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
from cancer_utils_tiles import get_class_weights,UBCModel, get_optimizer, fetch_scheduler, EarlyStopping, print_logged_info, get_or_create_experiment_id


In [4]:
os.environ["MLFLOW_TRACKING_USERNAME"]="Niggl0n"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "7a3590e8c5558d4598dacc7810befa70a4baac9e"
os.environ['MLFLOW_TRACKING_PROJECTNAME'] = "UBC_Cancer_Classification"
#dagshub.auth.add_app_token("7a3590e8c5558d4598dacc7810befa70a4baac9e")
mlflow.set_tracking_uri(f'https://dagshub.com/' + os.environ['MLFLOW_TRACKING_USERNAME'] + '/' + os.environ['MLFLOW_TRACKING_PROJECTNAME'] + '.mlflow')

In [6]:
CONFIG = {
    "is_submission": False,
    "weighted_loss": True,
    "datetime_now": datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), 
    "n_fold":5, 
    "test_fold": 0,
    "seed": 42,
    "img_size": 512,
    "model_name": "tf_efficientnet_b0_ns",   # "tf_efficientnet_b0_ns", # "tf_efficientnetv2_s_in21ft1k"
    "checkpoint_path": "/kaggle/input/tf-efficientnet-b0-aa-827b6e33-pth/tf_efficientnet_b0_aa-827b6e33.pth",
    "num_classes": 5,
    "train_batch_size": 8,
    "valid_batch_size": 8,
    "n_tiles": 10,
    "n_tiles_test": 10,
    "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    "num_epochs": 15,
    "early_stopping": True,
    "patience": 6,
    "optimizer": 'adam',
    "scheduler": 'CosineAnnealingLR',
    "min_lr": 1e-6,
    "T_max": 10,
    "momentum": 0.9,
    "weight_decay": 1e-4,
}

## 1. Data Preparation

In [7]:
ROOT_DIR = '/kaggle/input/UBC-OCEAN'
TRAIN_DIR = '/kaggle/input/tiles-of-cancer-2048px-scale-0-25'
INFER_DIR = '/kaggle/input/ubc-ocean-tiles-w-masks-2048px-scale-0-25'
df_orig = pd.read_csv("/kaggle/input/UBC-OCEAN/train.csv")
df_orig = df_orig.rename(columns={"label":"subtype"})
df_train = pd.read_csv("/kaggle/input/df-tiles-025x-cancer-tissue-binary-labels/tiles_labelled_binary_cancer.csv", index_col="Unnamed: 0")
display(df_orig.sample(5))
display(df_train.sample(5))

Unnamed: 0,image_id,subtype,image_width,image_height,is_tma
331,39208,MC,61530,28220,False
58,6359,CC,39280,37289,False
317,38349,CC,45540,42755,False
463,54949,HGSC,37903,28174,False
426,51032,EC,91965,40838,False


Unnamed: 0,image_path,mask_path,cancer_ratio,non_cancer_ratio,label
42402,/kaggle/input/ubc-ocean-tiles-w-masks-2048px-s...,/kaggle/input/ubc-ocean-tiles-w-masks-2048px-s...,0.0,5.7e-05,non-cancer
30867,/kaggle/input/ubc-ocean-tiles-w-masks-2048px-s...,/kaggle/input/ubc-ocean-tiles-w-masks-2048px-s...,0.0,0.0,unknown
12840,/kaggle/input/ubc-ocean-tiles-w-masks-2048px-s...,/kaggle/input/ubc-ocean-tiles-w-masks-2048px-s...,0.929276,0.0,cancer
21100,/kaggle/input/ubc-ocean-tiles-w-masks-2048px-s...,/kaggle/input/ubc-ocean-tiles-w-masks-2048px-s...,0.0,0.0,unknown
25482,/kaggle/input/ubc-ocean-tiles-w-masks-2048px-s...,/kaggle/input/ubc-ocean-tiles-w-masks-2048px-s...,0.0,0.0,unknown


In [None]:
df_train["image_id"] = df_train["image_path"].map(lambda x: int(x.split('/')[-2]))
df_train = df_train[~df_train["image_path"].str.lower().str.contains("tma")]
df_train = df_train[df_train["label"]!="unknown"].reset_index(drop=True)
df_train = pd.merge(df_train, df_orig, on="image_id", how="left")
df_train["group"] = df_train["label"] + "_" + df_train["subtype"] 
df_masks = df_train.copy()

encoder = LabelEncoder()
df_train['target_label'] = encoder.fit_transform(df_train['label'])
with open("label_encoder_"+ CONFIG["datetime_now"] +".pkl", "wb") as fp:
    joblib.dump(encoder, fp)
    
# use stratified K Fold for crossvalidation 
skf = StratifiedKFold(n_splits=CONFIG['n_fold'], shuffle=True, random_state=CONFIG["seed"],)

for fold, ( _, val_) in enumerate(skf.split(X=df_train, y=df_train.target_label,  groups=df_train.group)):
    df_train.loc[val_ , "kfold"] = int(fold)
display(df_train.head())

# separate train and test dataset
df_test = df_train[df_train["kfold"]==CONFIG["test_fold"]].reset_index(drop=True)
df_train = df_train[df_train["kfold"]!=CONFIG["test_fold"]].reset_index(drop=True)
print(f"Shape df_train: {df_train.shape}, Shape df_test: {df_test.shape} ")

In [8]:

def create_dataframe_from_directory(directory_path):
    image_paths = []
    image_ids = []

    for folder in os.listdir(directory_path):
        folder_path = os.path.join(directory_path, folder)
        if os.path.isdir(folder_path):
            for filename in os.listdir(folder_path):
                if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                    file_path = os.path.join(folder_path, filename)
                    image_paths.append(file_path)
                    image_ids.append(folder)

    df = pd.DataFrame({
        'image_path': image_paths,
        'image_id': image_ids
    })

    return df

directory_path = "/kaggle/input/tiles-of-cancer-2048px-scale-0-25"
df_images = create_dataframe_from_directory(directory_path)
df_images.head() 

Unnamed: 0,image_path,image_id
0,/kaggle/input/tiles-of-cancer-2048px-scale-0-2...,8805
1,/kaggle/input/tiles-of-cancer-2048px-scale-0-2...,8805
2,/kaggle/input/tiles-of-cancer-2048px-scale-0-2...,8805
3,/kaggle/input/tiles-of-cancer-2048px-scale-0-2...,8805
4,/kaggle/input/tiles-of-cancer-2048px-scale-0-2...,8805


In [13]:

df_images.iloc[1004,0], df_images.iloc[1004,1]

('/kaggle/input/tiles-of-cancer-2048px-scale-0-25/46444/000151_8-10.png',
 '46444')

In [None]:
img_color_mean=[0.8661704276539922, 0.7663107094675368, 0.8574260897185548]
img_color_std=[0.08670629753900036, 0.11646580094195522, 0.07164169171856792]

data_transforms = {
    "train": A.Compose([
        A.Resize(512, 512),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        # A.RandomBrightnessContrast(p=0.75),
        A.ShiftScaleRotate(p=0.75),
        A.OneOf([
        A.GaussNoise(var_limit=[10, 50]),
        A.GaussianBlur(),
        A.MotionBlur(),
        ], p=0.4),
        A.GridDistortion(num_steps=5, distort_limit=0.3, p=0.5),
        A.CoarseDropout(max_holes=5, max_width=int(512* 0.1), max_height=int(512* 0.1),
        mask_fill_value=0, p=0.5),
        A.Normalize(img_color_mean, img_color_std), 
        ToTensorV2()], p=1.),
    
    "valid": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.Normalize(img_color_mean, img_color_std), 
        ToTensorV2()], p=1.)
}



## 3. Training

In [None]:
class InferBinaryTilesDataset(Dataset):
    def __init__(
        self,
        df_data,
        transforms = None,
        mode: str = 'valid',
        labels_lut = None,
        # train_val_split: float = 0.90,
        tissue_label_th: float = 0.2
    ):

        self.transforms = transforms
        self.mode = mode
        self.train_val_split = train_val_split

        self.data = df_data
        self.img_paths =  self.data.image_path.values.tolist()
        
    def __getitem__(self, idx: int) -> tuple:
        img_path = self.img_paths[idx]
        assert os.path.isfile(img_path), f"missing: {img_path}"
        tile = cv2.imread(img_path)
        tile = cv2.cvtColor(tile, cv2.COLOR_BGR2RGB)
        # augmentation
        if self.transforms:
            tile = self.transforms(image=tile)["image"]
        #print(f"img dim: {img.shape}")
        return {
            "image": tile,
            "image_path": img_path,
               }
    
    def __len__(self) -> int:
        return len(self.img_paths)


In [None]:
def infer_on_holdout_set(model, CONFIG, df_test, val_size=1.0):
    model.eval()
    test_dataset = InferBinaryTilesDataset(df_test, transforms=data_transforms["valid"], mode="test", train_val_split=1.0)
    test_loader = DataLoader(test_dataset, batch_size=CONFIG['valid_batch_size'], num_workers=2, shuffle=False, pin_memory=True)
    print(f"Test-Dataset Size: {len(test_dataset)}")

    preds = []
    probs_list = []
    image_path_list = []
    test_acc = 0.0

    with torch.no_grad():
        bar = tqdm(enumerate(test_loader), total=len(test_loader))
        for step, data in bar: 
            # print(step)
            images = data['image'].to(CONFIG["device"], dtype=torch.float)
            image_paths = data['label'].to(CONFIG["device"], dtype=torch.float)

            outputs = model(images)        
            probs = torch.sigmoid(outputs)
            predicted = (probs > 0.5).int()
            preds.append(predicted.detach().cpu().numpy() )
            probs_list.append(probs.detach().cpu().numpy() )
            image_path_list.append(image_paths.detach().cpu().numpy() )
    pred_labels = encoder.inverse_transform(preds)
    return df_test


In [None]:
class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
        
    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)
        
    def __repr__(self):
        return self.__class__.__name__ + \
                '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + \
                ', ' + 'eps=' + str(self.eps) + ')'


class UBCBinaryModel(nn.Module):
    '''
    EfficientNet B0 fine-tune.
    '''
    def __init__(self, model_name, pretrained=False, checkpoint_path=None):
        '''
        Fine tune for EfficientNetB0
        Args
            learnable_modules : tuple - Names of the modules to fine-tune.
        Return
            
        '''
        super(UBCBinaryModel, self).__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained, checkpoint_path=checkpoint_path)

        in_features = self.model.classifier.in_features
        self.model.classifier = nn.Identity()
        self.model.global_pool = nn.Identity()
        self.pooling = GeM()
        self.linear = nn.Linear(in_features, 1)
        

    def forward(self, images):
        """
        Forward function for the fine-tuned model
        Args
            x: 
        Return
            result
        """
        features = self.model(images)
        pooled_features = self.pooling(features).flatten(1)
        logits = self.linear(pooled_features)
        return logits

In [None]:
print(f"Shape df_train: {df_train.shape}, Shape df_test: {df_test.shape}")

model = UBCBinaryModel(CONFIG['model_name'], pretrained=False , checkpoint_path=CONFIG["checkpoint_path"])
# model.load_state_dict(torch.load(CONFIG["checkpoint_path"]))
model.load_state_dict(torch.load(save_model_path))
model.to(CONFIG['device']);

encoder

In [None]:
print("Infer on Holdout Set:")
df_test = test_on_holdout(model, CONFIG, df_test, val_size=1)
df_test_file_path = "df_test_results.csv"




In [None]:
"""
model = UBCModel(CONFIG['model_name'], CONFIG['num_classes'], pretrained=False , checkpoint_path=None)
model.load_state_dict(torch.load("/kaggle/input/effnet-version-28/best_model_checkpoint2023-11-21_15-47-39.pth"))
model.to(CONFIG['device']);
df_test = test_on_holdout(model, CONFIG, df_test, TRAIN_DIR, val_size=1, n_tiles=CONFIG["n_tiles_test"])
"""