**TODO :**
- Bigger img input / bigger mask
- mixup
- use oof to remove noisy annot

In [None]:
# %load_ext nb_black
%load_ext autoreload
%autoreload 2

### Initialization

### Imports

In [None]:
import os
import sys
import torch
import warnings
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
from matplotlib import pyplot as plt

sys.path.append("../code/")
# os.environ['CUDA_VISIBLE_DEVICES'] = "1,0"
warnings.simplefilter("ignore", UserWarning)

In [None]:
from training.main import k_fold

from utils.logger import (
    prepare_log_folder,
    save_config,
    create_logger,
    update_overall_logs,
)

from params import *

### Load

In [None]:
df_info = pd.read_csv(DATA_PATH + f"HuBMAP-20-dataset_information.csv")
df_mask = pd.read_csv(DATA_PATH + "train.csv")

## Training

In [None]:
BATCH_SIZES = {
    "resnet18": 64,
    "resnet34": 32,
    "resnext50_32x4d": 32,
    "se_resnext50_32x4d": 32,
    "efficientnet-b0": 32,
    "efficientnet-b1": 32,
    "efficientnet-b2": 32,
    "efficientnet-b3": 16,
    "efficientnet-b4": 16,
    "efficientnet-b5": 16,
    "efficientnet-b6": 8,
}

In [None]:
class Config:
    """
    Parameters used for training
    """
    # General
    seed = 42
    verbose = 1

    device = "cuda" if torch.cuda.is_available() else "cpu"
    save_weights = True
    sampling_mode = 'centered' # chose between 'convhull', 'centered', 'random', 'visible'

    # Images
    tile_size = 512
    reduce_factor = 2
    on_spot_sampling = 0.99
    overlap_factor = 1.5

    img_dir = DATA_PATH + f"train_{tile_size}_red_{reduce_factor}"
    mask_dir = DATA_PATH + f"masks_{tile_size}_red_{reduce_factor}"

    # k-fold
    cv_column = "5fold"
    random_state = 0
    selected_folds = [0, 1, 2, 3, 4]

    # Model
    encoder = "efficientnet-b1"  # "resnet18" "resnext50_32x4d", "resnet34", "efficientnet-b5"
    decoder = "Unet"  # "Unet", "DeepLabV3Plus"
    use_bot = False
    use_fpn = False
    double_model = False
    use_mixstyle = False
    encoder_weights = "imagenet"
    num_classes = 2

    # Training
    loss = "BCEWithLogitsLoss"  # "SoftDiceLoss" / "BCEWithLogitsLoss"  / "lovasz"
    activation = "none" if loss == "lovasz" else "sigmoid"

    optimizer = "Adam"

    batch_size = BATCH_SIZES[encoder]

    if tile_size == 512:
        batch_size = batch_size // 2

    if batch_size >= 32:
        epochs = 50 
    elif batch_size >= 16:
        epochs = 40
    elif batch_size >= 6:
        epochs = 30
    else:
        epochs = 25

    iter_per_epoch = 5000
    lr = 1e-3
    swa_first_epoch = 50

    warmup_prop = 0.05
    val_bs = batch_size * 2

    first_epoch_eval = 0

    mix_proba = 0
    mix_alpha = 0.4

    if mix_proba > 0:
        epochs *= 3

    use_fp16 = True

    oof_folder = None  # "../logs/2021-04-14/0/"
    loss_oof_weight = 0  # 0.5

    use_pl = 0.25
    use_external = 0.25 

    predict_fc = False
    if predict_fc:
        pl_path = "../output/submission_0938.csv"  # replace with one that predicts even more fcs ?
        extra_path = "../input/train_extra_fc.csv"
        rle_paths = f"../input/train_{reduce_factor}_fc.csv"
    else:
        pl_path = "../logs/2021-05-01/2/"
        extra_path = [
            "../input/train_extra.csv",
            "../input/train_extra_onlyfc.csv"
        ]
        rle_path = [
            f"../input/train_{reduce_factor}_fix.csv",
            f"../input/train_{reduce_factor}_onlyfc.csv",
        ]

    if use_pl > 0:
        epochs += 10
        
#     epochs = 1
#     iter_per_epoch = 100

In [None]:
from data.dataset import InMemoryTrainDataset
from data.transforms import HE_preprocess

def get_dataset(config, log_folder=None):
    """
    Performs a patient grouped k-fold cross validation.
    The following things are saved to the log folder : val predictions, histories

    Args:
        config (Config): Parameters.
        log_folder (None or str, optional): Folder to logs results to. Defaults to None.
    """
    scores = []
    nb_folds = 5
    # Data preparation
    print("Creating in-memory dataset ...")

    if isinstance(config.rle_path, list):
        df_rle = [pd.read_csv(path) for path in config.rle_path]
        train_img_names = df_rle[0].id.unique()
    else:
        df_rle = pd.read_csv(config.rle_path)
        train_img_names = df_rle.id.unique()

    if isinstance(config.extra_path, list):
        df_rle_extra = [pd.read_csv(path) for path in config.extra_path]
    else:
        df_rle_extra = pd.read_csv(config.extra_path)

    in_mem_dataset = InMemoryTrainDataset(
        train_img_names,
        df_rle,
        train_tile_size=config.tile_size,
        reduce_factor=config.reduce_factor,
        train_transfo=HE_preprocess(augment=False,size=config.tile_size, visualize=True),
        valid_transfo=HE_preprocess(augment=False, size=config.tile_size, visualize=True),
        train_path=f"../input/train_{config.reduce_factor}/",
        iter_per_epoch=config.iter_per_epoch,
        on_spot_sampling=config.on_spot_sampling,
        sampling_mode=config.sampling_mode,
        oof_folder=config.oof_folder,
        pl_path=config.pl_path,
        use_pl=config.use_pl,
        test_path=f"../input/test_{config.reduce_factor}/",
        df_rle_extra=df_rle_extra,
        use_external=config.use_external,
    )
    
    return in_mem_dataset

In [None]:
dataset = get_dataset(Config)

In [None]:
dataset.update_fold_nb(0)

In [None]:
dataset.train_img_names[8] 

In [None]:
for i in np.random.choice(len(dataset), 1000):
    img, mask, _, w = dataset[i]
    
    if mask[:, :, 1].sum() and w:
        plt.figure(figsize=(15, 5))
        plt.subplot(1, 3, 1)
        plt.imshow(img.numpy().transpose(1, 2, 0))
        plt.axis(False)
        plt.subplot(1, 3, 2)
        plt.imshow(mask[:, :, 0].numpy())
        plt.colorbar()
        plt.axis(False)
        plt.subplot(1, 3, 3)
        plt.imshow(mask[:, :, 1].numpy())
        plt.colorbar()
        plt.axis(False)
        plt.show()
        
#         break