**About** : Exploration

In [None]:
# %load_ext nb_black
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src/

## Initialization

### Imports

In [None]:
import os
import gc
import ast
import sys
import cv2
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from collections import Counter
from sklearn.model_selection import StratifiedKFold
warnings.simplefilter("ignore", UserWarning)
pd.options.display.max_rows = 999

In [None]:
from params import *
from utils.rle import *
from utils.plots import *
from utils.metrics import iou_map
from utils.rle import rles_to_mask_fix
from utils.logger import prepare_log_folder, create_logger, save_config

from data.preparation import prepare_data, prepare_extra_data
from data.dataset import SartoriusDataset
from data.transforms import define_pipelines

from training.main import k_fold
from inference.post_process import *
from utils.metrics import *
from utils.torch import *

In [None]:
import mmdet
import mmdet.models
from mmcv import Config

from mmcv.utils import build_from_cfg
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import Compose

## Data

In [None]:
class Config:
    """
    Parameters used for training
    """
    # Images
    use_mosaic = False
    use_tta = False  # TODO
    data_config = "configs/config_aug_mosaic.py" if use_mosaic else "configs/config_aug.py"
#     data_config = "data/config_rescale.py"

    k = 5
    random_state = 0
    split = "sgkf"

In [None]:
df = prepare_data(remove_anomalies=True)
# df = prepare_extra_data()

In [None]:
from data.preparation import prepare_data, prepare_extra_data, get_splits
splits = get_splits(df, Config)

all_results = []

for i, (train_idx, val_idx) in enumerate(splits):
    print(f"\n-------------   Fold {i + 1} / {Config.k}  -------------\n")
    print(Counter(df.iloc[val_idx]['cell_type']))

In [None]:
Counter(df['cell_type'])

In [None]:
df['len'] = df['ann'].apply(lambda x: len(x['bboxes']))

plt.figure(figsize=(15, 5))
sns.histplot(x='len', hue="cell_type", data=df, bins=100)
plt.show()

In [None]:
len(df['sample_id'].unique()), len(df)

In [None]:
sns.countplot(x=df['cell_type'])

In [None]:
Counter(df['cell_type'])

In [None]:
pipelines = define_pipelines(Config.data_config)

In [None]:
# dataset = SartoriusDataset(df, pipelines['test_viz'], precompute_masks=False)
dataset = SartoriusDataset(df, pipelines['val_viz'], precompute_masks=False)
# dataset = SartoriusDataset(df, pipelines['train_viz'], precompute_masks=False)

In [None]:
df = prepare_data(fix=True)

df = df.sort_values('sample_id').reset_index(drop=True)

dataset = SartoriusDataset(df, pipelines['val_viz'], precompute_masks=False)

In [None]:
df['plate'] = df['sample_id'].apply(lambda x: x.split('_')[0])
df['plate_well'] = df['sample_id'].apply(lambda x: x.split('-')[0])

In [None]:
df['plate'].unique()

In [None]:
df_plot = df[df['plate'] == "astros[cereb]"].reset_index(drop=True)

dataset = SartoriusDataset(df_plot, pipelines['val_viz'], precompute_masks=False)

In [None]:
for idx in range(min(10, len(df_plot))):
    
    data = dataset[idx]

    masks = np.array([rle_decode(enc, ORIG_SIZE) for enc in df_plot['annotation'][idx]])

    img = data['img']
    boxes = data['gt_bboxes']

    plt.figure(figsize=(15, 15))
    plot_sample(img, masks, plotly=False)
    plt.axis(False)
    plt.title(df_plot['sample_id'][idx])
    plt.show()

#     break

In [None]:
for idx in range(min(10, len(df_plot))):
    
    data = dataset[idx]

    masks = np.array([rle_decode(enc, ORIG_SIZE) for enc in df_plot['annotation'][idx]])

    img = data['img']
    boxes = data['gt_bboxes']

    plt.figure(figsize=(15, 15))
    plot_sample(img, masks, plotly=False)
    plt.axis(False)
    plt.title(df_plot['sample_id'][idx])
    plt.show()

#     break

In [None]:
# lens = {}

# for idx in tqdm(range(len(dataset))):
#     cell_type = df['cell_type'][idx]
#     data = dataset[idx]
#     boxes = data['gt_bboxes']
    
#     if cell_type == "astro" and len(boxes) > 300:
#         img = data['img']
#         plt.figure(figsize=(15, 15))
#         plot_sample(img, data['gt_masks'])
#         plt.axis(False)
#         plt.show()
        
    
#     try:
#         lens[cell_type].append(len(boxes))
#     except:
#         lens[cell_type] = [len(boxes)]
    

In [None]:
# plt.figure(figsize=(15, 5))
# for i, c in enumerate(lens):
#     plt.subplot(1, 3, i + 1)
#     sns.histplot(lens[c])
# #     plt.axvline(1000, c="salmon")
#     plt.title(c)

# plt.show()

In [None]:
# ious = {}

# for idx in tqdm(range(len(dataset))):
#     cell_type = df['cell_type'][idx]
    
#     data = dataset[idx]
#     boxes = data['gt_bboxes']
    
#     for i, b1 in enumerate(boxes):
#         for b2 in boxes[:i]:
#             iou = 0 if (b1 == b2).all() else bbox_iou(b1, b2)
#             if iou:
#                 try:
#                     ious[cell_type].append(iou)
#                 except:
#                     ious[cell_type] = [iou]
    

In [None]:
# plt.figure(figsize=(15, 5))
# for i, c in enumerate(ious):
#     plt.subplot(1, 3, i + 1)
#     sns.histplot(ious[c])
#     plt.axvline(0.5, c="salmon")
#     plt.title(c)

# plt.show()

In [None]:
# for i, c in enumerate(ious):
#     print((np.array(ious[c]) > 0.5).sum(), (np.array(ious[c]) > 0.5).sum() / len(ious[c]))

In [None]:
# sizes_, ratios_, ns = [], [], []
# for i in tqdm(range(len(dataset))):
#     data = dataset[i]
# #     img = data['img']
#     boxes = data['gt_bboxes'].astype(float)

#     sizes = np.max([boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]], 0)
#     ratios = (boxes[:, 2] - boxes[:, 0]) / (boxes[:, 3] - boxes[:, 1])
#     ratios = np.max([ratios, 1 / ratios], 0)
#     ns.append(len(boxes))
    
# #     if np.max(ratios) > 10:
#     if len(boxes) > 600:
# #     if np.max(sizes) > 256:
#         plt.figure(figsize=(15, 15))
#         plot_sample(data['img'], data['gt_masks'], boxes, plotly=False)
#         plt.axis(False)
#         plt.show()
        
    
#     sizes_.append(sizes)
#     ratios_.append(ratios)

In [None]:
# dataset = SartoriusDataset(df, pipelines['train_viz'], precompute_masks=False)
# dataset = to_mosaic(Config, dataset, 'mosaic_viz')

In [None]:
# for _ in range(1):
#     plt.figure(figsize=(15, 15))
    
#     for i in range(4):
#         plt.subplot(2, 2, i + 1)
#         idx = np.random.choice(len(dataset))
# #         idx = 581

#         data = dataset[idx]
#         print(data['img'].shape)
#         plot_sample(data['img'], data['gt_masks'], data['gt_bboxes'], plotly=False)
# #         print(data['img'].shape)

#         plt.axis(False)
#     plt.show()

## Model

In [None]:
from data.loader import define_loaders
from training.optim import define_optimizer

from model_zoo.models import define_model

In [None]:
model = define_model("configs/config_maskrcnn_custom.py", encoder="resnet50")

In [None]:
model.module.roi_head

In [None]:
256 * 7 * 7

In [None]:
pipelines = define_pipelines("configs/config_aug.py")

In [None]:
train_dataset = SartoriusDataset(
    df.head(1),
    pipelines['train'],
    precompute_masks=False,
)
# train_dataset.sample_extra_data(0)

test_dataset = SartoriusDataset(df, pipelines['test'], precompute_masks=False)
# test_dataset = SartoriusDataset(df, pipelines['test_tta'], precompute_masks=False)

train_loader, val_loader = define_loaders(train_dataset, test_dataset, batch_size=1, val_bs=1, num_workers=0)

In [None]:
for batch in tqdm(train_loader):
    results = model(**batch, return_loss=True)
    
#     print(batch['img'].data[0].mean())

#     print(results)

#     print(batch['img'].data[0].size())    
#     print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
#     print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

    break

In [None]:
for batch in tqdm(train_loader):
    results = model(**batch, return_loss=True)
    
    print(batch['img'].data[0].mean())

#     print(batch['img'].data[0].size())    
#     print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
#     print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

#     break

    print('''\n\n\n''')

In [None]:
model = model.eval()
with torch.no_grad():
    for batch in tqdm(val_loader):
        results = model(**batch, return_loss=False, rescale=True)

        break

In [None]:
for c in range(len(results[0][0])):
    print(results[0][0][c].shape)
    print(results[0][1][1][c].shape)
    
    results[0][0][c] = np.concatenate([results[0][0][c], results[0][1][1][c][:, None]], -1)
    
    break

In [None]:
len(results[0][1][0][0])

In [None]:
# for obj in gc.get_objects():
#     try:
#         if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
#             print(type(obj), obj.size())
#     except:
#         pass

In [None]:
from model_zoo.ensemble import EnsembleModel
from mmcv.parallel import MMDataParallel

In [None]:
model.module.roi_head.bbox_head.get_bboxes

In [None]:
models = MMDataParallel(EnsembleModel([model, model]))

with torch.no_grad():
    for batch in tqdm(val_loader):
        for b in batch:
            batch[b] = [batch[b]]  # no tta

    #     batch['img_metas'][0].data[0][0]['scale_factor'] = np.ones(4, dtype=np.float32)
        results = models(**batch, return_loss=False, rescale=True)

        break

## Training

In [None]:
class Config:
    """
    Parameters used for training
    """
    # General
    seed = 42
    verbose = 1
    first_epoch_eval = 0
    compute_val_loss = False
    verbose_eval = 5

    device = "cuda" if torch.cuda.is_available() else "cpu"
    save_weights = True

    # Images
    fix = False
    use_mosaic = False
    use_tta = False  # TODO
    # data_config = "data/config_mosaic.py" if use_mosaic else "data/config.py"
    data_config = "data/config.py"

    # k-fold
    k = 5
    random_state = 0
    selected_folds = [0, 1, 2, 3, 4]

    # Model
    name = "maskrcnn"  # "cascade"
    reduce_stride = False
    pretrain = False
    
    if pretrain and reduce_stride:
        model_config = f"model_zoo/config_{name}_stride_pretrain.py"
    elif pretrain:
        model_config = f"model_zoo/config_{name}_pretrain.py" 
    elif reduce_stride:
        model_config = f"model_zoo/config_{name}_stride.py"
    else:
        model_config = f"model_zoo/config_{name}.py"

    pretrained_folder = None
    # pretrained_folder = "../logs/2021-11-04/6/"

    # Training
    optimizer = "Adam"
    scheduler = "plateau" if optimizer == "SGD" else "linear"
    weight_decay = 0.0005 if optimizer == "SGD" else 0
    batch_size = 2 if reduce_stride else 2
    val_bs = batch_size

    epochs = 50

    lr = 5e-4
    warmup_prop = 0.01

    use_fp16 = False  # TODO

In [None]:
DEBUG = True
log_folder = None

In [None]:
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f"Logging results to {log_folder}")
    save_config(Config, log_folder + "config.json")
    create_logger(directory=log_folder, name="logs.txt")

results = k_fold(Config, log_folder=log_folder)