**About** : Exploration

In [None]:
# %load_ext nb_black
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src/

## Initialization

### Imports

In [None]:
import os
import gc
import ast
import sys
import cv2
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from collections import Counter
from sklearn.model_selection import StratifiedKFold
warnings.simplefilter("ignore", UserWarning)
pd.options.display.max_rows = 999

In [None]:
from params import *
from utils.rle import *
from utils.plots import *
from utils.metrics import iou_map
from utils.rle import rles_to_mask_fix
from utils.logger import prepare_log_folder, create_logger, save_config

from data.preparation import *
from data.dataset import SartoriusDataset
from data.transforms import define_pipelines

from training.main import k_fold
from inference.post_process import *
from utils.metrics import *
from utils.torch import *

In [None]:
import mmdet
import mmdet.models
from mmcv import Config

from mmcv.utils import build_from_cfg
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import Compose

## Data

In [None]:
class Config:
    """
    Parameters used for training
    """
    # Images
    use_mosaic = False
    use_tta = False  # TODO
    data_config = "configs/config_aug_mosaic.py" if use_mosaic else "configs/config_aug.py"
#     data_config = "data/config_rescale.py"

    k = 5
    random_state = 0
    split = "gkf"

In [None]:
df = prepare_data(remove_anomalies=True, fix=True)
# df = prepare_extra_data()

In [None]:
pipelines = define_pipelines(Config.data_config)

In [None]:
splits = get_splits(df, Config)
fold = np.zeros(len(df))
for i, (train_idx, val_idx) in enumerate(splits):
    fold[val_idx] = i
    
df['fold'] = fold

In [None]:
from skimage import measure 

def compute_features(masks, boxes, name, cell_type, fold):
    features = []
    for i, (mask, box) in enumerate(zip(masks, boxes)):
        all_props = measure.regionprops(mask)
        
        fts = {
            "name": name,
            "fold": fold,
            "cell_type": cell_type,
            "id": i,
            "x0": box[0],
            "y0": box[1],
            "x1": box[2],
            "y1": box[3],
            "w": box[2] - box[0],
            "h": box[3] - box[1],
            "solidity": all_props[0].solidity,
            "major_axis_length": all_props[0].major_axis_length,
            "minor_axis_length": all_props[0].minor_axis_length,
            "axis_ratio": all_props[0].minor_axis_length / (all_props[0].major_axis_length + 1e-6),
            "extent": all_props[0].extent,
            "area": all_props[0].area,
        }
        features.append(fts)
    return features

In [None]:
all_features = []
dataset = SartoriusDataset(df, pipelines['val_viz'], precompute_masks=False)

for i in tqdm(range(len(dataset))):
    data = dataset[i]

    masks = np.array([rle_decode(enc, ORIG_SIZE) for enc in df['annotation'][i]])

    img = data['img']
    boxes = data['gt_bboxes']
    
    
    features = compute_features(masks, boxes, df['id'][i], df['cell_type'][i], df['fold'][i])
    all_features += features
features = pd.DataFrame(all_features)

In [None]:
for col in ['solidity', 'axis_ratio']:
    plt.figure(figsize=(15, 15))
    for i in range(3):
        plt.subplot(3, 1, i + 1)
        sns.histplot(x=features[features['cell_type'] == CELL_TYPES[i]][col])
#         plt.yscale('log')
        plt.title(f'{col} - {CELL_TYPES[i]}', size=15)
        plt.xticks([0.1 * i for i in range(11)])
    plt.show()

In [None]:
for col in features.columns[8:]:
    plt.figure(figsize=(15, 5))
    sns.histplot(x=features[col], hue=features['cell_type'])
    plt.title(col, size=15)
    plt.show()

In [None]:
dfg = features.groupby('cell_type').min()
dfg[dfg.columns[7:]]

In [None]:
dfg = features.groupby('cell_type').max()
dfg[dfg.columns[5:]]

In [None]:
dfg = features.groupby(['cell_type', 'fold']).min()
dfg[dfg.columns[6:]]

In [None]:
dfg =features.groupby(['cell_type', 'fold']).max()
dfg[dfg.columns[4:]]

In [None]:
df = prepare_data(fix=True)

df = df.sort_values('sample_id').reset_index(drop=True)

dataset = SartoriusDataset(df, pipelines['val_viz'], precompute_masks=False)

In [None]:
df['plate'] = df['sample_id'].apply(lambda x: x.split('_')[0])
df['plate_well'] = df['sample_id'].apply(lambda x: x.split('-')[0])

In [None]:
df['plate'].unique()

In [None]:
df_plot = df[df['plate'] == "astros[cereb]"].reset_index(drop=True)

dataset = SartoriusDataset(df_plot, pipelines['val_viz'], precompute_masks=False)

In [None]:
for idx in range(min(10, len(df_plot))):
    
    data = dataset[idx]

    masks = np.array([rle_decode(enc, ORIG_SIZE) for enc in df_plot['annotation'][idx]])

    img = data['img']
    boxes = data['gt_bboxes']

    plt.figure(figsize=(15, 15))
    plot_sample(img, masks, plotly=False)
    plt.axis(False)
    plt.title(df_plot['sample_id'][idx])
    plt.show()

#     break

In [None]:
for idx in range(min(10, len(df_plot))):
    
    data = dataset[idx]

    masks = np.array([rle_decode(enc, ORIG_SIZE) for enc in df_plot['annotation'][idx]])

    img = data['img']
    boxes = data['gt_bboxes']

    plt.figure(figsize=(15, 15))
    plot_sample(img, masks, plotly=False)
    plt.axis(False)
    plt.title(df_plot['sample_id'][idx])
    plt.show()

#     break

## Model

In [None]:
from data.loader import define_loaders
from training.optim import define_optimizer

from model_zoo.models import define_model

In [None]:
model = define_model("configs/config_maskrcnn.py", encoder="resnet50")

In [None]:
pipelines = define_pipelines("configs/config_aug.py")

In [None]:
train_dataset = SartoriusDataset(
    df.head(1),
    pipelines['train'],
    precompute_masks=False,
)
# train_dataset.sample_extra_data(0)

test_dataset = SartoriusDataset(df, pipelines['test'], precompute_masks=False)
# test_dataset = SartoriusDataset(df, pipelines['test_tta'], precompute_masks=False)

train_loader, val_loader = define_loaders(train_dataset, test_dataset, batch_size=1, val_bs=1, num_workers=0)

In [None]:
for batch in tqdm(train_loader):
    results = model(**batch, return_loss=True)
    
#     print(batch['img'].data[0].mean())

#     print(results)

#     print(batch['img'].data[0].size())    
#     print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
#     print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

    break

In [None]:
for batch in tqdm(train_loader):
    results = model(**batch, return_loss=True)

    print(results)

#     print(batch['img'].data[0].size())    
#     print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
#     print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

#     break

    print('''\n\n\n''')

In [None]:
model = model.eval()
with torch.no_grad():
    for batch in tqdm(val_loader):
        results = model(**batch, return_loss=False, rescale=True)

        break