In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src

## Imports

In [None]:
import os
import re
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import *
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold

In [None]:
from params import *
from utils.logger import *
from data.dataset import PEDatasetFt

from model_zoo.models_lvl2 import *

from utils.metric import rsna_metric
from training.losses import RSNAWLoss

## Data

In [None]:
def str_to_arr(l):
    l = re.sub('\n', ' ', l[1:-1])
    l = re.sub('\.', '', l)
    l = re.sub('\s+', ' ', l).strip()
    return np.array(l.split(' ')).astype(int)

In [None]:
try:
    df = pd.read_csv('../output/df_patient_level.csv')
    df[IMG_TARGET] = df[IMG_TARGET].apply(str_to_arr)
except:
    df = pd.read_csv(DATA_PATH + "train.csv")
    df = df.groupby(['StudyInstanceUID', 'SeriesInstanceUID'])[['SOPInstanceUID'] + EXAM_TARGETS + [IMG_TARGET]].agg(list).reset_index()
    
    ordered_targets = []
    for study, series, names, tgt in tqdm(
        df[['StudyInstanceUID', 'SeriesInstanceUID', 'SOPInstanceUID', 'pe_present_on_image']].values
    ):
        imgs = sorted(os.listdir(IMG_PATH + f'{study}/{series}/'))
        ordered_names = [n.split('_')[1][:-4] for n in imgs]
        ordered_target = np.zeros(len(ordered_names))

        for name, t in zip(names, tgt):
            ordered_target[ordered_names.index(name)] = t

        ordered_targets.append(ordered_target)
    df[IMG_TARGET] = ordered_targets
    
    for c in EXAM_TARGETS:
        df[c] = df[c].apply(lambda x: x[0])
        
    df['path'] = 'features_' + df['StudyInstanceUID'] + '_' + df['SeriesInstanceUID'] + '.npy'
    df.to_csv('../output/df_patient_level.csv', index=False)

In [None]:
df['path_preds'] = 'preds_' + df['StudyInstanceUID'] + '_' + df['SeriesInstanceUID'] + '.npy'

## Checks

In [None]:
dataset = PEDatasetFt(df, [FEATURES_PATH + "resnext2/", FEATURES_PATH + "b3/"])

In [None]:
# preds = []
# for p in tqdm(df['path_preds']):
#     pred = np.load(FEATURES_PATH + "b3/" + p)
#     preds.append(pred)
# preds = np.array(preds)

In [None]:
pred_exams_oof = np.load('../logs2/2020-10-25/4/pred_exams_oof.npy')
pred_imgs_oof = np.load('../logs2/2020-10-25/4/pred_imgs_oof.npy')
sizes_oof = np.load('../logs2/2020-10-25/4/sizes_oof.npy')

In [None]:
pred_exams_oof = np.ones(pred_exams_oof.shape) * 0.5
pred_imgs_oof = np.ones(pred_imgs_oof.shape) * 0.5

In [None]:
rsna_metric(
    dataset.img_targets,
    dataset.exam_targets,
    pred_imgs_oof,
    pred_exams_oof,
    sizes_oof,
)

## Dataset

In [None]:
model = RNNModel(3584, use_msd=True)

In [None]:
ft, y_exam, y_img, size = dataset[7]

In [None]:
logits_exam, logits_img = model(ft.unsqueeze(0))

In [None]:
logits_exam.shape, logits_img.shape, ft.shape, y_exam.shape, y_img.shape

In [None]:
RSNAWLoss(cuda=False)(y_img.unsqueeze(0), y_exam.unsqueeze(0), logits_img, logits_exam, size.unsqueeze(0))

In [None]:
rsna_metric(
    y_img.unsqueeze(0).numpy(), 
    y_exam.unsqueeze(0).numpy(), 
    torch.sigmoid(logits_img).detach().numpy(), 
    torch.sigmoid(logits_exam).detach().numpy(), 
    size.unsqueeze(0).numpy(),
)

## Training

In [None]:
from training.train import *
from utils.torch_utils import save_model_weights

In [None]:
import numpy as np
import pandas as pd
from torchcontrib.optim import SWA
from sklearn.model_selection import KFold

from training.sampler import *

In [None]:
from time import time

In [None]:
from utils.metric import *

In [None]:
def fit(
    model,
    train_dataset,
    val_dataset,
    optimizer_name='adam',
    loss_name='bce',
    epochs=10,
    batch_size=32,
    val_bs=32,
    warmup_prop=0.1,
    lr=1e-3,
    swa_first_epoch=10,
    verbose=1,
):

    optimizer = define_optimizer(optimizer_name, model.parameters(), lr=lr)
    if swa_first_epoch < epochs:
        optimizer = SWA(optimizer)

#     loss_fct = nn.BCEWithLogitsLoss()
    loss_fct = RSNAWLoss()
    
    train_loader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        num_workers=8,
        shuffle=True,
        pin_memory=True
    )

    val_loader = DataLoader(
        val_dataset, 
        batch_size=val_bs, 
        shuffle=False, 
        num_workers=8, 
        pin_memory=True,
    )

    num_warmup_steps = int(warmup_prop * epochs * len(train_loader))
    num_training_steps = int(epochs * len(train_loader))
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps, num_training_steps
    )

    for epoch in range(epochs):
        model.train()
        model.zero_grad()
        start_time = time()

        avg_loss = 0
        t2 = time()
        for x, y_exam, y_img, sizes in train_loader:
            pred_exam, pred_img = model(x.cuda())
            
#             loss = loss_fct(pred_exam, y_exam.cuda()) + loss_fct(pred_img, y_img.cuda())
            loss = loss_fct(y_img.cuda(), y_exam.cuda(), pred_img, pred_exam, sizes.cuda())
            loss.backward()
            avg_loss += loss.item() / len(train_loader)
            
            optimizer.step()
            scheduler.step()

            for param in model.parameters():
                param.grad = None

        if epoch + 1 >= swa_first_epoch:
            optimizer.update_swa()
            optimizer.swap_swa_sgd()

        model.eval()
        avg_val_loss = 0.
        sizes = np.empty((0))
        pred_exams = np.empty((0, NUM_EXAM_TARGETS))
        pred_imgs = np.empty((0, val_dataset.max_len))
        
        with torch.no_grad():
            for x, y_exam, y_img, size in val_loader:
                pred_exam, pred_img = model(x.cuda())
                
#                 loss = loss_fct(pred_exam.detach(), y_exam.cuda()) + loss_fct(pred_img.detach(), y_img.cuda())
                loss = loss_fct(y_img.cuda(), y_exam.cuda(), pred_img.detach(), pred_exam.detach(), size.cuda())
                
                avg_val_loss += loss.item() / len(val_loader)
            
                pred_exams = np.concatenate([pred_exams, torch.sigmoid(pred_exam).detach().cpu().numpy()])
                pred_imgs = np.concatenate([pred_imgs, torch.sigmoid(pred_img).detach().cpu().numpy()])
                sizes = np.concatenate([sizes, size.numpy()])
                
        score = rsna_metric(
            val_dataset.img_targets,
            val_dataset.exam_targets,
            pred_imgs,
            pred_exams,
            sizes,
        )

        if epoch + 1 >= swa_first_epoch and epoch < epochs - 1:
            optimizer.swap_swa_sgd()

        elapsed_time = time() - start_time
        if (epoch + 1) % verbose == 0:
            elapsed_time = elapsed_time * verbose
            lr = scheduler.get_last_lr()[0]
            print(
                f"Epoch {epoch + 1:02d}/{epochs:02d} \t lr={lr:.1e} \t t={elapsed_time:.0f}s  \t loss={avg_loss:.3f} \t ",
                end="",
            )
            print(
                f"val_loss={avg_val_loss:.3f}\t score={score:.4f}"
            )

    torch.cuda.empty_cache()
    
    return pred_exams, pred_imgs, sizes


In [None]:
from utils.torch_utils import seed_everything, count_parameters


def train(config, df_train, df_val, fold, log_folder=''):
    """
    Trains and validate a model

    Args:
        config (Config): Parameters.
        df_train (pandas dataframe): Training metadata.
        df_val (pandas dataframe): Validation metadata.
        fold (int): Selected fold.
        log_folder (str, optional): Folder to logs results to. Defaults to ''.

    Returns:
        np array: Validation predictions.
        pandas dataframe: Training history.
    """

    seed_everything(config.seed)

    model = RNNModel(
        ft_dim=config.ft_dim, 
        lstm_dim=config.lstm_dim,
        dense_dim=config.dense_dim,
        logit_dim=config.logit_dim,
        use_msd=config.use_msd,
    ).cuda()
        
    model.zero_grad()

    train_dataset = PEDatasetFt(df_train, max_len=config.max_len, paths=config.ft_path)
    val_dataset = PEDatasetFt(df_val, max_len=config.max_len, paths=config.ft_path)
        
    n_parameters = count_parameters(model)
    print(f"    -> {len(train_dataset)} training images")
    print(f"    -> {len(val_dataset)} validation images")
    print(f"    -> {n_parameters} trainable parameters\n")

    pred_exams, pred_imgs, sizes = fit(
        model,
        train_dataset,
        val_dataset,
        optimizer_name=config.optimizer,
        loss_name=config.loss,
        epochs=config.epochs,
        batch_size=config.batch_size,
        val_bs=config.val_bs,
        lr=config.lr,
        warmup_prop=config.warmup_prop,
        swa_first_epoch=config.swa_first_epoch,
    )

    if config.save_weights:
        save_model_weights(
            model,
            f"{config.name}_{fold}.pt",
            cp_folder=log_folder,
        )
        
    return pred_exams, pred_imgs, sizes

In [None]:
def k_fold(config, df, log_folder=''):
    """
    Performs a patient grouped k-fold cross validation.
    The following things are saved to the log folder :
    oof predictions, val predictions, val indices, histories

    Args:
        config (Config): Parameters.
        df (pandas dataframe): Metadata.
        log_folder (str, optional): Folder to logs results to. Defaults to ''.
    """

    
    pred_exams_oof = np.zeros((len(df), NUM_EXAM_TARGETS))
    pred_imgs_oof = np.zeros((len(df), config.max_len))
    sizes_oof = np.zeros(len(df))
        
    kf = KFold(n_splits=config.k)
    splits = list(kf.split(X=df))

    for i, (train_idx, val_idx) in enumerate(splits):
        if i in config.selected_folds:
            print(f"\n-------------   Fold {i + 1} / {config.k}  -------------\n")

            df_train = df.iloc[train_idx].copy()
            df_val = df.iloc[val_idx].copy()

            pred_exams, pred_imgs, sizes = train(config, df_train, df_val, i, log_folder=log_folder)
            
            pred_exams_oof[val_idx] = pred_exams
            pred_imgs_oof[val_idx] = pred_imgs
            sizes_oof[val_idx] = sizes
            
#             break
    
    return pred_exams_oof, pred_imgs_oof, sizes_oof

In [None]:
class Config:
    """
    Parameters used for training
    """
    # General
    seed = 42
    verbose = 1
    save_weights = True
    max_len = 400
    
    ft_path = [
        FEATURES_PATH + "b3/", 
#         FEATURES_PATH + "resnext2/", 
    ]

    # k-fold
    k = 5
    selected_folds = [0, 1, 2, 3, 4]

    # Model
    ft_dim = 1536
    lstm_dim = 256
    dense_dim = 256
    logit_dim = 256
    use_msd = True
    
    # Training
    loss = "BCEWithLogitsLoss"
    optimizer = "Adam"
    
    batch_size = 32
    epochs = 10
    swa_first_epoch = 7
    lr = 5e-3
    warmup_prop = 0.
    val_bs = 32

    name = "rnn_3"

## Main

In [None]:
log_folder = prepare_log_folder(LOG_PATH_2)
print(f'Logging results to {log_folder}')

In [None]:
# config_df = save_config(Config, log_folder + 'config.json')

In [None]:
create_logger(directory=log_folder, name="logs.txt")

pred_exams_oof, pred_imgs_oof, sizes_oof = k_fold(Config, df, log_folder)

In [None]:
np.save(log_folder + 'pred_exams_oof.npy', pred_exams_oof)
np.save(log_folder + 'pred_imgs_oof.npy', pred_imgs_oof)
np.save(log_folder + 'sizes_oof.npy', sizes_oof)

## Load results

In [None]:
from sklearn.metrics import *

In [None]:
log_folder = "../logs2/2020-10-25/21/"

In [None]:
pred_exams_oof = np.load(log_folder + 'pred_exams_oof.npy')
pred_imgs_oof = np.load(log_folder + 'pred_imgs_oof.npy')
sizes_oof = np.load(log_folder + 'sizes_oof.npy')
# np.save(log_folder + 'sizes_oof.npy', sizes_oof)

In [None]:
rsna_metric(
    dataset.img_targets,
    dataset.exam_targets,
    pred_imgs_oof,
    pred_exams_oof,
    sizes_oof,
)

In [None]:
y = dataset.exam_targets

In [None]:
ts = [i/100 for i in range(1, 90)]

In [None]:
fig, axs = plt.subplots(3, 3, figsize=(15, 16))

for i in range(y.shape[1]):
    ax = axs.flatten()[i]
    
    best_acc = 0
    t_ = 0.5
    for t in ts:
        acc = f1_score(
            pred_exams_oof[:, i] > t, 
            y[:, i], 
        )
        if acc > best_acc:
            best_acc = acc
            t_ = t
            
    plot_confusion_matrix(
        pred_exams_oof[:, i] > t_, 
        y[:, i], 
        cmap="Reds", 
        display_labels=['0', '1'], 
        fig=fig, ax=ax,
        normalize='true',
    )
    ax.title.set_text(f"{EXAM_TARGETS[i]}") # threshold={t_}")
    ax.title.set_size(15)
plt.show()