**About** : This notebook is used to prepare the data.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src/

## Initialization

### Imports

In [None]:
import os
import sys
import cv2
import glob
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.linear_model import *

pd.set_option('display.width', 500)
pd.set_option('max_colwidth', 100)

## Data

### Imgs

In [None]:
files = os.listdir('../input/imgs/')

In [None]:
df = pd.DataFrame(files)
df.columns = ['file']

In [None]:
df['patient'] = df['file'].apply(lambda x: x.split('_')[0])
df['series'] = df['file'].apply(lambda x: x.split('_')[1])
df['frame'] = df['file'].apply(lambda x: x.split('_')[2][:-4])

df['path'] = '../input/imgs/' + df['file']

### Tags

In [None]:
tags = pd.read_parquet("../input/train_dicom_tags.parquet")

In [None]:
tags['z'] = tags['ImagePositionPatient'].apply(lambda x: float(x[:-1].split(', ')[-1]))

In [None]:
tags = tags[["path", "z"]].copy()

In [None]:
tags['patient'] = tags['path'].apply(lambda x: x.split('/')[1])
tags['series'] = tags['path'].apply(lambda x: x.split('/')[2])
tags['instance'] = tags['path'].apply(lambda x: x.split('/')[3][:-4])

In [None]:
tags = tags.sort_values(['patient', 'series', 'z'], ignore_index=True)

In [None]:
tags['frame'] = tags.groupby(['patient', 'series']).agg('rank')['z'].astype(int)
tags['frame'] -= 1

In [None]:
tags['frame'] = tags['frame'].apply(lambda x: f'{x:04d}')

In [None]:
tags.to_csv('../input/frame_mapping.csv', index=False)

In [None]:
tags.head()

### Merge

In [None]:
df = df.merge(tags[['patient', 'series', 'frame', 'instance']], how="left")
df = df.sort_values(['patient', 'series', 'frame'], ignore_index=True)

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
image_level = pd.read_csv('../input/image_level_labels.csv')

In [None]:
image_level.columns = ['patient', 'series', 'instance', 'injury_name']
image_level[image_level.columns] = image_level[image_level.columns].astype(str)

In [None]:
image_level = image_level.groupby(['patient', 'series', 'instance']).agg(list).reset_index()

In [None]:
image_level.head(1)

In [None]:
dfm = df.merge(image_level, on=['patient', 'series', 'instance'], how="left")

In [None]:
dfm["injury_name"] = dfm["injury_name"].fillna('').astype(str)
dfm["extravasation_injury"] = dfm["injury_name"].apply(lambda x: "Active_Extravasation" in x).astype(np.uint8)
dfm["bowel_injury"] = dfm["injury_name"].apply(lambda x: "Bowel" in x).astype(np.uint8)

In [None]:
dfm = dfm[['patient', 'series', 'instance', "frame", 'extravasation_injury', 'bowel_injury', 'path']]
dfm.to_csv('../input/df_images_train.csv', index=False)

In [None]:
dfm.head()

### Target EDA

In [None]:
train = pd.read_csv('../input/train.csv')

In [None]:
plt.figure(figsize=(10, 3))

for i, k in enumerate(['bowel_injury', 'extravasation_injury']):
    plt.subplot(1, 2, i + 1)
    sns.countplot(x=train[k])
    plt.yscale('log')
    plt.xticks([0, 1], ['healthy', 'injured'])
    
plt.show()

In [None]:
plt.figure(figsize=(15, 3))

for i, k in enumerate(['kidney', 'liver', 'spleen']):
    plt.subplot(1, 3, i + 1)
    train[k] = train[f'{k}_low'] + 2 * train[f'{k}_high']
    sns.countplot(x=train[k])
    plt.yscale('log')
    plt.xticks([0, 1, 2], ['healthy', 'low','high'])
    
plt.show()

In [None]:
healthy = (
    (train['kidney'] == 0) &
    (train['liver'] == 0) &
    (train['spleen'] == 0)&
    (train['bowel_injury'] == 0) &
    (train['extravasation_injury'] == 0)
)

(train['any_injury'] == ~healthy).all()

plt.figure(figsize=(5, 3))
sns.countplot(x=train['any_injury'])
plt.show()

In [None]:
def get_weight(row):
    

In [None]:
tgts = ['kidney', 'liver', 'spleen', 'bowel_injury', 'extravasation_injury']

for i, t1 in enumerate(tgts):
    for t2 in tgts[i + 1:]:
        print(
            f'{t1.split("_")[0][:6]}\t:', (train[t1] > 0).sum(),
            f' \t{t2.split("_")[0][:6]}\t:', (train[t2] > 0).sum(),
            f' \t{t1.split("_")[0][:6]} & {t2.split("_")[0][:6]}\t:', ((train[t1] > 0) & (train[t2] > 0)).sum()
        )

### Metric

In [None]:
from sklearn.metrics import log_loss
from util.metrics import *

In [None]:
log_loss([2, 0, 0, 1], [[.1, 0, .9], [.9, .1, 0], [.8, .2, 0], [.35, .65, 0]])

In [None]:
from sklearn.metrics import log_loss
log_loss([0, 0, 0, 1], [[1, 0,], [.9, .1], [.8, .2], [.35, .65]], labels=[0, 1])

In [None]:
from sklearn.metrics import log_loss
log_loss([0, 0, 0, 1], [0, .1, .2, .65], labels=[0, 1])

In [None]:
from sklearn.metrics import log_loss
log_loss([[0, 0, 1], [1, 0, 0], [1, 0, 0], [0, 1, 0]],  [[.1, 0, .9], [.9, .1, 0], [.8, .2, 0], [.35, .65, 0]])

In [None]:
preds = [
    np.random.random((5, 1)),
    np.random.random((5, 1)),
    np.random.random((5, 3)),
    np.random.random((5, 3)),
    np.random.random((5, 3)),
]

In [None]:
preds

In [None]:
losses, avg_loss = rsna_loss(preds, train.head(5))

In [None]:
losses, avg_loss

In [None]:
train.to_csv('../input/df_train.csv', index=False)

In [None]:
train.head()

Done ! 