In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from glob import glob
import random
import json
import pydicom
import pickle
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

DICOM_TAGS_DF_PATH = '/kolos/m2/ct/data/rsna/df.pkl'
LABELS_PATH = '/kolos/storage/ct/data/rsna/stage_1_train.csv'

DISEASES = ['epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']

NUM_FOLDS = 5

In [3]:
def load_dicom_tags():
    with open(DICOM_TAGS_DF_PATH, 'rb') as f:
        df = pickle.load(f)

    return df

def load_labels():
    labels = pd.read_csv(LABELS_PATH)
    labels[['SOPInstanceUID', 'Disease']] = labels.ID.str.rsplit("_", 1, expand=True)
    labels = labels[['SOPInstanceUID', 'Disease', 'Label']]
    labels = pd.pivot_table(labels, index="SOPInstanceUID", columns="Disease", values="Label")
    
    return labels


tags = load_dicom_tags()
labels = load_labels()

df = labels.merge(tags, on='SOPInstanceUID', how='outer')

In [4]:
dataset = df[['SOPInstanceUID', 'StudyInstanceUID', 'PatientID', 'any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural', 'subset']]
dataset.head()

Unnamed: 0,SOPInstanceUID,StudyInstanceUID,PatientID,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,subset
0,ID_000039fa0,ID_134d398b61,ID_eeaf99e7,0.0,0.0,0.0,0.0,0.0,0.0,train
1,ID_00005679d,ID_b5c26cda09,ID_18f2d431,0.0,0.0,0.0,0.0,0.0,0.0,train
2,ID_00008ce3c,ID_974735bf79,ID_ce8a3cd2,0.0,0.0,0.0,0.0,0.0,0.0,train
3,ID_0000950d7,ID_8881b1c4b1,ID_d278c67b,0.0,0.0,0.0,0.0,0.0,0.0,train
4,ID_0000aee4b,ID_9aad90e421,ID_ce5f0b6c,0.0,0.0,0.0,0.0,0.0,0.0,train


In [31]:
folds = pd.read_csv('/home/sp/repos/kaggle-rsna-2019/rsna19/data/csv/5fold3D.csv')

In [32]:
folds.head()

Unnamed: 0,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,path,fold
0,0.0,0.0,0.0,0.0,0.0,0.0,rsna/train/ID_00047d6503,4
1,1.0,0.0,0.0,0.0,0.0,1.0,rsna/train/ID_0004f7a877,0
2,0.0,0.0,0.0,0.0,0.0,0.0,rsna/train/ID_0006600dd8,0
3,1.0,0.0,1.0,0.0,1.0,1.0,rsna/train/ID_000b852931,1
4,0.0,0.0,0.0,0.0,0.0,0.0,rsna/train/ID_00135fb9ff,4


In [38]:
folds['StudyInstanceUID'] = folds.path.apply(lambda x: x.split('/')[-1])

In [39]:
folds.head()

Unnamed: 0,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,path,fold,SOPInstanceUID,PatientID,StudyInstanceUID
0,0.0,0.0,0.0,0.0,0.0,0.0,rsna/train/ID_00047d6503,4,ID_00047d6503,ID_e0d2de32,ID_00047d6503
1,1.0,0.0,0.0,0.0,0.0,1.0,rsna/train/ID_0004f7a877,0,ID_0004f7a877,ID_8cd7ca78,ID_0004f7a877
2,0.0,0.0,0.0,0.0,0.0,0.0,rsna/train/ID_0006600dd8,0,ID_0006600dd8,ID_3a12cfa6,ID_0006600dd8
3,1.0,0.0,1.0,0.0,1.0,1.0,rsna/train/ID_000b852931,1,ID_000b852931,ID_782db7a2,ID_000b852931
4,0.0,0.0,0.0,0.0,0.0,0.0,rsna/train/ID_00135fb9ff,4,ID_00135fb9ff,ID_0cddb2a2,ID_00135fb9ff


In [35]:
from tqdm import tqdm_notebook

patient_ids = []
for idx, row in tqdm_notebook(folds.iterrows(), total=len(folds)):
    tmp = dataset.loc[dataset['StudyInstanceUID'] == row.StudyInstanceUID]
    patient_ids.append(tmp.PatientID.values[0])

HBox(children=(IntProgress(value=0, max=19530), HTML(value='')))




In [36]:
folds['PatientID'] = patient_ids

In [23]:
len(dataset)

752803

In [28]:
len(folds)

674255

In [50]:
folds.head()

Unnamed: 0,StudyInstanceUID,PatientID,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,fold,path
0,ID_00047d6503,ID_e0d2de32,0.0,0.0,0.0,0.0,0.0,0.0,4,rsna/train/ID_00047d6503
1,ID_0004f7a877,ID_8cd7ca78,1.0,0.0,0.0,0.0,0.0,1.0,0,rsna/train/ID_0004f7a877
2,ID_0006600dd8,ID_3a12cfa6,0.0,0.0,0.0,0.0,0.0,0.0,0,rsna/train/ID_0006600dd8
3,ID_000b852931,ID_782db7a2,1.0,0.0,1.0,0.0,1.0,1.0,1,rsna/train/ID_000b852931
4,ID_00135fb9ff,ID_0cddb2a2,0.0,0.0,0.0,0.0,0.0,0.0,4,rsna/train/ID_00135fb9ff


In [42]:
test_set =  df[df.subset == 'test']
test_set = test_set[['SOPInstanceUID', 'StudyInstanceUID', 'PatientID', 'any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural', 'subset']]
test_set.head()

Unnamed: 0,SOPInstanceUID,StudyInstanceUID,PatientID,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,subset
674258,ID_53e0c2d04,ID_a7f5df3dae,ID_05e3c061,,,,,,,test
674259,ID_ea32d6d6d,ID_4f5a968fa2,ID_f0230723,,,,,,,test
674260,ID_434cabb4c,ID_a2ef38eec3,ID_ebb62360,,,,,,,test
674261,ID_22b3b726f,ID_a75a691e6d,ID_ad9600e0,,,,,,,test
674262,ID_cc10b6f55,ID_1f1d324148,ID_2239aae9,,,,,,,test


In [72]:
import random

test_fold = []
for idx, row in tqdm_notebook(test_set.iterrows(), total=len(test_set)):
    out = folds.loc[folds['PatientID'] == row.PatientID]
    if len(out.values):
        test_fold.append(out.fold.values[0])
    else:
        test_fold.append(random.randint(0, 4))

HBox(children=(IntProgress(value=0, max=78545), HTML(value='')))




In [62]:
out

Unnamed: 0,StudyInstanceUID,PatientID,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,fold,path
18581,ID_f3bc267899,ID_ecf5c9cf,1.0,0.0,0.0,0.0,0.0,1.0,3,rsna/train/ID_f3bc267899


In [69]:
out.PatientID.values[0]

'ID_ecf5c9cf'

In [73]:
test_set['fold'] = test_fold
test_set

Unnamed: 0,SOPInstanceUID,StudyInstanceUID,PatientID,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,subset,fold
674258,ID_53e0c2d04,ID_a7f5df3dae,ID_05e3c061,,,,,,,test,4
674259,ID_ea32d6d6d,ID_4f5a968fa2,ID_f0230723,,,,,,,test,1
674260,ID_434cabb4c,ID_a2ef38eec3,ID_ebb62360,,,,,,,test,4
674261,ID_22b3b726f,ID_a75a691e6d,ID_ad9600e0,,,,,,,test,2
674262,ID_cc10b6f55,ID_1f1d324148,ID_2239aae9,,,,,,,test,1
...,...,...,...,...,...,...,...,...,...,...,...
752798,ID_3579e1d1f,ID_f8bbbfe48f,ID_5a1ef7fe,,,,,,,test,0
752799,ID_bd7aaac7e,ID_df881b36d2,ID_9025ebc5,,,,,,,test,0
752800,ID_8b23db3f4,ID_e6894b1c76,ID_145f96c6,,,,,,,test,2
752801,ID_367cdeab9,ID_d1eff44351,ID_da04514e,,,,,,,test,4


In [74]:
test_set.to_csv('fold_patients_test.csv', index=False)