In [1]:
import sys
sys.path.append("..")

import argparse
import functools
import pickle
from multiprocessing import Pool
import copy

import pydicom
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
np.seterr(over='ignore')

from utils import misc

exist = []
for img in os.listdir(f'/home/islab/kaggle-rsna-intracranial-hemorrhage/TotalData'):
    exist.append(img)
new_exist = [s.strip('.dcm') for s in exist]

df_input = pd.read_csv('stage_2_train.csv')
df_input = df_input[df_input.ID.str.contains('|'.join(new_exist))]

def group_labels_by_id(df):
    ids = {}
    for row in tqdm(df.itertuples(), total=len(df)):
        prefix, id, label = row.ID.split('_')
        id = '%s_%s' % (prefix, id)
        if id not in ids:
            ids[id] = []
        if row.Label == 1: 
            ids[id].append(label)
    return ids

def remove_corrupted_images(ids):
    ids = ids.copy()
    
    for id in ['ID_6431af929']:
        try:
            ids.pop(id) 
        except KeyError as e:
            print('%s not found' % id)
        else:
            print('removed %s' % id)

    return ids

def create_record(item, dirname):
    id, labels = item
    
    path = '%s/%s.dcm' % (dirname, id)
    dicom = pydicom.dcmread(path)
    
    record = {
        'ID': id,
        'labels': ' '.join(labels),
        'n_label': len(labels),
    }
    record.update(misc.get_dicom_raw(dicom))

    raw = dicom.pixel_array
    slope = float(record['RescaleSlope'])
    intercept = float(record['RescaleIntercept'])
    center = misc.get_dicom_value(record['WindowCenter'])
    width = misc.get_dicom_value(record['WindowWidth'])
    bits= record['BitsStored']
    pixel = record['PixelRepresentation']

    image = misc.rescale_image(raw, slope, intercept, bits, pixel)
    doctor = misc.apply_window(image, center, width)
    brain = misc.apply_window(image, 40, 80)

    record.update({
        'raw_max': raw.max(),
        'raw_min': raw.min(),
        'raw_mean': raw.mean(),
        'raw_diff': raw.max() - raw.min(),
        'doctor_max': doctor.max(),
        'doctor_min': doctor.min(),
        'doctor_mean': doctor.mean(),
        'doctor_diff': doctor.max() - doctor.min(),
        'brain_max': brain.max(),
        'brain_min': brain.min(),
        'brain_mean': brain.mean(),
        'brain_diff': brain.max() - brain.min(),
        'brain_ratio': misc.get_windowed_ratio(image, 40, 80),
    })
    return record

def create_df(ids):
    print('making records...')
    with Pool(6) as pool:
        records = list(tqdm(
            iterable=pool.imap_unordered(
                functools.partial(create_record, dirname='/home/islab/kaggle-rsna-intracranial-hemorrhage/TotalData'),
                ids.items()
            ),
            total=len(ids),
        ))
    return pd.DataFrame(records).sort_values('ID').reset_index(drop=True)

ids = remove_corrupted_images(group_labels_by_id(df_input))
output = create_df(ids)

100%|██████████| 36000/36000 [00:00<00:00, 455410.35it/s]

ID_6431af929 not found
making records...



100%|██████████| 6000/6000 [00:51<00:00, 116.73it/s]


In [2]:
output

Unnamed: 0,ID,labels,n_label,BitsAllocated,BitsStored,Columns,HighBit,ImageOrientationPatient,ImagePositionPatient,Modality,...,raw_diff,doctor_max,doctor_min,doctor_mean,doctor_diff,brain_max,brain_min,brain_mean,brain_diff,brain_ratio
0,ID_000edbf38,epidural any,2,16,16,512,15,"(1.000000, 0.000000, 0.000000, 0.000000, 0.942...","(-125.000000, -118.330185, 93.577164)",CT,...,4722,70.0,-10.0,9.970745,80.0,80.0,0.0,16.805882,80.0,0.316666
1,ID_0019445aa,subdural any,2,16,16,512,15,"(1.000000, 0.000000, 0.000000, 0.000000, 0.987...","(-125.000000, -144.661041, 109.036705)",CT,...,4603,70.0,-10.0,3.558544,80.0,80.0,0.0,11.106380,80.0,0.240425
2,ID_001f8e12d,subarachnoid any,2,16,16,512,15,"(1.000000, 0.000000, 0.000000, 0.000000, 1.000...","(-125.000, -144.700, 94.750)",CT,...,3628,115.0,-35.0,-15.581387,150.0,80.0,0.0,9.676018,80.0,0.070393
3,ID_001faa58f,intraparenchymal any,2,16,16,512,15,"(1.000000, 0.000000, 0.000000, 0.000000, 1.000...","(-125.000, -144.200, 52.000)",CT,...,3778,115.0,-35.0,-8.703197,150.0,80.0,0.0,12.791885,80.0,0.274216
4,ID_002287eb9,subarachnoid any,2,16,12,512,11,"(1, 0, 0, 0, 1, 0)","(-114.5, 5.5, 113.199951)",CT,...,2524,76.0,-4.0,16.681355,80.0,80.0,0.0,19.880718,80.0,0.198517
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,ID_ffa6e19a5,epidural any,2,16,16,512,15,"(1.000000, 0.000000, 0.000000, 0.000000, 0.927...","(-152.899994, -88.397980, 99.923126)",CT,...,4694,70.0,-10.0,5.248508,80.0,80.0,0.0,12.635189,80.0,0.259125
5996,ID_ffc947808,subarachnoid any,2,16,16,512,15,"(1.000000, 0.000000, 0.000000, 0.000000, 0.972...","(-125.000, -132.446, 66.461)",CT,...,3669,115.0,-35.0,-4.323048,150.0,80.0,0.0,15.376766,80.0,0.284801
5997,ID_ffd27f702,epidural any,2,16,16,512,15,"(1.000000, 0.000000, 0.000000, 0.000000, 0.927...","(-125.000, -99.298, 57.885)",CT,...,4863,85.0,-15.0,4.719376,100.0,80.0,0.0,14.809498,80.0,0.197674
5998,ID_ffe17f20d,epidural any,2,16,12,512,11,"(1, 0, 0, 0, 0.939692621, -0.342020143)","(-125, 59.5384224, 235.841488)",CT,...,4079,80.0,0.0,12.124134,80.0,80.0,0.0,12.124134,80.0,0.232887


In [3]:
with open('total_raw.pkl', 'wb') as f:
    pickle.dump(output, f)

In [4]:
import collections
from pprint import pprint

def show_distribution(dataset):
    counter = collections.defaultdict(int)
    for row in dataset.itertuples():
        for label in row.labels.split():
            counter[label] += 1
        if not row.labels:
            counter['negative'] += 1
        counter['all'] += 1
    pprint(counter)

    
def parse_position(df):
    expanded = df.ImagePositionPatient.apply(lambda x: pd.Series(x))
    expanded.columns = ['Position1', 'Position2', 'Position3']
    return pd.concat([df, expanded], axis=1)

def add_adjacent_labels(df):
    df = df.sort_values('PositionOrd')

    records = []
    print('making adjacent labels...')
    for index,group in tqdm(df.groupby('StudyInstanceUID')):

        labels = list(group.labels)
        for j,id in enumerate(group.ID):
            if j == 0:
                left = ''
            else:
                left = labels[j-1]
            if j+1 == len(labels):
                right = ''
            else:
                right = labels[j+1]

            records.append({
                'LeftLabel': left,
                'RightLabel': right,
                'ID': id,
            })
    return pd.merge(df, pd.DataFrame(records), on='ID')

show_distribution(output)

#output = output[output.brain_diff > 60]
output = parse_position(output)

output['WindowCenter'] = output.WindowCenter.apply(lambda x: misc.get_dicom_value(x))
output['WindowWidth'] = output.WindowWidth.apply(lambda x: misc.get_dicom_value(x))
output['PositionOrd'] = output.groupby('SeriesInstanceUID')[['Position3']].rank() / output.groupby('SeriesInstanceUID')[['Position3']].transform('count')

output = add_adjacent_labels(output)
output = output[['ID', 'labels', 'PatientID', 'WindowCenter', 'WindowWidth', 'RescaleIntercept', 'RescaleSlope', 'Position3', 'PositionOrd', 'LeftLabel', 'RightLabel', 'BitsStored', 'PixelRepresentation', 'brain_ratio', 'brain_diff']]

show_distribution(output)

defaultdict(<class 'int'>,
            {'all': 6000,
             'any': 5000,
             'epidural': 1000,
             'intraparenchymal': 1000,
             'intraventricular': 1000,
             'negative': 1000,
             'subarachnoid': 1000,
             'subdural': 1000})


  0%|          | 0/4155 [00:00<?, ?it/s]

making adjacent labels...


100%|██████████| 4155/4155 [00:01<00:00, 2861.78it/s]


defaultdict(<class 'int'>,
            {'all': 6000,
             'any': 5000,
             'epidural': 1000,
             'intraparenchymal': 1000,
             'intraventricular': 1000,
             'negative': 1000,
             'subarachnoid': 1000,
             'subdural': 1000})


In [5]:
# dic = {}
# for classes in os.listdir(f'/home/islab/kaggle-rsna-intracranial-hemorrhage/TrainingData/'):
#     for img in os.listdir(f'/home/islab/kaggle-rsna-intracranial-hemorrhage/TrainingData/{classes}/'):
#         dic[img[:-4]] = classes
        
# for tmp in output.loc[output['labels'] == '']['ID']:
#     output.loc[output['ID'] == tmp, 'labels'] = 'any'

In [6]:
import random

def _make_folds(df, n_fold, seed):

    counter_gt = collections.defaultdict(int)
    for labels in df.labels.str.split():
        for label in labels:
            counter_gt[label] += 1

    counter_folds = collections.Counter()

    folds = {}
    min_labels = {}
    random.seed(seed)
    groups = df.groupby('PatientID')
    print('making %d folds...' % n_fold)
    for patient_id, group in tqdm(groups, total=len(groups)):

        labels = []
        for row in group.itertuples():
            for label in row.labels.split():
                labels.append(label)
        if not labels:
            labels = ['']

        count_labels = [counter_gt[label] for label in labels]
        min_label = labels[np.argmin(count_labels)]
        count_folds = [(f, counter_folds[(f, min_label)]) for f in range(n_fold)]
        min_count = min([count for f,count in count_folds])
        fold = random.choice([f for f,count in count_folds if count == min_count])
        folds[patient_id] = fold

        for label in labels:
            counter_folds[(fold,label)] += 1

    pprint(counter_folds)

    return folds

folds = _make_folds(output, 1, 42)

making 1 folds...


100%|██████████| 3808/3808 [00:05<00:00, 656.67it/s]

Counter({(0, 'any'): 5000,
         (0, 'intraparenchymal'): 1000,
         (0, 'subarachnoid'): 1000,
         (0, 'epidural'): 1000,
         (0, 'intraventricular'): 1000,
         (0, 'subdural'): 1000,
         (0, ''): 831})





In [7]:
output['labels'].unique()

array(['epidural any', 'intraparenchymal any', '', 'subdural any',
       'subarachnoid any', 'intraventricular any'], dtype=object)

In [8]:
output['fold'] = output.PatientID.map(folds)

In [9]:
# from sklearn.model_selection import train_test_split

# train, val = train_test_split(output, test_size=0.1, stratify=output['labels'])

In [10]:
# with open('train.pkl', 'wb') as f:
#     pickle.dump(train, f)
    
# with open('val.pkl', 'wb') as f:
#     pickle.dump(val, f)
    
with open('total.pkl', 'wb') as f:
    pickle.dump(output, f)