In [1]:
import sys
sys.path.append("..")

import argparse
import functools
import pickle
from multiprocessing import Pool
import copy

import pydicom
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
np.seterr(over='ignore')

from utils import misc

exist = []
f = open('/home/islab/kaggle-rsna-intracranial-hemorrhage/src/preprocess/val_label.pkl', 'rb')  
exist = pickle.load(f) 

new_exist = [s.strip('.dcm') for s in exist]

df_input = pd.read_csv('stage_2_train.csv')
df_input = df_input[df_input.ID.str.contains('|'.join(new_exist))]

def group_labels_by_id(df):
    ids = {}
    for row in tqdm(df.itertuples(), total=len(df)):
        prefix, id, label = row.ID.split('_')
        id = '%s_%s' % (prefix, id)
        if id not in ids:
            ids[id] = []
        if row.Label == 1: 
            ids[id].append(label)
    return ids

def remove_corrupted_images(ids):
    ids = ids.copy()
    
    for id in ['ID_6431af929']:
        try:
            ids.pop(id) 
        except KeyError as e:
            print('%s not found' % id)
        else:
            print('removed %s' % id)

    return ids

def create_record(item, dirname):
    id, labels = item
    
    path = '%s/%s.dcm' % (dirname, id)
    dicom = pydicom.dcmread(path)
    
    record = {
        'ID': id,
        'labels': ' '.join(labels),
        'n_label': len(labels),
    }
    record.update(misc.get_dicom_raw(dicom))

    raw = dicom.pixel_array
    slope = float(record['RescaleSlope'])
    intercept = float(record['RescaleIntercept'])
    center = misc.get_dicom_value(record['WindowCenter'])
    width = misc.get_dicom_value(record['WindowWidth'])
    bits= record['BitsStored']
    pixel = record['PixelRepresentation']

    image = misc.rescale_image(raw, slope, intercept, bits, pixel)
    doctor = misc.apply_window(image, center, width)
    brain = misc.apply_window(image, 40, 80)

    record.update({
        'raw_max': raw.max(),
        'raw_min': raw.min(),
        'raw_mean': raw.mean(),
        'raw_diff': raw.max() - raw.min(),
        'doctor_max': doctor.max(),
        'doctor_min': doctor.min(),
        'doctor_mean': doctor.mean(),
        'doctor_diff': doctor.max() - doctor.min(),
        'brain_max': brain.max(),
        'brain_min': brain.min(),
        'brain_mean': brain.mean(),
        'brain_diff': brain.max() - brain.min(),
        'brain_ratio': misc.get_windowed_ratio(image, 40, 80),
    })
    return record

def create_df(ids):
    print('making records...')
    with Pool(6) as pool:
        records = list(tqdm(
            iterable=pool.imap_unordered(
                functools.partial(create_record, dirname='/data/rsna-intracranial-hemorrhage-detection/stage_2_train/'),
                ids.items()
            ),
            total=len(ids),
        ))
    return pd.DataFrame(records).sort_values('ID').reset_index(drop=True)

ids = remove_corrupted_images(group_labels_by_id(df_input))
output = create_df(ids)

100%|██████████| 3222/3222 [00:00<00:00, 409727.66it/s]

ID_6431af929 not found
making records...



100%|██████████| 537/537 [00:04<00:00, 116.46it/s]


In [12]:
output[output['brain_diff']==0]

Unnamed: 0,ID,labels,PatientID,WindowCenter,WindowWidth,RescaleIntercept,RescaleSlope,Position3,PositionOrd,LeftLabel,RightLabel,BitsStored,PixelRepresentation,brain_ratio,brain_diff,fold
111,ID_9f0dc2c71,,ID_3c47afea,36,80,-1024.0,1.0,911.700073,1.0,,,12,0,0.0,0.0,0


In [3]:
# with open('total_raw.pkl', 'wb') as f:
#     pickle.dump(output, f)

In [4]:
import collections
from pprint import pprint

def show_distribution(dataset):
    counter = collections.defaultdict(int)
    for row in dataset.itertuples():
        for label in row.labels.split():
            counter[label] += 1
        if not row.labels:
            counter['negative'] += 1
        counter['all'] += 1
    pprint(counter)

    
def parse_position(df):
    expanded = df.ImagePositionPatient.apply(lambda x: pd.Series(x))
    expanded.columns = ['Position1', 'Position2', 'Position3']
    return pd.concat([df, expanded], axis=1)

def add_adjacent_labels(df):
    df = df.sort_values('PositionOrd')

    records = []
    print('making adjacent labels...')
    for index,group in tqdm(df.groupby('StudyInstanceUID')):

        labels = list(group.labels)
        for j,id in enumerate(group.ID):
            if j == 0:
                left = ''
            else:
                left = labels[j-1]
            if j+1 == len(labels):
                right = ''
            else:
                right = labels[j+1]

            records.append({
                'LeftLabel': left,
                'RightLabel': right,
                'ID': id,
            })
    return pd.merge(df, pd.DataFrame(records), on='ID')

show_distribution(output)

output = parse_position(output)

output['WindowCenter'] = output.WindowCenter.apply(lambda x: misc.get_dicom_value(x))
output['WindowWidth'] = output.WindowWidth.apply(lambda x: misc.get_dicom_value(x))
output['PositionOrd'] = output.groupby('SeriesInstanceUID')[['Position3']].rank() / output.groupby('SeriesInstanceUID')[['Position3']].transform('count')

output = add_adjacent_labels(output)
output = output[['ID', 'labels', 'PatientID', 'WindowCenter', 'WindowWidth', 'RescaleIntercept', 'RescaleSlope', 'Position3', 'PositionOrd', 'LeftLabel', 'RightLabel', 'BitsStored', 'PixelRepresentation', 'brain_ratio', 'brain_diff']]

show_distribution(output)

 60%|██████    | 305/508 [00:00<00:00, 3046.02it/s]

defaultdict(<class 'int'>,
            {'all': 537,
             'any': 449,
             'epidural': 91,
             'intraparenchymal': 90,
             'intraventricular': 92,
             'negative': 88,
             'subarachnoid': 89,
             'subdural': 87})
making adjacent labels...


100%|██████████| 508/508 [00:00<00:00, 2999.87it/s]

defaultdict(<class 'int'>,
            {'all': 537,
             'any': 449,
             'epidural': 91,
             'intraparenchymal': 90,
             'intraventricular': 92,
             'negative': 88,
             'subarachnoid': 89,
             'subdural': 87})





In [5]:
# dic = {}
# for classes in os.listdir(f'/home/islab/kaggle-rsna-intracranial-hemorrhage/TrainingData/'):
#     for img in os.listdir(f'/home/islab/kaggle-rsna-intracranial-hemorrhage/TrainingData/{classes}/'):
#         dic[img[:-4]] = classes
        
# for tmp in output.loc[output['labels'] == '']['ID']:
#     output.loc[output['ID'] == tmp, 'labels'] = 'any'

In [6]:
import random

def _make_folds(df, n_fold, seed):

    counter_gt = collections.defaultdict(int)
    for labels in df.labels.str.split():
        for label in labels:
            counter_gt[label] += 1

    counter_folds = collections.Counter()

    folds = {}
    min_labels = {}
    random.seed(seed)
    groups = df.groupby('PatientID')
    print('making %d folds...' % n_fold)
    for patient_id, group in tqdm(groups, total=len(groups)):

        labels = []
        for row in group.itertuples():
            for label in row.labels.split():
                labels.append(label)
        if not labels:
            labels = ['']

        count_labels = [counter_gt[label] for label in labels]
        min_label = labels[np.argmin(count_labels)]
        count_folds = [(f, counter_folds[(f, min_label)]) for f in range(n_fold)]
        min_count = min([count for f,count in count_folds])
        fold = random.choice([f for f,count in count_folds if count == min_count])
        folds[patient_id] = fold

        for label in labels:
            counter_folds[(fold,label)] += 1

    pprint(counter_folds)

    return folds

folds = _make_folds(output, 1, 42)

 27%|██▋       | 134/498 [00:00<00:00, 667.12it/s]

making 1 folds...


100%|██████████| 498/498 [00:00<00:00, 663.60it/s]

Counter({(0, 'any'): 449,
         (0, 'intraventricular'): 92,
         (0, 'epidural'): 91,
         (0, 'intraparenchymal'): 90,
         (0, 'subarachnoid'): 89,
         (0, 'subdural'): 87,
         (0, ''): 87})





In [7]:
output['labels'].unique()

array(['epidural any', 'intraparenchymal any', 'subarachnoid any',
       'intraventricular any', 'subdural any', ''], dtype=object)

In [8]:
output['fold'] = output.PatientID.map(folds)

In [9]:
# from sklearn.model_selection import train_test_split

# train, val = train_test_split(output, test_size=0.1, stratify=output['labels'])

In [11]:
# with open('train.pkl', 'wb') as f:
#     pickle.dump(train, f)
    
with open('val.pkl', 'wb') as f:
    pickle.dump(output, f)
    
# with open('total.pkl', 'wb') as f:
#     pickle.dump(output, f)