In [1]:
import matplotlib.pyplot as plt
import pydicom
import pandas as pd
import cv2
import numpy as np

In [2]:
import sys
sys.path.insert(0, '../../input/siim-acr-pneumothorax-segmentation')
import fastai
from mask_functions import rle2mask, mask2rle

In [3]:
import os
DATA_DIR = '../../input/pneumonothorax-data'
TRAIN_DIR = os.path.join(DATA_DIR, 'dicom-images-train')
NEW_TRAIN_DIR = os.path.join(DATA_DIR, 'train','images')
TEST_DIR = os.path.join(DATA_DIR, 'dicom-images-test')
NEW_TEST_DIR = os.path.join(DATA_DIR, 'test', 'images')
ANNOT_PATH = os.path.join(DATA_DIR, "train-rle.csv")
NEW_ANNOT_DIR = os.path.join(DATA_DIR, 'train', 'masks')

NEW_TRAIN_METADATA_PATH = os.path.join(DATA_DIR, 'train','metadata.csv')
NEW_TEST_METADATA_PATH = os.path.join(DATA_DIR, 'test','metadata.csv')
def create_dir(direc):
    try:
        os.makedirs(direc)
    except OSError:
        pass
create_dir(NEW_TRAIN_DIR)
create_dir(NEW_TEST_DIR)
create_dir(NEW_ANNOT_DIR)


In [4]:


def drop_unusable_cols(metadata):
    metadata.replace('', np.nan, inplace=True)
    metadata.dropna(how='all', axis=1, inplace=True)
    for col in metadata:
        try:
            if col != 'ImageId' and (metadata[col].nunique() == 1 or
                metadata[col].nunique() == len(metadata[col])):
                metadata.drop(col, axis=1, inplace=True)
        except TypeError:
            pass
def convert_datasets(img_dir, annot_path, img_save_dir, mask_save_dir, metadata_save_path):
    metadata = {}
    if annot_path:
        annotation = pd.read_csv(annot_path, index_col='ImageId')
    for root, _, fils in os.walk(img_dir):
        for fil in fils:
            fil = os.path.join(root, fils[0])
            image_id = os.path.splitext(os.path.basename(fil))[0]
            dataset = pydicom.dcmread(fil)
            if not metadata:
                metadata = {k: [] for k in dataset.trait_names() if k[0].isupper() and k != 'PixelData'}
                metadata['ImageId'] = []
            cv2.imwrite(os.path.join(img_save_dir, image_id + '.png'), dataset.pixel_array)
            for name in metadata:
                if name != 'ImageId':
                    metadata[name].append(getattr(dataset, name))
            metadata['ImageId'].append(image_id)
            if annot_path:
                try:
                    rle_code = annotation.loc[image_id, :][0][1:]
                    if rle_code == '-1':
                        continue
                    rle_mask = rle2mask(
                        rle_code,
                        dataset.pixel_array.shape[1], dataset.pixel_array.shape[0]).T
                    cv2.imwrite(os.path.join(mask_save_dir, image_id + '.png'), rle_mask)
                except KeyError:
                    pass
    metadata = pd.DataFrame(metadata)
    drop_unusable_cols(metadata)
    metadata.to_csv(metadata_save_path, index=False)
    return metadata


In [5]:
test_metadata = convert_datasets(TEST_DIR, None, NEW_TEST_DIR, None, NEW_TEST_METADATA_PATH)

In [6]:
train_metadata = convert_datasets(TRAIN_DIR, ANNOT_PATH, NEW_TRAIN_DIR, NEW_ANNOT_DIR, NEW_TRAIN_METADATA_PATH)

In [7]:
test_metadata.head()

Unnamed: 0,PatientAge,PatientSex,PixelSpacing,SeriesDescription,ViewPosition,ImageId
0,81,M,"[0.14300000000000002, 0.14300000000000002]",view: PA,PA,1.2.276.0.7230010.3.1.4.8323329.5797.151787519...
1,26,M,"[0.14300000000000002, 0.14300000000000002]",view: PA,PA,1.2.276.0.7230010.3.1.4.8323329.5798.151787519...
2,58,M,"[0.168, 0.168]",view: AP,AP,1.2.276.0.7230010.3.1.4.8323329.5799.151787519...
3,51,F,"[0.19431099999999998, 0.19431099999999998]",view: PA,PA,1.2.276.0.7230010.3.1.4.8323329.580.1517875163...
4,68,F,"[0.171, 0.171]",view: AP,AP,1.2.276.0.7230010.3.1.4.8323329.5800.151787519...
