## Project structure 
---

In [1]:
# For dealing with files
import os
import shutil
import pandas as pd
import numpy as np
# For using regex expressions
import re

# For splitting the data
from sklearn.model_selection import train_test_split

# For loding img and reshaping
import pydicom as dicom
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [3]:
if os.path.exists('stage_1_test_images.zip'):
    os.remove('stage_1_test_images.zip')
if os.path.exists('stage_1_train_images.zip'):
    os.remove('stage_1_train_images.zip')
if os.path.exists('stage_1_train_images.zip'):
    os.remove('test.zip')
if os.path.exists('test'):
    shutil.rmtree('test')
if os.path.exists('train'):
    shutil.rmtree('train')

# Depending on your machine the following might take some seconds to run
!unzip -q all.zip
!unzip -q test1.zip
!unzip -q train.zip

if os.name == "nt": # windows
    !move test1 test
    !del test1.zip train.zip
else:
    !mv test1 test
    !rm test1.zip train.zip

'unzip' is not recognized as an internal or external command,
operable program or batch file.
'unzip' is not recognized as an internal or external command,
operable program or batch file.
'unzip' is not recognized as an internal or external command,
operable program or batch file.
Det går inte att hitta filen.
Could Not Find C:\Users\sandr\Documents\DnnProject\Project\test1.zip


## Segrigate data in class directories
---

In [19]:
proj_path = r'C:\Users\sandr\Documents\DnnProject\Project\stage_1_train_images'

# get a pandas data frame for training data
try:
    df = pd.read_csv('stage_1_detailed_class_info.csv')
except FileNotFoundError:
    print ('already have data frame')
# save where you currently are (for coming back afterwards)
old_path = os.getcwd()

# cd to train directory
try:
    os.chdir(proj_path)
except FileNotFoundError:
    print('already in Training set stage 1 directory')

# Making directories for data for healthy, Pneumonia and abnormal classes 
if not os.path.isdir('Pneumonia'):
    os.mkdir('Pneumonia')
    
if not os.path.isdir('Abnormal'):
    os.mkdir('Abnormal')
             
if not os.path.isdir('Healthy'):
    os.mkdir('Healthy')


# Separate all in the three catigories
for dirName, subdirList, fileList in os.walk(proj_path):
    for filename in fileList:
        if ".dcm" in filename.lower():  # check whether the file's DICOM
            #print(filename[:-4])
            #print(df.head())
            #print(df[df['patientId'] == filename[:-4] ]['class'].tolist()[0])
            if(df[df['patientId'] == filename[:-4] ]['class'].tolist()[0] == 'Normal' ):
                shutil.move(filename, os.path.join(os.getcwd(), 'Healthy', filename))
            elif (df[df['patientId'] == filename[:-4] ]['class'].tolist()[0] == 'No Lung Opacity / Not Normal' ):
                shutil.move(filename, os.path.join(os.getcwd(), 'Abnormal', filename))
            else:
                shutil.move(filename, os.path.join(os.getcwd(), 'Pneumonia', filename))

already have data frame


FileNotFoundError: [Errno 2] No such file or directory: '0004cfab-14fd-4e49-80ba-63a80b6bddd6.dcm'

## Make small training and validation sets
---

In [33]:
# Parameters for creating your sub set
proportion = 0.1
randome = False
size_hight, size_width = 256, 256 # Resizing the images
filename = 'small_training_set_'

#make sure you are in 'Training set stage 1' dir
try:
    os.chdir('Training set stage 1')
except FileNotFoundError:
    print('already in Training set stage 1 directory')

# Making directories for smal data set of the different classes
if not os.path.isdir('Pneumonia_small'):
    os.mkdir('Pneumonia_small')
    
if not os.path.isdir('Abnormal_small'):
    os.mkdir('Abnormal_small')
             
if not os.path.isdir('Healthy_small'):
    os.mkdir('Healthy_small')

list_pneumonia_img = os.listdir('Pneumonia') 
list_abnormal_img = os.listdir('Abnormal')
list_healthy_img = os.listdir('Healthy')
size_small_data_set = round(len(list_pneumonia_img)*proportion)
print('Filezise will be: %d kb', size_hight * round(size_small_data_set))

if(randome):
    sub_list_pneumonia_img = np.choice(list_pneumonia_img, [size_small_data_set], replace = False)
    sub_list_abnormal_img = np.choice(list_abnormal_img, [size_small_data_set], replace = False)
    sub_list_healthy_img = np.choice(list_healthy_img, [size_small_data_set], replace = False)
else:      
    sub_list_pneumonia_img = list_pneumonia_img[0:size_small_data_set]
    sub_list_abnormal_img = list_abnormal_img[0:size_small_data_set]
    sub_list_healthy_img = list_healthy_img[0:size_small_data_set]


small_training_set_pneumonia = np.zeros([size_small_data_set, size_hight, size_width], dtype = np.float32)
small_training_set_abnormal = np.zeros([size_small_data_set, size_hight, size_width], dtype = np.float32)
small_training_set_healthy = np.zeros([size_small_data_set, size_hight, size_width], dtype = np.float32)

if not os.path.isfile(os.path.join(os.getcwd(), 'Pneumonia_small', filename + 'pneumonia')):
    for f in sub_list_pneumonia_img:
        if ".dcm" in f.lower():  # check whether the file's DICOM
            # read the file
            ds = dicom.read_file(os.path.join(os.getcwd(), 'Pneumonia', f))
            # store the raw image data
            rezise = tf.image.resize_images(np.flipud(np.reshape(ds.pixel_array,[1024,1024,1])),
                                            [size_hight, size_width],
                                            align_corners=False,
                                            preserve_aspect_ratio=False)
            small_training_set_pneumonia[sub_list_pneumonia_img.index(f), :, :] = tf.Session().run(rezise[:,:,0])
    np.save(os.path.join(os.getcwd(), 'Pneumonia_small', filename + 'pneumonia'), small_training_set_pneumonia, allow_pickle = False)

if not os.path.isfile(os.path.join(os.getcwd(), 'Abnormal_small', filename + 'abnormal')):
    for f in sub_list_abnormal_img:
        if ".dcm" in f.lower():  # check whether the file's DICOM
            # read the file
            ds = dicom.read_file(os.path.join(os.getcwd(), 'Abnormal', f))
            # store the raw image data
            rezise = tf.image.resize_images(np.flipud(np.reshape(ds.pixel_array,[1024,1024,1])),
                                            [size_hight, size_width],
                                            align_corners=False,
                                            preserve_aspect_ratio=False)
            small_training_set_pneumonia[sub_list_abnormal_img.index(f), :, :] = tf.Session().run(rezise[:,:,0])
    np.save(os.path.join(os.getcwd(), 'Abnormal_small', filename + 'abnormal'), small_training_set_abnormal, allow_pickle = False)

if not os.path.isfile(os.path.join(os.getcwd(), 'Healthy_small', filename + 'healthy')):
    for f in sub_list_healthy_img:
        if ".dcm" in f.lower():  # check whether the file's DICOM
            # read the file
            ds = dicom.read_file(os.path.join(os.getcwd(), 'Healthy', f))
            # store the raw image data
            rezise = tf.image.resize_images(np.flipud(np.reshape(ds.pixel_array,[1024,1024,1])),
                                            [size_hight, size_width],
                                            align_corners=False,
                                            preserve_aspect_ratio=False)
            small_training_set_pneumonia[sub_list_healthy_img.index(f), :, :] = tf.Session().run(rezise[:,:,0])
    np.save(os.path.join(os.getcwd(), 'Healthy_small', filename + 'healthy'), small_training_set_healthy, allow_pickle = False)

already in Training set stage 1 directory
Filezise will be: %d kb 144896


## Rescaling lables

In [6]:
proj_path = r'C:\Users\sandr\Documents\DnnProject\Project'
try:
    os.chdir(proj_path)
except FileNotFoundError:
    print('already in Training set stage 1 directory')
# get a pandas data frame for training data
try:
    df = pd.read_csv('stage_1_train_labels.csv')
except FileNotFoundError:
    print ('already have data frame')

list_healthy = os.listdir(os.path.join(proj_path,'stage_1_train_images', 'Healthy'))
for i in list_healthy:
    list_healthy[list_healthy.index(i)] = i[:-4]
list_abnormal = os.listdir(os.path.join(proj_path,'stage_1_train_images', 'Abnormal'))
for i in list_abnormal:
    list_abnormal[list_abnormal.index(i)] = i[:-4]
list_pneumonia = os.listdir(os.path.join(proj_path,'stage_1_train_images', 'Pneumonia'))
for i in list_pneumonia:
    list_pneumonia[list_pneumonia.index(i)] = i[:-4]


is_in_healthy = df['patientId'].isin(list_healthy)
healthy_df = df[is_in_healthy]
print(is_in_healthy)
is_in_abnormal = df['patientId'].isin(list_abnormal) 
abnormal_df = df[is_in_abnormal]
is_in_pneumonia = df['patientId'].isin(list_pneumonia) 
pneumonia_df = df[is_in_pneumonia]

#if not os.path.isfile('train_small_label.csv'):
#    frames = [healthy_df, abnormal_df, pneumonia_df]
#    small_train_lable = pd.concat(frames)
#    small_train_lable.to_csv(os.path.join(proj_path, 'stage_1_train_images', 'train_small_label.csv'))



<class 'pandas.core.series.Series'>


In [None]:
C:\Users\sandr\Documents\DnnProject\Project\stage_1_train_images\Healthy