In [1]:
# !pip install pydicom

In [9]:
# Place kaggle token in root
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

mkdir: /Users/pazuzzu/.kaggle: File exists


In [10]:
OUTPUT_DIR = './'

In [11]:
import os
import numpy as np
import tensorflow as tf
from tqdm import tqdm

# Prepare Data

In [12]:
# !rm -rdf data

In [13]:
# Download data
!kaggle datasets download -d paultimothymooney/chest-xray-pneumonia
!kaggle datasets download -d nazmul0087/ct-kidney-dataset-normal-cyst-tumor-and-stone
!kaggle datasets download -d abdulkader90/brain-ct-hemorrhage-dataset
# !kaggle datasets download -d nikhilroxtomar/ct-heart-segmentation
!kaggle datasets download -d shashwatwork/knee-osteoarthritis-dataset-with-severity

Downloading chest-xray-pneumonia.zip to /Users/pazuzzu/Documents/X-ray_classifier_yahya
100%|█████████████████████████████████████▉| 2.29G/2.29G [01:42<00:00, 31.9MB/s]
100%|██████████████████████████████████████| 2.29G/2.29G [01:42<00:00, 24.0MB/s]
Downloading ct-kidney-dataset-normal-cyst-tumor-and-stone.zip to /Users/pazuzzu/Documents/X-ray_classifier_yahya
100%|█████████████████████████████████████▉| 1.51G/1.52G [01:18<00:00, 24.1MB/s]
100%|██████████████████████████████████████| 1.52G/1.52G [01:18<00:00, 20.8MB/s]
Downloading brain-ct-hemorrhage-dataset.zip to /Users/pazuzzu/Documents/X-ray_classifier_yahya
 98%|███████████████████████████████████████ | 126M/129M [00:06<00:00, 20.4MB/s]
100%|████████████████████████████████████████| 129M/129M [00:07<00:00, 19.2MB/s]
Downloading knee-osteoarthritis-dataset-with-severity.zip to /Users/pazuzzu/Documents/X-ray_classifier_yahya
 99%|███████████████████████████████████████▍| 201M/204M [00:11<00:00, 18.2MB/s]
100%|███████████████████████

In [14]:
# Create folders to move to classes to (mixes up training, val and test)
!mkdir -p data/chest
!mkdir -p data/kidney
!mkdir -p data/brain
# !mkdir -p data/heart
!mkdir -p data/knee

In [15]:
# Unzip in appropriate folder
!unzip chest-xray-pneumonia.zip                          -d data/chest  &> /dev/null
!unzip ct-kidney-dataset-normal-cyst-tumor-and-stone.zip -d data/kidney &> /dev/null
!unzip brain-ct-hemorrhage-dataset.zip                   -d data/brain  &> /dev/null
# !unzip ct-heart-segmentation.zip                         -d data/heart  &> /dev/null
!unzip knee-osteoarthritis-dataset-with-severity.zip     -d data/knee   &> /dev/null

In [16]:
# Clean up zips
!rm *zip

In [17]:
# Function to get only file paths, ignoring intermediate subdirectories
SUPPORTED_FORMATS = set(('jpg', 'jpeg', 'dcm', 'jpg', 'png'))

def get_image_file_paths(folder):
  file_paths = []
  for root, _, files in tqdm(os.walk(folder)):
    for f in files :
      if f.split('.')[-1] in SUPPORTED_FORMATS :
        file_path = os.path.join(root, f)
        file_paths.append(file_path)
  return file_paths

In [18]:
# Function to remove folders and keep files
import shutil

def remove_subdirectories(source_folder):
  # move files to source_folder
  source_file_paths = get_image_file_paths(source_folder)
  for source_file_path in tqdm(source_file_paths, desc='Moving files...'):
    destination_file_path = os.path.join(source_folder, source_file_path.split('/')[-1])
    shutil.move(source_file_path, destination_file_path)

  # remove subdirectories
  for root, subdirs, _ in tqdm(os.walk(source_folder), desc='Removing subdirectories'):
    for subdir in subdirs :
      subdir_path = os.path.join(root, subdir)
      shutil.rmtree(subdir_path)

In [19]:
# Only keep jpg, jpeg, dcm, jpg and png files (also removes subdirectories)
remove_subdirectories('data/chest')
remove_subdirectories('data/kidney')
remove_subdirectories('data/brain')
# remove_subdirectories('data/heart')
remove_subdirectories('data/knee')

32it [00:00, 429.02it/s]
Moving files...: 100%|██████████| 17568/17568 [00:04<00:00, 3607.46it/s]
Removing subdirectories: 1it [00:00, 19.41it/s]
7it [00:00, 168.82it/s]
Moving files...: 100%|██████████| 12446/12446 [00:02<00:00, 4695.07it/s]
Removing subdirectories: 1it [00:00, 32.62it/s]
50it [00:00, 1983.57it/s]
Moving files...: 100%|██████████| 6772/6772 [00:01<00:00, 5041.44it/s]
Removing subdirectories: 1it [00:00, 29.79it/s]
25it [00:00, 828.88it/s]
Moving files...: 100%|██████████| 9786/9786 [00:01<00:00, 5641.10it/s]
Removing subdirectories: 1it [00:00, 31.57it/s]


In [20]:
# Function to convert dcm to jpg
import pydicom
from PIL import Image

def dcm_to_jpg(dcm_image, output_image):
  dcm_image = pydicom.dcmread(dcm_image)
  new_image = dcm_image.pixel_array.astype(float)
  scaled_image = np.uint8((np.maximum(new_image, 0) / new_image.max()) * 255.0)
  final_image = Image.fromarray(scaled_image)
  final_image.save(output_image)
  pass

def convert_dcms_to_jpgs(dcm_image_folder):
  for f in tqdm(os.listdir(dcm_image_folder)):
    if f.split('.')[-1] == 'dcm':
      input_file_path = os.path.join(dcm_image_folder, f)
      output_file_path = os.path.join(dcm_image_folder, '.'.join(f.split('.')[:-1])+'.jpg')
      dcm_to_jpg(input_file_path, output_file_path)
      os.remove(input_file_path)

In [21]:
# Function to check for and remove corrupted files
def remove_corrupted(folder):
  len_files, counter = len(os.listdir(folder)), 0
  for f in tqdm(os.listdir(folder)):
    file_path = os.path.join(folder, f)

    # Check for empty file
    if os.path.getsize(file_path) == 0 :
      # print(f'{f} empty ! Removing file ...')
      counter += 1
      os.remove(file_path)

    # Check for error by opening files with pillow
    try:
      img = Image.open(file_path)
      img.load()
      img.close()
    except IOError as e:
      # print(f'{f} corrupted ! Removing file !')
      counter += 1
      os.remove(file_path)
  print(f'\nFiles removed {counter}/{len_files} from {folder} folder !\n')

In [22]:
# Removing for corrupted files
remove_corrupted('data/chest')
remove_corrupted('data/kidney')
remove_corrupted('data/brain')
# remove_corrupted('data/heart')
remove_corrupted('data/knee')

100%|██████████| 11712/11712 [00:41<00:00, 279.43it/s]



Files removed 5856/11712 from data/chest folder !



100%|██████████| 12447/12447 [00:56<00:00, 219.53it/s]



Files removed 1/12447 from data/kidney folder !



100%|██████████| 6771/6771 [00:14<00:00, 478.30it/s]



Files removed 0/6771 from data/brain folder !



100%|██████████| 9786/9786 [00:13<00:00, 726.72it/s]


Files removed 0/9786 from data/knee folder !






In [23]:
# Convert dcm files to jpgs #Maybe convert before removing corruption in the future
convert_dcms_to_jpgs('data/chest')
convert_dcms_to_jpgs('data/kidney')
convert_dcms_to_jpgs('data/brain')
# convert_dcms_to_jpgs('data/heart')
convert_dcms_to_jpgs('data/knee')

100%|██████████| 5856/5856 [00:00<00:00, 3072534.93it/s]
100%|██████████| 12446/12446 [00:00<00:00, 3150981.32it/s]
100%|██████████| 6771/6771 [00:00<00:00, 3255345.30it/s]
100%|██████████| 9786/9786 [00:00<00:00, 3313859.11it/s]


In [24]:
# number of examples
get_examples_number = lambda x : len(os.listdir(x))

chest_base_size = get_examples_number('data/chest')
kidney_base_size = get_examples_number('data/kidney')
brain_base_size = get_examples_number('data/brain')
# heart_base_size = get_examples_number('data/heart')
knee_base_size = get_examples_number('data/knee')

In [25]:
# Display number of examples per class
print('\nThe chest class has\t', chest_base_size,  '\texamples')
print('\nThe kidney class has\t', kidney_base_size, '\texamples')
print('\nThe brain class has\t', brain_base_size,   '\texamples')
# print('\nThe heart class has\t', heart_base_size,   '\texamples')
print('\nThe knee class has\t', knee_base_size,     '\texamples')


The chest class has	 5856 	examples

The kidney class has	 12446 	examples

The brain class has	 6771 	examples

The knee class has	 9786 	examples


## Down Sampling (OPTIONAL / Use later eventually)

In [26]:
# Function to down sample other classes(OPTIONAL)
import random

def keep_n_examples(folder, m):
  files = os.listdir(folder)
  random.shuffle(files)
  files_len = len(files)
  for i, f in enumerate(tqdm(files)):
    file_path = os.path.join(folder, f)
    os.remove(file_path)
    if files_len - i == m + 1 :
      break 

In [27]:
# Down sample other classes to (418) (OPTIONAL)
M = 5856
# keep_n_examples('data/chest', M)
keep_n_examples('data/kidney', M)
keep_n_examples('data/brain', M)
keep_n_examples('data/knee', M)

 53%|█████▎    | 6589/12446 [00:00<00:00, 7115.60it/s]
 13%|█▎        | 914/6771 [00:00<00:01, 5667.44it/s]
 40%|████      | 3929/9786 [00:00<00:00, 7888.80it/s]


In [28]:
# Display number of examples per class after down sampling
print('\nThe chest class has\t', get_examples_number('data/chest') ,  '\texamples')
print('\nThe kidney class has\t', get_examples_number('data/kidney'), '\texamples')
print('\nThe brain class has\t', get_examples_number('data/brain'),   '\texamples')
# print('\nThe heart class has\t', get_examples_number('data/heart'),   '\texamples')
print('\nThe knee class has\t', get_examples_number('data/knee'),     '\texamples')


The chest class has	 5856 	examples

The kidney class has	 5856 	examples

The brain class has	 5856 	examples

The knee class has	 5856 	examples


In [29]:
# !cp -r data/ ./

## Split Data

In [None]:
# !cp -r ./data/ ./

In [30]:
import random

# Function to split data
def split_examples(source_dir, train_dir, validation_dir, train_split_rate):
  file_paths = os.listdir(source_dir)
  random.shuffle(file_paths)

  train_split_size = int(train_split_rate * len(file_paths))

  train_split = file_paths[:train_split_size]
  validation_split = file_paths[train_split_size:]

  for f in tqdm(train_split, desc='Moving training examples...') :
    shutil.copy(os.path.join(source_dir, f),
                os.path.join(train_dir, f))
  for f in tqdm(validation_split, desc='Moving validation examples...') :
    shutil.copy(os.path.join(source_dir, f),
                os.path.join(validation_dir, f))

In [31]:
# Create train and validation dirs
!mkdir -p data/train/chest   data/validation/chest
!mkdir -p data/train/kidney  data/validation/kidney
!mkdir -p data/train/brain   data/validation/brain
!mkdir -p data/train/heart   data/validation/heart
!mkdir -p data/train/knee  data/validation/knee

In [32]:
# Split and move data to train & validation directories
SPLIT = 0.8 #

split_examples('data/chest', 'data/train/chest', 'data/validation/chest', SPLIT)
split_examples('data/kidney', 'data/train/kidney', 'data/validation/kidney', SPLIT)
split_examples('data/brain', 'data/train/brain', 'data/validation/brain', SPLIT)
# split_examples('data/heart', 'data/train/heart', 'data/validation/heart', SPLIT)
split_examples('data/knee', 'data/train/knee', 'data/validation/knee', SPLIT)

Moving training examples...: 100%|██████████| 4684/4684 [00:05<00:00, 932.80it/s] 
Moving validation examples...: 100%|██████████| 1172/1172 [00:00<00:00, 1297.81it/s]
Moving training examples...: 100%|██████████| 4684/4684 [00:03<00:00, 1537.60it/s]
Moving validation examples...: 100%|██████████| 1172/1172 [00:00<00:00, 1577.62it/s]
Moving training examples...: 100%|██████████| 4684/4684 [00:01<00:00, 2696.47it/s]
Moving validation examples...: 100%|██████████| 1172/1172 [00:00<00:00, 2539.74it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'data/heart'

In [None]:
# Sanity check

chest_train_size, chest_validation_size = get_examples_number('data/train/chest'), get_examples_number('data/validation/chest')
kidney_train_size, kidney_validation_size = get_examples_number('data/train/kidney'), get_examples_number('data/validation/kidney')
brain_train_size, brain_validation_size = get_examples_number('data/train/brain'), get_examples_number('data/validation/brain')
# heart_train_size, heart_validation_size = get_examples_number('data/train/heart'), get_examples_number('data/validation/heart')
knee_train_size, knee_validation_size = get_examples_number('data/train/knee'), get_examples_number('data/validation/knee')

print('\nThe chest class folder has\n\t', chest_train_size, f'training examples ({round(chest_train_size/chest_base_size, 2)*100}%)\n\t', chest_validation_size, f'validation examples ({round(chest_validation_size/chest_base_size, 2)*100}%)')
print('\nThe kidney class folder has\n\t', kidney_train_size, f'training examples ({round(kidney_train_size/kidney_base_size, 2)*100}%)\n\t', kidney_validation_size, f'validation examples ({round(kidney_validation_size/kidney_base_size, 2)*100}%)')
print('\nThe brain class folder has\n\t', brain_train_size, f'training examples ({round(brain_train_size/brain_base_size, 2)*100}%)\n\t', brain_validation_size, f'validation examples ({round(brain_validation_size/brain_base_size, 2)*100}%)')
# print('\nThe heart class folder has\n\t', heart_train_size, f'training examples ({round(heart_train_size/heart_base_size, 2)*100}%)\n\t', heart_validation_size, f'validation examples ({round(heart_validation_size/heart_base_size, 2)*100}%)')
print('\nThe knee class folder has\n\t', knee_train_size, f'training examples ({round(knee_train_size/knee_base_size, 2)*100}%)\n\t', knee_validation_size, f'validation examples ({round(knee_validation_size/knee_base_size, 2)*100}%)')

# Check Create Generators

In [None]:
# Initiate data generators # Todo : Uncomment augmentation to use it
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(
    rescale=1./255.,
    # rotation_range=40,
    # height_shift_range=.2,
    # width_shift_range=.2,
    # horizontal_flip=True,     # relevant in x-ray
    # zoom_range=.2,
    # shear_range=.2,
    # fill_mode='nearest',
)

validation_datagen = ImageDataGenerator(
    rescale=1./255.,
)

In [None]:
# Create data generators (validation split can be created automatically with "validation_split" parameter !)
train_generator = train_datagen.flow_from_directory(
    directory='data/train',
    target_size=(150, 150), # Change to 256 ?
    batch_size=32,
    class_mode='categorical',
)
validation_generator = validation_datagen.flow_from_directory(
    directory='data/validation',
    target_size=(150, 150), # Change to 256 ?
    batch_size=32,
    class_mode='categorical',
)

Found 28087 images belonging to 5 classes.
Found 7026 images belonging to 5 classes.
