In [36]:
import os
import glob
import pandas as pd
import cv2
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from collections import Counter
from random import sample
import threading
from concurrent import futures

In [25]:
np.random.seed(42)
base_directory = '../chest-xray-pneumonia/chest_xray/chest_xray/' 
train_dir = os.path.join(base_directory,'train')
val_dir = os.path.join(base_directory,'val')
test_dir = os.path.join(base_directory,'test')

In [27]:
def prepare_dataset(data_dir):
    normal_dir = glob.glob(os.path.join(data_dir,'NORMAL')+'/*.jpeg')
    infected_dir = glob.glob(os.path.join(data_dir,'PNEUMONIA')+'/*.jpeg')
    print(len(normal_dir),len(infected_dir))
    df = pd.DataFrame({'filename':normal_dir+infected_dir,'label':['normal']*len(normal_dir) + ['pneumonia']*len(infected_dir)}).sample(frac=1, random_state=42).reset_index(drop=True)
    return df

In [28]:
random_index =  np.array(sample(range(len(training_data)), 10))
training_data = prepare_dataset(train_dir)
training_data.iloc[random_index]

1341 3875


Unnamed: 0,filename,label
1188,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
5094,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
277,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
3927,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
741,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
2355,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
3864,../chest-xray-pneumonia/chest_xray/chest_xray/...,normal
1772,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
1964,../chest-xray-pneumonia/chest_xray/chest_xray/...,normal
1736,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia


In [31]:
validation_dataset = prepare_dataset(val_dir)
#validation_dataset.iloc[random_index]
validation_dataset

8 8


Unnamed: 0,filename,label
0,../chest-xray-pneumonia/chest_xray/chest_xray/...,normal
1,../chest-xray-pneumonia/chest_xray/chest_xray/...,normal
2,../chest-xray-pneumonia/chest_xray/chest_xray/...,normal
3,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
4,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
5,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
6,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
7,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
8,../chest-xray-pneumonia/chest_xray/chest_xray/...,normal
9,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia


In [40]:
#Obtaining image statistics in parallel
def get_image_shape(index,image,total_images):
    if(index%1000==0 or index==(total_images-1)):
        print('{}: working on img num: {}'.format(threading.current_thread().name,index))
    return cv2.imread(image).shape

executor = futures.ThreadPoolExecutor(max_workers=None)
training_images_input = [(index,image,len(training_data)) for index,image in enumerate(training_data['filename'])]
print('Starting the computation of image shape')
image_dims_computation = executor.map(get_image_shape,
                                     [record[0] for record in training_images_input],
                                     [record[1] for record in training_images_input],
                                     [record[2] for record in training_images_input])
image_dimension_list = list(image_dims_computation)
print('Min Dimensions:', np.min(image_dimension_list, axis=0))
print('Max Dimensions:', np.max(image_dimension_list, axis=0))
print('Mean Dimensions:', np.mean(image_dimension_list, axis=0))
print('Median Dimensions:', np.median(image_dimension_list, axis=0))

Starting the computation of image shape
ThreadPoolExecutor-3_0: working on img num: 0
ThreadPoolExecutor-3_37: working on img num: 1000
ThreadPoolExecutor-3_21: working on img num: 2000
ThreadPoolExecutor-3_12: working on img num: 3000
ThreadPoolExecutor-3_48: working on img num: 4000
ThreadPoolExecutor-3_22: working on img num: 5000
ThreadPoolExecutor-3_22: working on img num: 5215
Min Dimensions: [127 384   3]
Max Dimensions: [2663 2916    3]
Mean Dimensions: [ 968.07476994 1320.61081288    3.        ]
Median Dimensions: [ 888. 1284.    3.]
