In [1]:
# Import libraries for exploratory data analysis and model training
import numpy as np
import pandas as pd
from PIL import Image
from torchvision import transforms as T

In [2]:
from glob import glob

# Create a list 'files' of the 5573 downloaded images saved locally
xray_files = np.array(glob('chestXrays/all/*'))

In [3]:
# Check that file names have been stored correctly
xray_files

array(['chestXrays/all/person281_bacteria_1328.jpeg',
       'chestXrays/all/person294_bacteria_1380.jpeg',
       'chestXrays/all/person998_bacteria_2927.jpeg', ...,
       'chestXrays/all/IM-0312-0001.jpeg',
       'chestXrays/all/person260_bacteria_1223.jpeg',
       'chestXrays/all/person25_bacteria_115.jpeg'], dtype='<U45')

In [4]:
# Create a dataframe with file names so we can use 'apply()' later to transform images to tensors
xray_df = pd.DataFrame(data=xray_files, columns=['file_name'])

In [5]:
def get_shape(tensor):
    return tensor.shape[0]

In [6]:
# Ensure that the dataframe is correctly structured
xray_df.head()

Unnamed: 0,file_name
0,chestXrays/all/person281_bacteria_1328.jpeg
1,chestXrays/all/person294_bacteria_1380.jpeg
2,chestXrays/all/person998_bacteria_2927.jpeg
3,chestXrays/all/person59_bacteria_282.jpeg
4,chestXrays/all/person115_virus_218.jpeg


In [7]:
# Simple function to transform the images for processing
def to_tensor(img_url):
    transform = T.Compose([T.Resize(112), T.Resize((112,160)), T.ToTensor()])
    return transform(Image.open(img_url))

In [8]:
# Verify tensor shape across a few samples
for xray in xray_files[:5]:
    print(to_tensor(xray).shape)
    
to_tensor(xray_files[0]).shape[0]

torch.Size([1, 112, 160])
torch.Size([1, 112, 160])
torch.Size([1, 112, 160])
torch.Size([1, 112, 160])
torch.Size([1, 112, 160])


1

In [9]:
# Create a new column 'tensor' with the to_tensor() function applied to 'file_name'
xray_df['tensor'] = xray_df['file_name'].apply(to_tensor)

In [10]:
# Create a new function to get shape of tensors that can be applied to a dataframe column
def get_shape(tensor):
    return tensor.shape[0]

In [11]:
# Test the get_shape() function to ensure it works properly
get_shape(xray_df['tensor'][0])

1

In [12]:
xray_df['channels'] = xray_df['tensor'].apply(get_shape)

In [13]:
# Check to make sure all images are grayscale and have 1 channel
xray_df[xray_df['channels']>1]['file_name'].head()

Series([], Name: file_name, dtype: object)

In [14]:
import os

# Remove any color images with 3 channels

count = 0

for file in xray_df[xray_df['channels']>1]['file_name']:
    os.remove(file)
    count+=1

print('{} files removed.'.format(count))

0 files removed.


In [15]:
# Verify that the dataframe is correctly created
xray_df.head()

Unnamed: 0,file_name,tensor,channels
0,chestXrays/all/person281_bacteria_1328.jpeg,"[[[tensor(0.5804), tensor(0.5765), tensor(0.57...",1
1,chestXrays/all/person294_bacteria_1380.jpeg,"[[[tensor(0.6941), tensor(0.6824), tensor(0.64...",1
2,chestXrays/all/person998_bacteria_2927.jpeg,"[[[tensor(0.8745), tensor(0.8235), tensor(0.71...",1
3,chestXrays/all/person59_bacteria_282.jpeg,"[[[tensor(0.0471), tensor(0.0392), tensor(0.03...",1
4,chestXrays/all/person115_virus_218.jpeg,"[[[tensor(0.3529), tensor(0.4039), tensor(0.49...",1


In [16]:
# Verify that the shape of the tensors are as expected
xray_df['tensor'][1].shape

torch.Size([1, 112, 160])

In [17]:
# Calculate mean and standard deviation of image tensors
tensor_vals = []

for tensor in xray_df['tensor'][:4]:
    tensor_vals.append(tensor.mean())

img_mean = np.mean(tensor_vals)
img_std = np.std(tensor_vals)
    
print('mean: {}'.format(img_mean))
print('std: {}'.format(img_std))

mean: 0.4954160153865814
std: 0.05647209659218788
