In [1]:
import sys
import os, os.path

sys.path.append(os.path.join(os.getcwd() ,'/modules'))
root_path = "C:/git/Springboard-Public/Capstone Project 2/"
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    root_path = "/content/drive/My Drive/Capstone Project 2/"

print('Current Working Dir: ', os.getcwd())
print('Root Path: ', root_path)

# We need to set the working directory since we are using relative paths from various locations
if os.getcwd() != root_path:
  os.chdir(root_path)

Current Working Dir:  C:\git\Springboard-Public\Capstone Project 2\notebooks\Support Notebooks for Modules
Root Path:  C:/git/Springboard-Public/Capstone Project 2/


In [2]:
import numpy as np
import pandas as pd
from datetime import datetime
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

import os, os.path
from PIL import Image
import copy

from modules.lib.ChextXRayImages import CleanMetaData, Dataset, Loaders

import torch.optim as optim
import torch
import torch.utils.data
import torch.nn as nn
import torch.nn.functional as F

import torchvision
import torchvision.transforms as transforms
from torchvision.transforms import ToTensor, ToPILImage
import torchvision.models as models

from torchsummary import summary

%matplotlib inline

### Get DataFrame from our Prep class

In [3]:
cleanMetaData = CleanMetaData()

In [4]:
df = cleanMetaData.getCleanDF()
df

Unnamed: 0_level_0,PatientID,StudyID,Age,Sex_Male,Sex_Unknown,Orientation_PA,Support Devices,Image_Path,Hierarchical_Path,Enlarged_Cardiomediastinum,...,Lung_Opacity,Lung_Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural_Effusion,Pleural_Other,Fracture
ImageID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,68,0,0,0,1.0,data/raw/train/patient00001/study1/view1_front...,data/d0/d1/i0.jpg,0,...,0,0,0,0,0,0,0,0,0,0
1,2,2,87,0,0,0,0.0,data/raw/train/patient00002/study2/view1_front...,data/d1/d2/i1.jpg,0,...,1,0,0,0,0,0,0,0,0,1
2,2,1,83,0,0,0,0.0,data/raw/train/patient00002/study1/view1_front...,data/d2/d2/i2.jpg,0,...,1,0,0,0,0,0,0,0,0,1
4,3,1,41,1,0,0,0.0,data/raw/train/patient00003/study1/view1_front...,data/d4/d3/i4.jpg,0,...,0,0,1,0,0,0,0,0,0,0
5,4,1,20,0,0,1,0.0,data/raw/train/patient00004/study1/view1_front...,data/d5/d4/i5.jpg,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223409,64537,2,59,1,0,0,0.0,data/raw/train/patient64537/study2/view1_front...,data/d9/d37/i223409.jpg,0,...,0,0,0,0,0,0,0,1,0,0
223410,64537,1,59,1,0,0,0.0,data/raw/train/patient64537/study1/view1_front...,data/d10/d37/i223410.jpg,0,...,0,0,0,0,0,0,0,0,0,0
223411,64538,1,0,0,0,0,0.0,data/raw/train/patient64538/study1/view1_front...,data/d11/d38/i223411.jpg,0,...,0,0,0,0,0,0,0,0,0,0
223412,64539,1,0,0,0,0,0.0,data/raw/train/patient64539/study1/view1_front...,data/d12/d39/i223412.jpg,0,...,1,0,0,0,0,1,0,0,0,0


### Build our own DataSet class

We need to do this so that the DS will hold the multiple labels

In the constructor, we walk every row in the DataFrame from above and hold all the label values in a list for each feature.

The index of these lists match the image.  The iterator returns key value pairs.  The first is the image tensor (converted to tensor via the transform).  The other keys map to the 12 features.

In [5]:
class ChestXRayDataset(torch.utils.data.Dataset):
    def __init__(self, df, transform=None):

        # initialize the arrays to store the ground truth labels and paths to the images
        self.data = []
        self.Enlarged_Cardiomediastinum = []
        self.Cardiomegaly = []
        self.Lung_Opacity = []
        self.Lung_Lesion = []
        self.Edema = []
        self.Consolidation = []
        self.Pneumonia = []
        self.Atelectasis = []
        self.Pneumothorax = []
        self.Pleural_Effusion = []
        self.Pleural_Other = []
        self.Fracture = []
        
        self.transform = transform
        self.to_tensor = ToTensor()
        self.to_pil = ToPILImage()

        # Load the image and lables for each row in the DataFrame
        for _, row in df.iterrows():
            self.data.append(row.Image_Path)
            self.Enlarged_Cardiomediastinum.append(row['Enlarged_Cardiomediastinum'])
            self.Cardiomegaly.append(row['Cardiomegaly'])
            self.Lung_Opacity.append(row['Lung_Opacity'])
            self.Lung_Lesion.append(row['Lung_Lesion'])
            self.Edema.append(row['Edema'])
            self.Consolidation.append(row['Consolidation'])
            self.Pneumonia.append(row['Pneumonia'])
            self.Atelectasis.append(row['Atelectasis'])
            self.Pneumothorax.append(row['Pneumothorax'])
            self.Pleural_Effusion.append(row['Pleural_Effusion'])
            self.Pleural_Other.append(row['Pleural_Other'])
            self.Fracture.append(row['Fracture'])
            
    def __len__(self):
        return len(self.data)
            
    def __getitem__(self, idx):
        # take the data sample by its index
        img_path = self.data[idx]

        # read image
        img = Image.open(img_path)

        # apply the image augmentations if needed
        if self.transform:
            img = self.transform(img)

        # return the image and all the associated labels
        result = {
            'img': img,
            'labels': {
                        'Enlarged_Cardiomediastinum': self.Enlarged_Cardiomediastinum[idx],
                        'Cardiomegaly': self.Cardiomegaly[idx],
                        'Lung_Opacity': self.Lung_Opacity[idx],
                        'Lung_Lesion': self.Lung_Lesion[idx],
                        'Edema': self.Edema[idx],
                        'Consolidation': self.Consolidation[idx],
                        'Pneumonia': self.Pneumonia[idx],
                        'Atelectasis': self.Atelectasis[idx],
                        'Pneumothorax': self.Pneumothorax[idx],
                        'Pleural_Effusion': self.Pleural_Effusion[idx],
                        'Pleural_Other': self.Pleural_Other[idx],
                        'Fracture': self.Fracture[idx]
                    }
        }
        return result            

#### The dataset class makes it easy to find counts

We will create a temp data set for now without the resize or Normalization transformations since we have yet to look at these.

In [6]:
transform = transforms.Compose([transforms.Grayscale(1),
                              transforms.ToTensor()])
tempDS = ChestXRayDataset(df, transform=transform)

print(f'  Negative: {np.sum([-i for i in tempDS.Pleural_Effusion if i == -1]):,}\n' +
      f'  Positive: {np.sum([i for i in tempDS.Pleural_Effusion if i == 1]):,}\n' + 
      f'No Finding: {np.sum([i for i in tempDS.Pleural_Effusion if i == 0]):,}')

  Negative: 0.0
  Positive: 44,734
No Finding: 0


#### Let's look at the shapes of the images

We will just sampe every 100th image

In [7]:
image_folder = cleanMetaData.imageFolderPath()
image_widths = []
image_heights = []
image_pixel_counts = []
counter=0

for dirpath, _, filenames in os.walk(image_folder):
    for path_image in filenames:
        counter+=1
        if counter % 100 == 0:
            image = os.path.join(dirpath, path_image)
            with Image.open(image) as img:
                width, heigth = img.size
                image_widths.append(width)
                image_heights.append(heigth)
                image_pixel_counts.append(width * heigth)

isdf = pd.DataFrame({'Width': image_widths, 'Height': image_heights, 'Pixels':image_pixel_counts})

In [8]:
display(isdf.sort_values('Width', ascending=False))
print(f'Min/Max Height = {isdf.Width.min()}/{isdf.Width.max()}\n \
        Min/Max Width = {isdf.Height.min()}/{isdf.Height.max()}\n \
        Min/Max Pixels = {isdf.Pixels.min()}/{isdf.Pixels.max()}')

Unnamed: 0,Width,Height,Pixels
1822,483,320,154560
2161,439,320,140480
1863,439,320,140480
293,426,320,136320
760,418,320,133760
...,...,...,...
253,320,369,118080
661,320,320,102400
660,320,320,102400
1330,320,333,106560


Min/Max Height = 320/483
         Min/Max Width = 320/461
         Min/Max Pixels = 102400/154560


### Looks like most images are around four hundred by 320. 

Some are protrait and some are landscape

So let's pick the resize transformation to be 320X320


In [9]:
hight,width = 320, 320
transform = transforms.Compose(
                              [transforms.Resize(size=(hight,width), interpolation=2),
                              transforms.Grayscale(1),
                              transforms.ToTensor()])
chestXRayDataset = Dataset(df, transform=transform)

### Now, we will build a data loader to evaluate the mean and SD of the pixels

We will randomly choose 2,500 images for this.  To do this, we will use the Subset object from PyTorch

In [10]:
dss = torch.utils.data.Subset(chestXRayDataset, np.random.choice(len(chestXRayDataset), 2500, replace=False))
loader = torch.utils.data.DataLoader(dss, batch_size=64)

In [11]:
pixel_mean = 0.5064167
pixel_sd = 0.16673872

if pixel_mean == 0:
    image_count = 0
    fst_moment = torch.empty(1)
    snd_moment = torch.empty(1)

    for batch in loader:
      images = batch['img']
      b, c, h, w = images.shape
      nb_pixels = b * h * w
      sum_ = torch.sum(images, dim=[0, 2, 3])
      sum_of_square = torch.sum(images ** 2, dim=[0, 2, 3])
      fst_moment = (image_count * fst_moment + sum_) / (image_count + nb_pixels)
      snd_moment = (image_count * snd_moment + sum_of_square) / (image_count + nb_pixels)

      image_count += nb_pixels

    pixel_mean = fst_moment.numpy()  
    pixel_sd = torch.sqrt((snd_moment - fst_moment) ** 2).numpy()

print(f'Mean Pixel Value: {pixel_mean}\nSD:{pixel_sd}')

Mean Pixel Value: 0.5064167
SD:0.16673872


### Now we have both the size and statistics for the images

So we can build the final transformation and dataset

In [12]:
transform = transforms.Compose(
                              [transforms.Resize(size=(hight,width), interpolation=2),
                              transforms.Grayscale(1),
                              transforms.ToTensor(),
                              transforms.Normalize((pixel_mean,), (pixel_sd,))])
chestXRayDataset = Dataset(df, transform=transform)

## Test Dataset Loaders classes

#### Get single loader for all images

In [13]:
loader = Loaders().getDataLoader()
print(f'Number of Batches: {len(loader):,}')
print(f'Number of Images: {len(loader) * 64:,}')

Number of Batches: 2,059
Number of Images: 131,776


#### Get single load for 500 images

In [14]:
batch_size=16
loader = Loaders().getDataLoader(batch_size=batch_size, n_random_rows=512)
print(f'Number of Batches: {len(loader):,}')
print(f'Number of Images: {len(loader) * batch_size:,}')

Number of Batches: 32
Number of Images: 512


#### Get train/validation loaders for 15,000 images

In [15]:
batch_size=1
val_percent=0.15
number_images = 15000
train_loader, val_loader = Loaders().getDataTrainValidateLoaders(batch_size=batch_size, 
                                                                        val_percent=val_percent, 
                                                                        n_random_rows=number_images)
print(f'Number of Training Batches: {len(train_loader):,}')
print(f'Number of Validation Batches: {len(val_loader):,}')
print(f'Number of Training Images: {len(train_loader) * batch_size:,}')
print(f'Number of Validation Images: {len(val_loader) * batch_size:,}')

Number of Training Batches: 12,729
Number of Validation Batches: 2,271
Number of Training Images: 12,729
Number of Validation Images: 2,271


In [16]:
metaData = CleanMetaData()
df = metaData.getCleanDF()
missing_images = []
missing_hierarchical_images = []
for _, row in df.iterrows():
    if not os.path.exists(row.Image_Path):
      missing_images.append(row.Image_Path)
    if not os.path.exists(row.Hierarchical_Path):
      missing_hierarchical_images.append(row.Hierarchical_Path)

In [17]:
len(missing_images), len(missing_hierarchical_images)

(0, 0)