In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Problem Statement:

Pneumonia is an infection in one or both lungs. Bacteria, viruses, and fungi cause it. The infection causes inflammation in the air sacs in your lungs, which are called alveoli.

Now to detection Pneumonia we need to detect Inflammation of the lungs. In this project, you’re challenged to build an algorithm to detect a visual signal for pneumonia in medical images. Specifically, your algorithm needs to automatically locate lung opacities on chest radiographs.

Business Domain Value Automating Pneumonia screening in chest radiographs, providing affected area details through bounding box. Assist physicians to make better clinical decisions or even replace human judgement in certain functional areas of healthcare (eg, radiology).

Project objective In this capstone project, the goal is to build a pneumonia detection system, to locate the position of inflammation in an image.

In [None]:
pip install keras==2.2.5

In [None]:
pip install tensorflow==1.13.0rc1

In [None]:
#Importing all the standard libraries
#..... array/martrix operations and dataframe libraries
import numpy as np
import pandas as pd
import os
import sys
import random
import tensorflow as tf
#...........
#.......... Visulaization libraries
import pydicom
from imgaug import augmenters as iaa
import pylab
import cv2
import matplotlib.pyplot as plt
import seaborn as sn
from skimage.transform import resize

#......
from sklearn.model_selection import train_test_split

# NN model building linraries
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Dropout
#...................................................

In [None]:
# Directory to save logs and trained model
ROOT_DIR = '/kaggle/working'

In [None]:
!git clone https://github.com/matterport/Mask_RCNN.git
os.chdir('Mask_RCNN')

In [None]:
# Import Mask RCNN
sys.path.append(os.path.join(ROOT_DIR, 'Mask_RCNN'))  # To find local version of the library
from mrcnn.config import Config
from mrcnn import utils
import mrcnn.model as modellib
from mrcnn import visualize
from mrcnn.model import log

In [None]:
# setting path for each of the files
class_path='/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_detailed_class_info.csv'
labels_path='/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_labels.csv'
Image_train_path='/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_images/'

In [None]:
# Files descrition
#1. stage_2_detailed_class_info.csv- contains the information of target label
#2. stage_2_train_labels.csv- contains information on Target and bounding box
#3. stage_2_train_images- contains training images in dcm format

In [None]:
# Reading class file (first file) as dataframe and check few entries and shape
df_class=pd.read_csv(class_path)
print(df_class.head(10))
print(df_class.shape[0])


In [None]:
df_class['class'].value_counts()

In [None]:
#Observation:
# This file ocntains patient Id and repective class ifnormation. 
#. There are 30277 records
# There are three classes- 
#    1. Lung Opacity- Patient havinig pneumonia, 
#    2. Normal- Patient not having pnemonia and not having any other lung problem
#    3. No Lung Opacity/Not Normal- Patient not having pnemonia but having any other lung problem

In [None]:
df_class.info()

#Observation- There are no null values 

In [None]:
# checking the number of unique entries with respect to patient ID
print(df_class['patientId'].value_counts().shape[0],'patient cases')

In [None]:
# # Reading label file (second file) as dataframe and check few entries and shape
df_label=pd.read_csv(labels_path)
print(df_label.head())
print(df_label.shape)

# Observation
#1. There are 30277 lables record (same as the class dataframe)
#2. There are 6 columns - pateint ID (same as order as in class dataframe), bounding box co-ordinates, height and widht and Target label

In [None]:
# Now lets drop the duplicate cases
df=pd.concat([df_label,df_class.drop('patientId',1)],1)
print(df.shape)
print(df.head())

In [None]:
# Classes and Targets based on Patient count
df.groupby(['class','Target']).size().reset_index(name='patient_numbers')

In [None]:
print('Number of duplicate entries accross rows:\n', df[df.duplicated()].count())
print('Number of duplicate Patient Id entries :\n', df[df.duplicated(subset='patientId')].count())
print('Number of unique Patient Id entries: \n', df['patientId'].nunique())
print('Count of various classes: \n',df.groupby('class')['patientId'].nunique())

In [None]:
# Observation
#1. All the Normal and No Lung Opacity / Not Normal	patients are grouped under Target label 0 (no pnemonia)
#2. Data Imabalance- there are ~30% pneumonia records and rest ~70% no pneumonia
#3  There are no duplicates accross rows
#4. Checking for duplicate patientId's, there are 26684 unique Patient Ids

#--------------------------------------- Exploring training images data -------------------

In [None]:
# chekcing the type of image file format and total number of images
image_path='/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_images/'
print(os.listdir(image_path)[0])
import glob
print(len(list(glob.iglob("/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_images/*.dcm", recursive=True))))

#Observations:
# All the images are in dcm format 
# these image file saved in the Digital Imaging and Communications in Medicine (DICOM) image format. 
#It stores a medical image, such as a CT scan or ultrasound
# There are in total 26684 images which matches with the unique patient IDs. Seems there is no missing image file

In [None]:
# Checking sample image file for first entry in dataframe which is normal case
print(df.iloc[3])
patientId = df['patientId'][3]
image_path_1='/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_images/%s.dcm' %patientId
dcm_data=pydicom.read_file(image_path_1)
print(dcm_data)

#Observations:
# dcm file contains metadata information about Patient (sample with no pnemonia): 
#             name, ID, Age, Sex, body part examines, view position, pixel data of image

In [None]:
#size of image
dcm_data.pixel_array.shape

In [None]:
# Original DICOM image size: 1024 x 1024
ORIG_SIZE = 1024

#observations:
#1. All the 26684 images have same size of 1024 X 1024

In [None]:
#Plotting the image 
plt.figure(figsize=(12,10))
plt.subplot(121)
plt.title('Pateint- Normal case class')
plt.imshow(dcm_data.pixel_array)
plt.subplot(122)
plt.title('Pateint- Normal case class')
plt.imshow(dcm_data.pixel_array,cmap=plt.cm.gist_gray)

In [None]:
#... Sample No Lung Opacity / Not Normal case ----------------
print(df.iloc[0])
patientId = df['patientId'][0]
image_path_1='/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_images/%s.dcm' %patientId
dcm_data=pydicom.read_file(image_path_1)
print(dcm_data)

#Plotting the image 
plt.figure(figsize=(12,10))
plt.subplot(121)
plt.title('Pateint- No Lung Opacity / Not Normal case')
plt.imshow(dcm_data.pixel_array)
plt.subplot(122)
plt.title('Pateint- No Lung Opacity / Not Normal case')
plt.imshow(dcm_data.pixel_array,cmap=plt.cm.gist_gray)

In [None]:
# Lets us plot one Patient with pnemonia (Target = 1)
print(df.iloc[4])
patientId = df['patientId'][4]
image_path_1='/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_images/%s.dcm' %patientId
dcm_data=pydicom.read_file(image_path_1)
print(dcm_data)
#Plotting the image 
plt.figure(figsize=(12,10))
plt.subplot(121)
plt.title('Pateint- With pneumonia class')
plt.imshow(dcm_data.pixel_array)
plt.subplot(122)
plt.title('Pateint- With pneumonia class')
plt.imshow(dcm_data.pixel_array,cmap=plt.cm.gist_gray)

In [None]:
# Function to show to a sample image with overlayed bounding box 
def showImage(row):
    """
    Method to draw single patient with bounding box(es) if present 

    """
    # --- Open DICOM file
    imagePath = "/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_images/{0}.dcm".format(row['patientId'])
    d = pydicom.read_file(imagePath)
    image = d.pixel_array
    image = np.stack([image] * 3, axis=2)

    if row['Target'] == 1:        
        image = drawbox(image=image, row=row)

    plt.imshow(image, cmap=plt.cm.gist_gray)
    
    
def drawbox(image, row):
    color = np.floor(np.random.rand(3) * 256).astype('int')
    stroke=6
  
    # --- Extract coordinates
    x1 = int(row['x'])
    y1 = int(row['y'])
    y2 = y1 + int(row['height'])
    x2 = x1 + int(row['width'])
    
    #print(x1)
    #print(x2)
    #print(y1)
    #print(y2)
    
    image[y1:y1 + stroke, x1:x2] = color
    image[y2:y2 + stroke, x1:x2] = color
    image[y1:y2, x1:x1 + stroke] = color
    image[y1:y2, x2:x2 + stroke] = color

    return image

In [None]:
#patient = labels_w_class.iloc[[10]]
patient = list(df.T.to_dict().values())[4]
print("Path : stage_2_train_images/{0}.dcm".format(patient['patientId']))
print("Target : {0}".format(patient['Target']))

plt.figure(figsize=(7,7))
plt.title("Sample Patient - Lung Opacity")
showImage(patient)

#----------------------------------------- Data generation for training, Model Building and training model-----

In [None]:
# Function to collect three major information (Patient ID, box and corresponding image file path) into a dictonary called parsed
extract_boxes=lambda row: [ row['y'], row['x'], row ['height'], row['width']]
parsed={}
for n, row in df.iterrows():
    pid=row['patientId']
    if pid not in parsed:
        parsed[pid]={
            'dicom': '/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_images/%s.dcm' % pid,
            'label': row['Target'],
            'boxes':[]
            }
                 
    if parsed[pid]['label']==1:
        parsed[pid]['boxes'].append(extract_boxes(row))

In [None]:
len(parsed)

In [None]:
parsed[df['patientId'][7]]

In [None]:
#............ function to get the images with bounbding box for any given Patient ID
def draw(data):
#    """
#    Method to draw single patient with bounding box(es) if present 

#   """
    # --- Open DICOM file
    d = pydicom.read_file(data['dicom'])
    im = d.pixel_array

    # --- Convert from single-channel grayscale to 3-channel RGB
    im = np.stack([im] * 3, axis=2)

    # --- Add boxes with random color if present
    for box in data['boxes']:
        rgb = np.floor(np.random.rand(3) * 256).astype('int')
        im = overlay_box(im=im, box=box, rgb=rgb, stroke=6)

    pylab.imshow(im, cmap=pylab.cm.gist_gray)
    pylab.axis('off')

def overlay_box(im, box, rgb, stroke=1):
    """
    Method to overlay single box on image

    """
    # --- Convert coordinates to integers
    box = [int(b) for b in box]
    
    # --- Extract coordinates
    y1, x1, height, width = box
    y2 = y1 + height
    x2 = x1 + width

    im[y1:y1 + stroke, x1:x2] = rgb
    im[y2:y2 + stroke, x1:x2] = rgb
    im[y1:y2, x1:x1 + stroke] = rgb
    im[y1:y2, x2:x2 + stroke] = rgb

    return im

In [None]:
# overalaping bounding box with image for sample pnemonia case
#Plotting the image 
print(df.iloc[4])
patientId = df['patientId'][4]
draw(parsed[patientId])

In [None]:
import csv
# empty dictionary
pneumonia_locations = {}
# load table
with open(os.path.join(labels_path), mode='r') as infile:
    # open reader
    reader = csv.reader(infile)
    # skip header
    next(reader, None)
    # loop through rows
    for rows in reader:
        # retrieve information
        filename = rows[0]
        location = rows[1:5]
        pneumonia = rows[5]
        # if row contains pneumonia add label to dictionary
        # which contains a list of pneumonia locations per filename
        if pneumonia == '1':
            # convert string to float to int
            location = [int(float(i)) for i in location]
            # save pneumonia location in dictionary
            if filename in pneumonia_locations:
                pneumonia_locations[filename].append(location)
            else:
                pneumonia_locations[filename] = [location]

In [None]:
len(pneumonia_locations)

In [None]:
# load and shuffle filenames
folder = Image_train_path
filenames = os.listdir(folder)

In [None]:
train_image='/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_images/'
test_image='/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_test_images/'

In [None]:
def get_dicom_fps(dicom_dir):
    dicom_fps = glob.glob(dicom_dir+'/'+'*.dcm')
    return list(set(dicom_fps))

def parse_dataset(dicom_dir, anns): 
    image_fps = get_dicom_fps(dicom_dir)
    image_annotations = {fp: [] for fp in image_fps}
    for index, row in anns.iterrows(): 
        fp = os.path.join(dicom_dir, row['patientId']+'.dcm')
        image_annotations[fp].append(row)
    return image_fps, image_annotations 

In [None]:
class DetectorConfig(Config):
    """Configuration for training pneumonia detection on the RSNA pneumonia dataset.
    Overrides values in the base Config class.
    """
    
    # Give the configuration a recognizable name  
    NAME = 'pneumonia'
    
    # Train on 1 GPU and 8 images per GPU. We can put multiple images on each
    # GPU because the images are small. Batch size is 8 (GPUs * images/GPU).
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1 
    
    BACKBONE = 'resnet50'
    
    NUM_CLASSES = 2  # background + 1 pneumonia classes
    
    IMAGE_MIN_DIM = 256
    IMAGE_MAX_DIM = 256
    #RPN_ANCHOR_SCALES = (32, 64, 128, 256)
    TRAIN_ROIS_PER_IMAGE = 32
    MAX_GT_INSTANCES = 3
    DETECTION_MAX_INSTANCES = 3
    DETECTION_MIN_CONFIDENCE = 0.9
    DETECTION_NMS_THRESHOLD = 0.1

    STEPS_PER_EPOCH = 10
    
config = DetectorConfig()
config.display()

In [None]:
class DetectorDataset(utils.Dataset):
    """Dataset class for training pneumonia detection on the RSNA pneumonia dataset.
    """

    def __init__(self, image_fps, image_annotations, orig_height, orig_width):
        super().__init__(self)
        
        # Add classes
        self.add_class('pneumonia', 1, 'Lung Opacity')
   
        # add images 
        for i, fp in enumerate(image_fps):
            annotations = image_annotations[fp]
            self.add_image('pneumonia', image_id=i, path=fp, 
                           annotations=annotations, orig_height=orig_height, orig_width=orig_width)
            
    def image_reference(self, image_id):
        info = self.image_info[image_id]
        return info['path']

    def load_image(self, image_id):
        info = self.image_info[image_id]
        fp = info['path']
        ds = pydicom.read_file(fp)
        image = ds.pixel_array
        # If grayscale. Convert to RGB for consistency.
        if len(image.shape) != 3 or image.shape[2] != 3:
            image = np.stack((image,) * 3, -1)
        return image

    def load_mask(self, image_id):
        info = self.image_info[image_id]
        annotations = info['annotations']
        count = len(annotations)
        if count == 0:
            mask = np.zeros((info['orig_height'], info['orig_width'], 1), dtype=np.uint8)
            class_ids = np.zeros((1,), dtype=np.int32)
        else:
            mask = np.zeros((info['orig_height'], info['orig_width'], count), dtype=np.uint8)
            class_ids = np.zeros((count,), dtype=np.int32)
            for i, a in enumerate(annotations):
                if a['Target'] == 1:
                    x = int(a['x'])
                    y = int(a['y'])
                    w = int(a['width'])
                    h = int(a['height'])
                    mask_instance = mask[:, :, i].copy()
                    cv2.rectangle(mask_instance, (x, y), (x+w, y+h), 255, -1)
                    mask[:, :, i] = mask_instance
                    class_ids[i] = 1
        return mask.astype(np.bool), class_ids.astype(np.int32)

In [None]:
image_fps, image_annotations = parse_dataset(train_image, anns=df_label)

In [None]:
ds = pydicom.read_file(image_fps[0]) # read dicom image from filepath 
image = ds.pixel_array # get image array

In [None]:
######################################################################
# Modify this line to use more or fewer images for training/validation. 
# To use all images, do: image_fps_list = list(image_fps)
image_fps_list = list(image_fps[:1000]) 
#####################################################################

# split dataset into training vs. validation dataset 
# split ratio is set to 0.9 vs. 0.1 (train vs. validation, respectively)
sorted(image_fps_list)
random.seed(42)
random.shuffle(image_fps_list)

validation_split = 0.1
split_index = int((1 - validation_split) * len(image_fps_list))

image_fps_train = image_fps_list[:split_index]
image_fps_val = image_fps_list[split_index:]

print(len(image_fps_train), len(image_fps_val))

In [None]:
# prepare the training dataset
dataset_train = DetectorDataset(image_fps_train, image_annotations, ORIG_SIZE, ORIG_SIZE)
dataset_train.prepare()

In [None]:
# Show annotation(s) for a DICOM image 
test_fp = random.choice(image_fps_train)
image_annotations[test_fp]

In [None]:
# prepare the validation dataset
dataset_val = DetectorDataset(image_fps_val, image_annotations, ORIG_SIZE, ORIG_SIZE)
dataset_val.prepare()

In [None]:
model = modellib.MaskRCNN(mode='training', config=config, model_dir=ROOT_DIR)

In [None]:
# Image augmentation 
augmentation = iaa.SomeOf((0, 3), [
    iaa.Fliplr(0.5),
    iaa.Flipud(0.5),
    iaa.OneOf([iaa.Affine(rotate=90),
               iaa.Affine(rotate=180),
               iaa.Affine(rotate=270)],
             ),
    iaa.Affine(
        scale={"x": (0.8, 1.2), "y": (0.8, 1.2)},
        translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)},
        rotate=(-25, 25),
        shear=(-8, 8)
    ),
    iaa.Multiply((0.9, 1.1))
  
])

In [None]:
NUM_EPOCHS = 10

# Train Mask-RCNN Model 
import warnings 
warnings.filterwarnings("ignore")
model.train(dataset_train, dataset_val, 
            learning_rate=config.LEARNING_RATE, 
            epochs=NUM_EPOCHS, 
            layers='all',
            augmentation=augmentation)