### Import libraries

In [1]:
# Tensorflow and tf.keras
import os
import pandas as pd
import seaborn as sn

import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense, MaxPooling2D, Flatten, Conv2D, Dropout
from keras.models import Sequential

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

print(tf.__version__)

2.6.0


In [2]:
# Download and create image shape

import keras
from keras.preprocessing.image import ImageDataGenerator, load_img
import warnings
#from keras.layers import MaxPooling2D, Flatten, Conv2D


## Pneumonia Detection

### Chest x-xray images 

In [3]:
# chest_xray directory

DATADIR = "chest_xray/"

## Data Wrangling and Exploratory Data Analysis (EDA)

### Define the paths to each file

In [4]:
#Use os.path.join() method to access the file through the direcectory
train_dir = os.path.join(DATADIR, 'train')
val_dir = os.path.join(DATADIR, 'val')
test_dir = os.path.join(DATADIR, 'test')

In [5]:
# Colllect Pnemonia data from files
pneumonia_train_dir = os.path.join(train_dir,'PNEUMONIA')
pneumonia_val_dir = os.path.join(val_dir, 'PNEUMONIA')
pneumonia_test_dir = os.path.join(test_dir, 'PNEUMONIA')

In [6]:
# agggregate Normal data from the files
normal_train_dir = os.path.join(train_dir, 'NORMAL')
normal_val_dir = os.path.join(val_dir, 'NORMAL')
normal_test_dir = os.path.join(test_dir, 'NORMAL')

In [7]:
# check the size of pneumonia in the dataset
pneumonia_train_images = len(os.listdir(pneumonia_train_dir))
pneumonia_val_images = len(os.listdir(pneumonia_val_dir))
pneumonia_test_images = len(os.listdir(pneumonia_test_dir))

FileNotFoundError: [Errno 2] No such file or directory: 'chest_xray/train/PNEUMONIA'

In [None]:
# Check the size of normal in the dataset
normal_train_images = len(os.listdir(normal_train_dir))
normal_val_images = len(os.listdir(normal_val_dir))
normal_test_images = len(os.listdir(normal_test_dir))

### The subtotal images per classes in each file

In [None]:
print('*' * 40)
print(f'Total training images: {pneumonia_train_images + normal_train_images}')
print(f'Pneumonia: {pneumonia_train_images}')
print(f'Noraml: {normal_train_images}')
print('*'*40)

print(f'Total validation images: {pneumonia_val_images + normal_val_images}')
print(f'Pneumomian: {pneumonia_val_images}')
print(f'Normal: {normal_val_images}')
print('*'*40)
#print('\n')
print(f'Total test images: {pneumonia_test_images + normal_test_images}')
print(f'Pneumonia test images: {pneumonia_test_images}')
print(f' Nomal test images: {normal_test_images}')
print('*' * 40)

In [None]:
pneumonia = sum([pneumonia_train_images, pneumonia_val_images, pneumonia_test_images])
normal = sum([normal_train_images, normal_val_images, normal_test_images])

print(f"The total number of pneumonia infected dataset is: {pneumonia} ")
print(f"The total number of non infected dataset is: {normal}")

In [None]:
sn.barplot(x =["NORMAL", 'PNEUMONIA'], y =[normal, pneumonia])
plt.title("Training Dataset Class Distribution Plot", size = 20)
sn.set_style('darkgrid')
plt.show()

In [None]:
category = [ "PNEUMONIA", "NORMAL"]

df = pd.DataFrame({'train':[pneumonia_train_images, normal_train_images],'test':[pneumonia_test_images,
                            normal_test_images],'val':[pneumonia_val_images,normal_val_images]}, index= category)
plt.figure(figsize=(8, 8))
df.plot(kind='bar', rot =45, width=0.7)
plt.yscale('log')
plt.legend(loc= 'best')
plt.title('Normal vs. Pneumonia Datasets')
plt.show()

df.head()

### Preprocessing

In [None]:
# Data agumentation only for training data
train_datagen = ImageDataGenerator(
            rescale = 1./255,
            shear_range=0.2,
            zoom_range = 0.2,
            horizontal_flip = True)

In [None]:
# test data only scaled 
test_datagen = ImageDataGenerator(rescale=1.0/255)
train_generator = train_datagen.flow_from_directory(
                                        'chest_xray/chest_xray/train',
                                    target_size=(64,64),
                                    batch_size = 64,
                                    class_mode ='binary')

validation_generator = test_datagen.flow_from_directory('chest_xray/chest_xray/test',
                                                       target_size= (64, 64),
                                                       batch_size = 64,
                                                       class_mode = 'binary')

### Show some images after data augmentation

In [None]:
images_batch, label_batch = next(iter(train_generator))


def show_batch(images_batch, label_batch):
    plt.figure(figsize=(10,10))
    for i in range(0, 25):
        ax= plt.subplot(5, 5, i+1)
        plt.imshow(images_batch[i])
        if label_batch[i]:
            plt.title("PNEUMONIA")
        else:
            plt.title('NORMAL')
            plt.axis('off')
    plt.tight_layout()
show_batch(images_batch, label_batch)

### Summary

Lung images infected with pneumonia show irregularly more whites than the normal lung, however, few lung images are misleading to classify with the naked eye, which usually makes image classification difficult. Generally, data imbalance between the two classes is a concern for biases and model overfitting.
