<a href="https://colab.research.google.com/github/SharoonSharif/Pneumonia-Detection-using-Chest-X-Ray/blob/main/Pneumonia_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'chest-xray-pneumonia:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F17810%2F23812%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20241009%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20241009T230829Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D7f0fa52c658ba3b11af8477fee6ba23708250aefe5c70a12d7263f418c64b16271def111632bc4326e28305b16c1294367fa0b405db11ab7297fe488a750d9cc62007e0685954f1f7f1a3a2764fd8088c5086ddae7d63cc85cb143909b2e9bdc3e6f8e5b515a2ba4399eb6266a72c8e32f961cb3bff9d74b74d95ed5e59bfe05c276fc8a4d5890878fcff3e22f6fd90ec23a177aa9318ec3d0cf975efd8b4a8087caa6ef111251b457a79adc277f716b9a3529e6ed787460c9fffd4e89d8b88740068ec8f915ba170e373aab6f55a803e49daa209752391f8536a90bec8da9bbf26dc77b1a471faafdbacf212c79b790313b3b32ee5478eece1ead7465d1b086'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# <div style="background-color: #76B1ED; color: white; padding: 10px; border-radius: 10px;text-align: center;font-family: 'Arial', sans-serif; font-size: 30px; margin: 5px; font-weight:bold;">Importing Dependencies</div>

In [None]:
import os
import cv2
import keras
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.models import Sequential
from sklearn.utils import class_weight
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import EfficientNetB4
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling2D, Dropout

warnings.filterwarnings('ignore')

# <div style="background-color: #76B1ED; color: white; padding: 10px; border-radius: 10px;text-align: center;font-family: 'Arial', sans-serif; font-size: 30px; margin: 5px; font-weight:bold;">Dataset Description</div>


<div style="background-color: #76B1ED; color: #0a0a85; padding: 10px; border-radius: 10px;font-family: 'Arial', sans-serif; font-size: 15px; margin: 5px;">
<p>The dataset is organized into 3 folders (train, test, val) and contains subfolders for each image category (Pneumonia/Normal). There are 5,863 X-Ray images (JPEG) and 2 categories (Pneumonia/Normal).

Chest X-ray images (anterior-posterior) were selected from retrospective cohorts of pediatric patients of one to five years old from Guangzhou Women and Children’s Medical Center, Guangzhou. All chest X-ray imaging was performed as part of patients’ routine clinical care.

For the analysis of chest x-ray images, all chest radiographs were initially screened for quality control by removing all low quality or unreadable scans. The diagnoses for the images were then graded by two expert physicians before being cleared for training the AI system. In order to account for any grading errors, the evaluation set was also checked by a third expert.
<p>
</div>

# <div style="background-color: #76B1ED; color: white; padding: 10px; border-radius: 10px;text-align: center;font-family: 'Arial', sans-serif; font-size: 30px; margin: 5px; font-weight:bold;">Loading Dataset</div>

In [None]:
# Labels for image categories
labels = ['PNEUMONIA', 'NORMAL']
img_size = 128

def loading_training_data(data_dir):
    data = []
    labels_list = []

    for label in labels:
        path = os.path.join(data_dir, label)
        class_num = labels.index(label)
        for img in os.listdir(path):
            img_arr = cv2.imread(os.path.join(path, img), cv2.IMREAD_GRAYSCALE)
            resized_arr = cv2.resize(img_arr, (img_size, img_size))
            data.append(resized_arr)
            labels_list.append(class_num)

    return np.array(data), np.array(labels_list)

# Load data for training, testing, and validation
train_data, train_labels = loading_training_data('/kaggle/input/chest-xray-pneumonia/chest_xray/train')
test_data, test_labels = loading_training_data('/kaggle/input/chest-xray-pneumonia/chest_xray/test')

# <div style="background-color: #76B1ED; color: white; padding: 10px; border-radius: 10px;text-align: center;font-family: 'Arial', sans-serif; font-size: 30px; margin: 5px; font-weight:bold;">Visualizing Data</div>

In [None]:
random_indices = np.random.choice(len(train_data), 8, replace=False)
plt.figure(figsize=(15, 6))

for i, idx in enumerate(random_indices):
    plt.subplot(2, 4, i + 1)
    plt.imshow(train_data[idx], cmap='magma')
    plt.title('Pneumonia' if train_labels[idx] == 0 else 'Normal')
    plt.axis('off')

plt.suptitle("Pneumonia Sample Images", size=18)
plt.tight_layout()
plt.show()


In [None]:
# Training Data Distribution
labels_df = pd.DataFrame({"Labels":train_labels})

plt.figure(figsize=(6,3))
colors = sns.light_palette("#76B1ED", n_colors=7)
sns.countplot(data = labels_df, x='Labels', palette=[colors[3], colors[6]])
plt.xticks(ticks=[0, 1], labels=['Pneumonia', 'Normal'])
plt.show()

<div style="background-color: #76B1ED; color: #0a0a85; padding: 10px; border-radius: 10px;font-family: 'Arial', sans-serif; font-size: 15px; margin: 5px;">The training data is imbalanced, with fewer examples for Normal compared to Pneumonia. To address this issue and increase the number of training examples, we will employ data augmentation techniques.</div>

# <div style="background-color: #76B1ED; color: white; padding: 10px; border-radius: 10px;text-align: center;font-family: 'Arial', sans-serif; font-size: 30px; margin: 5px; font-weight:bold;">Data Preprocessing</div>

In [None]:
# Normalize the data
X_train = np.array(train_data) / 255
X_test = np.array(test_data) / 255

<div style="background-color: #76B1ED; color: #0a0a85; padding: 10px; border-radius: 10px;font-family: 'Arial', sans-serif; font-size: 15px; margin: 5px;">Grayscale normalization is applied to minimize the impact of illumination variations. Also, the CNN converges more quickly on data scaled to the range [0..1] compared to the original range of [0..255].</div>

In [None]:
import numpy as np

img_size = 128

# Reshape the grayscale images to 128x128x1
X_train = X_train.reshape(-1, img_size, img_size, 1)
X_test = X_test.reshape(-1, img_size, img_size, 1)

# Convert grayscale to RGB by duplicating the single channel 3 times
X_train = np.repeat(X_train, 3, axis=-1)
X_test = np.repeat(X_test, 3, axis=-1)

# Convert labels to numpy arrays
y_train = np.array(train_labels)
y_test = np.array(test_labels)

print(X_train.shape)  # This should now show (num_samples, 128, 128, 3)
print(X_test.shape)


<div style="background-color: #76B1ED; color: #0a0a85; padding: 10px; border-radius: 10px;font-family: 'Arial', sans-serif; font-size: 15px; margin: 5px;">Reshaping the grayscale images to 128x128x1 ensures they conform to a standard size, and the added single-channel dimension allows the CNN to process grayscale images properly. Converting the grayscale images to RGB by duplicating the single channel three times ensures compatibility with pre-trained models, like MobileNet, that expect three-channel (RGB) inputs. Finally, converting the labels to NumPy arrays ensures they are in a format suitable for model training.</div>

In [None]:
# Set the validation size
val_size = 0.2

# Split the data
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=val_size, random_state=21)

# Check the shapes
print("Training data shape:", X_train_split.shape)
print("Validation data shape:", X_val_split.shape)
print("Training labels shape:", y_train_split.shape)
print("Validation labels shape:", y_val_split.shape)

<div style="background-color: #76B1ED; color: #0a0a85; padding: 10px; border-radius: 10px;font-family: 'Arial', sans-serif; font-size: 15px; margin: 5px;">Explanation: The train_test_split function is used to divide the dataset into training and validation sets. The validation set size is set to 20% of the original data. This split ensures the model can be trained on 80% of the data while evaluating its performance on the remaining 20%. The random_state=21 guarantees that the split is reproducible across different runs, providing consistency when evaluating the model's performance.</div>

# <div style="background-color: #76B1ED; color: white; padding: 10px; border-radius: 10px;text-align: center;font-family: 'Arial', sans-serif; font-size: 30px; margin: 5px; font-weight:bold;">Data Augmentation</div>

In [None]:
# Performing Data Augmentation
data_generator = ImageDataGenerator(
                    rotation_range = 30,
                    zoom_range = 0.2,
                    width_shift_range=0.1,
                    height_shift_range=0.1,
                    horizontal_flip = True,
                    shear_range=0.2,
                    fill_mode='nearest',
                 )


data_generator.fit(X_train)

<div style="background-color: #76B1ED; color: #0a0a85; padding: 10px; border-radius: 10px;font-family: 'Arial', sans-serif; font-size: 15px; margin: 5px;">Performing data augmentation by applying transformations like rotation, zoom, shifts, and flips. This helps increase the diversity of training data, which improves the model's robustness and generalization by reducing overfitting.</div>

# <div style="background-color: #76B1ED; color: white; padding: 10px; border-radius: 10px;text-align: center;font-family: 'Arial', sans-serif; font-size: 30px; margin: 5px; font-weight:bold;"> EfficientNetB4</div>

In [None]:
# Define the input shape
input_shape = (128, 128, 3)  # Directly using RGB input

# Load MobileNet with pre-trained weights, specifying the input shape and without the top classification layers
mobilenet_base = EfficientNetB4(input_shape=input_shape, include_top=False, weights='imagenet')

# Add Global Average Pooling to reduce dimensionality
x = mobilenet_base.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.7)(x)
x = Dense(96,activation='relu', kernel_regularizer=l2(0.05))(x)
x = Dropout(0.7)(x)
# Add the final Dense layer for binary classification
output_layer = Dense(1, activation='sigmoid', kernel_regularizer=l2(0.05))(x)

# Create the final model
model = Model(inputs=mobilenet_base.input, outputs=output_layer)

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Show model summary
model.summary()

<div style="background-color: #76B1ED; color: #0a0a85; padding: 10px; border-radius: 10px;font-family: 'Arial', sans-serif; font-size: 15px; margin: 5px;">EfficientNetB4 is loaded with pre-trained ImageNet weights to leverage transfer learning, ensuring the model starts with knowledge from large-scale datasets. Global Average Pooling is used to reduce dimensionality and minimize overfitting while retaining spatial information. Dropout layers with a rate of 0.7 are added to prevent overfitting by randomly disabling nodes during training. The Dense layers include L2 regularization to add further regularization, controlling model complexity. The final Dense layer uses a sigmoid activation for binary classification, outputting a probability. The model is compiled with Adam optimizer and binary cross-entropy loss, ideal for binary classification tasks.</div>

In [None]:
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights))

model.fit(X_train_split, y_train_split, epochs=15,
          validation_data=data_generator.flow(X_val_split, y_val_split),
          batch_size=64, class_weight=class_weights)


<div style="background-color: #76B1ED; color: #0a0a85; padding: 10px; border-radius: 10px;font-family: 'Arial', sans-serif; font-size: 15px; margin: 5px;">The class_weight function computes the class weights to balance the training process when dealing with imbalanced datasets, ensuring that the minority class is given more importance. The model is then trained using the fit function, with the class weights passed to ensure balanced learning. The data_generator.flow is used to feed the validation data in batches, and batch_size=64 helps manage memory usage. The training runs for 15 epochs, and the validation data is used to monitor the model's performance after each epoch.</div>

# <div style="background-color: #76B1ED; color: white; padding: 10px; border-radius: 10px;text-align: center;font-family: 'Arial', sans-serif; font-size: 30px; margin: 5px; font-weight:bold;">Training & Validation Metrics Visualization</div>

In [None]:
# Retrieve metrics from the training history
history = model.history.history  # Access the 'history' dictionary

train_acc = history['accuracy']
train_loss = history['loss']
val_acc = history['val_accuracy']
val_loss = history['val_loss']

# Epochs
epochs = range(1, len(train_acc) + 1)

# Create a figure and axes for the plots
fig, ax = plt.subplots(1, 2, figsize=(18, 6))

# Plot training and validation accuracy
ax[0].plot(epochs, train_acc, 'o-', color='darkgreen', label='Training Accuracy', markersize=8)
ax[0].plot(epochs, val_acc, 's--', color='darkred', label='Validation Accuracy', markersize=8)
ax[0].set_title('Training vs. Validation Accuracy', fontsize=16)
ax[0].set_xlabel('Epochs', fontsize=14)
ax[0].set_ylabel('Accuracy', fontsize=14)
ax[0].legend()
ax[0].grid(True)

# Plot training and validation loss
ax[1].plot(epochs, train_loss, 'o-', color='darkblue', label='Training Loss', markersize=8)
ax[1].plot(epochs, val_loss, 's--', color='orange', label='Validation Loss', markersize=8)
ax[1].set_title('Training vs. Validation Loss', fontsize=16)
ax[1].set_xlabel('Epochs', fontsize=14)
ax[1].set_ylabel('Loss', fontsize=14)
ax[1].legend()
ax[1].grid(True)

# Display the plots
plt.tight_layout()
plt.show()


# <div style="background-color: #76B1ED; color: white; padding: 10px; border-radius: 10px;text-align: center;font-family: 'Arial', sans-serif; font-size: 30px; margin: 5px; font-weight:bold;">Model Performance on Testing Data</div>

In [None]:
evaluation = model.evaluate(X_test,y_test)
print("=="*20)
print(f"Accuracy - {evaluation[1]*100}%")
print(f"Loss - {evaluation[0]}")
print("=="*20)

In [None]:
# Predicting on Test data
predictions = model.predict(X_test)
predictions = predictions.reshape(1,-1)[0]

In [None]:
# Randomly select 8 indices from the test set
random_indices = np.random.choice(len(X_test), 8, replace=False)

# Define the figure size
plt.figure(figsize=(15, 5))

# Iterate through the selected indices
for i, idx in enumerate(random_indices):
    plt.subplot(2, 4, i + 1)

    # Display the image
    plt.imshow(X_test[idx].reshape(128, 128,3), cmap='magma', interpolation='none')

    # Set the title with predicted and actual classes
    plt.title(f"Predicted: {round(predictions[idx])}   Actual: {y_test[idx]}", fontsize=10)

    # Remove x and y ticks
    plt.axis('off')

# Set the main title for the figure
plt.suptitle("Sample Test Images with Predictions", size=18)

# Adjust layout to prevent overlapping
plt.tight_layout()

# Show the plot
plt.show()