<a href="https://colab.research.google.com/github/Sdlkh/Pneumonia-Project/blob/main/Chest_X_Ray_Images_(Pneumonia).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
#Install Kaggle CLI
!pip install -q kaggle

In [None]:
#Upload your kaggle.json
from google.colab import files
files.upload()

In [29]:
#Put the key in the right place (and secure it)
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
#Download the Chest X-Ray dataset from Kaggle
!kaggle datasets download -d paultimothymooney/chest-xray-pneumonia

In [31]:
#Unzip the dataset
import zipfile, os

zip_path = "chest-xray-pneumonia.zip"
extract_to = "chest_xray"

with zipfile.ZipFile(zip_path, 'r') as zf:
    zf.extractall(extract_to)

In [None]:
#Set base_dir and verify folder structure
import os

# After unzipping, the dataset folder is nested as chest_xray/chest_xray/...
base_dir = "chest_xray/chest_xray"

print("Exists base_dir?", os.path.exists(base_dir))
print("Subfolders in base_dir:", os.listdir(base_dir))

# Should show: train, val, test
for split in ["train","val","test"]:
    path = os.path.join(base_dir, split)
    print(split, "->", os.path.exists(path), ":", os.listdir(path) if os.path.exists(path) else "NOT FOUND")

In [None]:
#Class counts (NORMAL vs PNEUMONIA) per split
import matplotlib.pyplot as plt

def count_images(folder):
    normal_dir = os.path.join(folder, "NORMAL")
    pneumonia_dir = os.path.join(folder, "PNEUMONIA")
    normal_count = len(os.listdir(normal_dir))
    pneumonia_count = len(os.listdir(pneumonia_dir))
    return normal_count, pneumonia_count

train_dir = os.path.join(base_dir, "train")
val_dir   = os.path.join(base_dir, "val")
test_dir  = os.path.join(base_dir, "test")

train_normal, train_pneu = count_images(train_dir)
val_normal, val_pneu     = count_images(val_dir)
test_normal, test_pneu   = count_images(test_dir)

print(f"Train: NORMAL={train_normal}, PNEUMONIA={train_pneu}")
print(f"Val  : NORMAL={val_normal}, PNEUMONIA={val_pneu}")
print(f"Test : NORMAL={test_normal}, PNEUMONIA={test_pneu}")

# Bar chart
labels = ["NORMAL", "PNEUMONIA"]
splits = ["Train", "Validation", "Test"]
counts = [
    [train_normal, train_pneu],
    [val_normal, val_pneu],
    [test_normal, test_pneu]]

plt.figure(figsize=(10,5))
x = range(len(labels))
bar_w = 0.25

for i, split in enumerate(splits):
    plt.bar([p + i*bar_w for p in x], counts[i], width=bar_w, label=split)

plt.xticks([p + bar_w for p in x], labels)
plt.ylabel("Number of Images")
plt.title("Class Distribution by Split")
plt.legend()
plt.show()

In [None]:
#Visualize random sample X-rays from each class
import matplotlib.image as mpimg
import random

def show_samples(folder, label, n=3):
    cls_dir = os.path.join(folder, label)
    files = os.listdir(cls_dir)
    samples = random.sample(files, min(n, len(files)))

    plt.figure(figsize=(12, 3))
    for i, fname in enumerate(samples, start=1):
        img_path = os.path.join(cls_dir, fname)
        img = mpimg.imread(img_path)
        plt.subplot(1, len(samples), i)
        plt.imshow(img, cmap="gray")
        plt.axis("off")
        plt.title(label)
    plt.show()

# Show samples from TRAIN split
show_samples(train_dir, "NORMAL", n=3)
show_samples(train_dir, "PNEUMONIA", n=3)

In [35]:
#Create ImageDataGenerators
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Training data generator with augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,              # normalize
    rotation_range=20,           # rotate images randomly up to 20 degrees
    width_shift_range=0.1,       # shift horizontally
    height_shift_range=0.1,      # shift vertically
    shear_range=0.1,             # shear transformation
    zoom_range=0.2,              # zoom in/out
    horizontal_flip=True,        # flip images horizontally
    fill_mode='nearest'          # fill in missing pixels
    )

# Validation and test data should NOT be augmented
val_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

In [None]:
#Load images from directories
IMG_SIZE = (150, 150)
BATCH_SIZE = 32

train_gen = train_datagen.flow_from_directory(
    train_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary')

val_gen = val_datagen.flow_from_directory(
    val_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary')

test_gen = test_datagen.flow_from_directory(
    test_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',
    shuffle=False   # keep order for evaluation
)

In [None]:
#Visualize augmented images (sanity check)
import numpy as np

# Get one batch of augmented images
x_batch, y_batch = next(train_gen)

plt.figure(figsize=(10, 5))
for i in range(6):
    plt.subplot(2, 3, i+1)
    plt.imshow(x_batch[i])
    plt.title("PNEUMONIA" if y_batch[i] == 1 else "NORMAL")
    plt.axis("off")
plt.suptitle("Augmented Training Samples")
plt.show()

In [None]:
#Basic CNN (from scratch)
import tensorflow as tf
from tensorflow.keras import layers, models

model_cnn = models.Sequential([
    # Convolutional block 1
    layers.Conv2D(32, (3,3), activation='relu', input_shape=(150,150,3)),
    layers.MaxPooling2D(2,2),

    # Convolutional block 2
    layers.Conv2D(64, (3,3), activation='relu'),
    layers.MaxPooling2D(2,2),

    # Convolutional block 3
    layers.Conv2D(128, (3,3), activation='relu'),
    layers.MaxPooling2D(2,2),

    # Flatten → Fully Connected Layers
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),   # prevent overfitting
    layers.Dense(1, activation='sigmoid')  # binary classification
])

model_cnn.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model_cnn.summary()

In [None]:
#Transfer Learning (VGG16 Example)
from tensorflow.keras.applications import VGG16

# Load pretrained VGG16 (without top classifier layers)
base_model = VGG16(weights="imagenet", include_top=False, input_shape=(150,150,3))

# Freeze base model layers (so we don’t destroy pre-trained weights)
for layer in base_model.layers:
    layer.trainable = False

# Add our custom classifier on top
x = layers.Flatten()(base_model.output)
x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.5)(x)
output = layers.Dense(1, activation='sigmoid')(x)

model_vgg16 = Model(inputs=base_model.input, outputs=output)

model_vgg16.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
model_vgg16.summary()

In [None]:
#Training
EPOCHS = 10

history = model_cnn.fit(
    train_gen,
    validation_data=val_gen,
    epochs=EPOCHS
)

In [None]:
#Evaluate on test set
# Evaluate basic metrics
test_loss, test_acc = model_cnn.evaluate(test_gen)
print(f"✅ Test Accuracy: {test_acc:.4f}, Test Loss: {test_loss:.4f}")

In [None]:
#Predictions for classification report
from sklearn.metrics import classification_report, confusion_matrix

# Predict probabilities
y_pred_probs = model_cnn.predict(test_gen)
y_pred = (y_pred_probs > 0.5).astype("int32").flatten()

# True labels
y_true = test_gen.classes

# Classification report
print("📊 Classification Report:")
print(classification_report(y_true, y_pred, target_names=["NORMAL", "PNEUMONIA"]))

In [None]:
# Confusion Matrix
import seaborn as sns

cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["NORMAL","PNEUMONIA"],
            yticklabels=["NORMAL","PNEUMONIA"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Training vs Validation Curves
# Plot Accuracy and Loss
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(acc))

plt.figure(figsize=(12,5))

# Accuracy plot
plt.subplot(1,2,1)
plt.plot(epochs_range, acc, label='Train Accuracy')
plt.plot(epochs_range, val_acc, label='Val Accuracy')
plt.legend()
plt.title('Training vs Validation Accuracy')

# Loss plot
plt.subplot(1,2,2)
plt.plot(epochs_range, loss, label='Train Loss')
plt.plot(epochs_range, val_loss, label='Val Loss')
plt.legend()
plt.title('Training vs Validation Loss')

plt.show()