In [None]:
## Imports
This section imports all necessary libraries for the project, including TensorFlow for the CNN, scikit-learn for traditional ML models, and other utilities.

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
from PIL import Image
import tensorflow as tf
from tensorflow.keras import layers, models
import os

print("Libraries imported successfully!")

In [None]:
## Data Preparation
This section loads the chest X-ray dataset, preprocesses the images (rescaling and resizing), and splits the data into training, validation, and test sets. A smaller subset is also created for initial experimentation with traditional ML models.

In [None]:
# Define paths
base_dir = 'C:/ChestXRay/'
train_dir = os.path.join(base_dir, 'Train')
test_dir = os.path.join(base_dir, 'Test')

# ImageDataGenerator for preprocessing
train_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.15
)

# Training generator
train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(150, 150),
    batch_size=64,
    class_mode='binary',
    subset='training',
    shuffle=True
)

# Validation generator
val_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(150, 150),
    batch_size=64,
    class_mode='binary',
    subset='validation',
    shuffle=True
)

# Test generator
test_generator = train_datagen.flow_from_directory(
    test_dir,
    target_size=(150, 150),
    batch_size=64,
    class_mode='binary',
    shuffle=False
)

# Get the full subset
def get_subset(generator, num_samples=250):
    X, y = [], []
    for _ in range((num_samples // generator.batch_size) + 1):
        images, labels = next(generator)
        X.extend(images)
        y.extend(labels)
        if len(X) >= num_samples:
            break
    return np.array(X[:num_samples]), np.array(y[:num_samples])

# Load the full subsets
X_train_subset, y_train_subset = get_subset(train_generator, 250)
X_val, y_val = get_subset(val_generator, 100)
X_test, y_test = get_subset(test_generator, 584)

print(f"Training subset: {X_train_subset.shape}, Labels: {y_train_subset.shape}")
print(f"Validation set: {X_val.shape}, Labels: {y_val.shape}")
print(f"Test set: {X_test.shape}, Labels: {y_test.shape}")

In [None]:
## Class Distribution
This section checks the class distribution to understand the imbalance between NORMAL and PNEUMONIA classes in the dataset.

In [None]:
# Full training set distribution
train_classes = train_generator.classes
print("Full training set distribution:")
print(f"NORMAL (0): {np.sum(train_classes == 0)}, PNEUMONIA (1): {np.sum(train_classes == 1)}")

# Subset distribution
print("250-image subset distribution:")
print(f"NORMAL (0): {np.sum(y_train_subset == 0)}, PNEUMONIA (1): {np.sum(y_train_subset == 1)}")

# Test set distribution
print("Test set distribution:")
print(f"NORMAL (0): {np.sum(y_test == 0)}, PNEUMONIA (1): {np.sum(y_test == 1)}")

In [None]:
## Traditional ML Models (SVM, Random Forest, KNN, Logistic Regression)
This section trains four traditional ML models on a 250-image subset, with a focus on the SVM model. The SVM is selected as the best traditional model due to its perfect recall for PNEUMONIA, which is critical for medical applications.

In [None]:
# Flatten images for traditional ML
X_train_flat = X_train_subset.reshape(X_train_subset.shape[0], -1)
X_val_flat = X_val.reshape(X_val.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

# Class weights
class_weights = {0: 1.0, 1: 5.0}

# Define models (renamed to ml_models to avoid conflict)
ml_models = {
    'SVM': SVC(class_weight=class_weights, probability=True),
    'Random Forest': RandomForestClassifier(class_weight=class_weights),
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(class_weight=class_weights, max_iter=1000)
}

# Train and evaluate
best_recall = 0
best_model_name = None
for name, model in ml_models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_flat, y_train_subset)
    y_pred = model.predict(X_test_flat)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"{name} - Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")
    
    if rec > best_recall:
        best_recall = rec
        best_model_name = name
        joblib.dump(model, f'best_model_{name}.pkl')

print(f"Best model based on recall: {best_model_name} with recall {best_recall:.4f}")

In [None]:
## CNN on 250-Image Subset
This section trains a basic CNN on a 250-image subset to test its performance before scaling to the full dataset.

In [None]:
# Define CNN model
cnn_model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile with class weights
cnn_model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.Recall()])

# Train CNN
history = cnn_model.fit(
    X_train_subset, y_train_subset,
    epochs=10,
    batch_size=32,
    validation_data=(X_val, y_val),
    class_weight=class_weights
)

# Evaluate on test set
test_loss, test_acc, test_recall = cnn_model.evaluate(X_test, y_test)
print(f"CNN - Test Accuracy: {test_acc:.4f}, Test Recall: {test_recall:.4f}")

# Save if best recall
if test_recall > best_recall:
    cnn_model.save('best_cnn_model.h5')
    print("CNN saved as best model based on recall")
   

In [None]:
## Full CNN on 4469 Images
This section trains the CNN on the full training set (4469 images) to improve performance.

In [None]:
# Create new generators for the full training set
full_train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(150, 150),
    batch_size=64,
    class_mode='binary',
    subset='training',
    shuffle=True
)

full_val_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(150, 150),
    batch_size=64,
    class_mode='binary',
    subset='validation',
    shuffle=True
)

# Define a new CNN model
full_cnn_model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
full_cnn_model.compile(optimizer='adam',
                       loss='binary_crossentropy',
                       metrics=['accuracy', tf.keras.metrics.Recall()])

# Train on the full training set
history_full = full_cnn_model.fit(
    full_train_generator,
    epochs=10,
    validation_data=full_val_generator,
    class_weight=class_weights
)

# Evaluate on the test set
test_loss_full, test_acc_full, test_recall_full = full_cnn_model.evaluate(test_generator)
print(f"Full CNN - Test Accuracy: {test_acc_full:.4f}, Test Recall: {test_recall_full:.4f}")

# Save if best recall
if test_recall_full > best_recall:
    full_cnn_model.save('best_full_cnn_model.h5')
    print("Full CNN saved as best model based on recall")

In [None]:
## Adjust Threshold for Full CNN
This section adjusts the prediction threshold of the full CNN to maximize recall for PNEUMONIA.

In [None]:
# Get predictions with probabilities
y_pred_prob = full_cnn_model.predict(test_generator)

# Adjust threshold to 0.3 (instead of 0.5) to favor PNEUMONIA predictions
threshold = 0.3
y_pred_adjusted = (y_pred_prob > threshold).astype(int)

# Calculate metrics with adjusted threshold
acc_adjusted = accuracy_score(y_test, y_pred_adjusted)
rec_adjusted = recall_score(y_test, y_pred_adjusted)
prec_adjusted = precision_score(y_test, y_pred_adjusted)
f1_adjusted = f1_score(y_test, y_pred_adjusted)

print(f"Full CNN (Adjusted Threshold {threshold}):")
print(f"Accuracy: {acc_adjusted:.4f}, Precision: {prec_adjusted:.4f}, Recall: {rec_adjusted:.4f}, F1: {f1_adjusted:.4f}")

In [None]:
## Save the Full CNN Model
This section saves the full CNN model for future use.

In [None]:
# Get predictions with probabilities
y_pred_prob = full_cnn_model.predict(test_generator)

# Adjust threshold to 0.3 (instead of 0.5) to favor PNEUMONIA predictions
threshold = 0.3
y_pred_adjusted = (y_pred_prob > threshold).astype(int)

# Calculate metrics with adjusted threshold
acc_adjusted = accuracy_score(y_test, y_pred_adjusted)
rec_adjusted = recall_score(y_test, y_pred_adjusted)
prec_adjusted = precision_score(y_test, y_pred_adjusted)
f1_adjusted = f1_score(y_test, y_pred_adjusted)

print(f"Full CNN (Adjusted Threshold {threshold}):")
print(f"Accuracy: {acc_adjusted:.4f}, Precision: {prec_adjusted:.4f}, Recall: {rec_adjusted:.4f}, F1: {f1_adjusted:.4f}")