In [1]:

import numpy as np # linear algebra
import os
from tqdm import tqdm
from tensorflow import keras
import cv2
from sklearn.utils import shuffle
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import tensorflow
from keras.applications.vgg16 import VGG16
from keras.layers import Dense, Flatten
from keras.models import Model
import matplotlib.pyplot as plt

In [2]:
labels = ['NORMAL', 'PNEUMONIA']
image_size = 150
data_dirs = ["train", "val", "test"]

In [3]:
# Initialize lists for data
image_data = []
label_data = []

In [4]:
print("Pre-processing data\n")

# Load and process images from each directory
base_path = "chest_xray"  # Adjusted to match your directory structure
for data_dir in data_dirs:
    for label in labels:
        data_path = os.path.join(base_path, data_dir, label)
        
        # Check if directory exists
        if not os.path.exists(data_path):
            print(f"Warning: Directory not found: {data_path}")
            continue
            
        # Process each image in the directory
        for image_file in tqdm(os.listdir(data_path), desc=f"Processing {data_dir}/{label}"):
            try:
                # Construct full image path
                image_path = os.path.join(data_path, image_file)
                
                # Read and resize image
                image = cv2.imread(image_path)
                if image is None:
                    print(f"Warning: Could not load image: {image_path}")
                    continue
                    
                image = cv2.resize(image, (image_size, image_size))
                
                # Append data
                image_data.append(image)
                label_data.append(label)
            except Exception as e:
                print(f"Error processing {image_path}: {str(e)}")

# Convert to numpy arrays
image_data = np.array(image_data)
label_data = np.array(label_data)

# Shuffle the data
image_data, label_data = shuffle(image_data, label_data, random_state=42)

# Split into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    image_data, 
    label_data, 
    test_size=0.2, 
    random_state=42
)

Pre-processing data



Processing train/NORMAL: 100%|█████████████████████████████████████████████████████| 1341/1341 [00:38<00:00, 35.24it/s]
Processing train/PNEUMONIA: 100%|█████████████████████████████████████████████████| 3875/3875 [00:31<00:00, 121.60it/s]
Processing val/NORMAL: 100%|█████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 55.14it/s]
Processing val/PNEUMONIA: 100%|█████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 121.71it/s]
Processing test/NORMAL: 100%|████████████████████████████████████████████████████████| 234/234 [00:05<00:00, 46.69it/s]
Processing test/PNEUMONIA: 100%|████████████████████████████████████████████████████| 390/390 [00:03<00:00, 106.65it/s]


In [5]:
# Convert labels to categorical format
def convert_labels(label_list, label_mapping):
    return to_categorical([label_mapping.index(label) for label in label_list])

# Convert train and test labels
Y_train = convert_labels(Y_train, labels)
Y_test = convert_labels(Y_test, labels)

print(f"\nData preprocessing complete:")
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")
print(f"Image shape: {X_train[0].shape}")


Data preprocessing complete:
Training samples: 4684
Testing samples: 1172
Image shape: (150, 150, 3)


In [6]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow import keras

# Redefine the model to ensure correctness
vgg = VGG16(input_shape=(150, 150, 3), weights='imagenet', include_top=False)
for layer in vgg.layers:
    layer.trainable = False
x = Flatten()(vgg.output)
prediction = Dense(2, activation='softmax')(x)
modelvgg = Model(inputs=vgg.input, outputs=prediction)


In [7]:

# Compile the model
modelvgg.compile(
    optimizer='adam',
    loss=keras.losses.CategoricalCrossentropy(),
    metrics=[
        keras.metrics.AUC(name='auc'),
        keras.metrics.Accuracy(name='accuracy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='roc_auc', curve='ROC')
    ]
)



In [12]:

# Train the model
history = modelvgg.fit(
    X_train, Y_train,
    epochs=10,
    batch_size=10,
    validation_data=(X_test, Y_test),
    callbacks=[callback]
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
# Evaluate the model on the test set
test_loss, test_auc, test_accuracy, test_precision, test_recall = modelvgg.evaluate(X_test, Y_test)

# Print the results
print(f"Test Loss: {test_loss:.4f}")
print(f"Test AUC: {test_auc:.4f}")  # AUC (this includes ROC AUC)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")


Test Loss: 1.1309
Test AUC: 0.8404
Test Accuracy: 0.9676
Test Precision: 0.9676
Test Recall: 0.9699


In [None]:
Test Loss: 1.1309
Test AUC: 0.8404
Test Accuracy: 0.9676
Test Precision: 0.9676
Test Recall: 0.9699

In [15]:
# Save the trained model
modelvgg.save("modelvgg.h5")

In [None]:
from tensroflow.keras.models import load_model

In [None]:
import matplotlib.pyplot as plt

# Function to plot metrics
def plot_metric(history, metric, title):
    plt.plot(history.history[metric], label=f'Train {metric}')
    plt.plot(history.history[f'val_{metric}'], label=f'Validation {metric}')
    plt.title(title)
    plt.xlabel('Epoch')
    plt.ylabel(metric.capitalize())
    plt.legend()
    plt.grid(True)
    plt.show()

# Plot all metrics
metrics = ['auc', 'accuracy', 'precision', 'recall', 'roc_auc', 'loss']
titles = [
    'AUC vs Epochs', 'Accuracy vs Epochs', 'Precision vs Epochs',
    'Recall vs Epochs', 'ROC AUC vs Epochs', 'Loss vs Epochs'
]

for metric, title in zip(metrics, titles):
    plot_metric(history, metric, title)

In [16]:
# Store VGG16 results
vgg16_results = {
    'Test Loss': test_loss,
    'Test AUC': test_auc,
    'Test Accuracy': test_accuracy,
    'Test Precision': test_precision,
    'Test Recall': test_recall,
}