In [11]:

import numpy as np # linear algebra
import os
from tqdm import tqdm
from tensorflow import keras
import cv2
from sklearn.utils import shuffle
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import tensorflow
from keras.applications.vgg16 import VGG16
from keras.layers import Dense, Flatten
from keras.models import Model
import matplotlib.pyplot as plt

In [5]:
labels = ['NORMAL', 'PNEUMONIA']
image_size = 150
data_dirs = ["train", "val", "test"]

In [6]:
# Initialize lists for data
image_data = []
label_data = []

In [7]:
print("Pre-processing data\n")

# Load and process images from each directory
base_path = "chest_xray"  # Adjusted to match your directory structure
for data_dir in data_dirs:
    for label in labels:
        data_path = os.path.join(base_path, data_dir, label)
        
        # Check if directory exists
        if not os.path.exists(data_path):
            print(f"Warning: Directory not found: {data_path}")
            continue
            
        # Process each image in the directory
        for image_file in tqdm(os.listdir(data_path), desc=f"Processing {data_dir}/{label}"):
            try:
                # Construct full image path
                image_path = os.path.join(data_path, image_file)
                
                # Read and resize image
                image = cv2.imread(image_path)
                if image is None:
                    print(f"Warning: Could not load image: {image_path}")
                    continue
                    
                image = cv2.resize(image, (image_size, image_size))
                
                # Append data
                image_data.append(image)
                label_data.append(label)
            except Exception as e:
                print(f"Error processing {image_path}: {str(e)}")

# Convert to numpy arrays
image_data = np.array(image_data)
label_data = np.array(label_data)

# Shuffle the data
image_data, label_data = shuffle(image_data, label_data, random_state=42)

# Split into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    image_data, 
    label_data, 
    test_size=0.2, 
    random_state=42
)

Pre-processing data



Processing train/NORMAL: 100%|█████████████████████████████████████████████████████| 1341/1341 [00:32<00:00, 41.29it/s]
Processing train/PNEUMONIA: 100%|█████████████████████████████████████████████████| 3875/3875 [00:32<00:00, 118.78it/s]
Processing val/NORMAL: 100%|█████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 63.74it/s]
Processing val/PNEUMONIA: 100%|█████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 128.75it/s]
Processing test/NORMAL: 100%|████████████████████████████████████████████████████████| 234/234 [00:04<00:00, 56.38it/s]
Processing test/PNEUMONIA: 100%|████████████████████████████████████████████████████| 390/390 [00:02<00:00, 137.26it/s]


In [8]:
# Convert labels to categorical format
def convert_labels(label_list, label_mapping):
    return to_categorical([label_mapping.index(label) for label in label_list])

# Convert train and test labels
Y_train = convert_labels(Y_train, labels)
Y_test = convert_labels(Y_test, labels)

print(f"\nData preprocessing complete:")
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")
print(f"Image shape: {X_train[0].shape}")


Data preprocessing complete:
Training samples: 4684
Testing samples: 1172
Image shape: (150, 150, 3)


In [12]:
vgg = VGG16(input_shape=(150,150,3), weights='imagenet', include_top=False)
for layer in vgg.layers:
    layer.trainable = False
x = Flatten()(vgg.output)
prediction = Dense(2, activation='softmax')(x)



Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [13]:
modelvgg = Model(inputs=vgg.input, outputs=prediction)
modelvgg.summary()
modelvgg.compile(optimizer='adam',
loss=tensorflow.losses.CategoricalCrossentropy(),
metrics=[keras.metrics.AUC(name='auc')])
callback = keras.callbacks.EarlyStopping(monitor='val_loss',patience=8,restore_best_weights=True)



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 150, 150, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 150, 150, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 150, 150, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 75, 75, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 75, 75, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 75, 75, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 37, 37, 128)       0     

In [14]:
history=modelvgg.fit(X_train,Y_train,
 epochs=10,batch_size= 10,
 validation_data=(X_test,Y_test), callbacks=callback)


Epoch 1/10

KeyboardInterrupt: 