In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!unzip /content/drive/MyDrive/FYP/chest_xray_for_SMOTE>.zip -d /content/extracted_files

In [3]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split


In [4]:
# Define paths to dataset
base_dir = '/content/extracted_files/chest_xray_for_SMOTE'  # Adjust the path as necessary
train_dir = os.path.join(base_dir, '/content/extracted_files/chest_xray_for_SMOTE/train')
test_dir = os.path.join(base_dir, '/content/extracted_files/chest_xray_for_SMOTE/test')


In [5]:
# Define the data generator for loading images without augmentation
train_datagen = ImageDataGenerator(rescale=1.0/255, validation_split=0.2)  # 20% for validation


In [6]:
train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(150, 150),  # Resize images to 150x150
    batch_size=32,
    class_mode='binary',
    subset='training'
)

Found 4187 images belonging to 2 classes.


In [7]:
validation_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(150, 150),
    batch_size=32,
    class_mode='binary',
    subset='validation'
)

Found 1045 images belonging to 2 classes.


In [8]:
# Test data generator
test_datagen = ImageDataGenerator(rescale=1.0/255)

test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=(150, 150),
    batch_size=32,
    class_mode='binary',
    shuffle=False  # Don't shuffle test data
)

Found 624 images belonging to 2 classes.


In [9]:
print("Training set class distribution:", train_generator.classes.sum())
print("Validation set class distribution:", validation_generator.classes.sum())
print("Test set class distribution:", test_generator.classes.sum())


Training set class distribution: 3107
Validation set class distribution: 776
Test set class distribution: 390


In [11]:
# Build a simple CNN model for feature extraction
base_model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
])
base_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
base_model.summary()


In [12]:
# Function to extract features from images
def extract_features(generator):
    features = []
    labels = []

    for batch_x, batch_y in generator:
        features_batch = base_model.predict(batch_x)
        features.extend(features_batch)
        labels.extend(batch_y)

        # Break after one epoch (since the flow_from_directory generates data indefinitely)
        if len(features) >= len(generator.filenames):
            break

    return np.array(features), np.array(labels)


In [13]:
# Extract features from training data
X_train, y_train = extract_features(train_generator)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 593ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 312ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 299ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 317ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 321ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 301ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 278ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 323ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 277ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 360ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 570ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 489ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [14]:
# Apply SMOTE to balance the classes
smote = SMOTE(sampling_strategy='auto')  # Auto means balance the classes
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print(f"Original data shape: {X_train.shape}, Resampled data shape: {X_train_resampled.shape}")




Original data shape: (4187, 82944), Resampled data shape: (6214, 82944)


In [15]:
# Define the final model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_resampled.shape[1],)),  # Fully connected layer
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
# Set the optimizer with a specific learning rate
learning_rate = 0.001  # Adjust this value as needed
optimizer = Adam(learning_rate=learning_rate)


In [17]:
# Compile the model
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


In [18]:
# Train the model
history = model.fit(
    X_train_resampled, y_train_resampled,
    validation_data=(X_train, y_train),  # Validation on original data (without SMOTE)
    epochs=10,  # Adjust epochs if necessary
    batch_size=32
)


Epoch 1/10
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 234ms/step - accuracy: 0.6374 - loss: 0.9739 - val_accuracy: 0.9018 - val_loss: 0.2948
Epoch 2/10
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 226ms/step - accuracy: 0.7918 - loss: 0.4082 - val_accuracy: 0.8916 - val_loss: 0.3415
Epoch 3/10
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 229ms/step - accuracy: 0.8159 - loss: 0.3623 - val_accuracy: 0.9632 - val_loss: 0.1807
Epoch 4/10
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 257ms/step - accuracy: 0.8237 - loss: 0.3421 - val_accuracy: 0.9618 - val_loss: 0.1292
Epoch 5/10
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 232ms/step - accuracy: 0.7784 - loss: 0.3740 - val_accuracy: 0.9484 - val_loss: 0.2003
Epoch 6/10
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 228ms/step - accuracy: 0.7249 - loss: 0.4234 - val_accuracy: 0.9608 - val_loss: 0.1479
Epoch 7/10

In [19]:
# Extract features from test data
X_test, y_test = extract_features(test_generator)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 776ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 268ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 285ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 302ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 279ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 289ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 277ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 303ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 282ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 273ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 270ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [20]:
# Evaluate on test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.5306 - loss: 0.7376
Test Accuracy: 74.84%


In [21]:
# Generate predictions
predictions = (model.predict(X_test) > 0.5).astype("int32")


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step


In [22]:
# Confusion Matrix and Classification Report
print("Confusion Matrix")
print(confusion_matrix(y_test, predictions))


Confusion Matrix
[[ 81 153]
 [  4 386]]


In [23]:
print("Classification Report")
print(classification_report(y_test, predictions, target_names=test_generator.class_indices.keys()))


Classification Report
              precision    recall  f1-score   support

      NORMAL       0.95      0.35      0.51       234
   PNEUMONIA       0.72      0.99      0.83       390

    accuracy                           0.75       624
   macro avg       0.83      0.67      0.67       624
weighted avg       0.80      0.75      0.71       624

