How to Reproduce Our Results

1. Install required libraries:
   pip install tensorflow pandas matplotlib seaborn scikit-learn

2. Place our dataset inside ./ML_P/new/ folder, structured as:
   - new/
     - Rose/
     - notRose/

3. Run Group4_project_code.ipynb 


4. The final results (accuracy, confusion matrix, etc.) will be printed.

# Import Libraries

In [None]:
import os
import shutil
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC, Precision, Recall
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img, save_img
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


# Data augmentation

In [None]:
# Image data path + path to images after augmentation
original_data_dir = 'C:\\Users\\reems\\Desktop\\ML_P\\new'
augmented_data_dir = 'C:\\Users\\reems\\Desktop\\ML_P\\new_augmented'

# Delete the previous folder if it exists + create a new one
if os.path.exists(augmented_data_dir):
    shutil.rmtree(augmented_data_dir)
shutil.copytree(original_data_dir, augmented_data_dir)

# Augmented image generation settings
datagen = ImageDataGenerator(
    rotation_range=10,          
    width_shift_range=0.05,      
    height_shift_range=0.05,     
    zoom_range=0.05,             
    horizontal_flip=True,        
    fill_mode='nearest'          
)

# For each class in the directory , 50% of the images are selected and each one is subjected to one type of prosessing for Augmentation
for class_name in os.listdir(original_data_dir):
    class_dir = os.path.join(original_data_dir, class_name)
    save_dir = os.path.join(augmented_data_dir, class_name)

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    images = os.listdir(class_dir)
    num_original = len(images)
    num_to_generate = int(num_original * 1)  # 50% increase from the original images

    selected_images = random.sample(images, num_to_generate)  # Select random images

    for idx, img_name in enumerate(selected_images):
        img_path = os.path.join(class_dir, img_name)
        img = load_img(img_path)
        x = img_to_array(img)
        x = x.reshape((1,) + x.shape) 

        gen = datagen.flow(
            x, 
            batch_size=1, 
            save_to_dir=save_dir, 
            save_prefix='aug', 
            save_format='jpeg'
        )

        next(gen)  

# show number of image before & after augmentation
print("number of image before augmentation:")
for class_name in os.listdir(original_data_dir):
    class_dir = os.path.join(original_data_dir, class_name)
    if os.path.isdir(class_dir):
        num_images = len(os.listdir(class_dir))
        print(f"{class_name}: {num_images} image")

print("\nnumber of image after augmentation:")
for class_name in os.listdir(augmented_data_dir):
    class_dir = os.path.join(augmented_data_dir, class_name)
    if os.path.isdir(class_dir):
        num_images = len(os.listdir(class_dir))
        print(f"{class_name}: {num_images} image")

# Split the data

In [None]:

# new path to store data after splitting
train_dir = 'C:\\Users\\reems\\Desktop\\ML_P\\train'
valid_dir = 'C:\\Users\\reems\\Desktop\\ML_P\\valid'
test_dir  = 'C:\\Users\\reems\\Desktop\\ML_P\\test'

# Delete old folders if they exist.
for dir_path in [train_dir, valid_dir, test_dir]:
    if os.path.exists(dir_path):
        shutil.rmtree(dir_path)
    os.makedirs(dir_path)

# split
def split_data(original_data_dir, train_dir, valid_dir, test_dir, test_size=0.15, valid_size=0.15):
    for class_name in os.listdir(original_data_dir):
        class_path = os.path.join(original_data_dir, class_name)
        images = os.listdir(class_path)

        # split to Train & (Valid + Test)
        train_imgs, temp_imgs = train_test_split(images, test_size=(valid_size + test_size), random_state=42)
        # split the rest to Valid & Test
        valid_imgs, test_imgs = train_test_split(temp_imgs, test_size=test_size / (valid_size + test_size), random_state=42)

        # copy image to each folder
        for split_imgs, split_dir in zip([train_imgs, valid_imgs, test_imgs], [train_dir, valid_dir, test_dir]):
            class_split_dir = os.path.join(split_dir, class_name)
            os.makedirs(class_split_dir, exist_ok=True)
            for img in split_imgs:
                shutil.copy(os.path.join(class_path, img), os.path.join(class_split_dir, img))

# Function execution
split_data(augmented_data_dir, train_dir, valid_dir, test_dir)

# num of iamge in each set + class
for name, dir_path in zip(['Train', 'Validation', 'Test'], [train_dir, valid_dir, test_dir]):
    print(f"\n{name} set:")
    for class_name in os.listdir(dir_path):
        class_path = os.path.join(dir_path, class_name)
        print(f"{class_name}: {len(os.listdir(class_path))} images")


# Setting ImageDataGenerator

In [None]:

datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

# Train generator
train_generator = datagen.flow_from_directory(
    train_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    shuffle=True
)

# Validation generator
val_generator = datagen.flow_from_directory(
    valid_dir,
    target_size=(224, 224),
    batch_size=64,
    class_mode='binary',
    shuffle=False
)

# Test generator
test_generator = datagen.flow_from_directory(
    test_dir,
    target_size=(224, 224),
    batch_size=64,
    class_mode='binary',
    shuffle=False
)


# Model design + Train

In [None]:
# Model building
model = Sequential([
    Conv2D(16, (3,3), activation='relu', input_shape=(224, 224, 3), kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    MaxPooling2D(2,2),
    Dropout(0.3),

    Conv2D(32, (3,3), activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    MaxPooling2D(2,2),
    Dropout(0.3),

    Conv2D(64, (3,3), activation='relu', kernel_regularizer=l2(0.005)),
    BatchNormalization(),
    MaxPooling2D(2,2),
    Dropout(0.3),

    Flatten(),
    Dense(128, activation='relu', kernel_regularizer=l2(0.005)),
    Dropout(0.3),

    Dense(1, activation='sigmoid')
])

# set learning rate
optimizer = Adam(learning_rate=0.0001) 

# compile
model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=['accuracy', AUC(), Precision(), Recall()]
)

# EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# train model
history = model.fit(
    train_generator,
    epochs=100,
    validation_data=val_generator,
    callbacks=[early_stop]
)

# Evaluation on the validation set
val_loss, val_accuracy, val_auc, val_precision, val_recall = model.evaluate(val_generator)

# Get the probability
val_predictions_prob = model.predict(val_generator, verbose=1)


val_labels = val_generator.classes
test_labels = test_generator.classes

# Get the probability predictions for validation and test sets
val_predictions_prob = model.predict(val_generator, verbose=1)
test_predictions_prob = model.predict(test_generator, verbose=1)

# Convert probabilities into classification (0 or 1) based on threshold
threshold = 0.53
val_predictions = (val_predictions_prob > threshold).astype(int)
test_predictions = (test_predictions_prob > threshold).astype(int)

# Model Evaluation

In [None]:
# ====== Evaluation Metrics ======

# Calculate Evaluation Metrics for validation set
val_predictions = (model.predict(val_generator, verbose=1) > 0.53).astype(int)  # Use the same threshold
precision_val = precision_score(val_labels, val_predictions)
recall_val = recall_score(val_labels, val_predictions)
f1_val = f1_score(val_labels, val_predictions)
accuracy_val = accuracy_score(val_labels, val_predictions)

# Metrics table for validation set
metrics_df_val = pd.DataFrame({
    "Metric": ["Precision", "Recall", "F1-Score", "Accuracy"],
    "Validation Value": [precision_val, recall_val, f1_val, accuracy_val]
})

# Calculate Evaluation Metrics for test set
test_predictions = (model.predict(test_generator, verbose=1) > 0.53).astype(int)  # Use the same threshold
precision_test = precision_score(test_labels, test_predictions)
recall_test = recall_score(test_labels, test_predictions)
f1_test = f1_score(test_labels, test_predictions)
accuracy_test = accuracy_score(test_labels, test_predictions)

# Metrics table for test set
metrics_df_test = pd.DataFrame({
    "Metric": ["Precision", "Recall", "F1-Score", "Accuracy"],
    "Test Value": [precision_test, recall_test, f1_test, accuracy_test]
})

# Merge validation and test metrics
metrics_df = pd.merge(metrics_df_val, metrics_df_test, on="Metric")

# Draw metrics table for validation and test set
plt.figure(figsize=(8, 3))  
sns.set(font_scale=1.2) 
sns.heatmap(metrics_df.set_index("Metric").T, annot=True, fmt=".2f", cmap="Blues", cbar=False, linewidths=1)
plt.title("Validation vs Test Set Evaluation Metrics")
plt.show()

print("\n")
# ====== Confusion matrix ======

# Calculate Confusion Matrix for validation set
conf_matrix_val = confusion_matrix(val_labels, val_predictions)
TN_val, FP_val, FN_val, TP_val = conf_matrix_val.ravel()

# Confusion matrix table for validation set
conf_matrix_df_val = pd.DataFrame({
    "Predicted Negative": [TN_val, FN_val],
    "Predicted Positive": [FP_val, TP_val]
}, index=["Actual Negative", "Actual Positive"])

# Calculate Confusion Matrix for test set
conf_matrix_test = confusion_matrix(test_labels, test_predictions)
TN_test, FP_test, FN_test, TP_test = conf_matrix_test.ravel()

# Confusion matrix table for test set
conf_matrix_df_test = pd.DataFrame({
    "Predicted Negative": [TN_test, FN_test],
    "Predicted Positive": [FP_test, TP_test]
}, index=["Actual Negative", "Actual Positive"])

# Draw Confusion matrix for validation set
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.heatmap(conf_matrix_df_val, annot=True, fmt="d", cmap="Blues", cbar=False, linewidths=1)
plt.title("Validation Set Confusion Matrix")

# Draw Confusion matrix for test set
plt.subplot(1, 2, 2)
sns.heatmap(conf_matrix_df_test, annot=True, fmt="d", cmap="Blues", cbar=False, linewidths=1)
plt.title("Test Set Confusion Matrix")

plt.tight_layout()
plt.show()

# ====== AUC Curve ======

# ROC Curve for validation set
fpr_val, tpr_val, thresholds_val = roc_curve(val_labels, model.predict(val_generator, verbose=1))
roc_auc_val = auc(fpr_val, tpr_val)

# ROC Curve for test set
fpr_test, tpr_test, thresholds_test = roc_curve(test_labels, model.predict(test_generator, verbose=1))
roc_auc_test = auc(fpr_test, tpr_test)

# Draw ROC curves for validation and test sets
plt.figure(figsize=(8, 4))
plt.subplot(1, 2, 1)
plt.plot(fpr_val, tpr_val, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc_val:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--') 
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Validation Set')
plt.legend(loc='lower right')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(fpr_test, tpr_test, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc_test:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--') 
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Test Set')
plt.legend(loc='lower right')
plt.grid(True)

plt.tight_layout()
plt.show()

# loss and accuracy curve

In [None]:
# Get the loss and accuracy for test set after training
test_loss, test_accuracy, _, _, _ = model.evaluate(test_generator)

# Create lists to store test results
test_losses = [test_loss] * len(history.history['loss'])
test_accuracies = [test_accuracy] * len(history.history['accuracy'])

# Draw the loss and accuracy curve during training
plt.figure(figsize=(12, 5))

# Draw the loss curve
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss', color='red')
plt.plot(history.history['val_loss'], label='Validation Loss', color='orange')
plt.plot(test_losses, label='Test Loss', color='purple', linestyle='--')
plt.title('Loss Curve')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

# Draw the accuracy curve
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy', color='green')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', color='blue')
plt.plot(test_accuracies, label='Test Accuracy', color='purple', linestyle='--')
plt.title('Accuracy Curve')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()
