In [None]:

import kagglehub
andrewmvd_leukemia_classification_path = kagglehub.dataset_download('andrewmvd/leukemia-classification')

print('Data source import complete.')


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam, Adamax
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation, Dropout, BatchNormalization
from tensorflow.keras import regularizers

# VGG16
from keras.applications.vgg16 import VGG16

from PIL import Image

In [None]:

def create_df(dataset):
    image_paths, labels = [], []

    for dirpath, dirnames, filenames in os.walk(dataset):
        for filename in filenames:

            image = os.path.join(dirpath, filename)
            image_paths.append(image)
            if dirpath[-3:] == 'all':
                labels.append('all')
            else:
                labels.append('hem')

    df = pd.DataFrame({'Image Path': image_paths,
                           'Label': labels})

    return df


train_dir = "/kaggle/input/leukemia-classification/C-NMC_Leukemia/training_data"
df =  create_df(train_dir)

train_df, remaining_df = train_test_split(df, train_size=0.7, shuffle=True, random_state=31, stratify=df['Label'])
valid_df, test_df= train_test_split(remaining_df, train_size=0.5, shuffle=True, random_state=31, stratify=remaining_df['Label'])

print("Number of training samples: %d" % len(train_df.index))
print("Number of test samples: %d" % len(test_df.index))
print("Number of validation samples: %d" % len(valid_df.index))

In [None]:
def show_history_plot(history):

    training_accuracy = history['accuracy']
    epochs = range(1, len(training_accuracy) + 1)


    plt.figure(figsize=(15, 5))


    plt.subplot(1, 2, 1)
    plt.plot(epochs, history['accuracy'], 'b', label='Training accuracy', marker='o')
    plt.plot(epochs, history['val_accuracy'], 'c', label='Validation accuracy', marker='o')
    plt.title('Training and Validation Accuracy', fontsize=14)
    plt.xlabel('Epochs', fontsize=12)
    plt.ylabel('Accuracy', fontsize=12)
    plt.legend()
    plt.grid(True)

    # Plotting training and validation loss
    plt.subplot(1, 2, 2)  # 1 row, 2 columns, second plot
    plt.plot(epochs, history['loss'], 'b', label='Training loss', marker='o')
    plt.plot(epochs, history['val_loss'], 'c', label='Validation loss', marker='o')
    plt.title('Training and Validation Loss', fontsize=14)
    plt.xlabel('Epochs', fontsize=12)
    plt.ylabel('Loss', fontsize=12)
    plt.legend()
    plt.grid(True)

    # Improve layout and displaying the plot
    plt.tight_layout()
    plt.show()

In [None]:
def show_conf_matrix(model):
    test_gen.reset()  # Reset the generator to be sure it's at the start of the dataset
    y_pred = model.predict(test_gen, steps=test_gen.n // test_gen.batch_size+1, verbose=0)

    label_dict = test_gen.class_indices
    classes = list(label_dict.keys())

    # Convert predictions to labels
    pred_labels = np.argmax(y_pred, axis=1)
    y_true = test_gen.classes

    # Generate the confusion matrix
    confusion_matrix = metrics.confusion_matrix(y_true, pred_labels)
    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=[False, True])

    # Plot the confusion matrix
    cmap = plt.cm.Blues
    cm_display.plot(cmap=cmap, colorbar=False)

    plt.title('Confusion Matrix', fontsize=16)
    plt.figure(figsize=(7, 7))
    plt.show()

In [None]:
def evaluation_matrix(model):
    test_steps = len(test_df) // batch_size
    train_score = model.evaluate(train_gen, steps= test_steps, verbose= 0)
    valid_score = model.evaluate(valid_gen, steps= test_steps, verbose= 0)
    test_score = model.evaluate(test_gen, steps= test_steps, verbose= 0)

    header = "{:<12} {:<10} {:<10}".format("", "Loss", "Accuracy")
    separator = '-' * len(header)
    train_row = "{:<12} {:<10.5f} {:<10.5f}".format("Train", train_score[0], train_score[1])  # Formats the float to five decimal places
    valid_row = "{:<12} {:<10.5f} {:<10.5f}".format("Validation", valid_score[0], valid_score[1])
    test_row = "{:<12} {:<10.5f} {:<10.5f}".format("Test", test_score[0], test_score[1])

    table = '\n'.join([header, separator, train_row, valid_row, test_row])
    print(table)

In [None]:
hem_img = train_df[train_df['Label'] == 'hem'].sample(3)
all_img = train_df[train_df['Label'] == 'all'].sample(3)
sampled_df = pd.concat([hem_img, all_img])

# Create a figure with subplots to show the images in
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(12, 6))

for i, row in enumerate(sampled_df.iterrows()):
    img = mpimg.imread(row[1]['Image Path'])
    ax = axes[i//3, i%3]
    ax.imshow(img)
    ax.axis('off')
    if row[1]['Label'] == 'hem':
        ax.set_title(f"Label: hem")
    else:
        ax.set_title(f"Label: all")

plt.show()

In [None]:
batch_size = 40

train_data_generator = ImageDataGenerator(horizontal_flip=True)
valid_data_generator = ImageDataGenerator()

train_gen = train_data_generator.flow_from_dataframe( train_df, x_col= 'Image Path', y_col= 'Label', target_size= (224, 224), class_mode= 'categorical',
                                    color_mode= 'rgb', shuffle= True, batch_size= batch_size)

valid_gen = valid_data_generator.flow_from_dataframe( valid_df, x_col= 'Image Path', y_col= 'Label', target_size= (224, 224), class_mode= 'categorical',
                                    color_mode= 'rgb', shuffle= True, batch_size= batch_size)

test_gen = valid_data_generator.flow_from_dataframe( test_df, x_col= 'Image Path', y_col= 'Label', target_size= (224, 224), class_mode= 'categorical',
                                    color_mode= 'rgb', shuffle= False, batch_size= batch_size)

train_steps = test_gen.n // test_gen.batch_size + 1
validation_steps = valid_gen.n // valid_gen.batch_size

In [None]:
# Instantiate base model
img_shape=(224, 224, 3)
VGG16_base_model = VGG16(weights=None, input_shape=img_shape, include_top=False, pooling=None)

# Freeze base model to keep pre-trained weights
VGG16_base_model.trainable = False

# Add custom top layers for binary classification
last_layer = VGG16_base_model.get_layer('block5_pool')
last_output = last_layer.output
x = keras.layers.GlobalMaxPooling2D()(last_output)
x = keras.layers.Dropout(0.5)(x)
x = keras.layers.Dense(2, activation='sigmoid')(x)

# Create new model
VGG16_model = tf.keras.Model(VGG16_base_model.input, x, name="VGG16_model")

VGG16_model.compile(Adamax(learning_rate= 0.001), loss= 'categorical_crossentropy', metrics= ['accuracy'])

VGG16_model.summary()

In [None]:
epochs = 2

history_VGG16 = VGG16_model.fit(
    train_gen,
    steps_per_epoch=train_steps,
    validation_data=valid_gen,
    validation_steps=validation_steps,
    epochs=epochs,
    batch_size=batch_size,
    verbose=1
)

In [None]:
show_history_plot(history_VGG16.history)

In [None]:
evaluation_matrix(VGG16_model)

In [None]:
show_conf_matrix(VGG16_model)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, cohen_kappa_score, recall_score, precision_score, f1_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Predict on the validation data
y_pred_probs = VGG16_model.predict(test_gen, batch_size=32)  # Predicted probabilities
#y_pred = (y_pred_probs > 0.5).astype(int).flatten()  # Convert probabilities to binary predictions
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = test_gen.classes
# Training Accuracy
#train_loss, train_accuracy = VGG16_model.evaluate(train_datagen.flow(X_train, y_train, batch_size=32), verbose=1)

# Validation Accuracy
#val_loss, val_accuracy = VGG16_model.evaluate(valid_datagen.flow(X_val, y_val, batch_size=32), verbose=1)

training_accuracy = history_VGG16.history['accuracy'][-1]  # Last epoch training accuracy
validation_accuracy = history_VGG16.history['val_accuracy'][-1]

# Test Accuracy
test_accuracy = np.mean(y_pred == y_true)

# Precision, Recall, F1 Score
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Cohen's Kappa
#kappa = cohen_kappa_score(y_true, y_pred)

# Classification Report
report = classification_report(y_true, y_pred, target_names=["Non_Diseased", "Diseased"])
print("Classification Report:\n", report)


# Print Metrics
print(f"Training Accuracy: {training_accuracy:.4f}")
print(f"Validation Accuracy: {validation_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
#print(f"Cohen's Kappa: {kappa:.4f}")


In [None]:
# Create Model Structure
img_shape = (224, 224, 3)
EN3_base_model = tf.keras.applications.efficientnet.EfficientNetB3(include_top= False, weights= None, input_shape= img_shape, pooling= 'max')

EN3_model = Sequential([
    EN3_base_model,
    BatchNormalization(),
    Dense(256, activation='relu'),
    Dropout(0.45),
    Dense(2, activation='softmax')
])

# Compile the model
EN3_model.compile(Adamax(learning_rate= 0.001), loss= 'categorical_crossentropy', metrics= ['accuracy'])

EN3_model.summary()

In [None]:
epochs = 2

history_EN3 = EN3_model.fit(x= train_gen, epochs= epochs, verbose= 1,
                    validation_data= valid_gen, validation_steps= None, shuffle= False)

In [None]:
show_history_plot(history_EN3.history)

In [None]:
evaluation_matrix(EN3_model)

In [None]:
show_conf_matrix(EN3_model)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, cohen_kappa_score, recall_score, precision_score, f1_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Predict on the validation data
y_pred_probs = EN3_model.predict(test_gen, batch_size=32)  # Predicted probabilities
#y_pred = (y_pred_probs > 0.5).astype(int).flatten()  # Convert probabilities to binary predictions
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = test_gen.classes


training_accuracy = history_EN3.history['accuracy'][-1]  # Last epoch training accuracy
validation_accuracy = history_EN3.history['val_accuracy'][-1]

# Test Accuracy
test_accuracy = np.mean(y_pred == y_true)

# Precision, Recall, F1 Score
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Cohen's Kappa
#kappa = cohen_kappa_score(y_true, y_pred)

# Classification Report
report = classification_report(y_true, y_pred, target_names=["Non_Diseased", "Diseased"])
print("Classification Report:\n", report)


# Print Metrics
print(f"Training Accuracy: {training_accuracy:.4f}")
print(f"Validation Accuracy: {validation_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
#print(f"Cohen's Kappa: {kappa:.4f}")
