In [None]:
import os
import shutil
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import resample
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import ResNet50, EfficientNetB0
from tensorflow.keras.utils import image_dataset_from_directory

In [None]:
# Parameters
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
EPOCHS = 10

DATA_ROOT = "Oral Cancer Prediction"
DATA_PATH = os.path.join(DATA_ROOT, "assets", "dataset")
BROKEN_PATH = os.path.join(DATA_ROOT, "assets", "broken")
CLASS_NAMES = ["cancer", "normal"]

In [None]:
def screen_non_jpg():
    for class_name in CLASS_NAMES:
        class_path = os.path.join(DATA_PATH, class_name)

        for img_name in os.listdir(class_path):
            img_path = os.path.join(class_path, img_name)
            try:
                img = tf.io.read_file(img_path)
                # This will crash if the image is non-decodable by tf
                img = tf.io.decode_jpeg(img)
            except Exception as e:
                print(f"[BROKEN] {img_path}: {e}")
                os.makedirs(BROKEN_PATH, exist_ok=True)
                shutil.move(img_path, os.path.join(BROKEN_PATH, img_name))


# Clean corrupt images
screen_non_jpg()

In [None]:
# Load training and validation data
train_ds = image_dataset_from_directory(
    DATA_PATH,
    validation_split=0.2,
    subset="training",
    seed=123,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=True,
)

val_ds = image_dataset_from_directory(
    DATA_PATH,
    validation_split=0.2,
    subset="validation",
    seed=123,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=True,
)

Found 719 files belonging to 2 classes.
Using 576 files for training.
Found 719 files belonging to 2 classes.
Using 143 files for validation.


In [None]:
print(train_ds.class_names)
print(val_ds.class_names)

['cancer', 'normal']
['cancer', 'normal']


In [None]:
# Counting number of images in each class
class_counts = {}
for class_name in CLASS_NAMES:
    class_path = os.path.join(DATA_PATH, class_name)
    class_counts[class_name] = len(os.listdir(class_path))
print(f"{class_counts=}")

class_counts={'cancer': 188, 'normal': 531}


In [None]:
# Prefetch for performance
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

# Data augmentation
data_augmentation = models.Sequential(
    [
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(0.1),
        layers.RandomZoom(0.1),
    ]
)

In [None]:
# Functional model
inputs = layers.Input(shape=(224, 224, 3))
x = data_augmentation(inputs)
x = layers.Rescaling(1.0 / 255)(x)

# Load pretrained base
base_model = ResNet50(weights="imagenet", include_top=False)
base_model.trainable = False
x = base_model(x, training=False)

In [None]:
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = models.Model(inputs, outputs)

In [None]:
# Compile
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.AUC()],
)

# Train
history = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

Epoch 1/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 8s/step - accuracy: 0.6559 - auc: 0.4998 - loss: 0.6877 - val_accuracy: 0.7133 - val_auc: 0.3955 - val_loss: 0.6266
Epoch 2/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 7s/step - accuracy: 0.7469 - auc: 0.4715 - loss: 0.5983 - val_accuracy: 0.7133 - val_auc: 0.5890 - val_loss: 0.5981
Epoch 3/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 7s/step - accuracy: 0.7423 - auc: 0.4818 - loss: 0.5898 - val_accuracy: 0.7133 - val_auc: 0.6692 - val_loss: 0.5923
Epoch 4/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 7s/step - accuracy: 0.7495 - auc: 0.5476 - loss: 0.5725 - val_accuracy: 0.7133 - val_auc: 0.6828 - val_loss: 0.5907
Epoch 5/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 8s/step - accuracy: 0.7518 - auc: 0.5354 - loss: 0.5801 - val_accuracy: 0.7133 - val_auc: 0.6894 - val_loss: 0.5944
Epoch 6/10
[1m18/18[0m [32m━━━━━

In [None]:
train_loss, train_acc, train_auc = model.evaluate(train_ds)
val_loss, val_acc, val_auc = model.evaluate(val_ds)

print(f"train_acc: {train_acc:.4f} - loss: {train_loss:.4f} - auc: {train_auc:.4f}")
print(f"  val_acc: {val_acc:.4f} - loss: {val_loss:.4f} - auc: {val_auc:.4f}")

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 5s/step - accuracy: 0.7504 - auc: 0.6527 - loss: 0.5538
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4s/step - accuracy: 0.7096 - auc: 0.6713 - loss: 0.5975
train_acc: 0.7448 - loss: 0.5601 - auc: 0.6563
  val_acc: 0.7133 - loss: 0.5904 - auc: 0.7174


In [None]:
# Get True and Predicted results
y_true = np.concatenate([y.numpy() for x, y in val_ds], axis=0)
y_pred_probs = model.predict(val_ds)
y_pred = np.argmax(y_pred_probs, axis=1)

# Generate Classification report
print(classification_report(y_true, y_pred, target_names=CLASS_NAMES, digits=4))

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 5s/step
              precision    recall  f1-score   support

      cancer     0.2867    1.0000    0.4457        41
      normal     0.0000    0.0000    0.0000       102

    accuracy                         0.2867       143
   macro avg     0.1434    0.5000    0.2228       143
weighted avg     0.0822    0.2867    0.1278       143



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


---

## 2nd Iteration

attempting to lower the losses by adding weights to class according to their file count.


In [None]:
class_indices = {name: idx for idx, name in enumerate(CLASS_NAMES)}
print(class_indices)
# Convert to numpy for sklearn
y = []
for class_name, count in class_counts.items():
    y.extend([class_indices[class_name]] * count)

class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)
class_weight_dict = {i: w for i, w in enumerate(class_weights)}

print(class_weight_dict)

{'cancer': 0, 'normal': 1}
{0: np.float64(1.9122340425531914), 1: np.float64(0.6770244821092278)}


In [None]:
# Train
history = model.fit(
    train_ds, validation_data=val_ds, epochs=EPOCHS, class_weight=class_weight_dict
)

Epoch 1/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 7s/step - accuracy: 0.6107 - auc: 0.5152 - loss: 0.7385 - val_accuracy: 0.2867 - val_auc: 0.7270 - val_loss: 0.7248
Epoch 2/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 9s/step - accuracy: 0.4173 - auc: 0.4971 - loss: 0.6868 - val_accuracy: 0.7133 - val_auc: 0.7377 - val_loss: 0.6605
Epoch 3/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 9s/step - accuracy: 0.5682 - auc: 0.5139 - loss: 0.6837 - val_accuracy: 0.6783 - val_auc: 0.7104 - val_loss: 0.6877
Epoch 4/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 8s/step - accuracy: 0.5395 - auc: 0.5499 - loss: 0.6805 - val_accuracy: 0.7203 - val_auc: 0.7230 - val_loss: 0.6669
Epoch 5/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 8s/step - accuracy: 0.7382 - auc: 0.6091 - loss: 0.6754 - val_accuracy: 0.7133 - val_auc: 0.7286 - val_loss: 0.6616
Epoch 6/10
[1m18/18[0m [32m━━━━━

In [None]:
train_loss, train_acc, train_auc = model.evaluate(train_ds)
val_loss, val_acc, val_auc = model.evaluate(val_ds)

print(f"train_acc: {train_acc:.4f} - loss: {train_loss:.4f} - auc: {train_auc:.4f}")
print(f"  val_acc: {val_acc:.4f} - loss: {val_loss:.4f} - auc: {val_auc:.4f}")

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 5s/step - accuracy: 0.7328 - auc: 0.6556 - loss: 0.6730
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 4s/step - accuracy: 0.6915 - auc: 0.6784 - loss: 0.6781
train_acc: 0.7378 - loss: 0.6734 - auc: 0.6600
  val_acc: 0.7413 - loss: 0.6749 - auc: 0.7243


In [None]:
# Get True and Predicted results
y_true = np.concatenate([y.numpy() for x, y in val_ds], axis=0)
y_pred_probs = model.predict(val_ds)
y_pred = np.argmax(y_pred_probs, axis=1)

# Generate Classification report
print(classification_report(y_true, y_pred, target_names=CLASS_NAMES, digits=4))

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4s/step
              precision    recall  f1-score   support

      cancer     0.2867    1.0000    0.4457        41
      normal     0.0000    0.0000    0.0000       102

    accuracy                         0.2867       143
   macro avg     0.1434    0.5000    0.2228       143
weighted avg     0.0822    0.2867    0.1278       143



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


---

## 3rd Iteration

Attempt to lower the losses by not freezing base model.


In [None]:
# After first training run:
base_model.trainable = True

# Re-compile with lower learning rate
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-5),
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.AUC()],
)

# Continue training
history_finetune = model.fit(
    train_ds, validation_data=val_ds, epochs=EPOCHS, class_weight=class_weight_dict
)

Epoch 1/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m557s[0m 28s/step - accuracy: 0.4748 - auc_1: 0.5471 - loss: 0.6872 - val_accuracy: 0.7133 - val_auc_1: 0.7096 - val_loss: 0.6672
Epoch 2/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m442s[0m 25s/step - accuracy: 0.6015 - auc_1: 0.7114 - loss: 0.6128 - val_accuracy: 0.7133 - val_auc_1: 0.6777 - val_loss: 0.6732
Epoch 3/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m526s[0m 26s/step - accuracy: 0.7623 - auc_1: 0.8418 - loss: 0.5356 - val_accuracy: 0.7133 - val_auc_1: 0.6572 - val_loss: 0.6297
Epoch 4/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m485s[0m 25s/step - accuracy: 0.7824 - auc_1: 0.8832 - loss: 0.4863 - val_accuracy: 0.7133 - val_auc_1: 0.6570 - val_loss: 0.6009
Epoch 5/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m523s[0m 26s/step - accuracy: 0.8019 - auc_1: 0.9237 - loss: 0.4365 - val_accuracy: 0.7133 - val_auc_1: 0.6674 - val_loss: 0.5928
Epoch 6/10

In [None]:
train_loss, train_acc, train_auc = model.evaluate(train_ds)
val_loss, val_acc, val_auc = model.evaluate(val_ds)

print(f"train_acc: {train_acc:.4f} - loss: {train_loss:.4f} - auc: {train_auc:.4f}")
print(f"  val_acc: {val_acc:.4f} - loss: {val_loss:.4f} - auc: {val_auc:.4f}")

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 5s/step - accuracy: 0.7504 - auc_1: 0.5681 - loss: 0.6511
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 5s/step - accuracy: 0.7096 - auc_1: 0.6151 - loss: 0.7276
train_acc: 0.7448 - loss: 0.6599 - auc: 0.6050
  val_acc: 0.7133 - loss: 0.7163 - auc: 0.6521


In [None]:
# Get True and Predicted results
y_true = np.concatenate([y.numpy() for x, y in val_ds], axis=0)
y_pred_probs = model.predict(val_ds)
y_pred = np.argmax(y_pred_probs, axis=1)

# Generate Classification report
print(classification_report(y_true, y_pred, target_names=CLASS_NAMES, digits=4))

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 5s/step
              precision    recall  f1-score   support

      cancer     0.2867    1.0000    0.4457        41
      normal     0.0000    0.0000    0.0000       102

    accuracy                         0.2867       143
   macro avg     0.1434    0.5000    0.2228       143
weighted avg     0.0822    0.2867    0.1278       143



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# EfficientNetB0


In [None]:
# Function model
inputs = tf.keras.Input(shape=(*IMG_SIZE, 3))
x = data_augmentation(inputs)
x = layers.Rescaling(1.0 / 255)(x)

In [None]:
# Load pretrained model
base_model = EfficientNetB0(
    weights="imagenet", include_top=False, input_shape=(*IMG_SIZE, 3)
)
base_model.trainable = False  # Freeze initial training

x = base_model(inputs, training=False)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)  # Binary classification
model = tf.keras.Model(inputs, outputs)

In [None]:
# Compile
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.AUC()],
)

# Trian
history = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

Epoch 1/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 3s/step - accuracy: 0.6288 - auc_2: 0.5885 - loss: 0.6550 - val_accuracy: 0.5524 - val_auc_2: 0.3775 - val_loss: 0.6948
Epoch 2/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 3s/step - accuracy: 0.6567 - auc_2: 0.4951 - loss: 0.6476 - val_accuracy: 0.6573 - val_auc_2: 0.4048 - val_loss: 0.6676
Epoch 3/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 3s/step - accuracy: 0.6904 - auc_2: 0.5280 - loss: 0.6156 - val_accuracy: 0.6923 - val_auc_2: 0.4295 - val_loss: 0.6499
Epoch 4/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 3s/step - accuracy: 0.7229 - auc_2: 0.5488 - loss: 0.5881 - val_accuracy: 0.6923 - val_auc_2: 0.4592 - val_loss: 0.6363
Epoch 5/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 3s/step - accuracy: 0.7343 - auc_2: 0.5809 - loss: 0.5747 - val_accuracy: 0.6923 - val_auc_2: 0.4903 - val_loss: 0.6239
Epoch 6/10
[1m18/18

In [None]:
train_loss, train_acc, train_auc = model.evaluate(train_ds)
val_loss, val_acc, val_auc = model.evaluate(val_ds)

print(f"train_acc: {train_acc:.4f} - loss: {train_loss:.4f} - auc: {train_auc:.4f}")
print(f"  val_acc: {val_acc:.4f} - loss: {val_loss:.4f} - auc: {val_auc:.4f}")

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 2s/step - accuracy: 0.7803 - auc_2: 0.7699 - loss: 0.4862
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step - accuracy: 0.7042 - auc_2: 0.6372 - loss: 0.5776
train_acc: 0.7743 - loss: 0.4951 - auc: 0.7670
  val_acc: 0.7063 - loss: 0.5729 - auc: 0.6453


In [None]:
# Get True and Predicted results
y_true = np.concatenate([y.numpy() for x, y in val_ds], axis=0)
y_pred_probs = model.predict(val_ds)
y_pred = np.argmax(y_pred_probs, axis=1)

# Generate Classification report
print(classification_report(y_true, y_pred, target_names=CLASS_NAMES, digits=4))

[1m4/5[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m2s[0m 2s/step



[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3s/step
              precision    recall  f1-score   support

      cancer     0.2867    1.0000    0.4457        41
      normal     0.0000    0.0000    0.0000       102

    accuracy                         0.2867       143
   macro avg     0.1434    0.5000    0.2228       143
weighted avg     0.0822    0.2867    0.1278       143



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 2nd Try with EfficientNetB0

using weighted class


In [None]:
class_indices = {name: idx for idx, name in enumerate(CLASS_NAMES)}
print(f"{class_indices=}")

# Convert to numpy for sklearn
y = []
for class_name, count in class_counts.items():
    y.extend([class_indices[class_name]] * count)

class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)
class_weight_dict = {i: w for i, w in enumerate(class_weights)}

print(class_weight_dict)

class_indices={'cancer': 0, 'normal': 1}
{0: np.float64(1.9122340425531914), 1: np.float64(0.6770244821092278)}


In [None]:
# Compile
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.AUC()],
)

# Trian
history = model.fit(
    train_ds, validation_data=val_ds, epochs=EPOCHS, class_weight=class_weight_dict
)

Epoch 1/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 3s/step - accuracy: 0.7939 - auc_3: 0.7254 - loss: 0.6636 - val_accuracy: 0.7343 - val_auc_3: 0.6685 - val_loss: 0.5634
Epoch 2/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 3s/step - accuracy: 0.7713 - auc_3: 0.7550 - loss: 0.6248 - val_accuracy: 0.7203 - val_auc_3: 0.6918 - val_loss: 0.5638
Epoch 3/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 3s/step - accuracy: 0.8200 - auc_3: 0.8140 - loss: 0.5628 - val_accuracy: 0.7133 - val_auc_3: 0.7111 - val_loss: 0.5682
Epoch 4/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 3s/step - accuracy: 0.7418 - auc_3: 0.8033 - loss: 0.5541 - val_accuracy: 0.6993 - val_auc_3: 0.7263 - val_loss: 0.5707
Epoch 5/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 3s/step - accuracy: 0.7678 - auc_3: 0.8033 - loss: 0.5511 - val_accuracy: 0.6783 - val_auc_3: 0.7400 - val_loss: 0.5728
Epoch 6/10
[1m18/18

In [None]:
train_loss, train_acc, train_auc = model.evaluate(train_ds)
val_loss, val_acc, val_auc = model.evaluate(val_ds)

print(f"train_acc: {train_acc:.4f} - loss: {train_loss:.4f} - auc: {train_auc:.4f}")
print(f"  val_acc: {val_acc:.4f} - loss: {val_loss:.4f} - auc: {val_auc:.4f}")

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 2s/step - accuracy: 0.7809 - auc_3: 0.8627 - loss: 0.5036
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step - accuracy: 0.6974 - auc_3: 0.7683 - loss: 0.5613
train_acc: 0.7795 - loss: 0.5020 - auc: 0.8655
  val_acc: 0.7133 - loss: 0.5534 - auc: 0.7849


In [None]:
# Get True and Predicted results
y_true = np.concatenate([y.numpy() for x, y in val_ds], axis=0)
y_pred_probs = model.predict(val_ds)
y_pred = np.argmax(y_pred_probs, axis=1)

# Generate Classification report
print(classification_report(y_true, y_pred, target_names=CLASS_NAMES, digits=4))



[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2s/step
              precision    recall  f1-score   support

      cancer     0.2867    1.0000    0.4457        41
      normal     0.0000    0.0000    0.0000       102

    accuracy                         0.2867       143
   macro avg     0.1434    0.5000    0.2228       143
weighted avg     0.0822    0.2867    0.1278       143



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Using method from Kaggle

used ResNet50 & confution_matrix to deal with imbalance in data


In [None]:
# Load training and validation data
train_ds = image_dataset_from_directory(
    DATA_PATH,
    validation_split=0.2,
    subset="training",
    seed=123,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
)

val_ds = image_dataset_from_directory(
    DATA_PATH,
    validation_split=0.2,
    subset="validation",
    seed=123,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
)

Found 719 files belonging to 2 classes.
Using 576 files for training.
Found 719 files belonging to 2 classes.
Using 143 files for validation.


In [None]:
# Counting number of images in each class
class_counts = {}
for class_name in CLASS_NAMES:
    class_path = os.path.join(DATA_PATH, class_name)
    class_counts[class_name] = len(os.listdir(class_path))
print(f"{class_counts=}")

class_counts={'cancer': 188, 'normal': 531}


In [None]:
# Labling image file paths
class_indices = {name: idx for idx, name in enumerate(CLASS_NAMES)}
print(class_indices)

img_paths = list()
labels = list()

for class_name in CLASS_NAMES:
    class_path = os.path.join(DATA_PATH, class_name)
    for img_name in os.listdir(class_path):
        img_path = os.path.join(class_path, img_name)
        img_paths.append(img_path)
        labels.append(class_indices[class_name])

# Converting to NumPy arrays
img_paths = np.array(img_paths)
labels = np.array(labels)

{'cancer': 0, 'normal': 1}


In [None]:
minor_class = "cancer" if class_counts["cancer"] < class_counts["normal"] else "normal"
major_class = "normal" if minor_class == "cancer" else "cancer"

minor_idx = np.where(labels == class_indices[minor_class])[0]
major_idx = np.where(labels == class_indices[major_class])[0]

In [None]:
# Oversample minority class
minor_oversampled = resample(
    minor_idx, replace=True, n_samples=len(major_idx), random_state=42
)
resampled_idx = np.hstack([major_idx, minor_oversampled])
np.random.shuffle(resampled_idx)

# New paths & labels shuffled
resampled_paths = img_paths[resampled_idx]
resampled_labels = labels[resampled_idx]

In [None]:
# Making new tf dataset
def load_and_preprocess_image(path, label):
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, IMG_SIZE)
    return image, label


ds = tf.data.Dataset.from_tensor_slices((resampled_paths, resampled_labels))
ds = ds.map(lambda x, y: load_and_preprocess_image(x, y))
ds = ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
# Data augmentation
data_augmentation = models.Sequential(
    [
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(0.1),
        layers.RandomZoom(0.1),
        layers.Rescaling(1.0 / 255),
    ]
)

In [None]:
# Functional model
inputs = layers.Input(shape=(*IMG_SIZE, 3))
x = data_augmentation(inputs)

# Load pretrained base
base_model = ResNet50(weights="imagenet", include_top=False)
base_model.trainable = False
x = base_model(x, training=False)

x = layers.GlobalAveragePooling2D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = models.Model(inputs, outputs)

In [None]:
# Compile
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.AUC()],
)

# Train
history = model.fit(ds, validation_data=val_ds, epochs=EPOCHS)

Epoch 1/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 7s/step - accuracy: 0.4995 - auc_4: 0.4936 - loss: 0.8206 - val_accuracy: 0.3986 - val_auc_4: 0.6584 - val_loss: 0.7005
Epoch 2/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 7s/step - accuracy: 0.5015 - auc_4: 0.5194 - loss: 0.7171 - val_accuracy: 0.2867 - val_auc_4: 0.6746 - val_loss: 0.7230
Epoch 3/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 7s/step - accuracy: 0.5101 - auc_4: 0.4915 - loss: 0.7240 - val_accuracy: 0.4406 - val_auc_4: 0.6854 - val_loss: 0.6991
Epoch 4/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 7s/step - accuracy: 0.5287 - auc_4: 0.5349 - loss: 0.7041 - val_accuracy: 0.4266 - val_auc_4: 0.6933 - val_loss: 0.7008
Epoch 5/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 7s/step - accuracy: 0.4860 - auc_4: 0.4805 - loss: 0.7198 - val_accuracy: 0.4685 - val_auc_4: 0.6905 - val_loss: 0.6988
Epoch 6/10
[1m

In [None]:
# Evaluate model
train_loss, train_acc, train_auc = model.evaluate(train_ds)
val_loss, val_acc, val_auc = model.evaluate(val_ds)

print(f"train_acc: {train_acc:.4f} - loss: {train_loss:.4f} - auc: {train_auc:.4f}")
print(f"  val_acc: {val_acc:.4f} - loss: {val_loss:.4f} - auc: {val_auc:.4f}")

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 6s/step - accuracy: 0.4837 - auc_4: 0.6318 - loss: 0.7033
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 5s/step - accuracy: 0.4951 - auc_4: 0.6569 - loss: 0.7076
train_acc: 0.4826 - loss: 0.7005 - auc: 0.6404
  val_acc: 0.5035 - loss: 0.6982 - auc: 0.7089


In [None]:
# Get True and Predicted results
y_true = np.concatenate([y.numpy() for x, y in val_ds], axis=0)
y_pred_probs = model.predict(val_ds)
y_pred = np.argmax(y_pred_probs, axis=1)

# Generate Classification report
print(classification_report(y_true, y_pred, target_names=CLASS_NAMES, digits=4))

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 5s/step
              precision    recall  f1-score   support

      cancer     0.2867    1.0000    0.4457        41
      normal     0.0000    0.0000    0.0000       102

    accuracy                         0.2867       143
   macro avg     0.1434    0.5000    0.2228       143
weighted avg     0.0822    0.2867    0.1278       143



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
