### **Imports**

In [None]:
import os
from pathlib import Path
import random, shutil
import pandas as pd
import numpy as np
import tensorflow as tf

In [None]:
print(os.listdir("/kaggle/input"))


['busi-dataset']


In [None]:

OUT_ROOT = Path("/kaggle/working/busi_processed")

for split in ["train", "val", "test"]:
    for cls in ["benign", "malignant"]:
        (OUT_ROOT / split / cls).mkdir(parents=True, exist_ok=True)

print("Created:", OUT_ROOT)


Created: /kaggle/working/busi_processed


In [None]:

ROOT = Path("/kaggle/input/busi-dataset")
print("Listing top-level:")
for p in ROOT.iterdir():
    print("-", p)

print("\nSearching for class folders...")
for p in ROOT.rglob("*"):
    if p.is_dir() and p.name.lower() in ["benign", "malignant", "normal"]:
        print("Found:", p)


Listing top-level:
- /kaggle/input/busi-dataset/Dataset BUSI

Searching for class folders...
Found: /kaggle/input/busi-dataset/Dataset BUSI/benign
Found: /kaggle/input/busi-dataset/Dataset BUSI/normal
Found: /kaggle/input/busi-dataset/Dataset BUSI/malignant


In [None]:


SEED = 42
random.seed(SEED)

benign_dir = Path("/kaggle/input/busi-dataset/Dataset BUSI/benign")
malignant_dir = Path("/kaggle/input/busi-dataset/Dataset BUSI/malignant")

OUT_ROOT = Path("/kaggle/working/busi_processed")

def collect_non_mask_images(class_dir: Path):
    exts = {".png", ".jpg", ".jpeg"}
    paths = [p for p in class_dir.rglob("*") if p.suffix.lower() in exts]
    # Ignore segmentation masks like *_mask.png
    paths = [p for p in paths if "mask" not in p.name.lower()]
    return sorted(paths)

benign_paths = collect_non_mask_images(benign_dir)
malignant_paths = collect_non_mask_images(malignant_dir)

print("Benign (non-mask):", len(benign_paths))
print("Malignant (non-mask):", len(malignant_paths))

def split_copy(paths, cls):
    paths = paths[:]
    random.shuffle(paths)
    n = len(paths)
    n_train = int(0.70 * n)
    n_val = int(0.15 * n)

    mapping = (
        [("train", p) for p in paths[:n_train]] +
        [("val", p) for p in paths[n_train:n_train+n_val]] +
        [("test", p) for p in paths[n_train+n_val:]]
    )

    rows = []
    for split, src in mapping:
        dst = OUT_ROOT / split / cls / src.name
        shutil.copy2(src, dst)
        rows.append({"filepath": str(dst), "label": cls, "split": split})
    return rows

df = pd.DataFrame(split_copy(benign_paths, "benign") + split_copy(malignant_paths, "malignant"))
df.to_csv("/kaggle/working/splits.csv", index=False)

print(df.groupby(["split","label"]).size())
print("Done. Your processed dataset is ready in:", OUT_ROOT)


Benign (non-mask): 437
Malignant (non-mask): 210
split  label    
test   benign        67
       malignant     32
train  benign       305
       malignant    147
val    benign        65
       malignant     31
dtype: int64
Done. Your processed dataset is ready in: /kaggle/working/busi_processed


In [None]:

DATA_ROOT = "/kaggle/working/busi_processed"
IMG_SIZE = (224, 224)
BATCH_SIZE = 32

train_ds = tf.keras.utils.image_dataset_from_directory(
    f"{DATA_ROOT}/train",
    labels="inferred",
    label_mode="binary",
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=True,
    seed=42,
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    f"{DATA_ROOT}/val",
    labels="inferred",
    label_mode="binary",
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=False,
)

test_ds = tf.keras.utils.image_dataset_from_directory(
    f"{DATA_ROOT}/test",
    labels="inferred",
    label_mode="binary",
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=False,
)

print("Class names:", train_ds.class_names)


Found 452 files belonging to 2 classes.
Found 96 files belonging to 2 classes.
Found 99 files belonging to 2 classes.
Class names: ['benign', 'malignant']


In [11]:

AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(AUTOTUNE)
val_ds   = val_ds.cache().prefetch(AUTOTUNE)
test_ds  = test_ds.cache().prefetch(AUTOTUNE)



In [12]:
# from your counts:
n_benign_train = 305
n_malignant_train = 147
total = n_benign_train + n_malignant_train

class_weight = {
    0: total / (2.0 * n_benign_train),     # benign
    1: total / (2.0 * n_malignant_train),  # malignant
}
print(class_weight)


{0: 0.740983606557377, 1: 1.5374149659863945}


In [None]:
print(tf.config.list_physical_devices("GPU"))


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]


In [None]:

IMG_SIZE = (224, 224)

# Augmentation
data_aug = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal"),
    tf.keras.layers.RandomRotation(0.05),
    tf.keras.layers.RandomZoom(0.1),
])

# Pretrained CNN backbone
base = tf.keras.applications.EfficientNetB0(
    include_top=False,
    weights="imagenet",
    input_shape=IMG_SIZE + (3,),
)
base.trainable = False  # train head first

inputs = tf.keras.Input(shape=IMG_SIZE + (3,))
x = data_aug(inputs)
x = tf.keras.applications.efficientnet.preprocess_input(x)
x = base(x, training=False)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dropout(0.2)(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(inputs, outputs)

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss="binary_crossentropy",
    metrics=[
        tf.keras.metrics.BinaryAccuracy(name="acc"),
        tf.keras.metrics.AUC(name="auc"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
    ],
)

model.summary()


In [None]:

callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_auc",
        mode="max",
        patience=5,
        restore_best_weights=True
    ),
    tf.keras.callbacks.ModelCheckpoint(
        filepath="/kaggle/working/best_model.keras",
        monitor="val_auc",
        mode="max",
        save_best_only=True
    ),
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15,
    class_weight=class_weight,
    callbacks=callbacks
)



Epoch 1/15


E0000 00:00:1768888457.200965      55 meta_optimizer.cc:967] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inStatefulPartitionedCall/functional_3_1/efficientnetb0_1/block2b_drop_1/stateless_dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer
I0000 00:00:1768888460.271500     127 cuda_dnn.cc:529] Loaded cuDNN version 91002


[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 418ms/step - acc: 0.5355 - auc: 0.5410 - loss: 0.7179 - precision: 0.3774 - recall: 0.5986 - val_acc: 0.5208 - val_auc: 0.6467 - val_loss: 0.6802 - val_precision: 0.3729 - val_recall: 0.7097
Epoch 2/15
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 131ms/step - acc: 0.6326 - auc: 0.7150 - loss: 0.6320 - precision: 0.4709 - recall: 0.6817 - val_acc: 0.6562 - val_auc: 0.7382 - val_loss: 0.6244 - val_precision: 0.4792 - val_recall: 0.7419
Epoch 3/15
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 128ms/step - acc: 0.7230 - auc: 0.8304 - loss: 0.5547 - precision: 0.5708 - recall: 0.7402 - val_acc: 0.6250 - val_auc: 0.7591 - val_loss: 0.6156 - val_precision: 0.4528 - val_recall: 0.7742
Epoch 4/15
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 129ms/step - acc: 0.7093 - auc: 0.8305 - loss: 0.5373 - precision: 0.5481 - recall: 0.8093 - val_acc: 0.6458 - val_auc: 0.7739 - val_loss

In [None]:

curr_metrics = model.evaluate(test_ds, return_dict=True, verbose=0)
best = tf.keras.models.load_model("/kaggle/working/best_model.keras")
best_metrics = best.evaluate(test_ds, return_dict=True, verbose=0)

print("CURRENT:", curr_metrics)
print("BEST:", best_metrics)


CURRENT: {'acc': 0.7878788113594055, 'auc': 0.8647387623786926, 'loss': 0.4619137942790985, 'precision': 0.6341463327407837, 'recall': 0.8125}
BEST: {'acc': 0.7878788113594055, 'auc': 0.8647387623786926, 'loss': 0.4619137942790985, 'precision': 0.6341463327407837, 'recall': 0.8125}


In [19]:
# Unfreeze backbone and fine-tune top layers
base.trainable = True

fine_tune_at = 200
for layer in base.layers[:fine_tune_at]:
    layer.trainable = False

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-5),
    loss="binary_crossentropy",
    metrics=[
        tf.keras.metrics.BinaryAccuracy(name="acc"),
        tf.keras.metrics.AUC(name="auc"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
    ],
)

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=3, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint("/kaggle/working/best_model_finetuned.keras",
                                       monitor="val_auc", mode="max", save_best_only=True),
]

history_ft = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    class_weight=class_weight,
    callbacks=callbacks,
)


Epoch 1/10


E0000 00:00:1768888930.963300      55 meta_optimizer.cc:967] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inStatefulPartitionedCall/functional_3_1/efficientnetb0_1/block2b_drop_1/stateless_dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer


[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 372ms/step - acc: 0.8022 - auc: 0.8758 - loss: 0.5018 - precision: 0.7331 - recall: 0.6617 - val_acc: 0.7083 - val_auc: 0.8127 - val_loss: 0.5324 - val_precision: 0.5333 - val_recall: 0.7742
Epoch 2/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 105ms/step - acc: 0.8308 - auc: 0.8758 - loss: 0.4981 - precision: 0.7612 - recall: 0.7302 - val_acc: 0.7083 - val_auc: 0.8114 - val_loss: 0.5291 - val_precision: 0.5333 - val_recall: 0.7742
Epoch 3/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 198ms/step - acc: 0.8148 - auc: 0.8890 - loss: 0.4678 - precision: 0.7397 - recall: 0.7059 - val_acc: 0.7188 - val_auc: 0.8132 - val_loss: 0.5251 - val_precision: 0.5455 - val_recall: 0.7742
Epoch 4/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 158ms/step - acc: 0.7885 - auc: 0.8717 - loss: 0.4881 - precision: 0.6853 - recall: 0.7071 - val_acc: 0.7083 - val_auc: 0.8149 - val_loss

In [None]:

best_ft = tf.keras.models.load_model("/kaggle/working/best_model_finetuned.keras")
best_ft.evaluate(test_ds, return_dict=True)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 72ms/step - acc: 0.7892 - auc: 0.5319 - loss: 0.3749 - precision: 0.3645 - recall: 0.5099            


{'acc': 0.7777777910232544,
 'auc': 0.8838618993759155,
 'loss': 0.4221147894859314,
 'precision': 0.6136363744735718,
 'recall': 0.84375}

In [None]:


best_ft = tf.keras.models.load_model("/kaggle/working/best_model_finetuned.keras")

def get_probs_and_labels(model, ds):
    y_true = []
    y_prob = []
    for x_batch, y_batch in ds:
        y_true.append(y_batch.numpy().reshape(-1))
        y_prob.append(model.predict(x_batch, verbose=0).reshape(-1))
    y_true = np.concatenate(y_true).astype(int)   # 0=benign, 1=malignant (based on class_names order)
    y_prob = np.concatenate(y_prob)
    return y_true, y_prob

y_val, p_val = get_probs_and_labels(best_ft, val_ds)

print("val samples:", len(y_val), " positive(malignant):", int(y_val.sum()))
print("prob range:", float(p_val.min()), "to", float(p_val.max()))


val samples: 96  positive(malignant): 31
prob range: 0.004393306095153093 to 0.9001309275627136


In [None]:

def metrics_at_threshold(y_true, y_prob, thr):
    y_pred = (y_prob >= thr).astype(int)
    tp = int(((y_true == 1) & (y_pred == 1)).sum())
    fp = int(((y_true == 0) & (y_pred == 1)).sum())
    fn = int(((y_true == 1) & (y_pred == 0)).sum())
    tn = int(((y_true == 0) & (y_pred == 0)).sum())

    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall    = tp / (tp + fn) if (tp + fn) else 0.0
    f1        = (2*precision*recall)/(precision+recall) if (precision+recall) else 0.0
    acc       = (tp + tn) / (tp + tn + fp + fn) if (tp+tn+fp+fn) else 0.0
    return acc, precision, recall, f1, tp, fp, fn, tn

thresholds = np.round(np.arange(0.05, 0.96, 0.05), 2)

rows = []
for t in thresholds:
    acc, prec, rec, f1, tp, fp, fn, tn = metrics_at_threshold(y_val, p_val, t)
    rows.append({"thr": t, "acc": acc, "precision": prec, "recall": rec, "f1": f1,
                 "tp": tp, "fp": fp, "fn": fn, "tn": tn})

df_thr = pd.DataFrame(rows).sort_values(["recall", "f1"], ascending=False)
df_thr


Unnamed: 0,thr,acc,precision,recall,f1,tp,fp,fn,tn
1,0.1,0.46875,0.378049,1.0,0.548673,31,51,0,14
0,0.05,0.427083,0.360465,1.0,0.529915,31,55,0,10
5,0.3,0.645833,0.47619,0.967742,0.638298,30,33,1,32
4,0.25,0.604167,0.447761,0.967742,0.612245,30,37,1,28
3,0.2,0.572917,0.428571,0.967742,0.594059,30,40,1,25
2,0.15,0.510417,0.394737,0.967742,0.560748,30,46,1,19
6,0.35,0.625,0.45614,0.83871,0.590909,26,31,5,34
7,0.4,0.65625,0.480769,0.806452,0.60241,25,27,6,38
8,0.45,0.65625,0.479167,0.741935,0.582278,23,25,8,40
10,0.55,0.729167,0.564103,0.709677,0.628571,22,17,9,48


In [23]:
TARGET_RECALL = 0.90
candidates = df_thr[df_thr["recall"] >= TARGET_RECALL].sort_values(["precision", "f1"], ascending=False)

if len(candidates) > 0:
    best_row = candidates.iloc[0]
    reason = f"picked highest precision with recall >= {TARGET_RECALL}"
else:
    best_row = df_thr.sort_values("f1", ascending=False).iloc[0]
    reason = "no threshold hit target recall; picked best F1"

best_threshold = float(best_row["thr"])
print("Best threshold:", best_threshold, "|", reason)
print(best_row)


Best threshold: 0.3 | picked highest precision with recall >= 0.9
thr           0.300000
acc           0.645833
precision     0.476190
recall        0.967742
f1            0.638298
tp           30.000000
fp           33.000000
fn            1.000000
tn           32.000000
Name: 5, dtype: float64


In [24]:
y_test, p_test = get_probs_and_labels(best_ft, test_ds)
t = best_threshold

acc, prec, rec, f1, tp, fp, fn, tn = metrics_at_threshold(y_test, p_test, t)
print({"threshold": t, "acc": acc, "precision": prec, "recall": rec, "f1": f1, "tp": tp, "fp": fp, "fn": fn, "tn": tn})


{'threshold': 0.3, 'acc': 0.7474747474747475, 'precision': 0.5636363636363636, 'recall': 0.96875, 'f1': 0.7126436781609196, 'tp': 31, 'fp': 24, 'fn': 1, 'tn': 43}


![Breast Cancer Classifier] (ss.png)


<img src="D:\\Github\\Breast-Cancer-Classifier\\ss.png" alt="Breast Cancer Classifier" width="300" />

