In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("paultimothymooney/breast-histopathology-images")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/breast-histopathology-images


In [3]:
import os
INPUT_DATASET = '/kaggle/input/breast-histopathology-images'
BASE_PATH = '/kaggle/working/idc_dataset'
TRAIN_PATH = os.path.sep.join([BASE_PATH, "training"])
VAL_PATH = os.path.sep.join([BASE_PATH, "validation"])
TEST_PATH = os.path.sep.join([BASE_PATH, "testing"])
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.1

In [4]:
from imutils import paths
import random, shutil, os
originalPaths = list(paths.list_images(INPUT_DATASET))
random.seed(7)
random.shuffle(originalPaths)
index = int(len(originalPaths) * TRAIN_SPLIT)
trainPaths = originalPaths[:index]
testPaths = originalPaths[index:]
index = int(len(trainPaths) * VAL_SPLIT)
valPaths = trainPaths[:index]
trainPaths = trainPaths[index:]
datasets = [
    ("training", trainPaths, TRAIN_PATH),
    ("validation", valPaths, VAL_PATH),
    ("testing", testPaths, TEST_PATH)
]
for (setType, originalPaths, baseOutput) in datasets:
    print(f'Building {setType} set')
    if not os.path.exists(baseOutput):
        print(f'Building directory {baseOutput}')
        os.makedirs(baseOutput)
    for path in originalPaths:
        filename = path.split(os.path.sep)[-1]
        label = filename[-5:-4]
        labelPath = os.path.sep.join([baseOutput, label])
        if not os.path.exists(labelPath):
            print(f'Building directory {labelPath}')
            os.makedirs(labelPath)
        newPath = os.path.sep.join([labelPath, filename])
        shutil.copy2(path, newPath)

Building training set
Building directory /kaggle/working/idc_dataset/training
Building directory /kaggle/working/idc_dataset/training/0
Building directory /kaggle/working/idc_dataset/training/1
Building validation set
Building directory /kaggle/working/idc_dataset/validation
Building directory /kaggle/working/idc_dataset/validation/0
Building directory /kaggle/working/idc_dataset/validation/1
Building testing set
Building directory /kaggle/working/idc_dataset/testing
Building directory /kaggle/working/idc_dataset/testing/0
Building directory /kaggle/working/idc_dataset/testing/1


In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras import backend as K
class CancerNet:
    @staticmethod
    def build(width,height,depth,classes):
        model = tf.keras.models.Sequential()
        channelDim=-1
        if K.image_data_format()=="channels_first":
            shape = (depth,height,width)
            channelDim=1
        else:
            shape = (height, width, depth)
        model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu', input_shape=shape))
        model.add(tf.keras.layers.BatchNormalization(axis=channelDim))
        model.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2))
        model.add(tf.keras.layers.Dropout(0.25))

        model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu'))
        model.add(tf.keras.layers.BatchNormalization(axis=channelDim))
        model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu'))
        model.add(tf.keras.layers.BatchNormalization(axis=channelDim))
        model.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2))
        model.add(tf.keras.layers.Dropout(0.25))
        model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu'))
        model.add(tf.keras.layers.BatchNormalization(axis=channelDim))
        model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu'))
        model.add(tf.keras.layers.BatchNormalization(axis=channelDim))
        model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu'))
        model.add(tf.keras.layers.BatchNormalization(axis=channelDim))
        model.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2))
        model.add(tf.keras.layers.Dropout(0.25))

        model.add(tf.keras.layers.Flatten())
        model.add(tf.keras.layers.Dense(units=256, activation='relu'))
        model.add(tf.keras.layers.BatchNormalization(axis=channelDim))
        model.add(tf.keras.layers.Dropout(0.5))

        model.add(tf.keras.layers.Dense(units=classes, activation='softmax'))

        return model

In [6]:
import matplotlib
matplotlib.use("Agg")
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(rescale = 1./255,
                                   shear_range = 0.2,
                                   zoom_range = 0.2,
                                   horizontal_flip = True)
training_set = train_datagen.flow_from_directory(VAL_PATH,
                                                target_size = (64, 64),
                                                batch_size = 32,
                                                class_mode = 'binary')

Found 42620 images belonging to 2 classes.


In [7]:
len(training_set)

1332

In [8]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import LearningRateScheduler
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from imutils import paths
import matplotlib.pyplot as plt
import numpy as np
import os

NUM_EPOCHS=4; INIT_LR=1e-2; BS=32

trainPaths = list(paths.list_images(TRAIN_PATH))
lenTrain=len(trainPaths)
lenVal=len(list(paths.list_images(VAL_PATH)))
lenTest=len(list(paths.list_images(TEST_PATH)))

trainLabels=[int(p.split(os.path.sep)[-2]) for p in trainPaths]
trainLabels=to_categorical(trainLabels)
classTotals=trainLabels.sum(axis=0)
classWeight=classTotals.max()/classTotals

trainAug = ImageDataGenerator(
    rescale=1/255.0,
    rotation_range=20,
    zoom_range=0.05,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.05,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest')

valAug=ImageDataGenerator(rescale=1 / 255.0)
testAug=ImageDataGenerator(rescale=1 / 255.0)

trainGen = trainAug.flow_from_directory(
    TRAIN_PATH,
    class_mode="categorical",
    target_size=(48,48),
    color_mode="rgb",
    shuffle=False,
    batch_size=BS)
valGen = valAug.flow_from_directory(
    VAL_PATH,
    class_mode="categorical",
    target_size=(48,48),
    color_mode="rgb",
    shuffle=False,
    batch_size=BS)
testGen = testAug.flow_from_directory(
    TEST_PATH,
    class_mode="categorical",
    target_size=(48,48),
    color_mode="rgb",
    shuffle=False,
    batch_size=BS)

Found 255789 images belonging to 2 classes.
Found 42620 images belonging to 2 classes.
Found 99904 images belonging to 2 classes.


In [None]:
model=CancerNet.build(width=48,height=48,depth=3,classes=2)
model.compile(loss="binary_crossentropy",optimizer='adam', metrics=["accuracy"])

M=model.fit(x=trainGen, validation_data = valGen, epochs = 20)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  self._warn_if_super_not_called()


Epoch 1/10
[1m7994/7994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2116s[0m 264ms/step - accuracy: 0.6929 - loss: 0.6445 - val_accuracy: 0.7108 - val_loss: 0.6028
Epoch 2/10
[1m7994/7994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2004s[0m 251ms/step - accuracy: 0.7142 - loss: 0.6056 - val_accuracy: 0.7108 - val_loss: 0.5999
Epoch 3/10
[1m7994/7994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2008s[0m 251ms/step - accuracy: 0.7165 - loss: 0.6006 - val_accuracy: 0.7108 - val_loss: 0.6024
Epoch 4/10
[1m7994/7994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2011s[0m 252ms/step - accuracy: 0.7057 - loss: 0.6077 - val_accuracy: 0.7108 - val_loss: 0.6075
Epoch 5/10
[1m7994/7994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2016s[0m 252ms/step - accuracy: 0.7130 - loss: 0.6017 - val_accuracy: 0.7108 - val_loss: 0.6024
Epoch 6/10
[1m7994/7994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2005s[0m 251ms/step - accuracy: 0.7160 - loss: 0.5991 - val_accuracy: 0.7108 - val

In [12]:
print("Now evaluating the model")
testGen.reset()
pred_indices=model.predict(testGen,steps=(lenTest//BS)+1)

pred_indices=np.argmax(pred_indices,axis=1)

print(classification_report(testGen.classes, pred_indices, target_names=testGen.class_indices.keys()))

cm=confusion_matrix(testGen.classes, pred_indices)
total=sum(sum(cm))
accuracy=(cm[0,0]+cm[1,1])/total
specificity=cm[1,1]/(cm[1,0]+cm[1,1])
sensitivity=cm[0,0]/(cm[0,0]+cm[0,1])
print(cm)
print(f'Accuracy: {accuracy}')
print(f'Specifity: {specificity}')
print(f'Sensitivity: {sensitivity}')

N = len(M.history["loss"])
plt.style.use("ggplot")
plt.figure()
plt.plot(np.arange(0,N), M.history["loss"], label="train_loss")
plt.plot(np.arange(0,N), M.history["val_loss"], label="val_loss")
plt.plot(np.arange(0,N), M.history["accuracy"], label="train_acc")
plt.plot(np.arange(0,N), M.history["val_accuracy"], label="val_acc")
plt.title("Training Loss and Accuracy on the IDC Dataset")
plt.xlabel('Epoch #')
plt.ylabel('Loss/Accuracy')
plt.legend(loc='lower left')
plt.savefig('plot.png')

Now evaluating the model
[1m3123/3123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 58ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.72      1.00      0.83     71586
           1       0.00      0.00      0.00     28318

    accuracy                           0.72     99904
   macro avg       0.36      0.50      0.42     99904
weighted avg       0.51      0.72      0.60     99904

[[71586     0]
 [28318     0]]
Accuracy: 0.7165478859705318
Specifity: 0.0
Sensitivity: 1.0
