<a href="https://colab.research.google.com/github/Tavo826/DataScience/blob/main/BreastCancerDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Se pretende entrenar un clasificador utilizando una red neuronal convolucional llamada CancerNet, la cual se alimentará con imágenes de IDC (Invasive Ductal Carcinoma). El dataset contiene 277524 parches de tamaño 50x50 extraídos de 162 imágenes. De estos 198738 dan negativo y 78786 dan positivo.

Los nobres de los archivos son de la forma
$$8863\_idx5\_x451\_y1451\_class0$$
donde 8863_idx5 es el ID del paciente, 451 y 1451 son las coordenadas x, y del cultivo y 0 es la etiqueta de la clase (0 denota ausencia de IDC)

# Configuration

In [None]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/Data Science/Clasificación de cáncer de seno')

INPUT_DATASET = '/content/drive/MyDrive/Colab Notebooks/Data Science/Clasificación de cáncer de seno/IDC_dataset'
BASE_PATH = '/content/drive/MyDrive/Colab Notebooks/Data Science/Clasificación de cáncer de seno/IDC_dataset/idc'
TRAIN_PATH = os.path.sep.join([BASE_PATH, 'training'])
VAL_PATH = os.path.sep.join([BASE_PATH, 'validation'])
TEST_PATH = os.path.sep.join([BASE_PATH, 'testing'])

TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.1

# Build Dataset

In [None]:
from imutils import paths
import random, shutil, os

originalPaths = list(paths.list_images(INPUT_DATASET))
random.seed(7)
random.shuffle(originalPaths)

index = int(len(originalPaths) * TRAIN_SPLIT)
trainPaths = originalPaths[:index]
testPaths = originalPaths[index:]

index = int(len(trainPaths) * VAL_SPLIT)
valPaths = trainPaths[:index]
trainPaths = trainPaths[index:]

datasets = [('training', trainPaths, TRAIN_PATH),
            ('validation', valPaths, VAL_PATH),
            ('testing', testPaths, TEST_PATH)]

for (setType, originalPaths, basePath) in datasets:

  print(f'Building {setType} set')
  
  if not os.path.exists(basePath):
    os.makedirs(basePath)
  
  for path in originalPaths:
    #lista de imágenes
    file = path.split(os.path.sep)[-1]
    label = file[-5:-4]

    #Asociando las etiquetas a los paths
    labelPath = os.path.sep.join([basePath, label])
    if not os.path.exists(labelPath):
      os.makedirs(labelPath)

    #Asociando cada imagen con su etiqueta en el path
    newPath = os.path.sep.join([labelPath, file])
    #shutil.copy2(path, newPath)

Building training set
Building validation set
Building testing set


## Cancernet

La CNN tendrá la siguiente estructura:

* 3x3 CONV filters
* Se apilan estos filtros sobre cada uno de los otros
* Se realiza max-pooling
* Se utiliza depthwise separable convolution (más eficiente y se usa menos memoria)

In [None]:
from keras.models import Sequential
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import SeparableConv2D, MaxPooling2D
from keras.layers.core import Activation, Flatten, Dropout, Dense
from keras import backend as K

class CancerNet:
  @staticmethod
  def build(width, height, depth, classes):
    model = Sequential()
    shape = (height, width, depth)
    channelDim = -1

    #Se inicializa el modelo y el tamaño
    if K.image_data_format() == 'channels_first':
      shape = (depth, height, width)
      channelDim = 1

    model.add(SeparableConv2D(32, (3,3), padding='same', input_shape = shape))
    model.add(Activation('relu'))
    model.add(BatchNormalization(axis=channelDim))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.25))

    model.add(SeparableConv2D(64, (3,3), padding='same'))
    model.add(Activation('relu'))
    model.add(BatchNormalization(axis=channelDim))
    model.add(SeparableConv2D(64, (3,3), padding='same'))
    model.add(Activation('relu'))
    model.add(BatchNormalization(axis=channelDim))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.25))

    model.add(SeparableConv2D(128, (3,3), padding='same'))
    model.add(Activation('relu'))
    model.add(BatchNormalization(axis=channelDim))
    model.add(SeparableConv2D(128, (3,3), padding='same'))
    model.add(Activation('relu'))
    model.add(BatchNormalization(axis=channelDim))
    model.add(SeparableConv2D(128, (3,3), padding='same'))
    model.add(Activation('relu'))
    model.add(BatchNormalization(axis=channelDim))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.25))

    model.add(Flatten())
    model.add(Dense(256))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(classes))
    model.add(Activation('softmax'))

    return model

## train_model

In [None]:
import matplotlib
matplotlib.use('Agg')

from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import LearningRateScheduler
from keras.optimizers import Adagrad
from keras.utils import np_utils
from sklearn.metrics import classification_report, confusion_matrix

import numpy as np
import matplotlib.pyplot as plt

NUM_EPOCHS = 40
INIT_LR = 1e-2
BS = 32

trainPaths = list(paths.list_images(TRAIN_PATH))
lenTrain = len(trainPaths)
lenVal = len(list(paths.list_images(VAL_PATH)))
lenTest = len(list(paths.list_images(TEST_PATH)))

trainLabels = [int(p.split(os.path.sep)[-2]) for p in trainPaths]
trainLabels = np_utils.to_categorical(trainLabels)
classTotals = trainLabels.sum(axis=0)
classWeight = classTotals.max() / classTotals

#Onjeto de aumento de datos de entrenamiento
# ayuda a generalizar el modelo con una regularización
# 
trainAug = ImageDataGenerator(rescale = 1/255.0,
                              rotation_range = 20,
                              zoom_range = 0.05,
                              width_shift_range = 0.1,
                              height_shift_range = 0.1,
                              shear_range = 0.05,
                              horizontal_flip = True,
                              vertical_flip = True,
                              fill_mode = 'nearest')

valAug = ImageDataGenerator(rescale = 1/255.0)

#Generando los batch de imágenes para los conjuntos de entrenamiento, validación y test
trainGen = trainAug.flow_from_directory(TRAIN_PATH,
                                        class_mode = 'categorical',
                                        target_size = (48,48),
                                        color_mode = 'rgb',
                                        shuffle = True,
                                        batch_size = BS)

valGen = valAug.flow_from_directory(VAL_PATH,
                                    class_mode = 'categorical',
                                    target_size = (48,48),
                                    color_mode = 'rgb',
                                    shuffle = False,
                                    batch_size = BS)

testGen = valAug.flow_from_directory(TEST_PATH,
                                    class_mode = 'categorical',
                                    target_size = (48,48),
                                    color_mode = 'rgb',
                                    shuffle = False,
                                    batch_size = BS)

model = CancerNet.build(width=48, height=48, depth=3, classes=2)
opt = Adagrad(lr=INIT_LR, decay=INIT_LR/NUM_EPOCHS)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

#Entrenando el modelo
M = model.fit(trainGen,
              steps_per_epoch = lenTrain // BS,
              validation_data = valGen,
              validation_steps = lenVal // BS,
              #class_weight = classWeight,
              epochs = NUM_EPOCHS)

print('Evaluando el modelo')
testGen.reset()
pred_indices = model.predict_generator(testGen, steps=(lenTest))

pred_indices = np.argmax(pred_indices, axis=1)

print(classification_report(testGen.classes,
                            pred_indices,
                            target_names=testGen.class_indices.keys()))

cm = confusion_matrix(testGen.classes, pred_indices)
total = sum(sum(cm))
accuracy = (cm[0,0] + cm[1,1] / total)
specificity = cm[1,1] / (cm[1,0] + cm[1,1])
sensitivity = cm[0,0] / (cm[0,0] + cm[0,1])
print(cm)
print(f'Accuracy: {accuracy}')
print(f'Specificity: {specificity}')
print(f'Sensitivity: {sensitivity}')

N = NUM_EPOCHS
plt.style.use('ggplot')
plt.figure()
plt.plot(np.arange(0,N), M.history['loss'], label='train_loss')
plt.plot(np.arange(0,N), M.history['val_loss'], label='val_loss')
plt.plot(np.arange(0,N), M.history['acc'], label='train_acc')
plt.plot(np.arange(0,N), M.history['val_acc'], label='val_acc')
plt.tilte('Pérdida en entrenamiento y exactitud')
plt.xlabel('Época ')
plt.ylabel('Loss/Accuracy')
plt.legend(loc='lower left')
plt.show()

Found 24755 images belonging to 2 classes.
Found 0 images belonging to 2 classes.
Found 0 images belonging to 2 classes.
Epoch 1/40

UnknownError: ignored