# Mammography - Breast Cancer

## Prediction of Breast Cancer



#### University of Aveiro

### Contents
* [1. Packages](#id1)
* [2. Helper Functions](#id2)
* [3. Reading .csv Data](#id3)
    - [3.1. Training set](#id3.1)
    - [3.2. Validation set](#id3.2)
    - [3.3. Overview the Data Set](#id3.3)
    - [3.4. Image Visualization](#id3.4)
    - [3.5. Selecting Data](#id3.5)
* [4. Classification Models](#id4)
    - [4.1. Kooi et al. Network architecture](#id4.1)
    - [4.2. Transfer Learning - VGG16](#id4.2)

    


### References
* CBIS_DDSM: https://wiki.cancerimagingarchive.net/display/Public/CBIS-DDSM
* NumPy Lib: https://numpy.org/
* Pandas Lib: https://pandas.pydata.org/
* Scikit-Learn Lib: https://scikit-learn.org/stable/
* TensorFlow: https://www.tensorflow.org/
* Keras: https://keras.io/
* Pydicom: https://pydicom.github.io/
* Unbalanced Samples: https://medium.com/strands-tech-corner/unbalanced-datasets-what-to-do-144e0552d9cd
* Oversampling: https://towardsdatascience.com/methods-for-dealing-with-imbalanced-data-5b761be45a18
* Large scale deep learning for computer aided detection of mammographic lesions”, Medical Image Analysis, 2016. http://dx.doi.org/10.1016/j.media.2016.07.007
* Keras CNN Dog or Cat Classification: https://www.kaggle.com/uysimty/keras-cnn-dog-or-cat-classification
* Breast cancer classification with Keras and Deep Learning: https://www.pyimagesearch.com/2019/02/18/breast-cancer-classification-with-keras-and-deep-learning/

# 1. Packages <a class="anchor" id="id1"></a>

In [None]:
import numpy as np
import imageio as io
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

#tensorflow related
import tensorflow as tf
print("TF version:", tf.__version__)
import pydicom as dicom
from tensorflow import keras
from keras import callbacks

#.py files
import generate_data as generator
import preprocessing as prepr
import models as models

# 2. Helper Functions <a class="anchor" id="id2"></a>

In [None]:
def image_classifications(features_dataframe,images_dataframe):
    """
    Remove space from patient name
    """

    features_patient_id= features_dataframe['patient_id']
    features_classification= features_dataframe['classification']
    patient_id_classification_dict=dict()
    i=0
    for patient_id in features_patient_id:
        if patient_id in list(patient_id_classification_dict.keys()):
            i+=1
        else:
            patient_id_classification_dict[patient_id] = features_classification[i]
            i+=1
    
    images_classification = []
    images_patient_id= images_dataframe['patient_id']
    for patient_id in images_patient_id:
        images_classification.append(patient_id_classification_dict[patient_id])

    return images_classification

def fix_path(path_list):

    new_paths = []
    n=0
    for path in path_list:
        lost_path = path.split('/')
        if lost_path[1][0] == ' ':
            lost_path[1]= lost_path[1][1:]
        string_path= lost_path[0]
        for i in lost_path[1:len(lost_path)]:
            string_path = string_path+'/'+i
        new_paths.append(string_path)
        n += 1
        
    return new_paths

def dataframe_by_view(dataframe,view):
    """
    Builds dataframe with images taken from view.
    View must be 'CC' or 'O
    """
    image_paths = dataframe['paths']
    image_labels = dataframe['labels']
    view_paths = []
    view_labels = []
    i=0
    for path in image_paths:
        if view in path:
            view_paths.append(path)
            view_labels.append(image_labels[i])
        i+=1
    view_dataframe = pd.DataFrame({'paths':view_paths,'labels':view_labels})    

    return view_dataframe

#evaluate predicted data
#confusion matrix and classification report
def evaluation(model,data_test,data_predicted):
    classes= ['Benign','Malign']
    report= classification_report(data_test,data_predicted,[0.0,1.0],classes)
    classes1 = ['Benign\n(Predicted)','Malign\n(Predicted)']
    classes2 = ['Benign\n(Real)','Malign\n(Real)']
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    c_matrix = confusion_matrix(data_test, data_predicted)
    #c_matrix = c_matrix.astype('float') / c_matrix.sum(axis = 1)[:, np.newaxis]
    c_train = pd.DataFrame(c_matrix, index = classes2, columns = classes1)
    
    plt.figure(figsize = (16,4))
    ax = sns.heatmap(c_train, annot = True, cmap = cmap, square = True, cbar = False,fmt = '.2f', annot_kws = {"size": 20})
    print("Report: \n",report) 

# 3. Reading .csv Data <a class="anchor" id="id3"></a>

### 3.1. BCDR Dataset number 1<a class="anchor" id="id3.1"></a>

In [None]:
bcdr1 = pd.read_csv('BCDR-D01_dataset/bcdr_d01_img.csv')
bcdr1_labels = pd.read_csv('BCDR-D01_dataset/bcdr_d01_features.csv')
print('Suspicious Dataset Nº 1: ',bcdr1.shape)
#bcdr1.head(10)

### 3.2. BCDR Dataset 2 <a class="anchor" id="id3.2"></a>

In [None]:
bcdr2 = pd.read_csv('BCDR-D02_dataset/bcdr_d02_img.csv')
bcdr2_labels = pd.read_csv('BCDR-D02_dataset/bcdr_d02_features.csv')
print('Suspicious Dataset Nº 2: ',bcdr2.shape)
#bcdr2.head(10)

### 3.3. BCDR Dataset 3 <a class="anchor" id="id3.2"></a>

In [None]:
bcdrN = pd.read_csv('BCDR-DN01_dataset/bcdr_dn01_img.csv')
print('Normal Dataset: ',bcdrN.shape)
#bcdrN.head(10)

## Convert TIF to TIFF

In [None]:
normal_broken_paths= []
for path in bcdrN['image_filename']:
    normal_broken_paths.append('BCDR-DN01_dataset/'+path)
normal_paths_tif = fix_path(normal_broken_paths)

normal_paths= generator.convert(normal_paths_tif)

In [None]:
suspicious_broken_paths_1= []
for path in bcdr1['image_filename']:
    suspicious_broken_paths_1.append('BCDR-D01_dataset/'+path)
suspicious_paths_1_tif = fix_path(suspicious_broken_paths_1)

suspicious_paths_1 = generator.convert(suspicious_paths_1_tif)

In [None]:
#suspicious_broken_paths_2= []
#for path in bcdr2['image_filename']:
#    suspicious_broken_paths_2.append('BCDR-D02_dataset/'+path)
#suspicious_paths_2_tif = fix_path(suspicious_broken_paths_2)

#suspicious_paths_2 = generator.convert(suspicious_paths_2_tif)

## Building Normal vs Suspicious Dataframes

In [None]:
bcdr_normal = pd.DataFrame({'paths':normal_paths,'labels':'Normal'})

bcdr_normal_cc = dataframe_by_view(dataframe=bcdr_normal,view='CC')
bcdr_normal_mlo = dataframe_by_view(bcdr_normal,'O')

In [None]:
#suspicious_dfs = [pd.DataFrame({'paths':suspicious_paths_1,'labels':'Suspicious'}),pd.DataFrame({'paths':suspicious_paths_2,'labels':'Suspicious'})]

#bcdr_suspicious = pd.concat(suspicious_dfs,ignore_index=True)

In [None]:
bcdr_suspicious = pd.DataFrame({'paths':suspicious_paths_1,'labels':'Suspicious'})

bcdr_suspicious_cc = dataframe_by_view(bcdr_suspicious,'CC')
bcdr_suspicious_mlo = dataframe_by_view(bcdr_suspicious,'O')

#### Checking dataset balance

In [None]:
print('Normal Images: ',bcdr_normal.shape[0])
print('Normal CC: ',bcdr_normal_cc.shape[0])
print('Normal MLO: ',bcdr_normal_mlo.shape[0])

In [None]:
print('Suspicious Images: ',bcdr_suspicious.shape[0])
print('Suspicious CC: ',bcdr_suspicious_cc.shape[0])
print('Suspicious MLO: ',bcdr_suspicious_mlo.shape[0])

In [None]:
print('Total Images: ',bcdr_normal.shape[0]+bcdr_suspicious.shape[0])

## Building Training vs Validation Dataframes

In [None]:
validation_split=0.2
validation_images = round(0.2*(bcdr_normal.shape[0]+bcdr_suspicious.shape[0]))
print('Validation Images: ',round(validation_images))

In [None]:
validation_df_cc = pd.concat([
    bcdr_suspicious_cc[:round((validation_images/4))],
    bcdr_normal_cc[:round((validation_images/4))]],
    ignore_index=True
    )

validation_df_mlo = pd.concat([
    bcdr_suspicious_mlo[:round((validation_images/4))],
    bcdr_normal_mlo[:round((validation_images/4))]],
    ignore_index=True    )

In [None]:
training_df_cc = pd.concat([
    bcdr_suspicious_cc[round(validation_images/4):],
    bcdr_normal_cc[round(validation_images/4):]],ignore_index=True)

training_df_mlo = pd.concat([
    bcdr_suspicious_mlo[round(validation_images/4):],
    bcdr_normal_mlo[round(validation_images/4):]],ignore_index=True)

## Batching Data

In [None]:
target_size_1 = 2500
target_size_2 = 2000
batch_size = 16

In [None]:
training_generator_cc = generator.generate(training_df_cc,target_size_1,target_size_2,batch_size)
training_generator_mlo = generator.generate(training_df_mlo,target_size_1,target_size_2,batch_size)
validation_generator_cc = generator.generate(validation_df_cc,target_size_1,target_size_2,batch_size)
validation_generator_mlo = generator.generate(validation_df_mlo,target_size_1,target_size_2,batch_size)

## Image Visualization

TIFF Image

In [None]:
tiff_image =plt.imread(normal_paths[0])
plt.imshow(np.asarray(tiff_image))
print(tiff_image.shape)

Generated Image

In [None]:
generated_image= training_generator[0][0][0]
plt.imshow(np.asarray(generated_image)[:,:,0],cmap='gray')
print(generated_image.shape)

## Classification with Sequential Model

In [None]:
model = models.create_sequential_model(target_size_1,target_size_2)

In [None]:
model.summary()

In [None]:
optimizer='adam'
loss_function='categorical_crossentropy'
metrics=[keras.metrics.categorical_accuracy,keras.metrics.AUC]
epochs= 30
steps_per_epoch= 20
fit_callbacks = [
    callbacks.EarlyStopping(
        monitor= 'val_accuracy',
        min_delta= 0.01,
        patience= 8, 
        restore_best_weights= True),
    callbacks.CSVLogger(
        filename= 'BCDR Training Parameters',
        separator=",", 
        append=True)
]

In [None]:
keras.utils.plot_model(model,to_file='sequential_model.png',show_shapes=True)

In [None]:
trained_model = models.train_model(model,optimizer,loss_function,metrics,epochs,steps_per_epoch,training_generator_cc,fit_callbacks)

## Classification with Sequential Model

In [None]:
model = models.create_functional_model(target_size_1,target_size_2)

In [None]:
training_generator_cc[0]

In [None]:
model.summary()

In [None]:
optimizer=keras.optimizers.Adam(learning_rate=0.00001)
loss_function='categorical_crossentropy'
metrics=[keras.metrics.categorical_accuracy,keras.metrics.AUC]
epochs= 30
steps_per_epoch= 20
fit_callbacks = [
    callbacks.EarlyStopping(
        monitor= 'val_accuracy',
        min_delta= 0.01,
        patience= 8, 
        restore_best_weights= True),
    callbacks.CSVLogger(
        filename= 'BCDR Training Parameters',
        separator=",", 
        append=True)
]

In [None]:
keras.utils.plot_model(model,show_shapes=True,to_file='functional_model.png')

In [None]:
model.compile(optimizer=optimizer,loss=loss_function,metrics='AUC')

In [None]:
model.fit(x={'CC_Input':training_generator_cc[0][0][0],'MLO_Input':training_generator_mlo[0][0][0]},y={'CC_outputtraining_generator_cc[0][1][0],training_generator_mlo[0][1][0]},epochs=20)

In [None]:
training_generator_cc[0][0][0].shape

#### Evaluation

In [None]:

%matplotlib inline

plt.figure()
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.plot(trained_model.history['loss'])
plt.plot(trained_model.history['val_loss'])
plt.legend(['Training', 'Validation'])

plt.figure()
plt.xlabel('Epochs')
plt.ylabel('Categorical Accuracy')
plt.plot(trained_model.history['categorical_accuracy'])
plt.plot(trained_model.history['val_categorical_accuracy'])
plt.legend(['Training', 'Validation'])

In [None]:
y_predicted = model.predict(validation_generator)

In [None]:
n=1
for predictions in y_predicted:
    if predictions[0] > 0.5:
        prediction = 'Normal'
        confidence = float("{:.2f}".format(predictions[0]))*100
    else:
        prediction = 'Suspicious'
        confidence = float("{:.2f}".format(predictions[1]))*100
    print('Image {} predicted as {} with {} confidence'.format(n,prediction,confidence))
    n+=1