In [None]:
# Testing that the machine recognizes the GPU
import tensorflow as tf
tf.config.list_physical_devices('GPU')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from matplotlib import pyplot
%matplotlib inline
import seaborn as sns
import os
from os.path import isfile
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras import utils
import itertools
import shutil
np.random.seed(42)
from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [None]:
print(tf.__version__)
print(tf.keras.__version__)

In [None]:
# gpu_options = tf.compat.v1.GPUOptions(allow_growth=True, per_process_gpu_memory_fraction=0.5)
# sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))

current_dir = os.getcwd()
print(current_dir)

In [None]:

pos_data = []
neg_data = []

if True:
    cur_dir = os.getcwd()
    patient_dir = "/kaggle/input/breast-histopathology-images/"
    patient_list = os.listdir(patient_dir)

    for patient in patient_list:

      try:

        path_0 = patient_dir + str(patient) + '/0'
        path_1 = patient_dir + str(patient) + '/1'
        
        # create a list of all files in folder 0
        file_list_0 = os.listdir(path_0)
        # create a list of list all file in folder 1
        file_list_1 = os.listdir(path_1)

        # move the 0 images to all_images_dir
        for fname in file_list_0:
            # source path to image
            src = os.path.join(path_0, fname)
            if os.path.isfile(src):
              neg_data.append(src)

        # move the 1 images to all_images_dir
        for fname in file_list_1:

            # source path to image
            src = os.path.join(path_1, fname)
            if os.path.isfile(src):
              pos_data.append(src)

      except:
        print(patient + " cannot be added properly")
else:
    print("Data already exists. Skipped copying")

print(f"Total: {len(neg_data) + len(pos_data)}")
print(f"Negative Data: {len(neg_data)}")
print(f"Positive Data: {len(pos_data)}")

In [None]:
###### Mostly, taken from the other author's notebook. ######

# Create a dataframe containing all the information

# image_list_1 = os.listdir(f'{all_images_dir}/1')
# df_data_1 = pd.DataFrame(image_list_1, columns=['image_id'])

# image_list_0 = os.listdir(f'{all_images_dir}/0')
# df_data_0 = pd.DataFrame(image_list_0, columns=['image_id'])

df_data_1 = pd.DataFrame(pos_data, columns=['image_id'])
df_data_0 = pd.DataFrame(neg_data, columns=['image_id'])

df_data = pd.concat([df_data_1, df_data_0])

In [None]:
# Validating that the data now exists in the dataframe
df_data.head()

In [None]:
###### Mostly, taken from the other author's notebook. ######

# Define Helper Functions

# Each file name has this format:
# '14211_idx5_x2401_y1301_class1.png'

prefix = "/kaggle/input/breast-histopathology-images/"
def extract_patient_id(x):
    st = x[len(prefix):]
    # split into a list
    a = st.split('/')
    # the id is the first index in the list
    patient_id = a[0]
    
    return patient_id

def extract_target(x):
    st = x[len(prefix):]
    # split into a list
    a = st.split('_')
    # the target is part of the string in index 4
    b = a[4]
    # the ytarget i.e. 1 or 2 is the 5th index of the string --> class1
    target = b[5]
    
    return target

# extract the patient id

# create a new column called 'patient_id'
df_data['patient_id'] = df_data['image_id'].apply(extract_patient_id)
# create a new column called 'target'
df_data['target'] = df_data['image_id'].apply(extract_target)

df_data.head()

In [None]:
# Showing the class distribution of the dataset, We find that the negative is way more than the positives.
sns.countplot(x=df_data.target).set_title("Class Distributions")

In [None]:
from sklearn.model_selection import train_test_split

y = df_data['target']

_, df_test = train_test_split(df_data, test_size=0.03603, random_state=42, stratify=y)

_, df_val = train_test_split(df_data, test_size=0.0036025, random_state=42, stratify=y)

print('Valid:', df_val.shape[0])
print('Test:', df_test.shape[0])

In [None]:
sns.countplot(x=df_val['target']).set_title('Valid Class Distributions')

In [None]:
sns.countplot(x=df_test['target']).set_title('Test Class Distributions')

In [None]:
# Improved the speed of the identification through using a dictionary... Dramatically.
val_dict = df_val.set_index('image_id').T.to_dict('list')
test_dict = df_test.set_index('image_id').T.to_dict('list')

def identify_train_val_and_test_rows(x):
    if str(x) in val_dict:
        return 'val'
    elif str(x) in test_dict:
        return 'test'
    else:
        return 'train'

# create a new colum that is a copy of the image_id column
df_data['train_or_val_or_test'] = df_data['image_id']
# apply the function to this new column
df_data['train_or_val_or_test'] = df_data['train_or_val_or_test'].apply(identify_train_val_and_test_rows)
   
# filter out train rows
df_train = df_data[df_data['train_or_val_or_test'] == 'train']

print('Train:', len(df_train))
print('Valid:', len(df_val))
print('Test:', len(df_test))
df_data.head()

In [None]:
sns.countplot(x=df_train['target']).set_title('Train Class Distributions')

In [None]:
# Set the image_id as the index in df_data
df_data.set_index('image_id', inplace=True)
og_train = df_train.copy()
df_data.head()

# Augmentation

In [None]:
from keras.preprocessing.image import ImageDataGenerator

image_size = 100

# create temporary directories here and delete these directories later
aug_dir = '/kaggle/working/aug_dir'
if os.path.exists(aug_dir) == False:
  os.mkdir(aug_dir)
# create a dir within the base dir to store images of the same class
img_dir = os.path.join(aug_dir, 'img_dir')

if os.path.exists(img_dir) == False:
  os.mkdir(img_dir)

img_dir = os.path.join(str(img_dir) + '/1')

# list all images in that directory
if os.path.exists(img_dir) == False:
    os.mkdir(img_dir)

df_train_c1 = og_train.loc[og_train['target'] == '1']

# point to a dir containing the images and not to the images themselves

# create a data generator
datagen = ImageDataGenerator(
    rotation_range=180,
    width_shift_range=0.125,
    height_shift_range=0.125,
    zoom_range=[1.2, 1.325],
    fill_mode='reflect'
)

batch_size = 25
aug_datagen = datagen.flow_from_dataframe(dataframe=df_train_c1,
                                          x_col='image_id',
                                          y_col='target',
                                          target_size=(image_size, image_size),
                                          batch_size=batch_size,
                                          save_to_dir=img_dir,
                                          save_foramt='jpg'
)

# generate the augmented images and add them to the folder of all images

neg_images = og_train.loc[og_train['target'] == '0']

num_aug_images_wanted = len(neg_images) # total number of images wanted in each class

# num_files = len(os.listdir(img_dir))
# num_files = num of existing classes of 1
num_files = len(og_train.loc[og_train['target'] == '1'])
# num_batches is the number of needed images - the existing 1s divided by the batch_size
num_batches = int(np.ceil((num_aug_images_wanted-num_files)/batch_size))

for i in range(0, num_batches):
  next(aug_datagen)

print("finished generation")

In [None]:
# Check how many train images
print('Train Total:', len(og_train.loc[og_train['target'] == '0', 'image_id'])+ len(og_train.loc[og_train['target'] == '1', 'image_id']))
print('Train Positive:', len(og_train.loc[og_train['target'] == '1']))
print('Train Negative:', len(og_train.loc[og_train['target']== '0']))

In [None]:
aug_images_list = os.listdir("/kaggle/working/aug_dir/img_dir/1")
aug_images_df = pd.DataFrame(aug_images_list, columns=['image_id'])
aug_images_df['image_id'] = aug_images_df['image_id'].apply(lambda x: str("/kaggle/working/aug_dir/img_dir/1/") + x)
aug_images_df.insert(1, 'target', '1')
aug_images_df.insert(2, 'train_or_val_or_test', 'train')
aug_images_df.insert(3, 'patient_id', '0')
aug_images_df.set_index('image_id')

final_df_train = og_train.append(aug_images_df)

In [None]:
aug_images_df.head()

In [None]:
print(len(final_df_train))

In [None]:
sns.countplot(x=final_df_train['target']).set_title('Train Class Distributions')

In [None]:
num_train_samples = len(final_df_train)
num_val_samples = len(df_val)

train_batch_size = 500
val_batch_size = 500

train_steps = np.ceil(num_train_samples / train_batch_size)
val_steps = np.ceil(num_val_samples / val_batch_size)

In [None]:
datagen = ImageDataGenerator(preprocessing_function=keras.applications.inception_v3.preprocess_input, 
                             horizontal_flip=True,
                             vertical_flip=True)

train_batches_bench = datagen.flow_from_dataframe(final_df_train, 
                                                  target_size=(image_size, image_size),
                                                  batch_size=train_batch_size,
                                                  x_col='image_id',
                                                  y_col='target')

valid_batches_bench = datagen.flow_from_dataframe(df_val, 
                                                  target_size=(image_size, image_size),
                                                  batch_size=val_batch_size,
                                                  x_col='image_id',
                                                  y_col='target')

test_batches_bench = datagen.flow_from_dataframe(df_test,
                                                 target_size=(image_size, image_size),
                                                 batch_size=1,
                                                 shuffle=False,
                                                 x_col='image_id',
                                                 y_col='target')

In [None]:
from tensorflow.keras.applications.vgg16 import VGG16
vgg = tf.keras.applications.vgg16.VGG16(
    include_top=False,			# We do not want the classifier on top of the VGG19, we will use our own classifier
    weights=None,			# We want the imagenet weights
    input_shape=(image_size, image_size, 3)	# The input shape is 100, 100, 3 (RGB)
)

In [None]:
x = tf.keras.layers.Flatten()(vgg.output)  # One flatten layer
prediction = tf.keras.layers.Dense(2, activation='softmax')(x) # one softmax layer
model = tf.keras.Model(inputs=vgg.input, outputs=prediction)   # Final Model
model.summary()

In [None]:
# This needs to be checked
# We need to know which data to be added exactly. We want the Image Data Generator function but without any agumentation (for now)
# did not have time to check exactly how. 
# Try to compare with the paper stuff. Hopefully we will be able to find something.
from tensorflow.keras.callbacks import ModelCheckpoint
model_checkpoint = ModelCheckpoint(filepath='breast_histopathology_baseline_AUG_origin_binarySGD.hdf5', verbose=1, save_best_only=True)

class_weights={
    0: 1.0, # negative
    1: 1.0, # postive 
    # make model more sensitive to positive class if necessary
}

# using RMSProp optimizer as it  is the best optimizers according to this paper:
# https://www.sciencedirect.com/science/article/pii/S2214785321013316
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
history = model.fit(train_batches_bench, 
					steps_per_epoch=train_steps, 
					class_weight=class_weights,
					validation_data=valid_batches_bench,
					validation_steps=val_steps,
					callbacks=[model_checkpoint], 
					epochs=24, 
					verbose=2)

In [None]:
import matplotlib.pyplot as plt
training_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

epoch_count = range(1, 25)

plt.figure()
plt.plot(epoch_count, training_acc, 'r--')
plt.plot(epoch_count, val_acc, 'b-')
plt.legend(['Training Acc', 'Validation Acc'])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.savefig("../vgg19_baseline_model_AUG_origin_binarySGD.png")
plt.show()

In [None]:
# Load Weights of the Baseline Model
model.load_weights('breast_histopathology_baseline_AUG_origin_binarySGD.hdf5')

In [None]:
# Test Predictions
preds = model.predict_generator(test_batches_bench, steps=len(df_test), verbose=1)

In [None]:
# Accuracy using the baseline model
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
# Take the highest probability scores to get index of predictions for test images
y_pred = np.argmax(preds, axis=1)

# Get labels of test images
y_true = test_batches_bench.classes

# Evaluate
print(f'Accuracy: {accuracy_score(y_true, y_pred):1.3f}%')

print(f'Balanced Accuracy: {balanced_accuracy_score(y_true, y_pred):1.3f}%')
# Balanced accuracy is calculated as the average of the proportion correct of each class individually

f = open("../testOutput_AUG_origin_binarySGD.txt", "w")
f.write(f'Accuracy: {accuracy_score(y_true, y_pred):1.3f}%\n')
f.write(f'Balanced Accuracy: {balanced_accuracy_score(y_true, y_pred):1.3f}%\n')
f.close()

In [None]:
# Confusion matrix for the baseline model
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_true, y_pred)
plt.imshow(conf_matrix, interpolation='nearest', cmap='viridis')
plt.title('Confusion matrix')
plt.colorbar()
tick_marks = np.arange(len(['negative', 'positive']))
plt.xticks(tick_marks, ['negative', 'positive'])
plt.yticks(tick_marks, ['negative', 'positive'])
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.savefig("../vgg19-baseline-model-confusion-matrix-AUG_origin_binarySGD.png")

In [None]:
confusionmatrix = confusion_matrix(y_true, y_pred)

f = open("../testOutput_AUG_origin_binarySGD.txt", "a")
f.write(f'Confusion matrix: {confusionmatrix}\n')
f.close()

confusionmatrix

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_true=y_true, y_pred=y_pred, target_names=['negative', 'positive'])

f = open("../testOutput_AUG_origin_binarySGD.txt", "a")
f.write(f'REPORT:\n {report}\n')
f.close()

print(report)

In [None]:
x, y = valid_batches_bench.next()

for i in range(1,10):
  
  image = x[i]
  label = y[i]
#   f = open("../testOutput_AUG.txt", "a")
  if np.argmax(label, axis=0) == 0:
      print('IDC negative')
#       f.write(f'IDC negative\n')
  if np.argmax(label, axis=0) == 1:
    print('IDC positive')
#     f.write(f'IDC positive\n')

#   print(image.shape)
#   prob = model.predict(image)
#   for a in prob:
#     for b in a:
#       print(f'Predicted Negative Probability: {(b*100):1.2f}%')
#       print(f'Predicted Positive Probability: {(1-b)*100:1.2f}%')
#       f.write(f'Predicted Negative Probability: {(b*100):1.2f}%\n')
#       f.write(f'Predicted Positive Probability: {(1-b)*100:1.2f}%\n')
#       break

  plt.imshow(image)
#   f.write(image)
  plt.show()
#   f.close()