In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# !pip uninstall pandas-profiling
# !pip install pandas-profiling[notebook,html]

In [None]:
import pandas as pd # used for data analysis => library for manipulating Dataframes and Series (advanced data holding structures)
import numpy as np #library for manipulating arrays contains array data and basic operations such as sorting, indexing)
import sklearn #widely used machine learning library
from pandas_profiling import ProfileReport #one-stop-shop tool for data summarization and visualization (aka Exploratory Data Analysis or EDA) https://pandas-profiling.github.io/pandas-profiling/docs/master/index.html
import matplotlib.pyplot as plt #Matplotlib is a comprehensive library for creating static, animated, and interactive visualizations in Python. 
import seaborn as sns # 

# **Problem Statement**

—Breast cancer is one of the most common causes of death in women worldwide. Upon that, it is the most common form of cancer in women. Clearly, breast cancer is a global issue. Research tells us that early detection and early treatement is vital because it greatly increases survival rates

- The tumors that cause breast cancer can be subdivided into malignant (cancerous) and benign (noncancerous) types, based on a variety of cell characteristics. Therefore, being able to differentiate the two types of tumors is key to early detection. Many health care operations leave their patients waiting for long periods of time before getting a result (often times innacurate). This flaw is why breast cancer is the most common cause of death in women worldwide

- I have set up a pathway for a different option, one that could drastically increases surivial rates. I created a model that can provide automatic breast cancer predictions. I believe this method is an improvement because it can give early diagnosis to patients with precision and accuracy

# **Dataset**

* **Histopathologic-Cancer-Detection**

In [None]:
data = pd.read_csv('/kaggle/input/histopathologic-cancer-detection/train_labels.csv')

print(data.shape) # number of rows and columns
print('-'*125)
data.head(5) #

In [None]:
data.sample(5)

# **Exploratory Data Analysis**

In [None]:
data.describe()

# **Data Cleaning**

In [None]:
# Data Cleansing: dropping the highly correlated features discovered in the data profile report
data_trim = data.drop(labels =['id'], axis = 1)

In [None]:
print(data.shape)
print(data_trim.shape)
data_trim.head()

# **Feature Engineering/Selection**

In [None]:
#Data Trimming - splitting the variables into target and predictor
y = data_trim['label']
print(y.sample(5))

X = data_trim.drop(labels = 'label', axis = 1)
print(X.sample(5))

type(y)
X.shape

In [None]:
from sklearn.preprocessing import LabelEncoder

# encode class values as integers
print("Before encoding: ")
print(y[100:110]) 

labelencoder_X_1 = LabelEncoder()
y = labelencoder_X_1.fit_transform(y)

print("\nAfter encoding: ")
print(y[100:110])

# **Keras**

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, model_from_json
from keras.layers import Conv2D, MaxPooling2D, MaxPool2D, Dropout, Activation, Flatten, Dense
from keras.optimizers import Adam
from keras import backend as k
import tensorflow as tf

In [None]:
import warnings
warnings.filterwarnings('ignore', category= DeprecationWarning)

# **Data Preprocessing**

In [None]:
img_width, img_height = 224, 224
img_size = (224, 224)

# saving the directory/folder location for training and validation image folders to variables
train_data_dir = '../input/histopathologic-cancer-detection'
test_data_dir = '../input/histopathologic-cancer-detection'

# Saving the batch_size and epochs for training to variables
batch_size = 32 #usually in powers of 2 - 8, 16, 32, 64, 128
epochs = 1

In [None]:
train_datagen = ImageDataGenerator(rescale=1./255, #normalization
                                   validation_split=0.2,
                                   shear_range = 0.2, # extent of shearing - 20% 
                                   zoom_range = 0.2, # extent of adjusting the zoom in images - 20%
                                   horizontal_flip= True, # randomly flipping to get mirror images 
                                   rotation_range=0.1)
                                   

# Validation ImageDataGenerator with rescaling.
# You should not apply any other transformations to the validation set because it has to reflect real world data
valid_datagen = ImageDataGenerator(rescale=1./255, 
                                   validation_split=0.2)

In [None]:
train_generator = train_datagen.flow_from_directory(train_data_dir, 
                                              subset='training',
                                              shuffle=True, 
                                              seed=42, 
                                              target_size=img_size,
                                              batch_size=batch_size,
                                              class_mode='binary')

# Create a flow from the directory for validation data - seed=42
validation_generator = valid_datagen.flow_from_directory(train_data_dir, 
                                              subset='validation',
                                              seed=42, 
                                              target_size=img_size,
                                              batch_size = batch_size,
                                              class_mode='binary')

In [None]:
if k.image_data_format() == "channels_first":
    input_shape = (3, img_width, img_height)
else:
    input_shape = (img_width, img_height, 3)
    
input_shape

# **Convolutional Net**

In [None]:
from tensorflow.keras.optimizers import RMSprop
model = tf.keras.models.Sequential([
    # First Convolution
    tf.keras.layers.Conv2D(16, (3, 3), activation='relu', input_shape=input_shape),
    tf.keras.layers.MaxPooling2D(2, 2),
    # Second Convolution
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2, 2),
    # Third Convolution
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2, 2),
    # Flatten
    tf.keras.layers.Flatten(),
    # Dense layer
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(lr=0.001),
    metrics=['accuracy']
)

In [None]:
model.summary()

In [None]:
history = model.fit(train_generator,epochs = epochs,
                             validation_data= validation_generator)

# Writing the model to a JSON file
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)

# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

# save model and architecture to single file
model.save("full_model.h5")
print("Saved full model to disk")

In [None]:
print(history.history.keys()) # prints what the history has stored (stored in a dictionary format (key-value pairs))

# summarize history for accuracy on a line chart
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

# **Evaluate Model Performance**

In [None]:
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

In [None]:
from keras.models import load_model #to load the model saved as a h5 file
# load model
full_model = load_model('full_model.h5')
# summarize model.
full_model.summary()

In [None]:
# Prints out the model loss and accuracy
model.evaluate_generator(validation_generator)

In [None]:
# load test dataset
test_datagen = ImageDataGenerator(rescale=1./255)

# Create a flow from the directory for validation data - seed=42
test_generator = test_datagen.flow_from_directory(test_data_dir,  
                                              target_size=img_size,
                                              batch_size = batch_size,
                                              class_mode='binary')

In [None]:
# Prints out the model loss and accuracy
model.evaluate_generator(test_generator)

# **Transfer Learning**

In [None]:
from tensorflow.keras.optimizers import RMSprop
tf.keras.applications.NASNetLarge(
    input_shape=None,
    include_top=True,
    weights="imagenet",
    input_tensor=None,
    pooling=None,
    classes=1000,
)

In [None]:
from keras.applications.resnet import ResNet50
from keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.inception_v3 import InceptionV3

from keras.models import Sequential, Model,
# loading the pretrained model
vgg_pre_trained_model = VGG16(input_shape = input_shape, 
                                include_top = False, 
                                weights = 'imagenet')

# inception_pre_trained_model = InceptionV3(input_shape = input_shape, 
# #                                 include_top = False, 
# #                                 weights = 'imagenet')

# resnet_pre_trained_model = ResNet50(input_shape = input_shape, 
# #                                 include_top = False, 
# #                                 weights = 'imagenet')

# freezing parameters in convolutional layers
for layer in pre_trained_model.layers:
    layer.trainable = False

pre_trained_model.summary()

In [None]:
def build_my_model(model_name):
 
    new_model = Sequential()
    model = vgg_pre_trained_model
    # Add the convolutional part of the VGG16 model from above.
    new_model.add(model)
    # Create a custom classifier 
    new_model.add(Flatten())
    new_model.add(Dense(1024, activation='relu'))
    new_model.add(Dropout(0.5))
    new_model.add(Dense(512, activation='relu'))
    new_model.add(Dropout(0.5))
    new_model.add(Dense(256, activation='relu'))
    new_model.add(Dropout(0.5))
    new_model.add(Dense(1, activation='sigmoid'))
    
    return new_model
# building vggnet model
my_vgg_model = build_my_model(vgg_pre_trained_model)

# building resnet model
my_resnet_model = build_my_model(resnet_pre_trained_model)


my_vgg_model.summary()

my_model = my_vgg_model

In [None]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

## Set our optimizer, loss function, and learning rate
optimizer = Adam(lr=1e-3) # RMSprop(learning_rate=1e-4)
loss = 'binary_crossentropy'
metrics = ['accuracy']

weight_path="{}_my_model.bestv.hdf5".format('class')

# save the weights of the best model during training
checkpoint = ModelCheckpoint(weight_path, 
                             monitor= 'val_loss', 
                             verbose=1, 
                             save_best_only=True, 
                             mode= 'auto', 
                             save_weights_only = True)

# if results have not improved after a certain number of epochs, stop training.
early = EarlyStopping(monitor= 'val_loss', 
                      mode= 'auto', 
                      patience=10)

callbacks_list = [checkpoint, early]

In [None]:
my_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# my_model.load_weights(weight_path)
# Saving the history of training epochs and associated metrics to a variable (history) 
history = my_model.fit(train_generator, 
                          validation_data = validation_generator, 
                          epochs = 1 , 
                          callbacks = callbacks_list)

In [None]:
print(history.history.keys()) # prints what the history has stored (stored in a dictionary format (key-value pairs))

# summarize history for accuracy on a line chart
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper left')
plt.show()

In [None]:
# summarize history for loss on a line chart
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

# **Evaluate Performance on Test Set**

In [None]:
# load test dataset
test_datagen = ImageDataGenerator(rescale=1./255)

# Create a flow from the directory for validation data - seed=42
test_generator = test_datagen.flow_from_directory(test_data_dir,  
                                              target_size=img_size,
                                              batch_size = batch_size,
                                              class_mode='binary')

my_model.evaluate(test_generator)

In [None]:
# test_data = []
# test_labels = []
# batch_index = 0

# while batch_index <= test_generator.batch_index:
#     data = next(test_generator)
#     test_data.append(data[0])
#     test_labels.append(data[1])
#     batch_index = batch_index + 1

# test_data_array = np.asarray(test_data)
# test_labels_array = np.asarray(test_labels)
# y_true = test_labels_array

# test_data_array.shape

In [None]:
y_true = test_generator.classes

# Get prediction probabilities from model
probabilities = my_model.predict(test_generator)
# Because this is a binary classification problem, you have to find predicted labels
y_pred = probabilities > 0.5

In [None]:
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix

# Classification report will show us precision, recall and F1 Score of the model
print(classification_report(y_true, y_pred))

In [None]:
accuracy = accuracy_score(y_true, y_pred)
print(accuracy)

In [None]:
conf_mat = confusion_matrix(y_true, y_pred)
print(conf_mat)

In [None]:
# Using Seaborn to display Confusion Matrix Beautifully
# Transform to df for easier plotting

cm_df = pd.DataFrame(conf_mat,
                     index = ['M','B'], 
                     columns = ['M','B'])

plt.figure(figsize=(8,6))
sns.heatmap(cm_df, annot=True)
plt.title('Dogs and Cats Classificaiton Confusion Matrx\n CNN')
plt.ylabel('True label')
plt.xlabel('Predicted label')
# plt.figtext(1.30, 0.01, 'Accuracy: {}%\n Sensitivity: {}\n Specificity: {} '.format(round(svc_accuracy, 2),round(sensitivity, 2),round(specificity, 2)), horizontalalignment='right')
plt.show()

# **Dispaly Results**