# Histopathologic Cancer Detection
## Identify metastatic tissue in histopathologic scans of lymph node sections

# About the images

#### There are 220,025 training images and 57,456 test images.
#### The images are 96x96 pixels and are full color.

# Import Packages

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.model_selection import train_test_split
import pickle
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *

import zipfile 

# Working Directory

In [None]:
working_dir = '../input/histopathologic-cancer-detection'
os.listdir(working_dir)

# Label as per csv file¶

#### 0 = no tumor tissue
#### 1 = has tumor tissue

# Number of images in the train and test folder

In [None]:
print('Number of images in train set',len(os.listdir('../input/histopathologic-cancer-detection/train')))
print('Number of images in test set',len(os.listdir('../input/histopathologic-cancer-detection/test')))

# Load Training DataFrame

In [None]:
# Load the training data into a DataFrame named 'train'.
train = pd.read_csv(f'../input/histopathologic-cancer-detection/train_labels.csv',dtype = 'str')

# Print the shape of the resulting DataFrame.
print('Training set size', train.shape)

In [None]:
# Display the first few rows of the dataframe.
train.head(10) 

In [None]:
#The id in the csv file does not have .tif extension, let's add it.
train['id'] = train['id'].apply(lambda x:f'{x}.tif')
train.head()

# Label Distribution

In [None]:
#Let's check the class distribution
#train['label'].value_counts()
train.label.value_counts() 

In [None]:
#Let's check the class distribution in proportion
#y_train = train.label
round((train.label.value_counts() / len(train)).to_frame()*100,2)

In [None]:
sample_size = 160000
train = train.sample(sample_size, random_state=1)

# View Sample of Images

In [None]:
#display 16 images

sample = train.sample(n=16).reset_index()
plt.figure(figsize=(6,6)) # specifying the overall grid size

for i, row in sample.iterrows():  
    img = mpimg.imread(f'../input/histopathologic-cancer-detection/train/{row.id}')
    label = row.label
    
    plt.subplot(4,4,i+1)    # the number of images in the grid is 6*6 (16)
    plt.imshow(img)
    plt.text(0, -5, f'Class {label}', color='k')
    plt.axis('off')
    
plt.tight_layout()
plt.show()

# Data generator

In [None]:
# Split the dataframe train into two DataFrames named train_df and valid_df. 

train_df, valid_df = train_test_split(train, test_size=0.20, random_state=1, stratify=train.label)

print(train_df.shape)
print(valid_df.shape)

In [None]:
# Create image data generators for both the training set and the validation set. 
# Here we use the data generators to scale the pixel values by a factor of 1/255. 

train_datagen = ImageDataGenerator(rescale=1/255)
valid_datagen = ImageDataGenerator(rescale=1/255)

In [None]:
BATCH_SIZE = 64

train_loader = train_datagen.flow_from_dataframe(
    dataframe = train_df,
    directory = '../input/histopathologic-cancer-detection/train/',
    x_col = 'id',
    y_col = 'label',
    batch_size = BATCH_SIZE,
    seed = 1,
    shuffle = True,
    class_mode = 'categorical',
    target_size = (64,64)
)

valid_loader = train_datagen.flow_from_dataframe(
    dataframe = valid_df,
    directory = '../input/histopathologic-cancer-detection/train/',
    x_col = 'id',
    y_col = 'label',
    batch_size = BATCH_SIZE,
    seed = 1,
    shuffle = True,
    class_mode = 'categorical',
    target_size = (64,64)
)

In [None]:
# Let's determine the number of training and validation batches. 

TR_STEPS = len(train_loader)
VA_STEPS = len(valid_loader)

print('Number of batches in the training set:',TR_STEPS)
print('Number of batches in the validation set:',VA_STEPS)

# Build network

In [None]:
np.random.seed(1)
tf.random.set_seed(1)

cnn_model = Sequential([
    Conv2D(filters=32, kernel_size=(3,3), padding='valid', activation='relu', input_shape=(64,64,3)),
    Conv2D(filters=32, kernel_size=(3,3), padding='valid', activation='relu'),
    MaxPooling2D(2,2),
    Dropout(0.25),
    BatchNormalization(),

    Conv2D(filters=64, kernel_size=(3,3), padding='valid', activation='relu'),
    Conv2D(filters=64, kernel_size=(3,3), padding='valid', activation='relu'),
    MaxPooling2D(2,2),
    Dropout(0.25),
    BatchNormalization(),

    Flatten(),
    
    Dense(128, activation='relu'),
    Dropout(0.25),
    Dense(64, activation='relu'),
    Dropout(0.25),
    Dense(32, activation='relu'),
    Dropout(0.25),
    BatchNormalization(),
    Dense(2, activation='softmax')
])
 

cnn_model.summary()

# Train network

In [None]:
# Define an optimizer and select a learning rate. 
# And then compile the model. 
import tensorflow as tf

opt = tf.keras.optimizers.Adam(0.001)
cnn_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy',tf.keras.metrics.AUC()])

In [None]:
%%time 

h1 = cnn_model.fit(
    x = train_loader, 
    steps_per_epoch = TR_STEPS, 
    epochs = 30,
    validation_data = valid_loader, 
    validation_steps = VA_STEPS, 
    verbose = 1
)

# Training Kurves

In [None]:
history = h1.history
print(history.keys())

In [None]:
epoch_range = range(1, len(history['loss'])+1)

plt.figure(figsize=[14,4])
plt.subplot(1,3,1)
plt.plot(epoch_range, history['loss'], label='Training')
plt.plot(epoch_range, history['val_loss'], label='Validation')
plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.title('Loss')
plt.legend()
plt.subplot(1,3,2)
plt.plot(epoch_range, history['accuracy'], label='Training')
plt.plot(epoch_range, history['val_accuracy'], label='Validation')
plt.xlabel('Epoch'); plt.ylabel('Accuracy'); plt.title('Accuracy')
plt.legend()
plt.subplot(1,3,3)
plt.plot(epoch_range, history['auc'], label='Training')
plt.plot(epoch_range, history['val_auc'], label='Validation')
plt.xlabel('Epoch'); plt.ylabel('AUC'); plt.title('AUC')
plt.legend()
plt.tight_layout()
plt.show()

# Same model and history

In [None]:
# save the model and the combined history dictionary to files.
cnn_model.save('cancer_model_v03.h5')
pickle.dump(history, open(f'cancer_history_v03.pkl', 'wb'))