# Git hub with all code [there](https://github.com/Sonya-Shultz/DLcourse)

In [11]:
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
from matplotlib import image as mpimg
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow import keras
from tensorflow.keras.layers import * 
from tensorflow.keras.preprocessing.image import ImageDataGenerator

**Read all data from train_label.csv and change id to file name.**

In [12]:
train_data_full = pd.read_csv('../input/histopathologic-cancer-detection/train_labels.csv', dtype=str)
print("Shape of dataset: ",train_data_full.shape)

train_data_full.id = train_data_full.id + '.tif'
train_data_full.head()

**For some statistics, lets see label distribution**

In [13]:
(train_data_full.label.value_counts() / len(train_data_full)).to_frame().sort_index().T

**Lets see some data images example:**

In [14]:
print("Dataset image example")
h_path = '../input/histopathologic-cancer-detection/train'
sample = train_data_full.sample(n=16).reset_index()

plt.figure(figsize=(6,6))

for i, row in sample.iterrows():
    img = mpimg.imread(f'../input/histopathologic-cancer-detection/train/{row.id}')
    label = row.label
    plt.subplot(4,4,i+1)
    plt.imshow(img)
    plt.text(0,-5,f'label {label}', color='k')

    plt.axis('off')

plt.tight_layout()
plt.show()


# Prepare data for training and validation.

Also create data loaders for training and validation.

In [15]:
# Divade into training and validation part
train_data, valid_data = train_test_split(train_data_full, test_size=0.2, random_state=1, stratify=train_data_full.label)

# Data loaders
train_data_gen = ImageDataGenerator(rescale=1/255)
validation_data_gen = ImageDataGenerator(rescale=1/255)

BATCH_SIZE = 64

train_loader = train_data_gen.flow_from_dataframe(
    dataframe = train_data,
    directory = h_path,
    x_col = 'id',
    y_col = 'label',
    batch_size = BATCH_SIZE,
    seed = 1,
    shuffle = True,
    class_mode = 'categorical',
    target_size = (96,96)
)

valid_loader = train_data_gen.flow_from_dataframe(
    dataframe = valid_data,
    directory = h_path,
    x_col = 'id',
    y_col = 'label',
    batch_size = BATCH_SIZE,
    seed = 1,
    shuffle = True,
    class_mode = 'categorical',
    target_size = (96,96)
)

# **Now build CNN.**
(Set log level becouse of warning from kaggle)

**Input shape** is 96px x 96px x 3(rgb)

**Use 3 blocks like:**
* two conv2d layers, 
* max pooling (2x2), 
* dropout and batch normalization. 
* Activation function is relu, padding - same.

*Then flatten layer.*

**Next for clasification:**
* Dense 64 layer + dropout (activation - relu)
* Dense 8 layer + dropout (activation - relu)
* Batch normalization
* Output layer - dense 2 with sigmoid activation (good for binary classification)

In [16]:
# Build CNN
np.random.seed(1)
tf.random.set_seed(1)

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
cnn = Sequential([
    Conv2D(64, (3,3), activation = 'relu', padding = 'same', input_shape=(96,96,3)),
    Conv2D(64, (3,3), activation = 'relu', padding = 'same'),
    MaxPooling2D(2,2),
    Dropout(0.5),
    BatchNormalization(),

    Conv2D(64, (3,3), activation = 'relu', padding = 'same'),
    Conv2D(64, (3,3), activation = 'relu', padding = 'same'),
    MaxPooling2D(2,2),
    Dropout(0.5),
    BatchNormalization(),
    
    Conv2D(128, (3,3), activation = 'relu', padding = 'same'),
    Conv2D(128, (3,3), activation = 'relu', padding = 'same'),
    MaxPooling2D(2,2),
    Dropout(0.5),
    BatchNormalization(),

    Flatten(),
    
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(8, activation='relu'),
    Dropout(0.5),
    BatchNormalization(),
    Dense(2, activation='sigmoid')
])

cnn.summary()

# Set optimazer parametr for first training

In [17]:
opt = tf.keras.optimizers.Adam(0.001)
cnn.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy', tf.keras.metrics.AUC()])

# Satrt training 40 epoch

In [18]:
%%time 

h1 = cnn.fit(
    x = train_loader, 
    steps_per_epoch = len(train_loader), 
    epochs = 40,
    validation_data = valid_loader, 
    validation_steps = len(valid_loader), 
    verbose = 1
)

**See what we can show on plot**

In [19]:
history = h1.history
print(history.keys())

# Function wich show us some training results

In [23]:
def show_res():
    epoch_range = range(1, len(history['loss'])+1)

    plt.figure(figsize=[14,4])
    plt.subplot(1,3,1)
    plt.plot(epoch_range, history['loss'], label='Training')
    plt.plot(epoch_range, history['val_loss'], label='Validation')
    plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.title('Loss')
    plt.legend()
    plt.subplot(1,3,2)
    plt.plot(epoch_range, history['accuracy'], label='Training')
    plt.plot(epoch_range, history['val_accuracy'], label='Validation')
    plt.xlabel('Epoch'); plt.ylabel('Accuracy'); plt.title('Accuracy')
    plt.legend()
    plt.subplot(1,3,3)
    plt.plot(epoch_range, history['auc_1'], label='Training')
    plt.plot(epoch_range, history['val_auc_1'], label='Validation')
    plt.xlabel('Epoch'); plt.ylabel('AUC'); plt.title('AUC')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
show_res()

# Secound training
**As see we had a "problem" with Validation set**

We, probably, have too big learning rate. So change it to "normal".

For not overfition our model let's do 20 epochs.

In [25]:
tf.keras.backend.set_value(cnn.optimizer.learning_rate, 0.0001)

In [26]:
%%time 

h2 = cnn.fit(
    x = train_loader, 
    steps_per_epoch = len(train_loader), 
    epochs = 20,
    validation_data = valid_loader, 
    validation_steps = len(valid_loader), 
    verbose = 1
)

In [27]:
for k in history.keys():
    history[k] += h2.history[k]
show_res()

# Third training

**We see that previous train give us much less loss and higher accuracy.**

So, let train it another time with same settings but for 10 epoch.

In [28]:
tf.keras.backend.set_value(cnn.optimizer.learning_rate, 0.0001)

In [29]:
%%time 

h3 = cnn.fit(
    x = train_loader, 
    steps_per_epoch = len(train_loader), 
    epochs = 10,
    validation_data = valid_loader, 
    validation_steps = len(valid_loader), 
    verbose = 1
)

In [30]:
for k in history.keys():
    history[k] += h3.history[k]
show_res()

**So, as we can see, accuracy preaty good (round 0.95 for validation set).**

So, let's stop on this.

# Submission
**Firstly we need to read all data from test folder**

And look some semple from it.

*Not shuffle data becouse we goin predict not train with model.*

In [32]:
test_data = pd.read_csv('../input/histopathologic-cancer-detection/sample_submission.csv')
print('Test data has ', test_data.shape, ' size.')
test_data['file_n'] = test_data.id + '.tif'
test_data.head()

In [33]:
BATCH_SIZE = 64

test_data_gen = ImageDataGenerator(rescale=1/255)

test_loader = test_data_gen.flow_from_dataframe(
    dataframe = test_data,
    directory = "../input/histopathologic-cancer-detection/test",
    x_col = 'file_n',
    batch_size = BATCH_SIZE,
    shuffle = False,
    class_mode = None,
    target_size = (96,96)
)

**Start prediction and see size of result**

In [34]:
test_end = cnn.predict(test_loader)
print(test_end.shape)

In [35]:
submission = pd.read_csv('../input/histopathologic-cancer-detection/sample_submission.csv')
submission.head()

**lets fit our ansver to competition submission form**

In [36]:
submission.label = test_end[:,1]
submission.head()

In [38]:
submission.to_csv('submission.csv', header=True, index=False)