# CNV-espresso training procedure

In [None]:
from __future__ import print_function
import os
import re
import copy
import random
import datetime
import timeit

import PIL
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold
import sklearn

import tensorflow as tf
from tensorflow import keras
import keras.preprocessing
from keras.models import Sequential, Model
from keras.utils import to_categorical
from keras.layers import Dense, Conv2D, MaxPooling2D, Dropout, Flatten
from keras.models import load_model
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras import backend

import function_dl as func_dl
import function as func

%load_ext autoreload
%autoreload 2

## Variables

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = "1"
physical_devices = tf.config.experimental.list_physical_devices('GPU') 
physical_devices

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True

In [None]:
project_dir = '/path/to/project'
output_model_dir = project_dir + '/train/'

batch_size = 32
epochs     = 20

true_del_file  = project_dir + '/train/true_del.list'
true_dup_file  = project_dir + '/train/true_dup.list'
false_del_file = project_dir + '/train/false_del.list'
false_dup_file = project_dir + '/train/false_dup.list'

In [None]:
img_width, img_height = 224, 224
seed = 2021

## Importing data

### File path

In [None]:
## For rare CNVs
true_del_df  = pd.read_csv(true_del_file,  header=0,sep='\t')
false_del_df = pd.read_csv(false_del_file, header=0,sep='\t')

true_dup_df  = pd.read_csv(true_dup_file,  header=0,sep='\t')
false_dup_df = pd.read_csv(false_dup_file, header=0,sep='\t')

true_del_images_path_list  = true_del_df['img_path']
false_del_images_path_list = false_del_df['img_path']

true_dup_images_path_list  = true_dup_df['img_path']
false_dup_images_path_list = false_dup_df['img_path']

In [None]:
print("The shape of each type:")
print("True  DEL:", true_del_images_path_list.shape)
print("True  DUP:", true_dup_images_path_list.shape)
print("False DEL:", false_del_images_path_list.shape)
print("False DUP:", false_dup_images_path_list.shape)

### Loading images

In [None]:
# # entire cnv
true_del_img_np = func_dl.loadImgs(true_del_images_path_list, img_width, img_height)
true_del_img_np.shape

false_del_img_np = func_dl.loadImgs(false_del_images_path_list, img_width, img_height)
false_del_img_np.shape

true_dup_img_np = func_dl.loadImgs(true_dup_images_path_list, img_width, img_height)
true_dup_img_np.shape

false_dup_img_np = func_dl.loadImgs(false_dup_images_path_list, img_width, img_height)
false_dup_img_np.shape

### Generate labels

In [None]:
# Three classes
true_del_label = [0 for i in range(0,len(true_del_img_np))]
false_del_label = [1 for i in range(0,len(false_del_img_np))]

true_dup_label = [2 for i in range(0,len(true_dup_img_np))]
false_dup_label = [1 for i in range(0,len(false_dup_img_np))]

print(true_del_label[0:5], false_del_label[0:5], true_dup_label[0:5], false_dup_label[0:5])
print(len(true_del_label), len(false_del_label), len(true_dup_label), len(false_dup_label))

### Combine data

In [None]:
combined_cnv_info_df = true_del_df.append(false_del_df, ignore_index=True)
combined_cnv_info_df = combined_cnv_info_df.append(true_dup_df, ignore_index=True)
combined_cnv_info_df = combined_cnv_info_df.append(false_dup_df, ignore_index=True)

In [None]:
combined_img = np.vstack((true_del_img_np, false_del_img_np, true_dup_img_np, false_dup_img_np))

In [None]:
combined_label = true_del_label + false_del_label + true_dup_label + false_dup_label
len(combined_label)

## Backup or restore data

### Backup

In [None]:
## Backup
backup_path = project_dir +'/train/data_backup/'
os.makedirs(backup_path, exist_ok=True)

project_name = 'TBD'
combined_cnv_info_df.to_csv(backup_path+'rare_cnv_info.csv')
np.save(backup_path+'rare_cnv_img', combined_img)
np.save(backup_path+'rare_cnv_label_'+str(len(np.unique(combined_label)))+'classes', combined_label)

### Restore

In [None]:
backup_path = project_dir +'/train/data_backup/'
project_name = 'TBD'
nClasses = 3
combined_img = np.load(backup_path + project_name + '_img.npy')
combined_label = np.load(backup_path+'rare_cnv_label_'+str(nClasses)+'classes'+ '.npy')
combined_cnv_info_df = pd.read_csv(backup_path+project_name+'_info.csv')

In [None]:
print("Project: '%s' dataset loaded."%project_name)
print("  -- Shape of image array: ", combined_img.shape)
print("  -- Shape of label      : ", len(combined_label))
try:
    print("  -- Shape of CNV info   : ", combined_cnv_info_df.shape)
except:
    print("Error")

## Normalization

In [None]:
# Find the shape of input images and create the variable input_shape
nRows,nCols,nDims = combined_img.shape[1:]
input_shape = (nRows, nCols, nDims)
print("The shape of input tensor:",input_shape)

In [None]:
# Change to float datatype
combined_img = combined_img.astype('float32')

# Scale the data to lie between 0 to 1
combined_img /= 255

# Change the labels from integer to categorical data
combined_label_one_hot = to_categorical(combined_label)

The numbers of training data:

In [None]:
classes = np.unique(combined_label)
nClasses = len(classes)
print('Total number of outputs : ', nClasses)
print('Output classes : ', classes)
print("3 classes label: 0-True deletion; 1-Diploid (False del & False dup); 2-True duplication")

In [None]:
# Let's randomly check one CNV image
item = random.randint(0,len(combined_label))
print("Label:", combined_label[item])
func_dl.showImg(combined_img[item])
print(combined_img[item][100][0:10])

## Train the convolutional neural networks

### Split dataset into training (80%) and test (20%) dataset

In [None]:
## split image arrays
train_img, test_img, train_label, test_label, train_cnv_info_df, test_cnv_info_df = train_test_split(combined_img,
                                                                                                    combined_label_one_hot,
                                                                                                    combined_cnv_info_df,
                                                                                                    test_size=0.2,
                                                                                                    shuffle=True,
                                                                                                    random_state=seed)

train_img, val_img, train_label, val_label, train_cnv_info_df, val_cnv_info_df = train_test_split(train_img,
                                                                                                  train_label,
                                                                                                  train_cnv_info_df,
                                                                                                  test_size=0.25,
                                                                                                  shuffle=True,
                                                                                                  random_state=seed) # 0.25*0.8=0.2

combined_img.shape, train_img.shape, val_img.shape, test_img.shape
combined_label_one_hot.shape, train_label.shape, val_label.shape, test_label.shape

## CNN (Transfer learning and fine-tuning)

### Using the pretrained MobileNet v1 architecture
- Firstly, we keep all the weights of base model frozen to train the FC layers.

In [None]:
model_name='MobileNet_v1_fine_tuning'
base_model = tf.keras.applications.MobileNet(
                                weights='imagenet', # Load weights pre-trained model.
                                input_shape=(224, 224, 3),  
                                include_top=False)  # Do not include the ImageNet classifier at the top.

base_model.trainable = False
inputs = keras.Input(shape=(224, 224, 3)) 
x = base_model(inputs, training=False)

# Convert features of shape `base_model.output_shape[1:]` to vectors
x = keras.layers.GlobalAveragePooling2D()(x)
# A Dense classifier with a single unit (binary classification)
outputs = keras.layers.Dense(nClasses,activation='softmax')(x)
model   = keras.Model(inputs, outputs)
model.summary()

In [None]:
model.compile(optimizer=keras.optimizers.Adam(),
              loss='categorical_crossentropy',
              metrics=['accuracy', func_dl.f1_m, func_dl.precision_m, func_dl.recall_m])

In [None]:
print("Training by MobileNet_v1 model ...")

model_file = output_model_dir + project_name + "_" + model_name + "_" + str(nClasses) + "classes.h5"

es = EarlyStopping(monitor  ='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint(model_file,
                     monitor='val_accuracy',
                     mode   ='max', 
                     verbose=1, 
                     save_best_only=True)

history = model.fit(train_img, train_label,
                    batch_size = batch_size, 
                    epochs =epochs,
                    verbose=1, 
                    validation_data=(val_img, val_label), 
                    callbacks=[es, mc])

print("\n")
loss, accuracy, f1_score, precision, recall = model.evaluate(test_img, test_label)

In [None]:
func_dl.draw_loss_accuracy_curves(history, project_name)
func_dl.confusion_matrix(model, test_img, test_label, nClasses)
fpr, tpr, thresholds, auc = func_dl.pred_roc_data(model, test_img, test_label)
func_dl.draw_single_roc_curve(tpr, fpr, auc)

### Fine-tuning
- Secondly, Once your model has converged on our train data, we unfreeze all or part of the base model and retrain the whole model end-to-end with a very low learning rate.

In [None]:
print("Fine tuning by MobileNet_v1 model ...")
model_file = output_model_dir + project_name + "_" + model_name + "_" + str(nClasses) + "classes.h5"

base_model.trainable=True
model.summary()

In [None]:
model.compile(optimizer=keras.optimizers.Adam(1e-5),
    loss='categorical_crossentropy', metrics=['accuracy', func_dl.f1_m, func_dl.precision_m, func_dl.recall_m])

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint(model_file,
                     monitor='val_accuracy',
                     mode   ='max', 
                     verbose=1, 
                     save_best_only=True)

history = model.fit(train_img, train_label,
                    batch_size = batch_size, 
                    epochs  = epochs,
                    verbose = 1, 
                    validation_data = (val_img, val_label), 
                    callbacks = [es, mc])
print("\n")
loss, accuracy, f1_score, precision, recall = model.evaluate(test_img, test_label)

In [None]:
func_dl.draw_loss_accuracy_curves(history, project_name)
func_dl.confusion_matrix(model, test_img, test_label, nClasses)
fpr, tpr, thresholds, auc = func_dl.pred_roc_data(model, test_img, test_label)
func_dl.draw_single_roc_curve(tpr, fpr, auc)

In [None]:
func.showDateTime()
print("[Done]. Please check the trained model at",model_file)