# Imports

In [None]:
#Packages
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow import keras
from statistics import mean
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from keras import Model
from keras.layers import Input, Dense, Activation, Dropout, BatchNormalization, Add

from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve, average_precision_score

from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# Data loading

## Mount Drive

In [None]:
drive.mount('/content/drive', force_remount=True)
root_dir = "/content/drive/My Drive/"

Mounted at /content/drive


## Load Dataset 

In [None]:
# Load frequency Vectors
freq_vectors = np.load(root_dir+"Data/hotspots/kmers/freqvectors_hotspots-5k-polys-500chunk_with_reversed.npy")
freq_vectors = np.delete(freq_vectors, np.s_[512:1024], axis=1)
print(freq_vectors.shape)

# Load labels
labels = np.load(root_dir+"Data/hotspots/kmers/labels_hotspots-3k-list-500chunk_with_reversed.npy")

(154336, 542)


In [None]:
#freq_vectors = np.load(root_dir+"Data/hotspots/fasta/hotspots-5k-1polys.npy")
#labels = np.load(root_dir+"Data/hotspots/fasta/labels_hotspots-5k-1polys.npy")

In [None]:
print(freq_vectors[3])

[29.  3.  3.  3.  4.  1.  0.  0.  0.  1.  1.  3.  1.  0.  0.  3.  1.  3.
  3.  0.  1.  1.  0.  0.  0.  0.  0.  0.  0.  1.  2.  1.  1.  0.  0.  0.
  2.  2.  0.  1.  2.  1.  1.  1.  1.  1.  3.  0.  0.  1.  0.  0.  0.  2.
  0.  0.  0.  0.  3.  1.  3.  0.  0.  4.  3.  1.  0.  0.  3.  2.  0.  0.
  2.  0.  5.  1.  0.  0.  0.  0.  1.  0.  3.  2.  0.  0.  2.  0.  1.  0.
  0.  1.  1.  2.  1.  2.  0.  0.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  2.  1.  2.  0.  1.  1.  3.  2.  2.  1.  0.
  1.  1.  1.  0.  0.  2.  2.  1.  0.  1.  2.  1.  3.  1.  0.  1.  0.  3.
  2.  0.  1.  1.  2.  3.  1.  4.  0.  1.  0.  0.  2.  1.  1.  1.  2.  2.
  4.  2.  3.  3.  1.  4.  1.  4.  1.  2.  2.  1.  1.  1.  0.  1.  0.  1.
  0.  2.  0.  2.  3.  1.  2.  0.  1.  0.  2.  2.  0.  0.  0.  0.  2.  0.
  0.  1.  0.  0.  3.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  3.  0.  1.
  0.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  0.  2.  0.  0.  3.  1.
  1.  1.  0.  1.  0.  1.  2.  0.  2.  1.  0.  0.  0

In [None]:
print(labels[3])

0.0


## Data preprocessing

In [None]:
scaler = MinMaxScaler()
freq_vectors = scaler.fit_transform(freq_vectors)

# Neural Network

## Hyperparameters

In [None]:
EPOCHS = 2000
LEARNING_RATE = 0.001
BATCH_SIZE = 128
DROPOUT_RATE = 0.25
RESIDUAL_ACTIVATION_TYPE = 'relu'

freq_vector_size = len(freq_vectors[0])

reduce_lr  = ReduceLROnPlateau(monitor='val_loss', factor=0.7, patience=200, min_delta=0.01, cooldown=100, min_lr=0.0001)
early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=250, restore_best_weights=True)

## Model Definition

In [None]:
def createOptimizer(model, learning_rate):

  optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
  model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics = ['accuracy'])

  return model

def create_model():
  initializer = keras.initializers.GlorotNormal()

  res_input = Input(shape=(freq_vector_size))
  res_part = Dropout(DROPOUT_RATE)(res_input)

  for i in range(0, 1):

      def regression_identity_block(res_part, activation):
          res_shortcut = res_part

          ri_block = Dense(units = 32 , kernel_initializer=initializer, use_bias=True, bias_initializer='zeros')(res_part)
          #ri_block  = BatchNormalization()(ri_block)
          ri_block = Activation(activation=activation)(ri_block)

          ri_block = Dense(units = 16, kernel_initializer=initializer, use_bias=True, bias_initializer='zeros')(ri_block)
          #ri_block  = BatchNormalization()(ri_block)
          ri_block = Activation(activation=activation)(ri_block)

          ri_block = Dense(8, kernel_initializer=initializer, use_bias=True, bias_initializer='zeros')(ri_block)

          ri_jump   = Dense(8, kernel_initializer=initializer, use_bias=True, bias_initializer='zeros')(res_shortcut)

          ri_block = Add()([ri_block, ri_jump])
          #ri_block  = BatchNormalization()(ri_block)
          ri_block = Activation(activation=activation)(ri_block)
          return ri_block

      res_part = regression_identity_block(res_part, RESIDUAL_ACTIVATION_TYPE)

  output = Dense(1, activation='sigmoid')(res_part)

  model = Model(inputs=res_input, outputs=output)
  model = createOptimizer(model, LEARNING_RATE)

  return model


## Training

In [None]:
test_acc_max = 0
best_model = ""
x_test_max = ""
y_true_max = ""
best_history = ""
scores = []

for i in range(0,1):
    fv_train, fv_test, y_train, y_test = train_test_split(freq_vectors, labels, test_size=0.2, shuffle=True)
    fv_train = fv_train.astype('float32')
    fv_test = fv_test.astype('float32')

    model = create_model()
    history = model.fit(fv_train, y_train, validation_data=(fv_test, y_test), epochs=EPOCHS, batch_size=BATCH_SIZE, shuffle=True, verbose=2, callbacks=[reduce_lr, early_stopping])
    test_loss, test_acc = model.evaluate(fv_test, y_test)
    scores.append(test_acc)
    if(test_acc > test_acc_max):
      test_acc_max = test_acc
      best_model = model
      x_test_max = fv_test
      y_true_max = y_test
      best_history = history

print('Max accuracy:', test_acc_max)
print('Mean accuracy:', mean(scores))

Epoch 1/2000
965/965 - 3s - loss: 0.6954 - accuracy: 0.4750 - val_loss: 0.6944 - val_accuracy: 0.4760
Epoch 2/2000
965/965 - 3s - loss: 0.6946 - accuracy: 0.4783 - val_loss: 0.6938 - val_accuracy: 0.4834
Epoch 3/2000
965/965 - 3s - loss: 0.6940 - accuracy: 0.4851 - val_loss: 0.6933 - val_accuracy: 0.4894
Epoch 4/2000
965/965 - 3s - loss: 0.6936 - accuracy: 0.4904 - val_loss: 0.6930 - val_accuracy: 0.5015
Epoch 5/2000
965/965 - 3s - loss: 0.6932 - accuracy: 0.5015 - val_loss: 0.6927 - val_accuracy: 0.5117
Epoch 6/2000
965/965 - 3s - loss: 0.6930 - accuracy: 0.5052 - val_loss: 0.6924 - val_accuracy: 0.5201
Epoch 7/2000
965/965 - 3s - loss: 0.6927 - accuracy: 0.5113 - val_loss: 0.6921 - val_accuracy: 0.5249
Epoch 8/2000
965/965 - 3s - loss: 0.6924 - accuracy: 0.5166 - val_loss: 0.6918 - val_accuracy: 0.5313
Epoch 9/2000
965/965 - 3s - loss: 0.6921 - accuracy: 0.5221 - val_loss: 0.6915 - val_accuracy: 0.5350
Epoch 10/2000
965/965 - 3s - loss: 0.6918 - accuracy: 0.5273 - val_loss: 0.6912 - 

# Results


## Report

In [None]:
y_pred=best_model.predict(x_test_max).ravel()
print(classification_report(y_true_max, (y_pred > 0.5)))

## ROC Curve

In [None]:
# calling the roc_curve, extract the probability of 
# the positive class from the predicted probability
fpr, tpr, thresholds = roc_curve(y_true_max, y_pred)

# AUC score that summarizes the ROC curve
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, lw = 2, label = 'ROC AUC: {:.2f}'.format(roc_auc))
plt.plot([0, 1], [0, 1],
         linestyle = '--',
         color = (0.6, 0.6, 0.6),
         label = 'random guessing')
plt.plot([0, 0, 1], [0, 1, 1],
         linestyle = ':',
         color = 'black', 
         label = 'perfect performance')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.title('Receiver Operator Characteristic')
plt.legend(loc = "lower right")
plt.tight_layout()
plt.show()

## Precission Recall Curve

In [None]:
precision, recall, thresholds = precision_recall_curve(y_true_max, y_pred)

# AUC score that summarizes the precision recall curve
avg_precision = average_precision_score(y_true_max, y_pred)

label = 'Precision Recall AUC: {:.2f}'.format(avg_precision)
plt.plot(recall, precision, lw = 2, label = label)
plt.xlabel('Recall')  
plt.ylabel('Precision')  
plt.title('Precision Recall Curve')
plt.legend()
plt.tight_layout()
plt.show()

## Confussion Matrix

In [None]:
y_pred=best_model.predict(x_test_max).ravel()
y_pred = y_pred > 0.5

class_names = ["Hotspot", "No Hotspot"]
con_mat = tf.math.confusion_matrix(labels=y_true_max, predictions=y_pred).numpy()
con_mat_norm = np.around(con_mat.astype('float') / con_mat.sum(axis=1)[:, np.newaxis], decimals=2)
con_mat_df = pd.DataFrame(con_mat_norm, index = class_names, columns = class_names)

print('Accuracy Y_test: ', accuracy_score(y_true_max, y_pred))
figure = plt.figure(figsize=(8, 8))
sns.heatmap(con_mat_df, annot=True,cmap=plt.cm.Blues)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

## Accuracy

In [None]:
plt.plot(best_history.history['accuracy'])
plt.plot(best_history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

## Loss

In [None]:
plt.plot(best_history.history['loss'])
plt.plot(best_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# Save Data


In [None]:
model.save(root_dir+'ResNetmodel-2kEpochs.h5')