In [5]:
#Packages
%tensorflow_version 2.x
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns

#Tools
from google.colab import drive
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from gensim.models import Word2Vec

from gensim.models.word2vec import LineSentence
from gensim.models.callbacks import CallbackAny2Vec
from sklearn.utils import shuffle

ModuleNotFoundError: ignored

In [None]:
from tensorflow.python.client import device_lib
tf.test.gpu_device_name()
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 1898894068086318938]

In [None]:
#Avoid warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Check running in colab
try:
  from google.colab import files
  from google.colab import drive
  IN_COLAB = True
except:
  IN_COLAB = False

In [None]:
#Configure colab vs local
if (IN_COLAB == True) :
  drive.mount('/content/drive', force_remount=True)
  root_dir = "/content/drive/My Drive/"
else:
  root_dir = "./"

Mounted at /content/drive


# Data Loading

In [None]:
hotspots = np.load(root_dir + "Data/hotspots/kmers/hotspots-5k-list.npy")
labels = np.load(root_dir + "Data/hotspots/kmers/labels_hotspots-5k-list.npy")

hotspots, labels = shuffle(hotspots, labels, random_state = 0)

hotspots = hotspots[0:round((len(hotspots)/5))]
labels = labels[0:round((len(labels)/5))]

print(np.count_nonzero(labels == 0))
print(np.count_nonzero(labels == 1))

print('Hotspots loaded, length:', hotspots.shape)
hotspots = hotspots.tolist()
print('Labels loaded, shape: ', labels.shape)

7641
7793
Hotspots loaded, length: (15434, 1496)
Labels loaded, shape:  (15434,)


# NLP Processing

In [None]:
"""def generateVocabulary(dataset):
    vocab = set()
    for kmer_list in dataset:
        for kmer in kmer_list:
            vocab.add(kmer)
    return vocab

vocabulary = generateVocabulary(hotspots)
print(len(vocabulary))

vocab_size = len(vocabulary)
oov_token = 'oov'
del vocabulary

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(hotspots)
word_index = tokenizer.word_index"""

"def generateVocabulary(dataset):\n    vocab = set()\n    for kmer_list in dataset:\n        for kmer in kmer_list:\n            vocab.add(kmer)\n    return vocab\n\nvocabulary = generateVocabulary(hotspots)\nprint(len(vocabulary))\n\nvocab_size = len(vocabulary)\noov_token = 'oov'\ndel vocabulary\n\ntokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)\ntokenizer.fit_on_texts(hotspots)\nword_index = tokenizer.word_index"

In [None]:
"""hotspots = tokenizer.texts_to_sequences(hotspots)
hotspots = np.array(hotspots)"""

'hotspots = tokenizer.texts_to_sequences(hotspots)\nhotspots = np.array(hotspots)'

# Word2Vec

In [None]:
class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0
        self.loss_to_be_subed = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print('Loss after epoch {}: {}'.format(self.epoch, loss_now))
        self.epoch += 1

word_model = Word2Vec(hotspots, size=100, min_count=5, window=5, iter=20, compute_loss=True, callbacks=[callback()])

Loss after epoch 0: 3186364.25
Loss after epoch 1: 1974185.25
Loss after epoch 2: 1131395.5
Loss after epoch 3: 794508.5
Loss after epoch 4: 617892.5
Loss after epoch 5: 520931.0
Loss after epoch 6: 463826.0
Loss after epoch 7: 425784.0
Loss after epoch 8: 394244.0
Loss after epoch 9: 401155.0
Loss after epoch 10: 401154.0
Loss after epoch 11: 413598.0
Loss after epoch 12: 426489.0
Loss after epoch 13: 456005.0
Loss after epoch 14: 509537.0
Loss after epoch 15: 576781.0
Loss after epoch 16: 683499.0
Loss after epoch 17: 890379.0
Loss after epoch 18: 1194518.0
Loss after epoch 19: 1042843.0


In [None]:
pretrained_weights = word_model.wv.syn0
vocab_size, embedding_dim = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)
print('Checking similar words:')
for word in ['acacc']:
  most_similar = ', '.join('%s (%.2f)' % (similar, dist) 
                           for similar, dist in word_model.most_similar(word)[:8])
  print('  %s -> %s' % (word, most_similar))

def word2idx(word):
  return word_model.wv.vocab[word].index
def idx2word(idx):
  return word_model.wv.index2word[idx]

Result embedding shape: (1363, 100)
Checking similar words:
  acacc -> tcacc (0.96), gcacc (0.95), ccacc (0.93), acccn (0.44), gcggn (0.42), acagn (0.41), gcagn (0.41), tcagn (0.38)


In [None]:
#Comments to wordembedding vector
embedding = []

for idx, sample in enumerate(hotspots):
    current_obs = []
    for idx2, token in enumerate(sample):
        try:
            wordy = word2idx(token)
            current_obs.append(wordy)
        except:
            current_obs.append("0")

    
    l = len(sample)
    
    for i in range(l, 1500):
        current_obs.append("0")


    embedding.append(current_obs)

#embedding = np.asarray(embedding)

# Neural Network

## Hyperparameters



In [None]:
epochs=50
learning_rate = 0.01

## Model Definition

In [None]:
def createModel(vocab_size, embedding_dim):
  model = Sequential()
  model.add(Embedding(input_dim=vocab_size,
                      output_dim=embedding_dim,
                      weights=[pretrained_weights]))
  model.add(Dropout(0))
  model.add(LSTM(embedding_dim))
  model.add(Dense(2, activation='softmax'))
  return model

In [None]:
def createOptimizer(model):

  optimizer = tf.keras.optimizers.Adam(lr=learning_rate, decay=1e-6)

  model.compile(loss="binary_crossentropy",
                optimizer=optimizer,
                metrics = ['accuracy'])
  return model

In [None]:
model = createModel(vocab_size, embedding_dim)
model = createOptimizer(model)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         134800    
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 100)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 202       
Total params: 215,402
Trainable params: 215,402
Non-trainable params: 0
_________________________________________________________________


## Training

In [None]:
embedding = np.array(embedding)
x_train, x_test, y_train, y_test = train_test_split(embedding, labels, test_size=0.1, shuffle=True)

#x_train = x_train[0:round((len(x_train)/4))]
#y_train = y_train[0:round((len(y_train)/4))]

In [None]:
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')

y_true_max = y_test

y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=epochs, shuffle=True, verbose=2)

Epoch 1/50
272/272 - 350s - loss: 0.7636 - accuracy: 0.4966 - val_loss: 0.6990 - val_accuracy: 0.4860
Epoch 2/50
272/272 - 349s - loss: 0.7297 - accuracy: 0.5046 - val_loss: 0.6969 - val_accuracy: 0.4870
Epoch 3/50
272/272 - 346s - loss: 0.7368 - accuracy: 0.5004 - val_loss: 0.6957 - val_accuracy: 0.4870
Epoch 4/50
272/272 - 347s - loss: 0.7437 - accuracy: 0.5102 - val_loss: 0.8596 - val_accuracy: 0.4870
Epoch 5/50
272/272 - 345s - loss: 0.7837 - accuracy: 0.4968 - val_loss: 1.0731 - val_accuracy: 0.4870
Epoch 6/50
272/272 - 346s - loss: 0.7461 - accuracy: 0.5089 - val_loss: 0.8936 - val_accuracy: 0.5130
Epoch 7/50
272/272 - 346s - loss: 0.7549 - accuracy: 0.4936 - val_loss: 0.7269 - val_accuracy: 0.4870
Epoch 8/50
272/272 - 346s - loss: 0.7416 - accuracy: 0.4984 - val_loss: 0.7033 - val_accuracy: 0.5130
Epoch 9/50
272/272 - 345s - loss: 0.7360 - accuracy: 0.4994 - val_loss: 0.7009 - val_accuracy: 0.5130
Epoch 10/50
272/272 - 345s - loss: 0.7366 - accuracy: 0.5063 - val_loss: 0.7250 - 

# Results

## Architecture

In [None]:
keras.utils.plot_model(model, root_dir +'multi_input_and_output_model.png', show_shapes=True)

## Confussion Matrix

In [None]:
y_pred=np.argmax(model.predict(x_test), axis=-1)
class_names = ["Hotspot", "No Hotspot"]
con_mat = tf.math.confusion_matrix(labels=y_true_max, predictions=y_pred).numpy()
con_mat_norm = np.around(con_mat.astype('float') / con_mat.sum(axis=1)[:, np.newaxis], decimals=2)
con_mat_df = pd.DataFrame(con_mat_norm, index = class_names, columns = class_names)

print('Accuracy Y_test: ', accuracy_score(y_true_max, y_pred))
figure = plt.figure(figsize=(8, 8))
sns.heatmap(con_mat_df, annot=True,cmap=plt.cm.Blues)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

## Accuracy

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

## Loss

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()