## NeuroAlign - Training



In [1]:
import sys
!{sys.executable} -m pip install tensorflow_probability==0.11.0

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import random
import os
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import CSVLogger
import Model as model
import Data as data
import Evaluation as eval

GPUS = tf.config.experimental.list_logical_devices('GPU')
NUM_DEVICES = max(1, len(GPUS))

if len(GPUS) > 0:
    print("Using ", NUM_DEVICES, " GPU devices.")
else:
    print("Using CPU.")

Collecting tensorflow_probability==0.11.0
  Using cached tensorflow_probability-0.11.0-py2.py3-none-any.whl (4.3 MB)
Collecting cloudpickle==1.3
  Using cached cloudpickle-1.3.0-py2.py3-none-any.whl (26 kB)
Collecting dm-tree
  Using cached dm_tree-0.1.5-cp38-cp38-manylinux2014_x86_64.whl (91 kB)
Installing collected packages: dm-tree, cloudpickle, tensorflow-probability
  Attempting uninstall: cloudpickle
    Found existing installation: cloudpickle 1.6.0
    Uninstalling cloudpickle-1.6.0:
      Successfully uninstalled cloudpickle-1.6.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
distributed 2.30.1 requires cloudpickle>=1.5.0, but you have cloudpickle 1.3.0 which is incompatible.[0m
Successfully installed cloudpickle-1.3.0 dm-tree-0.1.5 tensorflow-probability-0.11.0
Using  2  GPU devices.


In [2]:
NUM_EPOCHS = 200
NAME = "gap_prob"
MODEL_PATH = "./models/" + NAME
CHECKPOINT_PATH = MODEL_PATH + "/model.ckpt"

os.makedirs(MODEL_PATH, exist_ok=True)

##################################################################################################
##################################################################################################
neuroalign, neuroalign_config = model.make_neuro_align_model(NAME)

Configured model gap_prob and initialized weights randomly.


In [3]:
#Pfam protein families have identifiers of the form PF00001, PF00002, ...
#The largest id is PF19227, but the counting is not contiguous, there may be missing numbers
pfam = ["PF"+"{0:0=5d}".format(i) for i in range(1,19228)]
pfam_not_found = 0

fasta = []

for i,file in enumerate(pfam):
    try:
        f = data.Fasta("../brain/Pfam/alignments/" + file + ".fasta", gaps = True, contains_lower_case = True)
        fasta.append(f)
        for x in range(1,10):
            if i/len(pfam) > x/10 and (i-1)/len(pfam) < x/10:
                print(x*10, "% loaded")
                gc.collect()
    except:
        pfam_not_found += 1

np.random.seed(0)
random.seed(0)

indices = np.arange(len(fasta))
np.random.shuffle(indices)
if len(fasta) > 1000:
    print("Using the full dataset.")
    train, val = np.split(indices, [int(len(fasta)*(1-neuroalign_config["validation_split"]))]) 
    train_gen = data.AlignmentSampleGenerator(train, fasta, neuroalign_config, neuroalign_config["family_size"], NUM_DEVICES)
    val_gen = data.AlignmentSampleGenerator(val, fasta, neuroalign_config, neuroalign_config["family_size"], NUM_DEVICES, False)
else: 
    print("Using a small test dataset.")
    train_gen = data.AlignmentSampleGenerator(np.arange(len(fasta)), fasta, neuroalign_config, neuroalign_config["family_size"], NUM_DEVICES)
    val_gen = data.AlignmentSampleGenerator(np.arange(len(fasta)), fasta, neuroalign_config, neuroalign_config["family_size"], NUM_DEVICES, False) 

10 % loaded
20 % loaded
30 % loaded
40 % loaded
50 % loaded
60 % loaded
70 % loaded
80 % loaded
90 % loaded
Using the full dataset.


In [4]:
#COLUMN_LOSS_WEIGHT = 0.02
#ATTENTION_LOSS_WEIGHT = 0.98

##################################################################################################
##################################################################################################

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = tf.cast(neuroalign_config["col_dim"], tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


##################################################################################################
##################################################################################################

optimizer = tf.keras.optimizers.Adam(1e-4, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

##################################################################################################
##################################################################################################


def losses_prefixed(losses, metrics, weights, prefix=""):
    #if neuroalign_config["use_column_loss"]:
        #losses.update({prefix+"out_columns" : eval.kld})
        #weights.update({prefix+"out_columns" : COLUMN_LOSS_WEIGHT})
    #if neuroalign_config["use_attention_loss"]:
        #losses.update({prefix+"out_attention" : eval.att_loss})
        #metrics.update({prefix+"out_attention" : [eval.precision, eval.recall]})
        #weights.update({prefix+"out_attention" : ATTENTION_LOSS_WEIGHT})
    losses.update({prefix+"out_gaps" : 
                   keras.losses.CategoricalCrossentropy(
                       label_smoothing=0.1)})
    metrics.update({prefix+"out_gaps" : keras.metrics.CategoricalAccuracy()})
        

losses, metrics, weights = {}, {}, {}
if NUM_DEVICES == 1:
    model = neuroalign
    losses_prefixed(losses, metrics, weights)
else:
    inputs, outputs = [], []
    for i, gpu in enumerate(GPUS):
        with tf.device(gpu.name):
            sequences = keras.Input(shape=(None,model.INPUT_DIM), name="GPU_"+str(i)+"_sequences")
            aligned_sequences = keras.Input(shape=(None,model.INPUT_DIM), name="GPU_"+str(i)+"_aligned_sequences")
            sequences_residual_mask = keras.Input(shape=(None,None), name="GPU_"+str(i)+"_sequences_residual_mask")
            input_dict = {  "sequences" : sequences,
                            "aligned_sequences" : aligned_sequences,
                            "sequences_residual_mask" : sequences_residual_mask }
            #out_cols, A = neuroalign(input_dict)
            out_gaps = neuroalign(input_dict)
            outputs.append(layers.Lambda(lambda x: x, name="GPU_"+str(i)+"_out_gaps")(out_gaps))
            #outputs.append(layers.Lambda(lambda x: x, name="GPU_"+str(i)+"_out_attention")(A))
            inputs.extend([sequences, aligned_sequences, sequences_residual_mask])

    model = keras.Model(inputs=inputs, outputs=outputs)
    for i, gpu in enumerate(GPUS):
        losses_prefixed(losses, metrics, weights, "GPU_"+str(i)+"_")

model.compile(loss=losses, optimizer=optimizer, metrics=metrics, loss_weights=weights)
    
class ModelCheckpoint(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        neuroalign.save_weights(CHECKPOINT_PATH)
        print("Saved model to " + CHECKPOINT_PATH, flush=True)

csv_logger = CSVLogger(MODEL_PATH + "/log.csv", append=True, separator=',')

history = model.fit(train_gen,
                    validation_data=val_gen,
                    epochs = NUM_EPOCHS,
                    verbose = 1,
                    callbacks=[ModelCheckpoint(), csv_logger])

Epoch 1/200
 1616/17346 [=>............................] - ETA: 1:23:01 - loss: 1.1649 - GPU_0_out_gaps_loss: 0.5974 - GPU_1_out_gaps_loss: 0.5675 - GPU_0_out_gaps_categorical_accuracy: 0.8901 - GPU_1_out_gaps_categorical_accuracy: 0.8898

ResourceExhaustedError:  OOM when allocating tensor with shape[12,8,834,834] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node functional_3/functional_1/neuro_align_layer/col_decoder/decoder_layer_3/multi_head_attention_10/MatMul (defined at /home/jovyan/NeuroAlignTransfo/Model.py:84) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_train_function_101652]

Errors may have originated from an input operation.
Input Source operations connected to node functional_3/functional_1/neuro_align_layer/col_decoder/decoder_layer_3/multi_head_attention_10/MatMul:
 functional_3/functional_1/neuro_align_layer/col_decoder/decoder_layer_3/multi_head_attention_10/transpose (defined at /home/jovyan/NeuroAlignTransfo/Model.py:96)

Function call stack:
train_function
