In [1]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import numpy as np
import matplotlib.pyplot as plt
import random
import math 
from math import ceil
import os
import pickle
import gc

from OneNucleotideIndexer import OneNucleotideIndexer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import Sequence
from keras.models import Model, load_model
from tensorflow.keras.layers import Layer
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from keras.layers import Masking,Reshape,GlobalAveragePooling1D, ZeroPadding1D,Lambda, Concatenate,Input, Dense, BatchNormalization, Conv1D, Flatten, Activation, Embedding, MaxPooling1D, SeparableConv1D, Conv1DTranspose
from Metrics import specificity
import tempfile
import subprocess

2025-06-04 02:55:36.591540: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-04 02:55:36.630331: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
Data_for_model_path = '/media/zakaria/spin/SayantanProteinFamily/Uniref_Uniprot/Data_for_model' #'/mnt/e/GenerativeAI_generated_sequences/Data'
uniprotKB_file      = f'{Data_for_model_path}/uniprot_sprot.fasta'
bacterium_fastafile = f'{Data_for_model_path}/positive.fasta'
model_input_data = f'{Data_for_model_path}/uniref50_model_data/model_inputs'
saved_model_path   = f'{Data_for_model_path}/Models'
encoder_model_path = f'{saved_model_path}/network_classifier_features_20250217_091829.h5'
save_csv           = '/media/zakaria/spin/SayantanProteinFamily/Uniref_Uniprot/Htmls_results/Autoencoder_varieties_result'
indexer_file       = f'{model_input_data}/A_U_len1024_20250217_091829.pkl'

min_len = 50
max_len = 1024

In [3]:
train_fmatrix_embed = np.load(f'{model_input_data}/final_train_fmatrix_embedded_1024.npy')
val_fmatrix_embed   = np.load(f'{model_input_data}/final_val_fmatrixEmbed_1024.npy')
val1000_fmatrix_embed   = np.load(f'{model_input_data}/binned_final_val_fmatrixEmbed_1024.npy')

In [4]:
train_fmatrix_embed.shape,val_fmatrix_embed.shape,val1000_fmatrix_embed.shape

((757338, 1024, 3), (216382, 1024, 3), (1000, 1024, 3))

# Embedding layer from pre-trained encoder

In [5]:
class CustomConvLayer(Layer):
    def __init__(self, filter_num, filter_size, **kwargs):
        super(CustomConvLayer, self).__init__(**kwargs)
        self.filter_num = filter_num
        self.filter_size = filter_size
        self.conv1 = Conv1D(filters=filter_num, kernel_size=filter_size, use_bias=True,  activation='relu', name='conv1')
        self.conv2 = Conv1D(filters=filter_num, kernel_size=(2*filter_size)+1, use_bias=False, strides=2, name='conv2')# 
        self.bn = BatchNormalization(name='bn')
        self.activation = Activation(activation='relu', name='relu')

    def call(self, inputs):
        x = self.conv1(inputs)
        x = self.conv2(x)
        x = self.bn(x)
        x = self.activation(x)
        return x

    def get_config(self):
        config = super(CustomConvLayer, self).get_config()
        config.update({
            'filter_num': self.filter_num,
            'filter_size': self.filter_size
        })
        return config
    
    def set_weights(self, weight_list):
        self.conv1.set_weights(weight_list[:2])
        self.conv2.set_weights(weight_list[2:3])
        self.bn.set_weights(weight_list[3:])

    def freeze_layers(self):
        self.conv1.trainable = False
        self.conv2.trainable = False
        self.bn.trainable    = False

    def unfreeze_layers(self):
        self.conv1.trainable = True
        self.conv2.trainable = True
        self.bn.trainable    = True

In [6]:
encoder_model = load_model(encoder_model_path, custom_objects={'CustomConvLayer': CustomConvLayer, 'specificity': specificity})
embedding_layer = encoder_model.get_layer(name='embedding')



2025-06-04 02:55:41.668103: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13605 MB memory:  -> device: 0, name: NVIDIA RTX A4000, pci bus id: 0000:b3:00.0, compute capability: 8.6


In [7]:
encoder_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_tensor (InputLayer)   [(None, 1024)]            0         
                                                                 
 embedding (Embedding)       (None, 1024, 3)           81        
                                                                 
 custom_conv_layer (CustomC  (None, 508, 32)           7616      
 onvLayer)                                                       
                                                                 
 custom_conv_layer_1 (Custo  (None, 250, 64)           35136     
 mConvLayer)                                                     
                                                                 
 custom_conv_layer_2 (Custo  (None, 121, 128)          139904    
 mConvLayer)                                                     
                                                           

In [8]:
print("Input shape of the model:", encoder_model.input_shape)

Input shape of the model: (None, 1024)


### Freeze all encoder layers

In [9]:
for layer in encoder_model.layers:
    if isinstance(layer, CustomConvLayer):
        layer.freeze_layers()
    else:
        layer.trainable = False

### Get the trained embedding layer

In [10]:
embedding_matrix = embedding_layer.get_weights()[0]
# Display the vocabulary and their corresponding embedding values
vocab_size, embedding_dim = embedding_matrix.shape
print("Vocabulary Size:", vocab_size)
print("Embedding Dimension:", embedding_dim)
print("\nVocabulary and Embedding Values:\n")

for token_id in range(vocab_size):
    print(f"Token ID {token_id}: {embedding_matrix[token_id]}")

Vocabulary Size: 27
Embedding Dimension: 3

Vocabulary and Embedding Values:

Token ID 0: [-0.02140465  0.02830246 -0.05574557]
Token ID 1: [ 0.01456194 -0.04204811  0.03270236]
Token ID 2: [-0.06271231 -0.11801422 -0.05803983]
Token ID 3: [ 0.07430171  0.29197326 -0.23212597]
Token ID 4: [ 0.01646761  0.70462066 -0.6810645 ]
Token ID 5: [-0.28561434  0.14610802 -0.00950604]
Token ID 6: [-0.16307573 -0.5325746   0.07229828]
Token ID 7: [0.5686764  0.1469355  0.09788614]
Token ID 8: [-0.38822517  0.09971034 -0.5219342 ]
Token ID 9: [-0.27956656  0.05687758  0.07124779]
Token ID 10: [-0.34459665 -0.01498139 -0.14375386]
Token ID 11: [-0.00147066 -0.21718769 -0.10160962]
Token ID 12: [ 0.19393146 -0.26964355  0.35422647]
Token ID 13: [ 0.45520952 -0.15856832  0.37196437]
Token ID 14: [0.35017297 0.27239195 0.01043969]
Token ID 15: [-0.08567002  0.08376089 -0.37251484]
Token ID 16: [0.00457283 0.6379865  1.3526993 ]
Token ID 17: [-0.00031182 -0.06125099  0.29761612]
Token ID 18: [ 0.053938

In [11]:
def euclidean_distance(vec1, vec2):
    sum_squared_diff = 0
    for i in range(len(vec1)):
        diff = vec1[i] - vec2[i]
        sum_squared_diff += diff * diff
    return sum_squared_diff ** 0.5 # square root of the sum

In [12]:
# Later, to load the dictionary back:
with open(indexer_file, "rb") as f:
    new_index_table = pickle.load(f)

In [13]:
new_index_table.index_table

{'A': 2,
 'R': 3,
 'H': 4,
 'I': 5,
 'E': 6,
 'P': 7,
 'G': 8,
 'L': 9,
 'V': 10,
 'D': 11,
 'Y': 12,
 'W': 13,
 'S': 14,
 'Q': 15,
 'M': 16,
 'T': 17,
 'F': 18,
 'N': 19,
 'K': 20,
 'C': 21,
 'U': 22}

In [14]:
index_to_letter = {}
for letter, index in new_index_table.index_table.items():
    index_to_letter[index] = letter

In [15]:
def back_to_letters(sample, embedding_matrix=embedding_matrix, type_of_seq = 'Original'):
    recons_seq = []
    for num,new_vector in enumerate(sample):
        min_distance = float('inf')
        closest_token = None
        
                                                                   # vocab_size, embedding_dim = embedding_matrix.shape
        
        for token_id in range(2,23):
            vector = embedding_matrix[token_id]
            distance = euclidean_distance(new_vector , vector)
                                                                   # print(f'Distance of new vector from token {token_id}: {distance}')
            if distance < min_distance:
                min_distance = distance
                closest_token = token_id
        recons_seq.append(index_to_letter[closest_token])
                                                                   # print(f'Recons_Vector {i} closest match is token {closest_token} with distance {min_distance}')
    
    # Join the letters to form the tokenized string for the sequence.
    recons_seq_str = "".join(recons_seq)
        
        # output the closest match 
    # print(f'Recons_Vector {i} closest match is token {closest_token} with distance {min_distance}')
    # print(len(tokenized_seq))
    # all_tokenized_seq.append(tokenized_seq)
    
    #print(f'{type_of_seq} sequence is : {recons_seq_str}\n')
    return recons_seq_str

In [16]:
def extract_identity_score(output_file):
    try:
        with open(output_file, 'r') as f:
            for line in f:
                if "Identity:" in line:  # Look for the line containing identity
                    parts = line.split()
                    if len(parts) >= 4:
                        percentage = parts[-1]
                        cleaned_score = percentage.strip('() ').replace('%', '')
                        if cleaned_score:
                            return float(cleaned_score)
    except Exception as e:
        print(f"Error reading identity score: {e}")
    return None

In [17]:
def run_needle(seq1, seq2):
    # Create temporary files for seq1 and seq2
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".fasta") as temp_seq1:
        temp_seq1.write(f">seq1\n{seq1}")
        temp_seq1.flush()  # Ensure that the content is written to disk
        temp_seq1_path = temp_seq1.name
        # Read the content of temp_seq1 immediately after flushing to disk
        # with open(temp_seq1_path, 'r') as f:
        #     print(f"Temporary file for seq1 (before running needle):\n{f.read()}")

    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".fasta") as temp_seq2:
        temp_seq2.write(f">seq2\n{seq2}")
        temp_seq2.flush()  # Ensure that the content is written to disk
        temp_seq2_path = temp_seq2.name
        # Read the content of temp_seq2 immediately after flushing to disk
        # with open(temp_seq2_path, 'r') as f:
        #     print(f"Temporary file for seq2 (before running needle):\n{f.read()}")

    # Temporary file for needle output
    with tempfile.NamedTemporaryFile(delete=False) as temp_output:
        temp_output_path = temp_output.name

    try:
        # Run needle and suppress standard output and error
        subprocess.run(
            [
                "needle", #/projects/academic/mshalfon/hzgirgis/projects/sp_unsupervised/EMBOSS-6.6.0/emboss/
                "-asequence", temp_seq1_path,
                "-bsequence", temp_seq2_path,
                "-sprotein1", 
                "-sprotein2", 
                "-gapopen", "10", 
                "-gapextend", "0.5", 
                "-outfile", temp_output_path
            ],
            check=True,
            stdout=subprocess.PIPE,  # Suppress standard output
            stderr=subprocess.PIPE   # Suppress standard error
        )

        # Print the contents of the output file to check
        #with open(temp_output_path, 'r') as output_file:
         #   needle_output = output_file.read()
          #  print("Needle Output:\n", needle_output)  # Print the output for debugging

        # Extract identity score from the output file
        identity_score = extract_identity_score(temp_output_path)

    finally:
        # Clean up temporary files
        os.remove(temp_seq1_path)
        os.remove(temp_seq2_path)
        os.remove(temp_output_path)

    return identity_score

In [18]:
def remove_masking(sample):
    
    # 1) build a boolean mask of shape (200,) that is True where all channels == 0
    all_zero_mask = np.all(sample == 0, axis=0)
    
    # 2) find the indices where that happens
    all_zero_indices = np.where(all_zero_mask)[0]
    
    # 3) grab the first one (if any)
    if all_zero_indices.size > 0:
        first_all_zero_index = all_zero_indices[0]
        
        
    else:
        first_all_zero_index = None
    return first_all_zero_index

# Network

In [19]:
@keras.saving.register_keras_serializable()
class Sampling(keras.layers.Layer):
    def call(self, z_mean, z_log_var):
        batch_size = tf.shape(z_mean)[0]
        z_size = tf.shape(z_mean)[1]
        epsilon = tf.random.normal(shape=(batch_size, z_size))  
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon 
    def get_config(self):
        # No extra args to serialize, just inherit base config
        config = super().get_config()
        return config

In [20]:
@keras.saving.register_keras_serializable()
class MaskedZero(keras.layers.Layer):
    def __init__(self, **kwargs):
        super(MaskedZero, self).__init__(**kwargs)
        self.supports_masking = True

    def call(self, inputs, mask=None):
        """
        inputs: Tensor of shape (batch_size, max_len, features)
        mask:   Boolean Tensor of shape (batch_size, max_len,) where False=masked
        """
        if mask is not None:
            
            mask = tf.cast(mask, inputs.dtype)   # where the mask values changes to 1s and 0s
             
            # expand mask dims so it broadcasts over the last axis which is the features
            # e.g. mask shape (B, T) -> (B, T, 1) for inputs (B, T, F)
            for _ in range(len(inputs.shape) - len(mask.shape)):                   #### 
                mask = tf.expand_dims(mask, axis=-1)

            inputs  = inputs * mask
            
        return inputs

    def compute_mask(self, inputs, mask=None):
        # propagate the input mask unchanged
        return mask

    def get_config(self):
        base_config = super(MaskedZero, self).get_config()
        return base_config

In [21]:
@keras.saving.register_keras_serializable()
class MaskedConv1D(keras.layers.Layer):
    def __init__(self, filter_num, filter_size,strides,pool_size=None,**kwargs):
        super().__init__(**kwargs)
        # self.supports_masking = True
        self.filter_num  = filter_num
        self.filter_size = filter_size
        self.strides     = strides
        self.pool_size   = pool_size
        self.bn = BatchNormalization(name='bn')
        self.activation = Activation(activation='relu', name='relu')
        self.conv1 = Conv1D(filters=filter_num, kernel_size=filter_size, strides = strides, padding='same', name='conv1')
        if pool_size is not None:
            self.max_pool = MaxPooling1D(pool_size=pool_size, padding='valid', name='mask_pool')  # MaxPooling1D for the mask
        else:
            self.max_pool = None
    def call(self, inputs):
        x = self.conv1(inputs)
        x = self.bn(x)
        x = self.activation(x)
        return x
    def compute_mask(self, inputs, mask=None):
        if self.max_pool is None:
            return mask
        if mask is not None:
            if len(mask.shape) == 2:
                mask = tf.expand_dims(mask, axis=-1)  
                mask = tf.cast(mask, dtype=tf.float32)  
            mask = self.max_pool(mask)
            mask = tf.squeeze(mask, axis=-1)
            return tf.cast(mask, tf.bool)   
    def get_config(self):
        config = super().get_config()
        config.update({
            'filter_num' :  self.filter_num,
            'filter_size': self.filter_size,
            'strides'    : self.strides,
            'pool_size'  : self.pool_size,
        })
        return config

In [22]:
@keras.saving.register_keras_serializable()
class ExtractMask(tf.keras.layers.Layer):
    def call(self, inputs):
        mask = tf.not_equal(inputs, 0.0)
        mask = tf.reduce_any(mask, axis=-1)  # (batch, time)
        return tf.cast(mask, tf.bool)

    def get_config(self):
        config = super().get_config()
        return config

In [23]:
@keras.saving.register_keras_serializable()
class AssertMaskedZeros(tf.keras.layers.Layer):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)
        self.supports_masking = True
    def call(self, inputs, mask=None):
        input_tensor, embedding_output = inputs
        masked_positions = tf.where(tf.equal(input_tensor, 0))
        masked_vecs = tf.gather_nd(embedding_output, masked_positions)
        assert_op= tf.debugging.assert_near(
            masked_vecs,
            tf.zeros_like(masked_vecs),
            message="Embedding outputs at masked positions should be zero!"
        )
        # ensure the assert runs, but *then* return emb
        # with tf.control_dependencies([assert_op]):
        #     return tf.identity(embedding_output)
        return embedding_output
    def get_config(self):
        config = super().get_config()
        return config

In [24]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super().__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.sampler = Sampling()
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        return [self.total_loss_tracker,
                self.reconstruction_loss_tracker,
                self.kl_loss_tracker]
        
    def call(self, inputs, training=False):
        # Encoder already masks internally and returns the original mask
        z_mean, z_log_var, orig_mask = self.encoder(inputs) #, training=training
        z = self.sampler(z_mean, z_log_var)
        reconstruction = self.decoder([z, orig_mask]) # , training = training
        return reconstruction
    
    
    def train_step(self, data):
        with tf.GradientTape() as tape:
            
            z_mean, z_log_var, orig_mask = self.encoder(data)#data[0] if dataloader returned input twice , training=True
            z = self.sampler(z_mean, z_log_var)
            reconstruction = self.decoder([z, orig_mask]) #, training=True
            
            # print("Train step recons shape: ",reconstruction.shape)
            # mse = tf.keras.losses.MeanSquaredError(reduction="none")
            # loss_per_example = mse(data, reconstruction)  # shape: (batch,)
            # reconstruction_loss = tf.reduce_mean(tf.reduce_sum(loss_per_example), axis=(1,2))
            # reconstruction_loss = tf.reduce_mean(tf.reduce_sum(keras.losses.mse(data, reconstruction),axis=(1, 2)))  
            squared_error = tf.math.squared_difference(data, reconstruction)  # shape: (batch, 200, 3)
            loss_per_example = tf.reduce_sum(squared_error, axis=[1, 2])       # shape: (batch,)
            reconstruction_loss = tf.reduce_mean(loss_per_example) 
            
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            total_loss = (reconstruction_loss + tf.reduce_mean(kl_loss)) / (max_len * 3.0)
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "total_loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }
        
    def test_step(self, data):   # for validation loss calculation
        
        z_mean, z_log_var, orig_mask = self.encoder(data)#data[0] if dataloader returned input twice,training=False
        z = self.sampler(z_mean, z_log_var)
        reconstruction = self.decoder([z, orig_mask]) #,training=False
        # mse = tf.keras.losses.MeanSquaredError(reduction="none")
        # loss_per_example = mse(data, reconstruction)  # shape: (batch,)
        # reconstruction_loss = tf.reduce_mean(loss_per_example)
        # reconstruction_loss = tf.reduce_mean(tf.reduce_sum(keras.losses.mse(data, reconstruction),axis=(1, 2)))

        squared_error = tf.math.squared_difference(data, reconstruction)  # shape: (batch, 200, 3)
        loss_per_example = tf.reduce_sum(squared_error, axis=[1, 2])       # shape: (batch,)
        reconstruction_loss = tf.reduce_mean(loss_per_example) 
        
        kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
        total_loss = (reconstruction_loss + tf.reduce_mean(kl_loss)) / (max_len * 3.0)
        
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "total_loss": total_loss
        }

In [25]:
units = [4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96]
filter_num = [128]
filter_size = [3]
for codings_size in units:
    for size in filter_size:
        for filte_r in filter_num:
            tf.keras.backend.clear_session() 
            flag = 1
            
            inputs = tf.keras.layers.Input(shape=[max_len, 3])
            masked_inputs = layers.Masking(mask_value=0.0)(inputs)   
            print("Original mask : ",masked_inputs._keras_mask)
            
            # #Extract the mask only from the masking layer
            mask = ExtractMask()(masked_inputs) 
            
            masked_zero_inputs = MaskedZero()(masked_inputs)
            assert_zero_0  = AssertMaskedZeros()([masked_inputs,masked_zero_inputs])
            print("mask after first assert: ",assert_zero_0._keras_mask)
            
            Z = MaskedConv1D(filter_num=filte_r, filter_size=size, strides = 1,pool_size=None)(assert_zero_0)
            
            masked_Z_conv1 = MaskedZero()(Z)
            assert_zero_1  = AssertMaskedZeros()([Z,masked_Z_conv1])
            
            Z = MaskedConv1D(filter_num=filte_r*2, filter_size=size,strides = 2,pool_size=2)(assert_zero_1)
            print("input shape after conv2: ",Z.shape)
            # pooled_mask = downsample_mask(mask, 2)
                        
            masked_Z_conv2 = MaskedZero()(Z)#, pooled_mask)
            print("pooled mask after conv2: ",masked_Z_conv2._keras_mask)
            assert_zero_2  = AssertMaskedZeros()([Z,masked_Z_conv2])
            print("mask after 2nd conv Assert: ",assert_zero_2._keras_mask)
        
            Z = tf.keras.layers.Flatten()(assert_zero_2)
            
            codings_mean    = tf.keras.layers.Dense(codings_size*25)(Z)  # μ
            codings_log_var = tf.keras.layers.Dense(codings_size*25)(Z)  # γ
            if flag==1:
                variational_encoder = tf.keras.Model(
                    inputs=[inputs], outputs=[codings_mean, codings_log_var, mask],
                    name = f"Encoder_Condensation{codings_size*25}_Kernel_size{size}_Filter{filte_r}")
                variational_encoder.summary()

            flag = 0
            decoder_inputs = tf.keras.layers.Input(shape=[codings_size*25])
            decoder_mask   = tf.keras.layers.Input(shape=[max_len], dtype='bool')
            x = tf.keras.layers.Reshape([25,codings_size])(decoder_inputs)
            x = tf.keras.layers.Conv1DTranspose(filte_r*2, kernel_size=size, padding="same",strides=2)(x)
            x = tf.keras.layers.BatchNormalization()(x)    
            x = tf.keras.layers.Activation("relu")(x)
            x = tf.keras.layers.Flatten()(x)
            x = tf.keras.layers.Dense(max_len*3)(x)
            x = tf.keras.layers.Reshape([max_len,3])(x)
            
            outputs = MaskedZero()(x, decoder_mask)
            print("Mask at decoder output : ",outputs._keras_mask)
            assert_zero_3  = AssertMaskedZeros()([x,outputs])
                        
            if flag == 0:
                variational_decoder = tf.keras.Model(inputs=[decoder_inputs,decoder_mask], outputs=[outputs],
                name = f"Decoder_Condensation_{codings_size*25}_Kernel_size{size}_Filter{filte_r}")
                variational_decoder.summary()

            #####
            #Model Summary
            #####
            variational_ae = VAE(variational_encoder, variational_decoder)
            variational_ae.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3))#,loss=lambda y_true, y_pred: 0.0) #placeholder for loss
            variational_ae(inputs)
            variational_ae.summary()

            #####
            #Training and Evaluation
            #####
            early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='val_total_loss',       
            patience=10,               
            restore_best_weights=True,
            mode = 'min')
            
            variational_ae.fit(train_fmatrix_embed, epochs=200, validation_data=[val_fmatrix_embed],callbacks=[early_stopping], verbose=1)
            with tf.device('/CPU:0'):
                eval_value = variational_ae.evaluate(val_fmatrix_embed, return_dict=True) #, return_dict=True
            val_loss_value = eval_value["total_loss"]

            #####
            # Identity
            #####
            identity_scores_list = []
            num_repeats = 5
            for i in range(val1000_fmatrix_embed.shape[0]):
                orig = val1000_fmatrix_embed[i] #1024,3
                # Determine original (true) sequence length
                orig_len = remove_masking(orig[np.newaxis, ...])
                orig_trimmed = orig[:orig_len, :]
 
                # Convert to letter sequence
                orig_letters = back_to_letters(
                    np.squeeze(orig_trimmed),
                    type_of_seq='Original'
                )
                # Run model 5 times on the same input
                all_preds = [variational_ae.predict(orig[np.newaxis, ...], verbose=0)[0] for _ in range(num_repeats)]
                # print(all_preds[0],all_preds[0].shape)
            
                # Compute identity scores
                scores_i = []
                for pred in all_preds:
                    pred_len = remove_masking(pred[np.newaxis, ...])
                    pred_trimmed = pred[:pred_len, :]
            
                    pred_letters = back_to_letters(
                        np.squeeze(pred_trimmed),
                        type_of_seq='Predicted'
                    )
                    score = run_needle(pred_letters, orig_letters)
                    scores_i.append(score)
                mean_score = np.mean(scores_i)
                identity_scores_list.append(mean_score)
            identity_scores_str = ', '.join(map(str, identity_scores_list))
            # once all batches done:
            mean_identity = np.mean(identity_scores_list)
            std_deviation = np.std(identity_scores_list)
            model_name  = f"GPConvVAE{max_len}_Condensed{25*codings_size}_{filter_num[0]}_k{filter_size[0]}"
            result_line = f"{model_name}|{val_loss_value}|{identity_scores_str}|{mean_identity}|{std_deviation}"
            with open(f"{save_csv}/SirDataConvVAEModelsCondensation_{max_len}.csv",'a') as f:   
                f.write(result_line + "\n")
            
            model_filename = f"{saved_model_path}/VAE_models/{model_name}.h5"
            variational_ae.save_weights(model_filename)
            
            # Cleanup
            del variational_encoder
            del variational_decoder
            del variational_ae
            tf.keras.backend.clear_session()
            gc.collect()

Original mask :  KerasTensor(type_spec=TensorSpec(shape=(None, 1024), dtype=tf.bool, name=None), name='masking/Squeeze:0')
mask after first assert:  KerasTensor(type_spec=TensorSpec(shape=(None, 1024), dtype=tf.bool, name=None), name='Placeholder_4:0')
input shape after conv2:  (None, 512, 256)
pooled mask after conv2:  KerasTensor(type_spec=TensorSpec(shape=(None, 512), dtype=tf.bool, name=None), name='Placeholder_2:0')
mask after 2nd conv Assert:  KerasTensor(type_spec=TensorSpec(shape=(None, 512), dtype=tf.bool, name=None), name='Placeholder_4:0')
Model: "Encoder_Condensation100_Kernel_size3_Filter128"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1024, 3)]            0         []                            
                                                                                          

2025-06-04 02:55:54.087449: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8907
2025-06-04 02:55:55.094157: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2025-06-04 02:55:55.171530: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x791f4eaf0fb0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-06-04 02:55:55.171553: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA RTX A4000, Compute Capability 8.6
2025-06-04 02:55:55.176246: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-06-04 02:55:55.293140: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.

Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 7

2025-06-04 16:11:22.110702: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.48GiB (rounded to 2658902016)requested by op _EagerConst
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2025-06-04 16:11:22.110766: I tensorflow/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2025-06-04 16:11:22.110795: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 92, Chunks in use: 91. 23.0KiB allocated for chunks. 22.8KiB in use in bin. 2.2KiB client-requested in use in bin.
2025-06-04 16:11:22.110814: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 24, Chunks in use: 23. 12.0KiB allocated for chunks. 11.5KiB in use in bin. 10.7KiB client-requested in use in bin.
2025-06-04 16:11:22.110830: I tensorflow/tsl/frame

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

In [31]:
with tf.device('/CPU:0'):
    eval_value = variational_ae.evaluate(val_fmatrix_embed, return_dict=True)

NameError: name 'variational_ae' is not defined

In [28]:
# eval_value = variational_ae.evaluate(val_fmatrix_embed, return_dict=True) #, return_dict=True
val_loss_value = eval_value["total_loss"]
#####
# Identity
#####
identity_scores_list = []
num_repeats = 5
for i in range(val1000_fmatrix_embed.shape[0]):
    orig = val1000_fmatrix_embed[i] #1024,3
    # Determine original (true) sequence length
    orig_len = remove_masking(orig[np.newaxis, ...])
    orig_trimmed = orig[:orig_len, :]

    # Convert to letter sequence
    orig_letters = back_to_letters(
        np.squeeze(orig_trimmed),
        type_of_seq='Original'
    )
    # Run model 5 times on the same input
    all_preds = [variational_ae.predict(orig[np.newaxis, ...], verbose=0)[0] for _ in range(num_repeats)]
    # print(all_preds[0],all_preds[0].shape)

    # Compute identity scores
    scores_i = []
    for pred in all_preds:
        pred_len = remove_masking(pred[np.newaxis, ...])
        pred_trimmed = pred[:pred_len, :]

        pred_letters = back_to_letters(
            np.squeeze(pred_trimmed),
            type_of_seq='Predicted'
        )
        score = run_needle(pred_letters, orig_letters)
        scores_i.append(score)
    mean_score = np.mean(scores_i)
    identity_scores_list.append(mean_score)
identity_scores_str = ', '.join(map(str, identity_scores_list))
# once all batches done:
mean_identity = np.mean(identity_scores_list)
std_deviation = np.std(identity_scores_list)
model_name  = f"GPConvVAE{max_len}_Condensed{25*codings_size}_{filter_num[0]}_k{filter_size[0]}"
result_line = f"{model_name}|{val_loss_value}|{identity_scores_str}|{mean_identity}|{std_deviation}"
with open(f"{save_csv}/SirDataConvVAEModelsCondensation_{max_len}.csv",'a') as f:   
    f.write(result_line + "\n")

model_filename = f"{saved_model_path}/VAE_models/{model_name}.h5"
variational_ae.save_weights(model_filename)

In [30]:
# Cleanup
del variational_encoder
del variational_decoder
del variational_ae
tf.keras.backend.clear_session()
gc.collect()

NameError: name 'variational_encoder' is not defined

In [82]:
for i in range(val1000_fmatrix_embed.shape[0]):
    orig = val1000_fmatrix_embed[i] #1024,3
    # Determine original (true) sequence length
    orig_len = remove_masking(orig[np.newaxis, ...])
    orig_trimmed = orig[:orig_len, :]

    # Convert to letter sequence
    orig_letters = back_to_letters(
        np.squeeze(orig_trimmed),
        type_of_seq='Original'
    )
    # Run model 5 times on the same input
    all_preds = [variational_ae.predict(orig[np.newaxis, ...], verbose=0)[0] for _ in range(num_repeats)]
    # print(all_preds[0],all_preds[0].shape)

    # Compute identity scores
    scores_i = []
    for pred in all_preds:
        pred_len = remove_masking(pred[np.newaxis, ...])
        pred_trimmed = pred[:pred_len, :]

        pred_letters = back_to_letters(
            np.squeeze(pred_trimmed),
            type_of_seq='Predicted'
        )
        score = run_needle(pred_letters, orig_letters)
        scores_i.append(score)
        mean_score = np.mean(scores_i)
        identity_scores_list.append(mean_score)
        
identity_scores_str = ', '.join(map(str, identity_scores_list))
# once all batches done:
mean_identity = np.mean(identity_scores_list)
std_deviation = np.std(identity_scores_list)
model_name  = f"GPConvVAE{max_len}_Condensed{25*codings_size}_{filter_num[0]}_k{filter_size[0]}"
result_line = f"{model_name}|{val_loss_value}|{identity_scores_str}|{mean_identity}|{std_deviation}"
with open(f"{save_csv}/GPConvVAEModelsCondensation_{max_len}.csv",'a') as f:   
    f.write(result_line + "\n")

model_filename = f"{saved_model_path}/VAE_models/{model_name}.h5"
variational_ae.save_weights(model_filename)

# Cleanup
del variational_encoder
del variational_decoder
del variational_ae
tf.keras.backend.clear_session()
gc.collect()


53
53
53
53
53
