In [2]:
import pickle

In [3]:
#Load model
from keras.models import load_model
model = load_model('/Users/ricardorivero/Documents/CPTS-570/Project/Flu_LSTM/RNN_H5.h5')



In [4]:
#Utils
import numpy as np

# List of standard amino acid single-letter codes
amino_acids = [
    'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
    'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'
]

# Create a mapping from amino acids to integers
aa_to_int = {aa: idx + 1 for idx, aa in enumerate(amino_acids)}  # Start indexing from 1
int_to_aa = {idx + 1: aa for idx, aa in enumerate(amino_acids)}

In [5]:
import numpy as np
import time
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import tensorflow as tf

# Load the model, specifying the device
model = load_model('/Users/ricardorivero/Documents/CPTS-570/Project/Flu_LSTM/RNN_H5.h5')
model.compile(loss='categorical_crossentropy', optimizer='adam')

def generate_sequence(seed_sequence, model, aa_to_int, int_to_aa, max_sequence_length, desired_length, temperature=1.0):
    """
    Generates a protein sequence of desired length using temperature-based sampling.

    Args:
        seed_sequence: The starting amino acid sequence (string).
        model: Trained Keras model for sequence prediction.
        aa_to_int: Dictionary mapping amino acids to integers.
        int_to_aa: Dictionary mapping integers to amino acids.
        max_sequence_length: Maximum sequence length used during training.
        desired_length: The desired length of the generated sequence.
        temperature: Temperature parameter for controlling randomness.

    Returns:
        Generated amino acid sequence (string).
    """

    generated_sequence = seed_sequence

    while len(generated_sequence) < desired_length:
        int_seq = [aa_to_int.get(aa, 0) for aa in generated_sequence]
        padded_seq = pad_sequences([int_seq], maxlen=max_sequence_length, padding='pre')

        # Ensure prediction is done on CPU
        with tf.device('/cpu:0'):
            prediction = model.predict(padded_seq, verbose=0)[0]

        if temperature != 1.0:
            # Apply temperature scaling (consider optimizing this)
            logits = np.log(prediction + 1e-7) / temperature
            exp_preds = np.exp(logits)
            probs = exp_preds / np.sum(exp_preds)
        else:
            probs = prediction

        predicted_index = np.random.choice(len(probs), p=probs)
        predicted_aa = int_to_aa.get(predicted_index, '')
        generated_sequence += predicted_aa

    return generated_sequence

# Example usage:
seed_sequence = 'MEKIVLLLATVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTH'
temperature = 0.8

lengths = range(50, 569, 50)
runtimes = {}
max_sequence_length = 569

for length in lengths:
    start_time = time.time()
    generated_seq = generate_sequence(
        seed_sequence,
        model,
        aa_to_int,
        int_to_aa,
        max_sequence_length,
        length,
        temperature
    )
    end_time = time.time()
    runtime = end_time - start_time
    runtimes[length] = runtime
    print(f"Generated sequence of length {length} in {runtime:.4f} seconds:\n{generated_seq}")

print("Runtimes for different lengths:")
for length, rt in runtimes.items():
    print(f"Length {length}: {rt:.4f} s")

2024-12-11 17:06:47.223420: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Generated sequence of length 50 in 2.9033 seconds:
MEKIVLLLATVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILE
Generated sequence of length 100 in 20.3750 seconds:
MEKIVLLLATVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCDLDGVKPLILRDCSVAGWLLGNPMCDEFINVPEWSYIVEKAN
Generated sequence of length 150 in 43.9953 seconds:
MEKIVLLLATVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCDLDGVKPLILRDCSVAGWLLGNPMCDEFINVPEWSYIVEKANPANDLCYPGNFNDYEELKHLLSRINRFEKIRIIPKSYWSSHEASLGVSAA
Generated sequence of length 200 in 73.2894 seconds:
MEKIVLLLATVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCDLDGVKPLILRDCSVAGWLLGNPMCDEFINVPEWSYIVEKANPANDLCYPGSFNDYEELKHLLSRINHFEKIQIIPKSSWSDHEASSGVSSACPYLGKSSFFRNVVWLIKKNAYPTIKKSYNNTNQEDLLILWGIHHSNNEA
Generated sequence of length 250 in 82.6282 seconds:
MEKIVLLLATVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCNLDGVKPLILRDCSVAGWLLGNPMCDEFLNVPEWSYIVEKINPANDLCYPGNFNYEELKHLLSRINHFEKIQIIPKSSWSDHEASSGVSSACPYLGRSSFFRNVVWLIKKNNTYPTIKRSYNNTNQEDLLVLWGIHHPNDEAEQTRLYQNPTTYVSVGTSTLNQRLVPKIATRSK