In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import os

In [2]:
# ------------------ Load and Parse FASTA File ------------------
def load_fasta(file_path):
    sequences = []
    current_seq = []
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_seq:
                    sequences.append(''.join(current_seq))
                    current_seq = []
            else:
                current_seq.append(line)
        if current_seq:
            sequences.append(''.join(current_seq))
    return sequences

In [None]:
# Load sequences
fasta_path = "uniprot_sprot.fasta"
sequences = load_fasta(fasta_path)

# Optional filtering
sequences = [s for s in sequences if 30 < len(s) < 300]

# ------------------ Character Mapping ------------------
all_chars = sorted(list(set(''.join(sequences))))
char_to_idx = {c: i + 1 for i, c in enumerate(all_chars)}  # 0 reserved for padding
idx_to_char = {i + 1: c for i, c in enumerate(all_chars)}
vocab_size = len(char_to_idx) + 1  # +1 for padding (0)

# ------------------ Prepare Input and Target ------------------
seq_length = 50
X, y = [], []

for seq in sequences:
    for i in range(0, len(seq) - seq_length):
        input_seq = seq[i:i + seq_length]
        target_char = seq[i + seq_length]
        X.append([char_to_idx[c] for c in input_seq])
        y.append(char_to_idx[target_char])

X = np.array(X)
y = np.array(y)

In [None]:
# ------------------ Model Definition or Loading ------------------
model_path = "protein_generator_model.h5"
if os.path.exists(model_path):
    print("Loading existing model...")
    model = load_model(model_path)
else:
    print("Training new model...")
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=64, input_length=seq_length),
        LSTM(256),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    # ------------------ Training ------------------
    model.fit(X, y, batch_size=128, epochs=10)
    model.save(model_path)

# ------------------ Sequence Generation Function ------------------
def generate_sequence(seed, length=100, temperature=1.0):
    result = seed
    for _ in range(length):
        input_seq = [char_to_idx.get(c, 0) for c in result[-seq_length:]]
        input_seq = pad_sequences([input_seq], maxlen=seq_length)
        pred = model.predict(input_seq, verbose=0)[0]

        # Apply temperature for diversity
        pred = np.log(pred + 1e-8) / temperature
        pred = np.exp(pred) / np.sum(np.exp(pred))

        next_idx = np.random.choice(range(vocab_size), p=pred)
        next_char = idx_to_char.get(next_idx, '')
        result += next_char
    return result

In [None]:
# ------------------ Generate and Print ------------------
seed_seq = sequences[0][:seq_length]
new_protein = generate_sequence(seed_seq, length=150, temperature=0.8)
print("Generated Protein Sequence:\n", new_protein)