In [None]:
# Define the input and output file names
input_file = 'input_sequence.txt'  # Replace with your actual input file name
output_file = 'peptide_sequences.txt'

# Initialize an empty list to store the peptide sequences
peptide_sequences = []

# Open and read the input file
with open(input_file, 'r') as file:
    for line in file:
        line = line.strip()  # Remove any leading/trailing whitespace
        # Check if the line is not a sequence identifier
        if not line.startswith('>'):
            cleaned_sequence = line.replace('-', '')  # Remove any '-' signs
            peptide_sequences.append(cleaned_sequence)  # Add the cleaned sequence to the list

# Write the peptide sequences to the output file
with open(output_file, 'w') as file:
    file.write('peptide_sequence\n')  # Write the header
    for sequence in peptide_sequences:
        file.write(sequence + '\n')  # Write each peptide sequence on a new line

print(f"Peptide sequences have been written to {output_file}.")


Peptide sequences have been written to peptide_sequences.txt.


In [None]:
# Define the input and output file names
input_file = 'peptide_sequences.txt'  # The newly generated peptide_sequence file
output_file = 'filtered_peptide_sequences.txt'

# Set the minimum length of peptide sequences to keep
min_length = 5

# Initialize a set to store unique peptide sequences
unique_sequences = set()

# Open and read the input file
with open(input_file, 'r') as file:
    next(file)  # Skip the header line
    for line in file:
        sequence = line.strip()  # Remove any leading/trailing whitespace
        # Check if the sequence meets the minimum length requirement
        if len(sequence) >= min_length:
            unique_sequences.add(sequence)  # Add the sequence to the set (automatically removes duplicates)

# Write the filtered unique peptide sequences to the output file
with open(output_file, 'w') as file:
    file.write('peptide_sequence\n')  # Write the header
    for sequence in unique_sequences:
        file.write(sequence + '\n')  # Write each unique, valid sequence on a new line

print(f"Filtered peptide sequences have been written to {output_file}.")


Filtered peptide sequences have been written to filtered_peptide_sequences.txt.


In [None]:
# Define the input and output file names
input_file = 'peptide_sequences.txt'  # The newly generated peptide_sequence file
output_file = 'filtered_peptide_sequences.txt'

# Set the minimum length of peptide sequences to keep
min_length = 8  # Minimum sequence length is now 8 nucleotides (base pairs)

# Initialize a set to store unique peptide sequences
unique_sequences = set()

# Open and read the input file
with open(input_file, 'r') as file:
    next(file)  # Skip the header line
    for line in file:
        sequence = line.strip()  # Remove any leading/trailing whitespace
        # Check if the sequence meets the minimum length requirement
        if len(sequence) >= min_length:
            unique_sequences.add(sequence)  # Add the sequence to the set (automatically removes duplicates)

# Write the filtered unique peptide sequences to the output file
with open(output_file, 'w') as file:
    file.write('peptide_sequence\n')  # Write the header
    for sequence in unique_sequences:
        file.write(sequence + '\n')  # Write each unique, valid sequence on a new line

print(f"Filtered peptide sequences have been written to {output_file}.")


Filtered peptide sequences have been written to filtered_peptide_sequences.txt.


In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


In [None]:

# Load the dataset
df = pd.read_csv('3_Training_set_moe.csv')

# Extract peptide sequences from the 'peptide_sequence' column
peptide_sequences = df['peptide_sequence'].tolist()

# Tokenize the sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(peptide_sequences)
total_chars = len(tokenizer.word_index) + 1

# Create input sequences and their corresponding output sequences
sequences = []
for seq in peptide_sequences:
    tokenized_seq = tokenizer.texts_to_sequences([seq])[0]
    for i in range(1, len(tokenized_seq)):
        n_gram_sequence = tokenized_seq[:i+1]
        sequences.append(n_gram_sequence)

max_sequence_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='pre')
X, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes=total_chars)  # One-hot encode the labels

# Build the LSTM model with increased complexity
model = Sequential()
model.add(Embedding(total_chars, 50, input_length=max_sequence_length-1))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(total_chars, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model for more epochs
model.fit(X, y, epochs=200, verbose=2)

# Generate novel peptide sequences with higher temperature for more randomness
def generate_sequences(seed_text, next_words, model, max_sequence_length, num_sequences=1, temperature=1.5):
    generated_sequences = []
    for _ in range(num_sequences):
        generated_sequence = seed_text
        for _ in range(next_words):
            token_list = tokenizer.texts_to_sequences([generated_sequence])[0]
            token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
            predicted_probs = model.predict(token_list, verbose=0)[0]
            predicted_probs = predicted_probs / np.sum(predicted_probs)  # Normalize probabilities to sum to 1
            predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)
            output_word = tokenizer.index_word.get(predicted_index, "")
            generated_sequence += " " + output_word
        generated_sequences.append(generated_sequence)
    return generated_sequences

# Example usage with higher temperature for more randomness
seed_sequence = "P"
generated_sequences = generate_sequences(seed_sequence, 8, model, max_sequence_length, num_sequences=5, temperature=1.5)
print("Generated Sequences:", generated_sequences)


Epoch 1/200




24/24 - 6s - 248ms/step - accuracy: 0.1016 - loss: 2.9567
Epoch 2/200
24/24 - 2s - 82ms/step - accuracy: 0.0963 - loss: 2.8644
Epoch 3/200
24/24 - 2s - 72ms/step - accuracy: 0.1187 - loss: 2.7710
Epoch 4/200
24/24 - 1s - 52ms/step - accuracy: 0.1240 - loss: 2.7100
Epoch 5/200
24/24 - 1s - 46ms/step - accuracy: 0.1491 - loss: 2.6477
Epoch 6/200
24/24 - 1s - 53ms/step - accuracy: 0.1517 - loss: 2.6028
Epoch 7/200
24/24 - 1s - 53ms/step - accuracy: 0.1847 - loss: 2.5597
Epoch 8/200
24/24 - 1s - 53ms/step - accuracy: 0.1781 - loss: 2.5012
Epoch 9/200
24/24 - 1s - 53ms/step - accuracy: 0.2032 - loss: 2.4526
Epoch 10/200
24/24 - 2s - 76ms/step - accuracy: 0.2375 - loss: 2.3978
Epoch 11/200
24/24 - 2s - 91ms/step - accuracy: 0.2678 - loss: 2.3425
Epoch 12/200
24/24 - 2s - 63ms/step - accuracy: 0.3153 - loss: 2.2908
Epoch 13/200
24/24 - 1s - 46ms/step - accuracy: 0.3562 - loss: 2.2222
Epoch 14/200
24/24 - 1s - 47ms/step - accuracy: 0.3694 - loss: 2.1363
Epoch 15/200
24/24 - 1s - 46ms/step - ac

In [1]:
#updated code


import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load the dataset
df = pd.read_csv('3_Training_set_moe.csv')  # Replace with your actual file path

# Extract peptide sequences from the 'peptide_sequence' column
peptide_sequences = df['peptide_sequence'].tolist()

# Tokenize the sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(peptide_sequences)
total_chars = len(tokenizer.word_index) + 1

# Create input sequences and their corresponding output sequences
sequences = []
for seq in peptide_sequences:
    tokenized_seq = tokenizer.texts_to_sequences([seq])[0]
    for i in range(1, len(tokenized_seq)):
        n_gram_sequence = tokenized_seq[:i + 1]
        sequences.append(n_gram_sequence)

max_sequence_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='pre')
X, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes=total_chars)  # One-hot encode the labels

# Build the LSTM model
model = Sequential()
model.add(Embedding(total_chars, 50, input_length=max_sequence_length - 1))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(total_chars, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=200, verbose=2)  # Adjust epochs as needed

# Function to generate random peptide sequences
def generate_random_sequences(next_words, model, max_sequence_length, num_sequences=1, temperature=1.5):
    generated_sequences = []
    for _ in range(num_sequences):
        # Randomly choose the first amino acid
        first_amino_acid_index = np.random.randint(1, total_chars)  # Assuming amino acids are indexed from 1
        generated_sequence = tokenizer.index_word[first_amino_acid_index]  # Get the amino acid from the tokenizer

        for _ in range(next_words - 1):  # Generate the remaining amino acids
            token_list = tokenizer.texts_to_sequences([generated_sequence])[0]
            token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
            predicted_probs = model.predict(token_list, verbose=0)[0]
            predicted_probs = predicted_probs / np.sum(predicted_probs)  # Normalize probabilities to sum to 1
            predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)
            output_word = tokenizer.index_word.get(predicted_index, "")
            generated_sequence += " " + output_word  # Append the generated word to the sequence

        generated_sequences.append(generated_sequence)  # Add the generated sequence to the list
    return generated_sequences

# Example usage: Generate 10 random sequences of length 8
num_sequences_to_generate = 10
sequence_length = 8
generated_sequences = generate_random_sequences(sequence_length, model, max_sequence_length, num_sequences=num_sequences_to_generate, temperature=1.5)

print("Generated Sequences:", generated_sequences)

Epoch 1/200




24/24 - 6s - 263ms/step - accuracy: 0.0976 - loss: 2.9768
Epoch 2/200
24/24 - 2s - 79ms/step - accuracy: 0.1055 - loss: 2.8953
Epoch 3/200
24/24 - 3s - 141ms/step - accuracy: 0.1082 - loss: 2.8180
Epoch 4/200
24/24 - 1s - 62ms/step - accuracy: 0.1108 - loss: 2.7344
Epoch 5/200
24/24 - 1s - 49ms/step - accuracy: 0.1253 - loss: 2.6800
Epoch 6/200
24/24 - 1s - 49ms/step - accuracy: 0.1372 - loss: 2.6216
Epoch 7/200
24/24 - 2s - 79ms/step - accuracy: 0.1728 - loss: 2.5639
Epoch 8/200
24/24 - 2s - 98ms/step - accuracy: 0.2018 - loss: 2.5092
Epoch 9/200
24/24 - 1s - 50ms/step - accuracy: 0.2058 - loss: 2.4549
Epoch 10/200
24/24 - 1s - 48ms/step - accuracy: 0.2335 - loss: 2.4057
Epoch 11/200
24/24 - 1s - 53ms/step - accuracy: 0.2559 - loss: 2.3680
Epoch 12/200
24/24 - 1s - 48ms/step - accuracy: 0.2731 - loss: 2.3063
Epoch 13/200
24/24 - 1s - 50ms/step - accuracy: 0.3074 - loss: 2.2746
Epoch 14/200
24/24 - 1s - 52ms/step - accuracy: 0.3549 - loss: 2.1773
Epoch 15/200
24/24 - 1s - 50ms/step - a

In [4]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load the dataset
df = pd.read_csv('3_Training_set_moe.csv')  # Replace with your actual file path

# Extract peptide sequences from the 'peptide_sequence' column
peptide_sequences = df['peptide_sequence'].tolist()

# Tokenize the sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(peptide_sequences)
total_chars = len(tokenizer.word_index) + 1

# Create input sequences and their corresponding output sequences
sequences = []
for seq in peptide_sequences:
    tokenized_seq = tokenizer.texts_to_sequences([seq])[0]
    for i in range(1, len(tokenized_seq)):
        n_gram_sequence = tokenized_seq[:i + 1]
        sequences.append(n_gram_sequence)

max_sequence_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='pre')
X, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes=total_chars)  # One-hot encode the labels

# Build the LSTM model
model = Sequential()
model.add(Embedding(total_chars, 50, input_length=max_sequence_length - 1))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(total_chars, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=200, verbose=2)  # Adjust epochs as needed

# Function to generate random peptide sequences with validation
def generate_random_sequences(next_words, model, max_sequence_length, num_sequences=1, temperature=1.5):
    generated_sequences = []
    for _ in range(num_sequences):
        # Randomly choose the first amino acid
        first_amino_acid_index = np.random.randint(1, total_chars)  # Assuming amino acids are indexed from 1
        generated_sequence = tokenizer.index_word[first_amino_acid_index]  # Get the amino acid from the tokenizer

        for _ in range(next_words - 1):  # Generate the remaining amino acids
            token_list = tokenizer.texts_to_sequences([generated_sequence])[0]
            token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
            predicted_probs = model.predict(token_list, verbose=0)[0]
            predicted_probs = predicted_probs / np.sum(predicted_probs)  # Normalize probabilities to sum to 1
            predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)
            output_word = tokenizer.index_word.get(predicted_index, "")
            generated_sequence += output_word  # Append the generated word to the sequence # No space added here

        generated_sequences.append(generated_sequence)  # Add the generated sequence to the list
    return generated_sequences

# Example usage: Generate 10 random sequences of length 8 and validate
num_sequences_to_generate = 10
sequence_length = 8
generated_sequences = generate_random_sequences(sequence_length, model, max_sequence_length, num_sequences=num_sequences_to_generate, temperature=1.5)

# Define a list of valid amino acid abbreviations
valid_amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

# Function to check sequence validity
def check_sequence_validity(sequence):
    for char in sequence:

SyntaxError: incomplete input (<ipython-input-4-0542e23a9e87>, line 74)

In [5]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [7]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load the dataset
df = pd.read_csv('3_Training_set_moe.csv')  # Replace with your actual file path

# Extract peptide sequences from the 'peptide_sequence' column
peptide_sequences = df['peptide_sequence'].tolist()

# Tokenize the sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(peptide_sequences)
total_chars = len(tokenizer.word_index) + 1

# Create input sequences and their corresponding output sequences
sequences = []
for seq in peptide_sequences:
    tokenized_seq = tokenizer.texts_to_sequences([seq])[0]
    for i in range(1, len(tokenized_seq)):
        n_gram_sequence = tokenized_seq[:i + 1]
        sequences.append(n_gram_sequence)

max_sequence_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='pre')
X, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes=total_chars)  # One-hot encode the labels

# Build the LSTM model
model = Sequential()
model.add(Embedding(total_chars, 50, input_length=max_sequence_length - 1))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(total_chars, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=200, verbose=2)  # Adjust epochs as needed

# Function to generate random peptide sequences with validation
def generate_random_sequences(next_words, model, max_sequence_length, num_sequences=1, temperature=1.5):
    generated_sequences = []
    for _ in range(num_sequences):
        # Randomly choose the first amino acid
        first_amino_acid_index = np.random.randint(1, total_chars)  # Assuming amino acids are indexed from 1
        generated_sequence = tokenizer.index_word[first_amino_acid_index]  # Get the amino acid from the tokenizer

        for _ in range(next_words - 1):  # Generate the remaining amino acids
            token_list = tokenizer.texts_to_sequences([generated_sequence])[0]
            token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
            predicted_probs = model.predict(token_list, verbose=0)[0]
            predicted_probs = predicted_probs / np.sum(predicted_probs)  # Normalize probabilities to sum to 1
            predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)
            output_word = tokenizer.index_word.get(predicted_index, "")
            generated_sequence += output_word  # Append the generated word to the sequence # No space added here

        generated_sequences.append(generated_sequence)  # Add the generated sequence to the list
    return generated_sequences

# Example usage: Generate 10 random sequences of length 8 and validate
num_sequences_to_generate = 10
sequence_length = 8
generated_sequences = generate_random_sequences(sequence_length, model, max_sequence_length, num_sequences=num_sequences_to_generate, temperature=1.5)

# Define a list of valid amino acid abbreviations
valid_amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

# Function to check sequence validity
def check_sequence_validity(sequence):
    for char in sequence:

SyntaxError: incomplete input (<ipython-input-7-0542e23a9e87>, line 74)

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load the dataset
df = pd.read_csv('3_Training_set_moe.csv')  # Replace with your actual file path

# Extract peptide sequences from the 'peptide_sequence' column
peptide_sequences = df['peptide_sequence'].tolist()

# Tokenize the sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(peptide_sequences)
total_chars = len(tokenizer.word_index) + 1

# Create input sequences and their corresponding output sequences
sequences = []
for seq in peptide_sequences:
    tokenized_seq = tokenizer.texts_to_sequences([seq])[0]
    for i in range(1, len(tokenized_seq)):
        n_gram_sequence = tokenized_seq[:i + 1]
        sequences.append(n_gram_sequence)

max_sequence_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='pre')
X, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes=total_chars)  # One-hot encode the labels

# Build the LSTM model
model = Sequential()
model.add(Embedding(total_chars, 50, input_length=max_sequence_length - 1))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(total_chars, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=200, verbose=2)  # Adjust epochs as needed

# Function to generate random peptide sequences with validation
def generate_random_sequences(next_words, model, max_sequence_length, num_sequences=1, temperature=1.5):
    generated_sequences = []
    for _ in range(num_sequences):
        # Randomly choose the first amino acid
        first_amino_acid_index = np.random.randint(1, total_chars)  # Assuming amino acids are indexed from 1
        generated_sequence = tokenizer.index_word[first_amino_acid_index]  # Get the amino acid from the tokenizer

        for _ in range(next_words - 1):  # Generate the remaining amino acids
            token_list = tokenizer.texts_to_sequences([generated_sequence])[0]
            token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
            predicted_probs = model.predict(token_list, verbose=0)[0]
            predicted_probs = predicted_probs / np.sum(predicted_probs)  # Normalize probabilities to sum to 1
            predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)
            output_word = tokenizer.index_word.get(predicted_index, "")
            generated_sequence += output_word  # Append the generated word to the sequence # No space added here

        generated_sequences.append(generated_sequence)  # Add the generated sequence to the list
    return generated_sequences

# Example usage: Generate 10 random sequences of length 8 and validate
num_sequences_to_generate = 10
sequence_length = 8
generated_sequences = generate_random_sequences(sequence_length, model, max_sequence_length, num_sequences=num_sequences_to_generate, temperature=1.5)

# Define a list of valid amino acid abbreviations
valid_amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

# Function to check sequence validity
def check_sequence_validity(sequence):
    for char in sequence:

In [11]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load the dataset
df = pd.read_csv('3_Training_set_moe.csv')  # Replace with your actual file path

# Extract peptide sequences from the 'peptide_sequence' column
peptide_sequences = df['peptide_sequence'].tolist()

# Tokenize the sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(peptide_sequences)
total_chars = len(tokenizer.word_index) + 1

# Create input sequences and their corresponding output sequences
sequences = []
for seq in peptide_sequences:
    tokenized_seq = tokenizer.texts_to_sequences([seq])[0]
    for i in range(1, len(tokenized_seq)):
        n_gram_sequence = tokenized_seq[:i + 1]
        sequences.append(n_gram_sequence)

max_sequence_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='pre')
X, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes=total_chars)  # One-hot encode the labels

# Build the LSTM model
model = Sequential()
model.add(Embedding(total_chars, 50, input_length=max_sequence_length - 1))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(total_chars, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=200, verbose=2)  # Adjust epochs as needed

# Function to generate random peptide sequences with validation
def generate_random_sequences(next_words, model, max_sequence_length, num_sequences=1, temperature=1.5):
    generated_sequences = []
    for _ in range(num_sequences):
        # Randomly choose the first amino acid
        first_amino_acid_index = np.random.randint(1, total_chars)  # Assuming amino acids are indexed from 1
        generated_sequence = tokenizer.index_word[first_amino_acid_index]  # Get the amino acid from the tokenizer

        for _ in range(next_words - 1):  # Generate the remaining amino acids
            token_list = tokenizer.texts_to_sequences([generated_sequence])[0]
            token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
            predicted_probs = model.predict(token_list, verbose=0)[0]
            predicted_probs = predicted_probs / np.sum(predicted_probs)  # Normalize probabilities to sum to 1
            predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)
            output_word = tokenizer.index_word.get(predicted_index, "")
            generated_sequence += output_word  # Append the generated word to the sequence # No space added here

        generated_sequences.append(generated_sequence)  # Add the generated sequence to the list
    return generated_sequences

# Example usage: Generate 10 random sequences of length 8 and validate
num_sequences_to_generate = 10
sequence_length = 8
generated_sequences = generate_random_sequences(sequence_length, model, max_sequence_length, num_sequences=num_sequences_to_generate, temperature=1.5)

# Define a list of valid amino acid abbreviations
valid_amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

# Function to check sequence validity
def check_sequence_validity(sequence):
    for char in sequence:
        if char.upper() not in valid_amino_acids:
            return False  # Invalid amino acid found
    return True  # All characters are valid amino acids

Epoch 1/200




24/24 - 5s - 206ms/step - accuracy: 0.1029 - loss: 2.9713
Epoch 2/200
24/24 - 1s - 52ms/step - accuracy: 0.0910 - loss: 2.8601
Epoch 3/200
24/24 - 2s - 100ms/step - accuracy: 0.1201 - loss: 2.7842
Epoch 4/200
24/24 - 2s - 70ms/step - accuracy: 0.1240 - loss: 2.7371
Epoch 5/200
24/24 - 1s - 52ms/step - accuracy: 0.1280 - loss: 2.6948
Epoch 6/200
24/24 - 1s - 49ms/step - accuracy: 0.1438 - loss: 2.6525
Epoch 7/200
24/24 - 1s - 50ms/step - accuracy: 0.1570 - loss: 2.6096
Epoch 8/200
24/24 - 1s - 51ms/step - accuracy: 0.1979 - loss: 2.5401
Epoch 9/200
24/24 - 1s - 50ms/step - accuracy: 0.2150 - loss: 2.4918
Epoch 10/200
24/24 - 1s - 54ms/step - accuracy: 0.2296 - loss: 2.4476
Epoch 11/200
24/24 - 1s - 48ms/step - accuracy: 0.2427 - loss: 2.4111
Epoch 12/200
24/24 - 2s - 77ms/step - accuracy: 0.2876 - loss: 2.3407
Epoch 13/200
24/24 - 2s - 100ms/step - accuracy: 0.3113 - loss: 2.2957
Epoch 14/200
24/24 - 2s - 82ms/step - accuracy: 0.3483 - loss: 2.2183
Epoch 15/200
24/24 - 1s - 50ms/step - 

In [12]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


In [13]:

# Load the dataset
df = pd.read_csv('3_Training_set_moe.csv')  # Replace with your actual file path

# Extract peptide sequences from the 'peptide_sequence' column
peptide_sequences = df['peptide_sequence'].tolist()

# Tokenize the sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(peptide_sequences)
total_chars = len(tokenizer.word_index) + 1

# Create input sequences and their corresponding output sequences
sequences = []
for seq in peptide_sequences:
    tokenized_seq = tokenizer.texts_to_sequences([seq])[0]
    for i in range(1, len(tokenized_seq)):
        n_gram_sequence = tokenized_seq[:i + 1]
        sequences.append(n_gram_sequence)

max_sequence_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='pre')
X, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes=total_chars)  # One-hot encode the labels

# Build the LSTM model
model = Sequential()
model.add(Embedding(total_chars, 50, input_length=max_sequence_length - 1))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(total_chars, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=200, verbose=2)  # Adjust epochs as needed

# Function to generate random peptide sequences
def generate_random_sequences(next_words, model, max_sequence_length, num_sequences=1, temperature=1.5):
    generated_sequences = []
    for _ in range(num_sequences):
        generated_sequence = ""

        # Randomly choose the first amino acid
        first_amino_acid_index = np.random.randint(1, total_chars)  # Assuming amino acids are indexed from 1
        generated_sequence += tokenizer.index_word[first_amino_acid_index]  # Get the amino acid from the tokenizer

        for _ in range(next_words - 1):  # Generate the remaining amino acids
            token_list = tokenizer.texts_to_sequences([generated_sequence])[0]
            token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
            predicted_probs = model.predict(token_list, verbose=0)[0]
            predicted_probs = predicted_probs / np.sum(predicted_probs)  # Normalize probabilities to sum to 1
            predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)

            # Get the corresponding amino acid
            output_word = tokenizer.index_word.get(predicted_index, "")
            if output_word == "":  # If the output is not a valid word, break early
                break
            generated_sequence += output_word  # Append the generated amino acid to the sequence

        generated_sequences.append(generated_sequence)  # Add the generated sequence to the list
    return generated_sequences

# Example usage: Generate 10 random sequences of length 8
num_sequences_to_generate = 10
sequence_length = 8
generated_sequences = generate_random_sequences(sequence_length, model, max_sequence_length, num_sequences=num_sequences_to_generate, temperature=1.5)

# Write the generated sequences to a text file
with open('generated_peptide_sequences.txt', 'w') as f:
    for seq in generated_sequences:
        f.write(seq + '\n')

print(f"{num_sequences_to_generate} peptide sequences have been generated and saved to 'generated_peptide_sequences.txt'.")


Epoch 1/200




24/24 - 6s - 248ms/step - accuracy: 0.1069 - loss: 2.9472
Epoch 2/200
24/24 - 1s - 52ms/step - accuracy: 0.0976 - loss: 2.8679
Epoch 3/200
24/24 - 1s - 52ms/step - accuracy: 0.1135 - loss: 2.7980
Epoch 4/200
24/24 - 1s - 51ms/step - accuracy: 0.1201 - loss: 2.7476
Epoch 5/200
24/24 - 1s - 52ms/step - accuracy: 0.1253 - loss: 2.6844
Epoch 6/200
24/24 - 1s - 52ms/step - accuracy: 0.1332 - loss: 2.6420
Epoch 7/200
24/24 - 3s - 136ms/step - accuracy: 0.1662 - loss: 2.5813
Epoch 8/200
24/24 - 2s - 73ms/step - accuracy: 0.1768 - loss: 2.5286
Epoch 9/200
24/24 - 1s - 53ms/step - accuracy: 0.2190 - loss: 2.4808
Epoch 10/200
24/24 - 1s - 53ms/step - accuracy: 0.2388 - loss: 2.4348
Epoch 11/200
24/24 - 1s - 52ms/step - accuracy: 0.2493 - loss: 2.3915
Epoch 12/200
24/24 - 1s - 52ms/step - accuracy: 0.2955 - loss: 2.3298
Epoch 13/200
24/24 - 1s - 53ms/step - accuracy: 0.3430 - loss: 2.2497
Epoch 14/200
24/24 - 1s - 53ms/step - accuracy: 0.3562 - loss: 2.2085
Epoch 15/200
24/24 - 2s - 95ms/step - a

In [14]:
# Define a list of valid amino acid abbreviations
valid_amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

# List of peptide sequences to check
peptides = [
    'nkfmdvyq',
    'syseniat',
    'hlyqrach',
    'dvyqrchp',
    'gmvfersf',
    'ychliekl',
    'ikfmkvys',
    'kfvevypr',
    'ldvsqrie',
    'ikymetqr'
]

# Function to check the validity of each peptide
def check_sequence_validity(sequence):
    for char in sequence.upper():  # Convert to uppercase to ensure case insensitivity
        if char not in valid_amino_acids:
            return False  # Invalid amino acid found
    return True  # All characters are valid amino acids

# Check and report valid/invalid peptides
valid_peptides = []
invalid_peptides = []

for peptide in peptides:
    if check_sequence_validity(peptide):
        valid_peptides.append(peptide)
    else:
        invalid_peptides.append(peptide)

# Output the results
print("Valid Peptides:")
for vp in valid_peptides:
    print(vp)

print("\nInvalid Peptides:")
for ip in invalid_peptides:
    print(ip)



Valid Peptides:
nkfmdvyq
syseniat
hlyqrach
dvyqrchp
gmvfersf
ychliekl
ikfmkvys
kfvevypr
ldvsqrie
ikymetqr

Invalid Peptides:


In [17]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import difflib

# Load the dataset
df = pd.read_csv('3_Training_set_moe.csv')  # Replace with your actual file path

# Extract peptide sequences from the 'peptide_sequence' column
peptide_sequences = df['peptide_sequence'].tolist()

# Tokenize the sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(peptide_sequences)
total_chars = len(tokenizer.word_index) + 1

# Create input sequences and their corresponding output sequences
sequences = []
for seq in peptide_sequences:
    tokenized_seq = tokenizer.texts_to_sequences([seq])[0]
    for i in range(1, len(tokenized_seq)):
        n_gram_sequence = tokenized_seq[:i + 1]
        sequences.append(n_gram_sequence)

max_sequence_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='pre')
X, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes=total_chars)  # One-hot encode the labels

# Build the LSTM model
model = Sequential()
model.add(Embedding(total_chars, 50, input_length=max_sequence_length - 1))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(total_chars, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=200, verbose=2)  # Adjust epochs as needed

# QK peptide sequence for comparison
QK_peptide_sequence = "KLTWQELYQLKYKGI"  # Strip off Ac- and -NH2 for now

# Tokenize QK peptide sequence
tokenized_qk_sequence = tokenizer.texts_to_sequences([QK_peptide_sequence])[0]

# Function to calculate sequence similarity
def sequence_similarity(seq1, seq2):
    return difflib.SequenceMatcher(None, seq1, seq2).ratio()

# Function to generate random peptide sequences
def generate_random_sequences(next_words, model, max_sequence_length, num_sequences=1, temperature=1.5):
    generated_sequences = []
    for _ in range(num_sequences):
        generated_sequence = ""

        # Randomly choose the first amino acid
        first_amino_acid_index = np.random.randint(1, total_chars)  # Assuming amino acids are indexed from 1
        generated_sequence += tokenizer.index_word[first_amino_acid_index]  # Get the amino acid from the tokenizer

        for _ in range(next_words - 1):  # Generate the remaining amino acids
            token_list = tokenizer.texts_to_sequences([generated_sequence])[0]
            token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
            predicted_probs = model.predict(token_list, verbose=0)[0]
            predicted_probs = predicted_probs / np.sum(predicted_probs)  # Normalize probabilities to sum to 1
            predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)

            # Get the corresponding amino acid
            output_word = tokenizer.index_word.get(predicted_index, "")
            if output_word == "":  # If the output is not a valid word, break early
                break
            generated_sequence += output_word  # Append the generated amino acid to the sequence

        generated_sequences.append(generated_sequence)  # Add the generated sequence to the list
    return generated_sequences

# Example usage: Generate 10 random sequences of length 8
num_sequences_to_generate = 10
sequence_length = 8
generated_sequences = generate_random_sequences(sequence_length, model, max_sequence_length, num_sequences=num_sequences_to_generate, temperature=1.5)

# Write the generated sequences to a text file
with open('generated_peptide_sequences.txt', 'w') as f:
    for seq in generated_sequences:
        f.write(seq + '\n')

# Compare the generated peptides with the QK peptide sequence
for generated_sequence in generated_sequences:
    similarity = sequence_similarity(generated_sequence, QK_peptide_sequence)
    print(f"Generated peptide: {generated_sequence}, Similarity with QK peptide: {similarity:.4f}")

print(f"{num_sequences_to_generate} peptide sequences have been generated and saved to 'generated_peptide_sequences.txt'.")


Epoch 1/200




24/24 - 5s - 197ms/step - accuracy: 0.0937 - loss: 2.9742
Epoch 2/200
24/24 - 1s - 50ms/step - accuracy: 0.0963 - loss: 2.8797
Epoch 3/200
24/24 - 1s - 50ms/step - accuracy: 0.1240 - loss: 2.8302
Epoch 4/200
24/24 - 1s - 52ms/step - accuracy: 0.1095 - loss: 2.7698
Epoch 5/200
24/24 - 2s - 93ms/step - accuracy: 0.1412 - loss: 2.7108
Epoch 6/200
24/24 - 2s - 87ms/step - accuracy: 0.1412 - loss: 2.6434
Epoch 7/200
24/24 - 1s - 51ms/step - accuracy: 0.1544 - loss: 2.5821
Epoch 8/200
24/24 - 1s - 51ms/step - accuracy: 0.1741 - loss: 2.5262
Epoch 9/200
24/24 - 1s - 52ms/step - accuracy: 0.2098 - loss: 2.4816
Epoch 10/200
24/24 - 1s - 54ms/step - accuracy: 0.2164 - loss: 2.4479
Epoch 11/200
24/24 - 2s - 103ms/step - accuracy: 0.2150 - loss: 2.3839
Epoch 12/200
24/24 - 1s - 52ms/step - accuracy: 0.2744 - loss: 2.3201
Epoch 13/200
24/24 - 2s - 68ms/step - accuracy: 0.3100 - loss: 2.2533
Epoch 14/200
24/24 - 2s - 100ms/step - accuracy: 0.3430 - loss: 2.1903
Epoch 15/200
24/24 - 1s - 62ms/step - 

In [20]:
!pip install python-Levenshtein biopython


Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.0-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.0 (from python-Levenshtein)
  Downloading levenshtein-0.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.0->python-Levenshtein)
  Downloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.0-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages:

In [22]:
from Levenshtein import distance as levenshtein_distance
from Bio.SeqUtils.ProtParam import ProteinAnalysis

# Function to calculate peptide properties
def calculate_peptide_properties(peptide):
    analysis = ProteinAnalysis(peptide)
    length = len(peptide)
    mol_weight = analysis.molecular_weight()
    hydrophobicity = analysis.gravy()  # Hydropathy index

    # Calculate net charge
    # pKa values for N-terminus, C-terminus, and amino acids
    pKa_values = {
        'A': 2.35,  # Aspartic acid
        'C': 1.71,  # Cysteine
        'D': 1.88,  # Aspartic acid
        'E': 2.19,  # Glutamic acid
        'F': 2.20,  # Phenylalanine
        'G': 2.34,  # Glycine
        'H': 6.00,  # Histidine
        'I': 2.36,  # Isoleucine
        'K': 2.18,  # Lysine
        'L': 2.36,  # Leucine
        'M': 2.28,  # Methionine
        'N': 2.18,  # Asparagine
        'P': 1.99,  # Proline
        'Q': 2.17,  # Glutamine
        'R': 2.17,  # Arginine
        'S': 2.19,  # Serine
        'T': 2.06,  # Threonine
        'V': 2.32,  # Valine
        'W': 2.46,  # Tryptophan
        'Y': 2.10   # Tyrosine
    }

    charge = 0
    for aa in peptide:
        if aa in pKa_values:
            # Calculate contribution of each amino acid to charge
            if aa in ['K', 'R']:
                charge += 1  # Positive charge for Lys and Arg
            elif aa in ['D', 'E']:
                charge -= 1  # Negative charge for Asp and Glu

    # Adding contributions from N-terminus and C-terminus
    charge += 1  # N-terminus (positive)
    charge -= 1  # C-terminus (negative)

    return length, mol_weight, hydrophobicity, charge

# Known VEGF peptide sequence
known_vegf_sequence = "KLTWQELYQLKYKGI"  # Adjusted for comparison without modifications

# Example generated sequences (replace with your actual generated sequences)
generated_sequences = ["ARFLEVWQRTYCKA", "KLTWQELYQLKYKGI", "DVYQRCHP", "SYSENIAT"]

# 1. Levenshtein Distance Comparison
print("Levenshtein Distance Comparisons:")
for seq in generated_sequences:
    similarity_score = levenshtein_distance(known_vegf_sequence, seq)
    print(f"Similarity between {known_vegf_sequence} and {seq}: {similarity_score}")

# 2. Amino Acid Composition Comparison
print("\nAmino Acid Composition Comparisons:")
known_analysis = ProteinAnalysis(known_vegf_sequence)
known_composition = known_analysis.get_amino_acids_percent()

for seq in generated_sequences:
    generated_analysis = ProteinAnalysis(seq)
    generated_composition = generated_analysis.get_amino_acids_percent()
    print(f"Composition of {seq}: {generated_composition}")

# 3. Physicochemical Property Comparison
print("\nPhysicochemical Property Comparisons:")
known_length, known_mol_weight, known_hydrophobicity, known_charge = calculate_peptide_properties(known_vegf_sequence)

for seq in generated_sequences:
    length, mol_weight, hydrophobicity, charge = calculate_peptide_properties(seq)
    print(f"Comparison with {seq}: Length = {length}, Mol_Wt = {mol_weight:.2f}, Hydrophobicity = {hydrophobicity:.2f}, Charge = {charge:.2f}")


Levenshtein Distance Comparisons:
Similarity between KLTWQELYQLKYKGI and ARFLEVWQRTYCKA: 12
Similarity between KLTWQELYQLKYKGI and KLTWQELYQLKYKGI: 0
Similarity between KLTWQELYQLKYKGI and DVYQRCHP: 13
Similarity between KLTWQELYQLKYKGI and SYSENIAT: 14

Amino Acid Composition Comparisons:
Composition of ARFLEVWQRTYCKA: {'A': 0.14285714285714285, 'C': 0.07142857142857142, 'D': 0.0, 'E': 0.07142857142857142, 'F': 0.07142857142857142, 'G': 0.0, 'H': 0.0, 'I': 0.0, 'K': 0.07142857142857142, 'L': 0.07142857142857142, 'M': 0.0, 'N': 0.0, 'P': 0.0, 'Q': 0.07142857142857142, 'R': 0.14285714285714285, 'S': 0.0, 'T': 0.07142857142857142, 'V': 0.07142857142857142, 'W': 0.07142857142857142, 'Y': 0.07142857142857142}
Composition of KLTWQELYQLKYKGI: {'A': 0.0, 'C': 0.0, 'D': 0.0, 'E': 0.06666666666666667, 'F': 0.0, 'G': 0.06666666666666667, 'H': 0.0, 'I': 0.06666666666666667, 'K': 0.2, 'L': 0.2, 'M': 0.0, 'N': 0.0, 'P': 0.0, 'Q': 0.13333333333333333, 'R': 0.0, 'S': 0.0, 'T': 0.06666666666666667, 'V

In [23]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import difflib
from Levenshtein import distance as levenshtein_distance
from Bio.SeqUtils.ProtParam import ProteinAnalysis

# Load the dataset
df = pd.read_csv('3_Training_set_moe.csv')  # Replace with your actual file path

# Extract peptide sequences from the 'peptide_sequence' column
peptide_sequences = df['peptide_sequence'].tolist()

# Tokenize the sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(peptide_sequences)
total_chars = len(tokenizer.word_index) + 1

# Create input sequences and their corresponding output sequences
sequences = []
for seq in peptide_sequences:
    tokenized_seq = tokenizer.texts_to_sequences([seq])[0]
    for i in range(1, len(tokenized_seq)):
        n_gram_sequence = tokenized_seq[:i + 1]
        sequences.append(n_gram_sequence)

max_sequence_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='pre')
X, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes=total_chars)  # One-hot encode the labels

# Build the LSTM model
model = Sequential()
model.add(Embedding(total_chars, 50, input_length=max_sequence_length - 1))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(total_chars, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=200, verbose=2)  # Adjust epochs as needed

# QK peptide sequence for comparison
QK_peptide_sequence = "KLTWQELYQLKYKGI"  # Strip off Ac- and -NH2 for now

# Tokenize QK peptide sequence
tokenized_qk_sequence = tokenizer.texts_to_sequences([QK_peptide_sequence])[0]

# Function to calculate sequence similarity
def sequence_similarity(seq1, seq2):
    return difflib.SequenceMatcher(None, seq1, seq2).ratio()

# Function to calculate peptide properties
def calculate_peptide_properties(peptide):
    analysis = ProteinAnalysis(peptide)
    length = len(peptide)
    mol_weight = analysis.molecular_weight()
    hydrophobicity = analysis.gravy()  # Hydropathy index

    # Calculate net charge
    pKa_values = {
        'A': 2.35,
        'C': 1.71,
        'D': 1.88,
        'E': 2.19,
        'F': 2.20,
        'G': 2.34,
        'H': 6.00,
        'I': 2.36,
        'K': 2.18,
        'L': 2.36,
        'M': 2.28,
        'N': 2.18,
        'P': 1.99,
        'Q': 2.17,
        'R': 2.17,
        'S': 2.19,
        'T': 2.06,
        'V': 2.32,
        'W': 2.46,
        'Y': 2.10
    }

    charge = 0
    for aa in peptide:
        if aa in pKa_values:
            if aa in ['K', 'R']:
                charge += 1  # Positive charge for Lys and Arg
            elif aa in ['D', 'E']:
                charge -= 1  # Negative charge for Asp and Glu

    # Adding contributions from N-terminus and C-terminus
    charge += 1  # N-terminus (positive)
    charge -= 1  # C-terminus (negative)

    return length, mol_weight, hydrophobicity, charge

# Function to generate random peptide sequences
def generate_random_sequences(next_words, model, max_sequence_length, num_sequences=1, temperature=1.5):
    generated_sequences = []
    for _ in range(num_sequences):
        generated_sequence = ""

        # Randomly choose the first amino acid
        first_amino_acid_index = np.random.randint(1, total_chars)  # Assuming amino acids are indexed from 1
        generated_sequence += tokenizer.index_word[first_amino_acid_index]  # Get the amino acid from the tokenizer

        for _ in range(next_words - 1):  # Generate the remaining amino acids
            token_list = tokenizer.texts_to_sequences([generated_sequence])[0]
            token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
            predicted_probs = model.predict(token_list, verbose=0)[0]
            predicted_probs = predicted_probs / np.sum(predicted_probs)  # Normalize probabilities to sum to 1
            predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)

            # Get the corresponding amino acid
            output_word = tokenizer.index_word.get(predicted_index, "")
            if output_word == "":  # If the output is not a valid word, break early
                break
            generated_sequence += output_word  # Append the generated amino acid to the sequence

        generated_sequences.append(generated_sequence)  # Add the generated sequence to the list
    return generated_sequences

# Example usage: Generate 10 random sequences of length 8
num_sequences_to_generate = 10
sequence_length = 8
generated_sequences = generate_random_sequences(sequence_length, model, max_sequence_length, num_sequences=num_sequences_to_generate, temperature=1.5)

# Write the generated sequences to a text file
with open('generated_peptide_sequences.txt', 'w') as f:
    for seq in generated_sequences:
        f.write(seq + '\n')

# Compare the generated peptides with the QK peptide sequence
print("Comparison with the QK peptide sequence:")
for generated_sequence in generated_sequences:
    # Calculate similarity
    similarity = sequence_similarity(generated_sequence, QK_peptide_sequence)
    print(f"Generated peptide: {generated_sequence}, Similarity with QK peptide: {similarity:.4f}")

    # Calculate Levenshtein distance
    levenshtein_dist = levenshtein_distance(QK_peptide_sequence, generated_sequence)
    print(f"Levenshtein distance: {levenshtein_dist}")

    # Calculate peptide properties
    length, mol_weight, hydrophobicity, charge = calculate_peptide_properties(generated_sequence)
    print(f"Properties: Length = {length}, Mol_Wt = {mol_weight:.2f}, Hydrophobicity = {hydrophobicity:.2f}, Charge = {charge:.2f}")

print(f"{num_sequences_to_generate} peptide sequences have been generated and saved to 'generated_peptide_sequences.txt'.")


Epoch 1/200




24/24 - 5s - 201ms/step - accuracy: 0.0976 - loss: 2.9691
Epoch 2/200
24/24 - 1s - 55ms/step - accuracy: 0.0923 - loss: 2.8863
Epoch 3/200
24/24 - 3s - 133ms/step - accuracy: 0.1095 - loss: 2.8239
Epoch 4/200
24/24 - 2s - 78ms/step - accuracy: 0.1016 - loss: 2.7432
Epoch 5/200
24/24 - 3s - 106ms/step - accuracy: 0.1398 - loss: 2.6929
Epoch 6/200
24/24 - 1s - 51ms/step - accuracy: 0.1187 - loss: 2.6498
Epoch 7/200
24/24 - 1s - 52ms/step - accuracy: 0.1464 - loss: 2.6122
Epoch 8/200
24/24 - 1s - 51ms/step - accuracy: 0.1675 - loss: 2.5587
Epoch 9/200
24/24 - 1s - 52ms/step - accuracy: 0.2256 - loss: 2.4792
Epoch 10/200
24/24 - 2s - 102ms/step - accuracy: 0.2309 - loss: 2.4298
Epoch 11/200
24/24 - 2s - 77ms/step - accuracy: 0.2744 - loss: 2.3675
Epoch 12/200
24/24 - 1s - 52ms/step - accuracy: 0.2942 - loss: 2.2979
Epoch 13/200
24/24 - 1s - 53ms/step - accuracy: 0.3734 - loss: 2.2306
Epoch 14/200
24/24 - 1s - 53ms/step - accuracy: 0.3681 - loss: 2.1575
Epoch 15/200
24/24 - 1s - 53ms/step -