In [None]:
# Define the input and output file names
input_file = 'input_sequence.txt'  # Replace with your actual input file name
output_file = 'peptide_sequences.txt'

# Initialize an empty list to store the peptide sequences
peptide_sequences = []

# Open and read the input file
with open(input_file, 'r') as file:
    for line in file:
        line = line.strip()  # Remove any leading/trailing whitespace
        # Check if the line is not a sequence identifier
        if not line.startswith('>'):
            cleaned_sequence = line.replace('-', '')  # Remove any '-' signs
            peptide_sequences.append(cleaned_sequence)  # Add the cleaned sequence to the list

# Write the peptide sequences to the output file
with open(output_file, 'w') as file:
    file.write('peptide_sequence\n')  # Write the header
    for sequence in peptide_sequences:
        file.write(sequence + '\n')  # Write each peptide sequence on a new line

print(f"Peptide sequences have been written to {output_file}.")


Peptide sequences have been written to peptide_sequences.txt.


In [None]:
# Define the input and output file names
input_file = 'peptide_sequences.txt'  # The newly generated peptide_sequence file
output_file = 'filtered_peptide_sequences.txt'

# Set the minimum length of peptide sequences to keep
min_length = 5

# Initialize a set to store unique peptide sequences
unique_sequences = set()

# Open and read the input file
with open(input_file, 'r') as file:
    next(file)  # Skip the header line
    for line in file:
        sequence = line.strip()  # Remove any leading/trailing whitespace
        # Check if the sequence meets the minimum length requirement
        if len(sequence) >= min_length:
            unique_sequences.add(sequence)  # Add the sequence to the set (automatically removes duplicates)

# Write the filtered unique peptide sequences to the output file
with open(output_file, 'w') as file:
    file.write('peptide_sequence\n')  # Write the header
    for sequence in unique_sequences:
        file.write(sequence + '\n')  # Write each unique, valid sequence on a new line

print(f"Filtered peptide sequences have been written to {output_file}.")


Filtered peptide sequences have been written to filtered_peptide_sequences.txt.


In [None]:
# Define the input and output file names
input_file = 'peptide_sequences.txt'  # The newly generated peptide_sequence file
output_file = 'filtered_peptide_sequences.txt'

# Set the minimum length of peptide sequences to keep
min_length = 8  # Minimum sequence length is now 8 nucleotides (base pairs)

# Initialize a set to store unique peptide sequences
unique_sequences = set()

# Open and read the input file
with open(input_file, 'r') as file:
    next(file)  # Skip the header line
    for line in file:
        sequence = line.strip()  # Remove any leading/trailing whitespace
        # Check if the sequence meets the minimum length requirement
        if len(sequence) >= min_length:
            unique_sequences.add(sequence)  # Add the sequence to the set (automatically removes duplicates)

# Write the filtered unique peptide sequences to the output file
with open(output_file, 'w') as file:
    file.write('peptide_sequence\n')  # Write the header
    for sequence in unique_sequences:
        file.write(sequence + '\n')  # Write each unique, valid sequence on a new line

print(f"Filtered peptide sequences have been written to {output_file}.")


Filtered peptide sequences have been written to filtered_peptide_sequences.txt.


In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


In [None]:

# Load the dataset
df = pd.read_csv('3_Training_set_moe.csv')

# Extract peptide sequences from the 'peptide_sequence' column
peptide_sequences = df['peptide_sequence'].tolist()

# Tokenize the sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(peptide_sequences)
total_chars = len(tokenizer.word_index) + 1

# Create input sequences and their corresponding output sequences
sequences = []
for seq in peptide_sequences:
    tokenized_seq = tokenizer.texts_to_sequences([seq])[0]
    for i in range(1, len(tokenized_seq)):
        n_gram_sequence = tokenized_seq[:i+1]
        sequences.append(n_gram_sequence)

max_sequence_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='pre')
X, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes=total_chars)  # One-hot encode the labels

# Build the LSTM model with increased complexity
model = Sequential()
model.add(Embedding(total_chars, 50, input_length=max_sequence_length-1))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(total_chars, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model for more epochs
model.fit(X, y, epochs=200, verbose=2)

# Generate novel peptide sequences with higher temperature for more randomness
def generate_sequences(seed_text, next_words, model, max_sequence_length, num_sequences=1, temperature=1.5):
    generated_sequences = []
    for _ in range(num_sequences):
        generated_sequence = seed_text
        for _ in range(next_words):
            token_list = tokenizer.texts_to_sequences([generated_sequence])[0]
            token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
            predicted_probs = model.predict(token_list, verbose=0)[0]
            predicted_probs = predicted_probs / np.sum(predicted_probs)  # Normalize probabilities to sum to 1
            predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)
            output_word = tokenizer.index_word.get(predicted_index, "")
            generated_sequence += " " + output_word
        generated_sequences.append(generated_sequence)
    return generated_sequences

# Example usage with higher temperature for more randomness
seed_sequence = "P"
generated_sequences = generate_sequences(seed_sequence, 8, model, max_sequence_length, num_sequences=5, temperature=1.5)
print("Generated Sequences:", generated_sequences)


Epoch 1/200




24/24 - 6s - 248ms/step - accuracy: 0.1016 - loss: 2.9567
Epoch 2/200
24/24 - 2s - 82ms/step - accuracy: 0.0963 - loss: 2.8644
Epoch 3/200
24/24 - 2s - 72ms/step - accuracy: 0.1187 - loss: 2.7710
Epoch 4/200
24/24 - 1s - 52ms/step - accuracy: 0.1240 - loss: 2.7100
Epoch 5/200
24/24 - 1s - 46ms/step - accuracy: 0.1491 - loss: 2.6477
Epoch 6/200
24/24 - 1s - 53ms/step - accuracy: 0.1517 - loss: 2.6028
Epoch 7/200
24/24 - 1s - 53ms/step - accuracy: 0.1847 - loss: 2.5597
Epoch 8/200
24/24 - 1s - 53ms/step - accuracy: 0.1781 - loss: 2.5012
Epoch 9/200
24/24 - 1s - 53ms/step - accuracy: 0.2032 - loss: 2.4526
Epoch 10/200
24/24 - 2s - 76ms/step - accuracy: 0.2375 - loss: 2.3978
Epoch 11/200
24/24 - 2s - 91ms/step - accuracy: 0.2678 - loss: 2.3425
Epoch 12/200
24/24 - 2s - 63ms/step - accuracy: 0.3153 - loss: 2.2908
Epoch 13/200
24/24 - 1s - 46ms/step - accuracy: 0.3562 - loss: 2.2222
Epoch 14/200
24/24 - 1s - 47ms/step - accuracy: 0.3694 - loss: 2.1363
Epoch 15/200
24/24 - 1s - 46ms/step - ac