In [26]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
import numpy as np

# set random seed for reproducibility
np.random.seed(1337)
# Load your data
# Replace 'your_data_file.csv' with your actual data file
data = pd.read_csv('combined.csv')
# drop rows where length of sequence is less than 90
data = data[data['len'] >= 90]

# Tokenize the sequences at the character level
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(data['sequence'])
sequences_numeric = tokenizer.texts_to_sequences(data['sequence'])

# Pad sequences to a maximum length of 200
max_length = 300
sequences_padded = pad_sequences(sequences_numeric, maxlen=max_length, padding='post', truncating='post')

# One-hot encode the sequences
n_classes = len(tokenizer.word_index) + 1  # +1 because index 0 is not used
one_hot_sequences = np.zeros((len(sequences_padded), max_length, n_classes))
for i, seq in enumerate(sequences_padded):
    for j, amino_acid_index in enumerate(seq):
        if amino_acid_index != 0:
            one_hot_sequences[i, j, amino_acid_index] = 1

# Balancing the dataset
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(one_hot_sequences.reshape(len(one_hot_sequences), -1), data['hmmer'])
X_res = X_res.reshape(-1, max_length, n_classes)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)


In [27]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

# Model configuration
n_units = 100  # Number of units in the LSTM layer

# Build the model
model = Sequential()
model.add(LSTM(n_units, input_shape=(max_length, n_classes), return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 100)               48800     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0         
                                                                 
 dense_3 (Dense)             (None, 1)                 101       
                                                                 
Total params: 48901 (191.02 KB)
Trainable params: 48901 (191.02 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [29]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs= 50, batch_size=64)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x14e13fe1f50>