In [24]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Concatenate
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

# Load the dataset
data = pd.read_csv('/content/dataset.csv')

# Features and target
categorical_features = ["parent_protein_id", "protein_seq", "peptide_seq"]
numerical_features = [
    "start_position",
    "end_position",
    "chou_fasman",
    "emini",
    "kolaskar_tongaonkar",
    "parker",
    "isoelectric_point",
    "aromaticity",
    "hydrophobicity",
    "stability"
]
target_feature = "target"

# Encode categorical features
label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    data[feature] = le.fit_transform(data[feature])
    label_encoders[feature] = le

# Normalize numerical features
scaler = MinMaxScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Prepare input and target
X_categorical = data[categorical_features].values
X_numerical = data[numerical_features].values
y = data[target_feature].values

# Define model parameters
sequence_max_len = X_categorical.shape[1]  # Max length of categorical features
num_features = X_numerical.shape[1]  # Number of numerical features
vocab_size = max([data[feature].max() for feature in categorical_features]) + 1  # Vocabulary size for sequences
latent_dim = 64  # Latent dimension for LSTM

# Encoder
encoder_inputs_numeric = Input(shape=(num_features,), name="encoder_numeric_inputs")
encoder_inputs_seq = Input(shape=(sequence_max_len,), name="encoder_sequence_inputs")
encoder_embedding = Embedding(vocab_size, latent_dim, name="encoder_embedding")(encoder_inputs_seq)
encoder_lstm = LSTM(latent_dim, return_state=True, name="encoder_lstm")
_, state_h, state_c = encoder_lstm(encoder_embedding)

# Decoder
decoder_inputs = Input(shape=(sequence_max_len,), name="decoder_inputs")
decoder_embedding = Embedding(vocab_size, latent_dim, name="decoder_embedding")(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name="decoder_lstm")
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
decoder_dense = Dense(vocab_size, activation="softmax", name="decoder_dense")
decoder_outputs = decoder_dense(decoder_outputs)

# Seq2Seq Model
seq2seq_model = Model([encoder_inputs_numeric, encoder_inputs_seq, decoder_inputs], decoder_outputs, name="seq2seq_model")
seq2seq_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Data Preparation for Training
decoder_input_data = np.zeros_like(X_categorical)  # Use shifted sequences as decoder inputs
X_numeric_train = X_numerical
X_seq_train = X_categorical
y_train = np.expand_dims(X_categorical, axis=-1)  # Target output as sequence

# Train the Model
seq2seq_model.fit([X_numeric_train, X_seq_train, decoder_input_data], y_train, batch_size=32, epochs=10)

# Reverse Prediction
def reverse_predict(target, stability, emini, isoelectric_point):
    # Prepare inputs
    sample_numeric = scaler.transform([[stability, emini, 0, 0, 0, isoelectric_point, 0, 0, 0, 0]])
    sample_seq = np.zeros((1, sequence_max_len))  # Empty sequence as input for decoder
    prediction = seq2seq_model.predict([sample_numeric, sample_seq, sample_seq])
    # Decode categorical outputs
    decoded_output = [label_encoders[feature].inverse_transform([np.argmax(pred)]) for feature, pred in zip(categorical_features, prediction[0])]
    return decoded_output

# Example: Predict features for target=1, stability=8.9, emini=0.16, isoelectric_point=6.6
result = reverse_predict(1, 8.9, 0.16, 6.6)
print("Predicted Features:", result)


Epoch 1/10
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 228ms/step - accuracy: 0.0181 - loss: 8.1502
Epoch 2/10
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 226ms/step - accuracy: 0.0243 - loss: 6.4369
Epoch 3/10
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 229ms/step - accuracy: 0.0696 - loss: 5.8510
Epoch 4/10
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 239ms/step - accuracy: 0.1791 - loss: 5.2944
Epoch 5/10
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 223ms/step - accuracy: 0.2882 - loss: 4.8021
Epoch 6/10
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 229ms/step - accuracy: 0.4077 - loss: 4.3337
Epoch 7/10
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 229ms/step - accuracy: 0.4896 - loss: 3.9410
Epoch 8/10
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 231ms/step - accuracy: 0.5271 - loss: 3.6441
Epoch 9/



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 417ms/step
Predicted Features: [array(['P62314'], dtype=object), array(['MAADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLVLPGYKYLGPFNGLDKGEPVNEADAAALEHDKAYDRQLDSGDNPYLKYNHADAEFQERLKEDTSFGGNLGRAVFQAKKRVLEPLGLVEEPVKTAPGKKRPVEHSPVEPDSSSGTGKAGQQPARKRLNFGQTGDADSVPDPQPLGQPPAAPSGLGTNTMATGSGAPMADNNEGADGVGNSSGNWHCDSTWMGDRVITTSTRTWALPTYNNHLYKQISSQSGASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTQNDGTTTIANNLTSTVQVFTDSEYQLPYVLGSAHQGCLPPFPADVFMVPQYGYLTLNNGSQAVGRSSFYCLEYFPSQMLRTGNNFTFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLSRTNTPSGTTTQSRLQFSQAGASDIRDQSRNWLPGPCYRQQRVSKTSADNNNSEYSWTGATKYHLNGRDSLVNPGPAMASHKDDEEKFFPQSGVLIFGKQGSEKTNVDIEKVMITDEEEIRTTNPVATEQYGSVSTNLQRGNRQAATADVNTQGVLPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKHPPPQILIKNTPVPANPSTTFSAAKFASFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYNKSVNVDFTVDTNGVYSEPRPIGTRYLTRNL'],
      dtype=object), array(['ANQAFKLTS'], dtype=object)]
