<a href="https://colab.research.google.com/github/Swayamprakashpatel/DD/blob/main/Drug_Discovery_15_2_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
print("TensorFlow Version:", tf.__version__)


TensorFlow Version: 2.18.0


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt

# Load dataset
data = pd.read_csv('final_output_15_2_25.csv')

# Extract necessary columns
protein_sequences = data['Sequence'].values
smiles_strings = data['SMILE'].values

# 1. Preprocess protein sequences (One-hot encoding)
def one_hot_encoding(protein_seq):
    amino_acids = 'ACDEFGHIJKLMNOPQRSTUVWXYZ'
    aa_dict = {amino_acids[i]: i for i in range(len(amino_acids))}
    num_aa = len(aa_dict)

    max_seq_len = 5038  # Maximum protein sequence length
    one_hot = np.zeros((len(protein_seq), max_seq_len, num_aa))  # Pad all sequences to max length

    for i, seq in enumerate(protein_seq):
        for j, aa in enumerate(seq):
            if aa in aa_dict:
                one_hot[i, j, aa_dict[aa]] = 1
    return one_hot

# Apply one-hot encoding to protein sequences
X_seq = one_hot_encoding(protein_sequences)

# 2. Preprocess SMILES strings (Integer encoding)
def smiles_to_int(smiles_strings, max_length=1000):
    char_set = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789()=#[]+-'
    char_dict = {char: i+1 for i, char in enumerate(char_set)}  # Map chars to integers

    smiles_int = np.zeros((len(smiles_strings), max_length), dtype=int)

    for i, smile in enumerate(smiles_strings):
        for j, char in enumerate(smile):
            if j < max_length:
                smiles_int[i, j] = char_dict.get(char, 0)  # Map characters to integers
    return smiles_int

# Apply integer encoding to SMILES strings
X_smiles = smiles_to_int(smiles_strings)

# 3. Split the data into training and test sets
X_train_seq, X_test_seq, X_train_smiles, X_test_smiles = train_test_split(
    X_seq, X_smiles, test_size=0.2, random_state=42
)

# 4. Pad the SMILES sequences (to 1000)
X_train_smiles = pad_sequences(X_train_smiles, maxlen=1000, padding='post')
X_test_smiles = pad_sequences(X_test_smiles, maxlen=1000, padding='post')

# 5. Prepare target variable (assuming y is your target, with 1000 classes)
y_train = np.zeros(len(X_train_smiles))  # Dummy binary target variable
y_test = np.zeros(len(X_test_smiles))  # Dummy binary target variable

# 6. One-hot encode the target variable
y_train = tf.keras.utils.to_categorical(y_train, num_classes=1000)  # One-hot encoding
y_test = tf.keras.utils.to_categorical(y_test, num_classes=1000)

# 7. Build the model

# Protein sequence model (1D Conv)
seq_input = Input(shape=(X_train_seq.shape[1], X_train_seq.shape[2]))  # (5038, 25)
x = Conv1D(64, 3, activation='relu')(seq_input)
x = MaxPooling1D(2)(x)
x = Flatten()(x)

# SMILES sequence model (Dense)
smiles_input = Input(shape=(X_train_smiles.shape[1],))  # (1000,)
y = Dense(128, activation='relu')(smiles_input)

# Merge both models
merged = tf.keras.layers.concatenate([x, y])

# Output layer for multi-class classification
output = Dense(1000, activation='softmax')(merged)  # Output for 1000 classes

# Compile the model
model = Model(inputs=[seq_input, smiles_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Display model summary
model.summary()

# 8. Train the model
history = model.fit(
    [X_train_seq, X_train_smiles], y_train,
    epochs=100,
    batch_size=200,
    validation_data=([X_test_seq, X_test_smiles], y_test)
)

# 9. Evaluate the model
test_loss, test_acc = model.evaluate([X_test_seq, X_test_smiles], y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_acc}")

# Optionally, plot training history (accuracy and loss)
# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()


Epoch 1/100
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1358s[0m 11s/step - accuracy: 0.9573 - loss: 0.3992 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 2/100
[1m 13/126[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m18:44[0m 10s/step - accuracy: 1.0000 - loss: 0.0000e+00

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import os

# Load dataset
data = pd.read_csv('final_output_15_2_25.csv')

# Extract necessary columns
protein_sequences = data['Sequence'].values
smiles_strings = data['SMILE'].values

# 1. Preprocess protein sequences (One-hot encoding)
def one_hot_encoding(protein_seq):
    amino_acids = 'ACDEFGHIJKLMNOPQRSTUVWXYZ'
    aa_dict = {amino_acids[i]: i for i in range(len(amino_acids))}
    num_aa = len(aa_dict)

    max_seq_len = 5038  # Maximum protein sequence length
    one_hot = np.zeros((len(protein_seq), max_seq_len, num_aa))  # Pad all sequences to max length

    for i, seq in enumerate(protein_seq):
        for j, aa in enumerate(seq):
            if aa in aa_dict:
                one_hot[i, j, aa_dict[aa]] = 1
    return one_hot

# Apply one-hot encoding to protein sequences
X_seq = one_hot_encoding(protein_sequences)

# 2. Preprocess SMILES strings (Integer encoding)
def smiles_to_int(smiles_strings, max_length=1000):
    char_set = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789()=#[]+-'
    char_dict = {char: i+1 for i, char in enumerate(char_set)}  # Map chars to integers

    smiles_int = np.zeros((len(smiles_strings), max_length), dtype=int)

    for i, smile in enumerate(smiles_strings):
        for j, char in enumerate(smile):
            if j < max_length:
                smiles_int[i, j] = char_dict.get(char, 0)  # Map characters to integers
    return smiles_int

# Apply integer encoding to SMILES strings
X_smiles = smiles_to_int(smiles_strings)

# 3. Split the data into training and test sets
X_train_seq, X_test_seq, X_train_smiles, X_test_smiles = train_test_split(
    X_seq, X_smiles, test_size=0.2, random_state=42
)

# 4. Pad the SMILES sequences (to 1000)
X_train_smiles = pad_sequences(X_train_smiles, maxlen=1000, padding='post')
X_test_smiles = pad_sequences(X_test_smiles, maxlen=1000, padding='post')

# 5. Prepare target variable (assuming y is your target, with 1000 classes)
y_train = np.zeros(len(X_train_smiles))  # Dummy binary target variable
y_test = np.zeros(len(X_test_smiles))  # Dummy binary target variable

# 6. One-hot encode the target variable
y_train = tf.keras.utils.to_categorical(y_train, num_classes=1000)  # One-hot encoding
y_test = tf.keras.utils.to_categorical(y_test, num_classes=1000)

# 7. Build the model

# Protein sequence model (1D Conv)
seq_input = Input(shape=(X_train_seq.shape[1], X_train_seq.shape[2]))  # (5038, 25)
x = Conv1D(64, 3, activation='relu')(seq_input)
x = MaxPooling1D(2)(x)
x = Flatten()(x)

# SMILES sequence model (Dense)
smiles_input = Input(shape=(X_train_smiles.shape[1],))  # (1000,)
y = Dense(128, activation='relu')(smiles_input)

# Merge both models
merged = tf.keras.layers.concatenate([x, y])

# Output layer for multi-class classification
output = Dense(1000, activation='softmax')(merged)  # Output for 1000 classes

# Compile the model
model = Model(inputs=[seq_input, smiles_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Display model summary
model.summary()

# 8. Set up Model Checkpoint to save the best model during training
checkpoint_dir = './model_checkpoints'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

checkpoint_callback = ModelCheckpoint(
    filepath=os.path.join(checkpoint_dir, 'best_model.h5'),  # Save best model as .h5 file
    monitor='val_loss',  # Monitor validation loss
    save_best_only=True,  # Save the best model only
    save_weights_only=False,  # Save the entire model (architecture + weights)
    mode='min',  # Save the model when validation loss is minimum
    verbose=1  # Print a message when saving the best model
)

# 9. Train the model and save the best model
history = model.fit(
    [X_train_seq, X_train_smiles], y_train,
    epochs=100,
    batch_size=200,
    validation_data=([X_test_seq, X_test_smiles], y_test),
    callbacks=[checkpoint_callback]  # Use checkpoint callback to save best model
)

# 10. Evaluate the model
test_loss, test_acc = model.evaluate([X_test_seq, X_test_smiles], y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_acc}")

# Optionally, plot training history (accuracy and loss)
# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Save the training history to a file
history_df = pd.DataFrame(history.history)
history_df.to_csv('training_history.csv', index=False)


In [None]:
# Prediction

import numpy as np
import tensorflow as tf

# Define your functions to preprocess the protein sequence and SMILES
def one_hot_encoding(protein_seq, max_seq_len=5038):
    amino_acids = 'ACDEFGHIJKLMNOPQRSTUVWXYZ'
    aa_dict = {amino_acids[i]: i for i in range(len(amino_acids))}
    num_aa = len(aa_dict)

    # One-hot encode the protein sequence
    one_hot = np.zeros((1, max_seq_len, num_aa))  # Shape (1, max_seq_len, num_aa)

    for j, aa in enumerate(protein_seq):
        if aa in aa_dict:
            one_hot[0, j, aa_dict[aa]] = 1
    return one_hot

# Define the function to preprocess SMILES string (use the same as during training)
def smiles_to_int(smiles, max_length=1000):
    char_set = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789()=#[]+-'
    char_dict = {char: i+1 for i, char in enumerate(char_set)}

    smiles_int = np.zeros((1, max_length), dtype=int)

    for j, char in enumerate(smiles):
        if j < max_length:
            smiles_int[0, j] = char_dict.get(char, 0)
    return smiles_int

# Assuming `model` is already loaded and trained

# Sample protein sequence to predict SMILES (for example)
protein_sequence = "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYGAEEMPTLNRRAKQL"
# Step 1: Preprocess the input protein sequence
X_seq = one_hot_encoding(protein_sequence)

# Step 2: Make prediction using the model
# Note: We're passing the protein sequence and dummy SMILES input (because the model is expecting both)
dummy_smiles = np.zeros((1, 1000))  # Dummy input for SMILES (since it's a multi-input model)
predicted_smiles_int = model.predict([X_seq, dummy_smiles])

# Step 3: Convert predicted SMILES from integer to character string
def int_to_smiles(smiles_int):
    char_set = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789()=#[]+-'
    char_dict = {i+1: char for i, char in enumerate(char_set)}  # Reverse mapping

    smiles = ""
    for i in range(smiles_int.shape[1]):  # 1000 characters
        char_idx = smiles_int[0, i]
        if char_idx != 0:  # Skip padding (0 represents padding)
            smiles += char_dict.get(char_idx, '')
    return smiles

# Step 4: Convert the predicted SMILES integers back to string
predicted_smiles = int_to_smiles(predicted_smiles_int)

print("Predicted SMILES:", predicted_smiles)


In [None]:
#MULTIPLE SMILE
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import os

# Load dataset
data = pd.read_csv('final_output_15_2_25.csv')

# Extract necessary columns
protein_sequences = data['Sequence'].values
smiles_strings = data['SMILE'].values

# 1. Preprocess protein sequences (One-hot encoding)
def one_hot_encoding(protein_seq):
    amino_acids = 'ACDEFGHIJKLMNOPQRSTUVWXYZ'
    aa_dict = {amino_acids[i]: i for i in range(len(amino_acids))}
    num_aa = len(aa_dict)

    max_seq_len = 5038  # Maximum protein sequence length
    one_hot = np.zeros((len(protein_seq), max_seq_len, num_aa))  # Pad all sequences to max length

    for i, seq in enumerate(protein_seq):
        for j, aa in enumerate(seq):
            if aa in aa_dict:
                one_hot[i, j, aa_dict[aa]] = 1
    return one_hot

# Apply one-hot encoding to protein sequences
X_seq = one_hot_encoding(protein_sequences)

# 2. Preprocess SMILES strings (Integer encoding)
def smiles_to_int(smiles_strings, max_length=1000):
    char_set = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789()=#[]+-'
    char_dict = {char: i+1 for i, char in enumerate(char_set)}  # Map chars to integers

    smiles_int = np.zeros((len(smiles_strings), max_length), dtype=int)

    for i, smile in enumerate(smiles_strings):
        for j, char in enumerate(smile):
            if j < max_length:
                smiles_int[i, j] = char_dict.get(char, 0)  # Map characters to integers
    return smiles_int

# Apply integer encoding to SMILES strings
X_smiles = smiles_to_int(smiles_strings)

# 3. Split the data into training and test sets
X_train_seq, X_test_seq, X_train_smiles, X_test_smiles = train_test_split(
    X_seq, X_smiles, test_size=0.2, random_state=42
)

# 4. Pad the SMILES sequences (to 1000)
X_train_smiles = pad_sequences(X_train_smiles, maxlen=1000, padding='post')
X_test_smiles = pad_sequences(X_test_smiles, maxlen=1000, padding='post')

# 5. Prepare target variable (assuming y is your target, with 1000 classes)
y_train = np.random.randint(0, 2, size=(len(X_train_smiles), 1000))  # Example of multi-label target
y_test = np.random.randint(0, 2, size=(len(X_test_smiles), 1000))  # Example of multi-label target

# 6. Modify the model output for multi-label classification
# Protein sequence model (1D Conv)
seq_input = Input(shape=(X_train_seq.shape[1], X_train_seq.shape[2]))  # (5038, 25)
x = Conv1D(64, 3, activation='relu')(seq_input)
x = MaxPooling1D(2)(x)
x = Flatten()(x)

# SMILES sequence model (Dense)
smiles_input = Input(shape=(X_train_smiles.shape[1],))  # (1000,)
y = Dense(128, activation='relu')(smiles_input)

# Merge both models
merged = tf.keras.layers.concatenate([x, y])

# Output layer for multi-label classification
output = Dense(1000, activation='sigmoid')(merged)  # Sigmoid for multi-label classification

# Compile the model
model = Model(inputs=[seq_input, smiles_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display model summary
model.summary()

# 7. Set up Model Checkpoint to save the best model during training
checkpoint_dir = './model_checkpoints'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

checkpoint_callback = ModelCheckpoint(
    filepath=os.path.join(checkpoint_dir, 'best_model.h5'),
    monitor='val_loss',
    save_best_only=True,
    save_weights_only=False,
    mode='min',
    verbose=1
)

# 8. Train the model
history = model.fit(
    [X_train_seq, X_train_smiles], y_train,
    epochs=1,
    batch_size=2,
    validation_data=([X_test_seq, X_test_smiles], y_test),
    callbacks=[checkpoint_callback]
)

# 9. Evaluate the model
test_loss, test_acc = model.evaluate([X_test_seq, X_test_smiles], y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_acc}")

# 10. Plot training history (optional)
# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Save the training history to a file
history_df = pd.DataFrame(history.history)
history_df.to_csv('training_history.csv', index=False)


In [None]:
#MULTIPLE PREDICTION

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os

# Load your dataset (replace with your actual file path)
data = pd.read_csv('final_output_15_2_25.csv')

# Assuming the dataset has a column 'Protein_Sequence' containing protein sequences
protein_sequences = data['Protein_Sequence'].values  # Protein sequences

# 1. Preprocess Protein Sequences (Integer encoding)
def protein_to_int(protein_sequences, max_length=1000):
    # Define the amino acid alphabet (20 standard amino acids)
    amino_acids = 'ACDEFGHIKLMNPQRSTVWXY'  # May include non-standard amino acids (e.g., 'B', 'Z', etc.)
    aa_dict = {aa: i+1 for i, aa in enumerate(amino_acids)}  # Map amino acids to integers

    # Prepare the protein sequence array (initially set to zero)
    protein_int = np.zeros((len(protein_sequences), max_length), dtype=int)

    for i, sequence in enumerate(protein_sequences):
        for j, aa in enumerate(sequence):
            if j < max_length:
                protein_int[i, j] = aa_dict.get(aa, 0)  # Map amino acids to integers
    return protein_int

# Apply integer encoding to protein sequences
X_proteins = protein_to_int(protein_sequences)

# 2. Pad the protein sequences (to 1000, or whatever max_length you prefer)
max_length = 1000
X_proteins = pad_sequences(X_proteins, maxlen=max_length, padding='post')

# 3. Parameters for the VAE model
latent_dim = 256  # Dimensionality of the latent space

# Encoder Model
inputs = layers.Input(shape=(max_length,))
embedding = layers.Embedding(input_dim=256, output_dim=128)(inputs)
x = layers.LSTM(256, return_sequences=False)(embedding)

# Latent space (mean and log variance for the VAE)
z_mean = layers.Dense(latent_dim)(x)
z_log_var = layers.Dense(latent_dim)(x)

# Sampling function
def sampling(args):
    z_mean, z_log_var = args
    batch = tf.shape(z_mean)[0]
    dim = tf.shape(z_mean)[1]
    epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

z = layers.Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

# Decoder Model
latent_inputs = layers.Input(shape=(latent_dim,))
x = layers.Dense(256, activation='relu')(latent_inputs)
x = layers.RepeatVector(max_length)(x)
x = layers.LSTM(256, return_sequences=True)(x)
outputs = layers.TimeDistributed(layers.Dense(256, activation='softmax'))(x)

# Instantiate the VAE Model
vae = Model(inputs, outputs)
vae.compile(optimizer='adam', loss='categorical_crossentropy')

vae.summary()

# 4. Set up Model Checkpoint to save the best model during training
checkpoint_dir = './model_checkpoints'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

checkpoint_callback = ModelCheckpoint(
    filepath=os.path.join(checkpoint_dir, 'best_model.h5'),  # Save best model as .h5 file
    monitor='loss',  # Monitor training loss (you can also monitor 'val_loss')
    save_best_only=True,  # Save the best model only
    save_weights_only=False,  # Save the entire model (architecture + weights)
    mode='min',  # Save the model when the loss is minimum
    verbose=1  # Print a message when saving the best model
)

# 5. Set up CSVLogger to save the training history
history_csv_file = './training_history.csv'
csv_logger = CSVLogger(history_csv_file, append=True)

# 6. Train the model
history = vae.fit(
    X_proteins, X_proteins,  # We use protein sequences as both input and output
    epochs=10,
    batch_size=64,
    callbacks=[checkpoint_callback, csv_logger]  # Use both callbacks
)

# 7. Save final model after training
vae.save('./vae_final_model.h5')  # Save the final model after training

# 8. Optionally, you can load the model later using:
# vae = tf.keras.models.load_model('./vae_final_model.h5')

# 9. Training history
history_df = pd.DataFrame(history.history)
history_df.to_csv('training_history.csv', index=False)

# Optionally, plot training history (accuracy and loss)
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])
plt.title('Model loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train'], loc='upper left')
plt.show()


In [None]:
#PREDICTION MULTIPLE

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

# Load the trained VAE model
vae = tf.keras.models.load_model('./vae_final_model.h5')

# Function to encode protein sequences into integers
def protein_to_int(protein_sequences, max_length=1000):
    # Define the amino acid alphabet (20 standard amino acids)
    amino_acids = 'ACDEFGHIKLMNPQRSTVWXY'  # May include non-standard amino acids (e.g., 'B', 'Z', etc.)
    aa_dict = {aa: i+1 for i, aa in enumerate(amino_acids)}  # Map amino acids to integers

    # Prepare the protein sequence array (initially set to zero)
    protein_int = np.zeros((len(protein_sequences), max_length), dtype=int)

    for i, sequence in enumerate(protein_sequences):
        for j, aa in enumerate(sequence):
            if j < max_length:
                protein_int[i, j] = aa_dict.get(aa, 0)  # Map amino acids to integers
    return protein_int

# Function to generate SMILES from protein sequence
def generate_smiles(protein_sequence, vae, max_length=1000, latent_dim=256):
    # Step 1: Preprocess the protein sequence (encode and pad)
    protein_int = protein_to_int([protein_sequence], max_length)
    protein_int = pad_sequences(protein_int, maxlen=max_length, padding='post')

    # Step 2: Pass the protein sequence through the encoder part of the VAE to get the latent vector
    encoder = tf.keras.models.Model(inputs=vae.input, outputs=vae.get_layer('lambda').output)  # Encoder part
    z_mean, z_log_var = encoder(protein_int)

    # Step 3: Sample the latent vector
    epsilon = tf.keras.backend.random_normal(shape=(tf.shape(z_mean)[0], latent_dim))
    z = z_mean + tf.exp(0.5 * z_log_var) * epsilon  # Sample latent vector using the reparameterization trick

    # Step 4: Decode the latent vector to generate SMILES (decoder part)
    decoder = tf.keras.models.Model(inputs=vae.get_layer('lambda').input, outputs=vae.output)  # Decoder part
    smiles_output = decoder(z)

    # Step 5: Convert the generated SMILES to readable format (integer to character mapping)
    smiles_generated = smiles_output.numpy().argmax(axis=-1)[0]  # Get the most probable character for each position

    # Convert integers back to SMILES characters
    char_set = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789()=#[]+-'
    int_to_char = {i+1: char for i, char in enumerate(char_set)}  # Reverse mapping from integer to character

    smiles = ''.join([int_to_char.get(i, '') for i in smiles_generated])  # Join the characters into the SMILES string
    return smiles

# Example: Predict SMILES for a protein sequence
protein_sequence = 'MKTIIALSYIFCLVFA'  # Replace this with your input protein sequence
predicted_smiles = generate_smiles(protein_sequence, vae)

print(f"Predicted SMILES for the protein sequence '{protein_sequence}': {predicted_smiles}")


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os

# Check for TPU and initialize TPU strategy if available
try:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver()  # Detect TPU
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)  # Correct initialization method
    strategy = tf.distribute.TPUStrategy(resolver)
    print("TPU initialized")
except ValueError:
    strategy = tf.distribute.get_strategy()  # Use default strategy if no TPU is found
    print("No TPU detected, using default strategy (CPU/GPU)")

# Load dataset
data = pd.read_csv('updated_final_output_MORGAN.csv')

# Extract protein sequences and Morgan Fingerprints
protein_sequences = data['Sequence'].values

# Handle NaN and ensure all Morgan Fingerprint values are strings
data['Morgan_Fingerprint'] = data['Morgan_Fingerprint'].fillna('')  # Replace NaNs with empty strings
data['Morgan_Fingerprint'] = data['Morgan_Fingerprint'].astype(str)  # Ensure all values are strings

# Convert Morgan Fingerprints from string to integer arrays
morgan_fingerprints = data['Morgan_Fingerprint'].apply(lambda x: np.array([int(bit) for bit in x.split(',') if bit.isdigit()])).values

# Calculate max sequence length for both protein sequences and Morgan fingerprints
max_seq_len = max([len(seq) for seq in protein_sequences])  # Maximum protein sequence length
max_fp_len = max([len(fp) for fp in morgan_fingerprints])  # Maximum Morgan fingerprint length

print(f"Max protein sequence length: {max_seq_len}")
print(f"Max Morgan fingerprint length: {max_fp_len}")

# 1. Preprocess protein sequences (One-hot encoding)
def one_hot_encoding(protein_seq, max_seq_len):
    amino_acids = 'ACDEFGHIJKLMNOPQRSTUVWXYZ'
    aa_dict = {amino_acids[i]: i for i in range(len(amino_acids))}
    num_aa = len(aa_dict)

    one_hot = np.zeros((len(protein_seq), max_seq_len, num_aa))  # Pad all sequences to max length

    for i, seq in enumerate(protein_seq):
        for j, aa in enumerate(seq):
            if aa in aa_dict:
                one_hot[i, j, aa_dict[aa]] = 1
    return one_hot

# Apply one-hot encoding to protein sequences
X_seq = one_hot_encoding(protein_sequences, max_seq_len)

# 2. No padding for Morgan fingerprints (directly use them as output)
y = np.array(morgan_fingerprints)  # Morgan fingerprints as output

# Convert y to a proper Numpy array with numeric dtype
y = np.array([np.array(fp, dtype=np.int32) for fp in y])

# 3. Split the data into training and test sets
X_train_seq, X_test_seq, y_train, y_test = train_test_split(
    X_seq, y, test_size=0.2, random_state=42
)

# 4. Build the model inside the strategy scope
with strategy.scope():
    # Protein sequence model (1D Conv)
    seq_input = Input(shape=(X_train_seq.shape[1], X_train_seq.shape[2]))  # (5038, 25)
    x = Conv1D(64, 3, activation='relu')(seq_input)
    x = MaxPooling1D(2)(x)
    x = Flatten()(x)

    # Additional hidden layer
    x = Dense(128, activation='relu')(x)  # New hidden layer with 128 units and ReLU activation

    # Output layer for binary classification (per bit of fingerprint)
    output = Dense(max_fp_len, activation='sigmoid')(x)  # 1024 output units for binary classification (0 or 1)

    # Compile the model with binary cross-entropy loss
    model = Model(inputs=seq_input, outputs=output)
    model.compile(optimizer='RMSprop', loss='binary_crossentropy', metrics=['accuracy'])

    # Display model summary
    model.summary()



# 5. Set up Model Checkpoint and Early Stopping to save the best model during training
checkpoint_dir = '/content'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

checkpoint_callback = ModelCheckpoint(
    filepath=os.path.join(checkpoint_dir, 'best_model.h5'),  # Save best model as .h5 file
    monitor='accuracy',  # Monitor validation loss
    save_best_only=True,  # Save the best model only
    save_weights_only=False,  # Save the entire model (architecture + weights)
    mode='min',  # Save the model when validation loss is minimum
    verbose=1  # Print a message when saving the best model
)

early_stopping_callback = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=5,  # Stop training if validation loss doesn't improve for 5 epochs
    restore_best_weights=True,  # Restore best weights after stopping
    verbose=1
)

# 6. Train the model and save the best model
history = model.fit(
    X_train_seq, y_train,
    epochs=2,  # Train for more epochs (increased from 1)
    batch_size=8,  # Increased batch size for better training
    validation_data=(X_test_seq, y_test),
    callbacks=[checkpoint_callback, early_stopping_callback]  # Use checkpoint and early stopping
)

# 7. Evaluate the model
test_loss, test_acc = model.evaluate(X_test_seq, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_acc}")

# Optionally, plot training history (accuracy and loss)
# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Save the training history to a file
history_df = pd.DataFrame(history.history)
history_df.to_csv('training_history.csv', index=False)


No TPU detected, using default strategy (CPU/GPU)
Max protein sequence length: 5038
Max Morgan fingerprint length: 1024


In [3]:
# PREDICTION SEQUENCE TO MORGAN FINGERPRINT TO SMILE (MULTIPLE SMILE)
!pip install rdkit
from tensorflow.keras.models import load_model
from rdkit import Chem
from rdkit.Chem import DataStructs
import numpy as np
import pandas as pd

# Load trained model
model = load_model('best_model.h5')

# Load dataset for SMILES reference
data = pd.read_csv("updated_final_output_MORGAN.csv")
reference_smiles = data['SMILE'].values
reference_fingerprints = np.stack([np.array([int(bit) for bit in x.split(',')]) for x in data['Morgan_Fingerprint'].values])

# Function to find closest SMILES match based on similarity threshold range
def get_closest_smiles(predicted_fp, min_similarity=0.7, max_similarity=0.9):
    matches = []  # List to store SMILES that meet the similarity criteria
    for i, ref_fp in enumerate(reference_fingerprints):
        similarity = DataStructs.FingerprintSimilarity(
            Chem.DataStructs.CreateFromBitString(''.join(map(str, predicted_fp.astype(int)))),
            Chem.DataStructs.CreateFromBitString(''.join(map(str, ref_fp.astype(int))))
        )
        # If similarity is within the given range, add it to the matches
        if min_similarity <= similarity <= max_similarity:
            matches.append((reference_smiles[i], similarity))

    # Sort matches by similarity score (optional)
    matches.sort(key=lambda x: x[1], reverse=True)

    return matches

# Function to predict multiple SMILES based on protein sequence and similarity range
def predict_multiple_smiles(protein_sequence, min_similarity=0.7, max_similarity=0.9):
    # Perform one-hot encoding of the protein sequence (ensure max_seq_len is defined)
    one_hot_seq = one_hot_encoding([protein_sequence], max_seq_len)  # Adjust max_seq_len
    predicted_fp = model.predict(one_hot_seq)[0]  # Get predicted fingerprint

    # Get multiple SMILES predictions based on similarity range
    closest_matches = get_closest_smiles(predicted_fp, min_similarity, max_similarity)

    return closest_matches

# Example protein sequence input (change this to your input)
protein_input = "MGNASNDSQSEDCETRQWLPPGESPAISSVMFSAGVLGNLIALALLARRWRGDVGCSAGRRSSLSLFHVLVTELVFTDLLGTCLISPVVLASYARNQTLVALAPESRACTYFAFAMTFFSLATMLMLFAMALERYLSIGHPYFYQRRVSRSGGLAVLPVIYAVSLLFCSLPLLDYGQYVQYCPGTWCFIRHGRTAYLQLYATLLLLLIVSVLACNFSVILNLIRMHRRSRRSRCGPSLGSGRGGPGARRRGERVSMAEETDHLILLAIMTITFAVCSLPFTIFAYMNETSSRKEKWDLQALRFLSINSIIDPWVFAILRPPVLRLMRSVLCCRISLRTQDATQTSCSTQSDASKQADL"

# Predict multiple SMILES with similarity above 0.7 and below 0.9
predicted_matches = predict_multiple_smiles(protein_input, min_similarity=0.7, max_similarity=0.9)

# Display the predictions
for smiles, similarity in predicted_matches:
    print(f"Predicted SMILES: {smiles}, Similarity: {similarity}")


Collecting rdkit
  Downloading rdkit-2024.9.5-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.5-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.5


FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'best_model.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)