<a href="https://colab.research.google.com/github/Swayamprakashpatel/DD/blob/main/Drug_Discovery_15_2_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import tensorflow as tf
print("TensorFlow Version:", tf.__version__)


TensorFlow Version: 2.18.0


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt

# Load dataset
data = pd.read_csv('final_output_15_2_25.csv')

# Extract necessary columns
protein_sequences = data['Sequence'].values
smiles_strings = data['SMILE'].values

# 1. Preprocess protein sequences (One-hot encoding)
def one_hot_encoding(protein_seq):
    amino_acids = 'ACDEFGHIJKLMNOPQRSTUVWXYZ'
    aa_dict = {amino_acids[i]: i for i in range(len(amino_acids))}
    num_aa = len(aa_dict)

    max_seq_len = 5038  # Maximum protein sequence length
    one_hot = np.zeros((len(protein_seq), max_seq_len, num_aa))  # Pad all sequences to max length

    for i, seq in enumerate(protein_seq):
        for j, aa in enumerate(seq):
            if aa in aa_dict:
                one_hot[i, j, aa_dict[aa]] = 1
    return one_hot

# Apply one-hot encoding to protein sequences
X_seq = one_hot_encoding(protein_sequences)

# 2. Preprocess SMILES strings (Integer encoding)
def smiles_to_int(smiles_strings, max_length=1000):
    char_set = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789()=#[]+-'
    char_dict = {char: i+1 for i, char in enumerate(char_set)}  # Map chars to integers

    smiles_int = np.zeros((len(smiles_strings), max_length), dtype=int)

    for i, smile in enumerate(smiles_strings):
        for j, char in enumerate(smile):
            if j < max_length:
                smiles_int[i, j] = char_dict.get(char, 0)  # Map characters to integers
    return smiles_int

# Apply integer encoding to SMILES strings
X_smiles = smiles_to_int(smiles_strings)

# 3. Split the data into training and test sets
X_train_seq, X_test_seq, X_train_smiles, X_test_smiles = train_test_split(
    X_seq, X_smiles, test_size=0.2, random_state=42
)

# 4. Pad the SMILES sequences (to 1000)
X_train_smiles = pad_sequences(X_train_smiles, maxlen=1000, padding='post')
X_test_smiles = pad_sequences(X_test_smiles, maxlen=1000, padding='post')

# 5. Prepare target variable (assuming y is your target, with 1000 classes)
y_train = np.zeros(len(X_train_smiles))  # Dummy binary target variable
y_test = np.zeros(len(X_test_smiles))  # Dummy binary target variable

# 6. One-hot encode the target variable
y_train = tf.keras.utils.to_categorical(y_train, num_classes=1000)  # One-hot encoding
y_test = tf.keras.utils.to_categorical(y_test, num_classes=1000)

# 7. Build the model

# Protein sequence model (1D Conv)
seq_input = Input(shape=(X_train_seq.shape[1], X_train_seq.shape[2]))  # (5038, 25)
x = Conv1D(64, 3, activation='relu')(seq_input)
x = MaxPooling1D(2)(x)
x = Flatten()(x)

# SMILES sequence model (Dense)
smiles_input = Input(shape=(X_train_smiles.shape[1],))  # (1000,)
y = Dense(128, activation='relu')(smiles_input)

# Merge both models
merged = tf.keras.layers.concatenate([x, y])

# Output layer for multi-class classification
output = Dense(1000, activation='softmax')(merged)  # Output for 1000 classes

# Compile the model
model = Model(inputs=[seq_input, smiles_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Display model summary
model.summary()

# 8. Train the model
history = model.fit(
    [X_train_seq, X_train_smiles], y_train,
    epochs=100,
    batch_size=200,
    validation_data=([X_test_seq, X_test_smiles], y_test)
)

# 9. Evaluate the model
test_loss, test_acc = model.evaluate([X_test_seq, X_test_smiles], y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_acc}")

# Optionally, plot training history (accuracy and loss)
# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()


Epoch 1/100
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1358s[0m 11s/step - accuracy: 0.9573 - loss: 0.3992 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 2/100
[1m 13/126[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m18:44[0m 10s/step - accuracy: 1.0000 - loss: 0.0000e+00

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import os

# Load dataset
data = pd.read_csv('final_output_15_2_25.csv')

# Extract necessary columns
protein_sequences = data['Sequence'].values
smiles_strings = data['SMILE'].values

# 1. Preprocess protein sequences (One-hot encoding)
def one_hot_encoding(protein_seq):
    amino_acids = 'ACDEFGHIJKLMNOPQRSTUVWXYZ'
    aa_dict = {amino_acids[i]: i for i in range(len(amino_acids))}
    num_aa = len(aa_dict)

    max_seq_len = 5038  # Maximum protein sequence length
    one_hot = np.zeros((len(protein_seq), max_seq_len, num_aa))  # Pad all sequences to max length

    for i, seq in enumerate(protein_seq):
        for j, aa in enumerate(seq):
            if aa in aa_dict:
                one_hot[i, j, aa_dict[aa]] = 1
    return one_hot

# Apply one-hot encoding to protein sequences
X_seq = one_hot_encoding(protein_sequences)

# 2. Preprocess SMILES strings (Integer encoding)
def smiles_to_int(smiles_strings, max_length=1000):
    char_set = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789()=#[]+-'
    char_dict = {char: i+1 for i, char in enumerate(char_set)}  # Map chars to integers

    smiles_int = np.zeros((len(smiles_strings), max_length), dtype=int)

    for i, smile in enumerate(smiles_strings):
        for j, char in enumerate(smile):
            if j < max_length:
                smiles_int[i, j] = char_dict.get(char, 0)  # Map characters to integers
    return smiles_int

# Apply integer encoding to SMILES strings
X_smiles = smiles_to_int(smiles_strings)

# 3. Split the data into training and test sets
X_train_seq, X_test_seq, X_train_smiles, X_test_smiles = train_test_split(
    X_seq, X_smiles, test_size=0.2, random_state=42
)

# 4. Pad the SMILES sequences (to 1000)
X_train_smiles = pad_sequences(X_train_smiles, maxlen=1000, padding='post')
X_test_smiles = pad_sequences(X_test_smiles, maxlen=1000, padding='post')

# 5. Prepare target variable (assuming y is your target, with 1000 classes)
y_train = np.zeros(len(X_train_smiles))  # Dummy binary target variable
y_test = np.zeros(len(X_test_smiles))  # Dummy binary target variable

# 6. One-hot encode the target variable
y_train = tf.keras.utils.to_categorical(y_train, num_classes=1000)  # One-hot encoding
y_test = tf.keras.utils.to_categorical(y_test, num_classes=1000)

# 7. Build the model

# Protein sequence model (1D Conv)
seq_input = Input(shape=(X_train_seq.shape[1], X_train_seq.shape[2]))  # (5038, 25)
x = Conv1D(64, 3, activation='relu')(seq_input)
x = MaxPooling1D(2)(x)
x = Flatten()(x)

# SMILES sequence model (Dense)
smiles_input = Input(shape=(X_train_smiles.shape[1],))  # (1000,)
y = Dense(128, activation='relu')(smiles_input)

# Merge both models
merged = tf.keras.layers.concatenate([x, y])

# Output layer for multi-class classification
output = Dense(1000, activation='softmax')(merged)  # Output for 1000 classes

# Compile the model
model = Model(inputs=[seq_input, smiles_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Display model summary
model.summary()

# 8. Set up Model Checkpoint to save the best model during training
checkpoint_dir = './model_checkpoints'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

checkpoint_callback = ModelCheckpoint(
    filepath=os.path.join(checkpoint_dir, 'best_model.h5'),  # Save best model as .h5 file
    monitor='val_loss',  # Monitor validation loss
    save_best_only=True,  # Save the best model only
    save_weights_only=False,  # Save the entire model (architecture + weights)
    mode='min',  # Save the model when validation loss is minimum
    verbose=1  # Print a message when saving the best model
)

# 9. Train the model and save the best model
history = model.fit(
    [X_train_seq, X_train_smiles], y_train,
    epochs=100,
    batch_size=200,
    validation_data=([X_test_seq, X_test_smiles], y_test),
    callbacks=[checkpoint_callback]  # Use checkpoint callback to save best model
)

# 10. Evaluate the model
test_loss, test_acc = model.evaluate([X_test_seq, X_test_smiles], y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_acc}")

# Optionally, plot training history (accuracy and loss)
# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Save the training history to a file
history_df = pd.DataFrame(history.history)
history_df.to_csv('training_history.csv', index=False)


In [None]:
# Prediction

import numpy as np
import tensorflow as tf

# Define your functions to preprocess the protein sequence and SMILES
def one_hot_encoding(protein_seq, max_seq_len=5038):
    amino_acids = 'ACDEFGHIJKLMNOPQRSTUVWXYZ'
    aa_dict = {amino_acids[i]: i for i in range(len(amino_acids))}
    num_aa = len(aa_dict)

    # One-hot encode the protein sequence
    one_hot = np.zeros((1, max_seq_len, num_aa))  # Shape (1, max_seq_len, num_aa)

    for j, aa in enumerate(protein_seq):
        if aa in aa_dict:
            one_hot[0, j, aa_dict[aa]] = 1
    return one_hot

# Define the function to preprocess SMILES string (use the same as during training)
def smiles_to_int(smiles, max_length=1000):
    char_set = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789()=#[]+-'
    char_dict = {char: i+1 for i, char in enumerate(char_set)}

    smiles_int = np.zeros((1, max_length), dtype=int)

    for j, char in enumerate(smiles):
        if j < max_length:
            smiles_int[0, j] = char_dict.get(char, 0)
    return smiles_int

# Assuming `model` is already loaded and trained

# Sample protein sequence to predict SMILES (for example)
protein_sequence = "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYGAEEMPTLNRRAKQL"
# Step 1: Preprocess the input protein sequence
X_seq = one_hot_encoding(protein_sequence)

# Step 2: Make prediction using the model
# Note: We're passing the protein sequence and dummy SMILES input (because the model is expecting both)
dummy_smiles = np.zeros((1, 1000))  # Dummy input for SMILES (since it's a multi-input model)
predicted_smiles_int = model.predict([X_seq, dummy_smiles])

# Step 3: Convert predicted SMILES from integer to character string
def int_to_smiles(smiles_int):
    char_set = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789()=#[]+-'
    char_dict = {i+1: char for i, char in enumerate(char_set)}  # Reverse mapping

    smiles = ""
    for i in range(smiles_int.shape[1]):  # 1000 characters
        char_idx = smiles_int[0, i]
        if char_idx != 0:  # Skip padding (0 represents padding)
            smiles += char_dict.get(char_idx, '')
    return smiles

# Step 4: Convert the predicted SMILES integers back to string
predicted_smiles = int_to_smiles(predicted_smiles_int)

print("Predicted SMILES:", predicted_smiles)
