<a href="https://colab.research.google.com/github/Swayamprakashpatel/DD/blob/main/SEQUENCE_SMILE_CNN_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**CNN-RNN approach**

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Concatenate, GlobalMaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

# Load the dataset from a CSV file
data = pd.read_csv('your_dataset.csv')

# Split the dataset into input (protein sequence) and output (drug SMILE) columns
X = data['Protein Sequence']
y = data['Drug SMILE']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize protein sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(X_train)

# Convert protein sequences to sequences of integers
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences to have the same length
max_sequence_length = max(max(len(seq) for seq in X_train_sequences), max(len(seq) for seq in X_test_sequences))
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

# Convert drug SMILE strings to categorical labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
num_classes = len(label_encoder.classes_)
y_train_categorical = to_categorical(y_train_encoded, num_classes=num_classes)
y_test_categorical = to_categorical(y_test_encoded, num_classes=num_classes)

# Define the model architecture
input_shape = (max_sequence_length,)
num_filters = 128
filter_sizes = [3, 4, 5]
embedding_dim = 128
hidden_units = 64

input_layer = Input(shape=input_shape)
embedding_layer = Embedding(len(tokenizer.word_index) + 1, embedding_dim)(input_layer)
conv_layers = []
for filter_size in filter_sizes:
    conv_layer = Conv1D(num_filters, filter_size, activation='relu')(embedding_layer)
    pooled_layer = MaxPooling1D()(conv_layer)
    conv_layers.append(pooled_layer)

concat_layer = Concatenate()(conv_layers) if len(conv_layers) > 1 else conv_layers[0]
lstm_layer = LSTM(hidden_units)(concat_layer)
output_layer = Dense(num_classes, activation='softmax')(lstm_layer)

model = Model(input_layer, output_layer)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train_categorical, epochs=10, batch_size=32, validation_data=(X_test_padded, y_test_categorical))
