<a href="https://colab.research.google.com/github/Swayamprakashpatel/DD/blob/main/SEQUENCE_SMILE_DIRECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Proteing Sequence to Drug SMILE Direct ML/DL Approach**

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder

# Load the dataset from a CSV file
data = pd.read_csv('your_dataset.csv')

# Split the dataset into input (protein sequence) and output (drug SMILE) columns
X = data['Protein Sequence']
y = data['Drug SMILE']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize protein sequences into individual amino acids
X_train_tokens = [seq.split() for seq in X_train]
X_test_tokens = [seq.split() for seq in X_test]

# Train Word2Vec model to learn embeddings
embedding_dim = 100
word2vec_model = Word2Vec(sentences=X_train_tokens, size=embedding_dim, window=5, min_count=1, workers=4)

# Convert protein sequences to embeddings
X_train_embeddings = np.array([np.mean([word2vec_model.wv[amino_acid] for amino_acid in seq if amino_acid in word2vec_model.wv] or [np.zeros(embedding_dim)], axis=0) for seq in X_train_tokens])
X_test_embeddings = np.array([np.mean([word2vec_model.wv[amino_acid] for amino_acid in seq if amino_acid in word2vec_model.wv] or [np.zeros(embedding_dim)], axis=0) for seq in X_test_tokens])

# Preprocess the output variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Define the model architecture
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_dim=embedding_dim),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_embeddings, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_test_embeddings, y_test_encoded))
