<a href="https://colab.research.google.com/github/Swayamprakashpatel/DD/blob/main/DD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.models import Sequential


In [9]:
def one_hot_encoding(protein_seq):
    amino_acids = 'ACDEFGHIJKLMNOPQRSTUVWXYZ'
    aa_dict = {amino_acids[i]: i for i in range(len(amino_acids))}
    num_aa = len(aa_dict)

    max_seq_len = max(len(seq) for seq in protein_seq)
    one_hot = np.zeros((len(protein_seq), max_seq_len, num_aa))

    for i, seq in enumerate(protein_seq):
        for j, aa in enumerate(seq):
            one_hot[i, j, aa_dict[aa]] = 1

    return one_hot
#3D array of one-hot encoding
#len(prtein_seq) = number of sequences in data
#max_seq_len = maximum length of sequence
#num_aa = actual one-hot or number of aminoacid

In [None]:
#@title DOWNLOAD DATA FROM KAGGLE
# DOWNLOAD DATA FROM KAGGLE (!IMPORTANT!: REFRESH RUNTIME BEFORE RE-RUNNING THE CODE)
#%%capture
from google.colab import files
files.upload()  #this will prompt you to upload the kaggle.json

#Make Directory of Kaggle and set its permission for access.
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json  # set permission

# Download Data from Kaggle Fast and Unzip them in /content
!kaggle datasets download -d drswayamprakashpatel/DD-Dataset-csv  -p /content # For model download

#Unzip data (Two Folders - Training and Validation)
import os
os.chdir('/content')
#create a directory named train/
!unzip -q DD-Dataset-csv.zip #Unzip Model

Saving kaggle.json to kaggle (1).json
kaggle.json
Downloading DD-Dataset-csv.zip to /content
  0% 0.00/5.59M [00:00<?, ?B/s]
100% 5.59M/5.59M [00:00<00:00, 64.3MB/s]


In [10]:
# load protein sequence data
protein_data = pd.read_csv("/content/DATASET_Final.csv") #Data in excel/csv. two columns: "sequence" and "Fingerprint of 881 bit binary" 
protein_data.dropna(inplace=True)
protein_data = protein_data.iloc[0:500, :]
# convert protein sequences to one-hot encoding
protein_seqs = protein_data["Sequence"].tolist()
one_hot = one_hot_encoding(protein_seqs)

# pad sequences to same length
padded_seqs = pad_sequences(one_hot, padding='post', maxlen=1000) #set maxlen value as per the longest lenght of protein sequence in your data

Fingerprint = protein_data.iloc[:,7:888]

X = padded_seqs
Y = Fingerprint
output_nodes = Y.shape[1]

In [None]:


#SPLIT DATA

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
X_train, X_val_and_test, Y_train, Y_val_and_test = train_test_split(X, Y, test_size=0.3,random_state = 42 )
X_val, X_test, Y_val, Y_test = train_test_split(X_val_and_test, Y_val_and_test, test_size=0.5, random_state= 42)

# define CNN model
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(1000, 25)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(units=128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=output_nodes, activation='sigmoid'))

import tensorflow as tf
import datetime
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

filepath = '/content/drive/MyDrive/Model_DE/DDModel.hdf5'
 
checkpoint = [tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_accuracy', mode='max', save_best_only=True, Save_weights_only = False, verbose = 1), 
              tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=25, verbose =1), [tensorboard_callback]]


model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss=tf.keras.losses.BinaryCrossentropy(from_logits = False), metrics=['accuracy'])
hist = model.fit(X_train, Y_train, epochs= 2000, callbacks=[checkpoint],validation_data=(X_val, Y_val), batch_size= None)


Epoch 1/2000
Epoch 1: val_accuracy improved from -inf to 0.00000, saving model to /content/drive/MyDrive/Model_DE/DDModel.hdf5
Epoch 2/2000
Epoch 2: val_accuracy did not improve from 0.00000
Epoch 3/2000
Epoch 3: val_accuracy did not improve from 0.00000
Epoch 4/2000
Epoch 4: val_accuracy did not improve from 0.00000
Epoch 5/2000
Epoch 5: val_accuracy improved from 0.00000 to 0.06667, saving model to /content/drive/MyDrive/Model_DE/DDModel.hdf5
Epoch 6/2000
Epoch 6: val_accuracy did not improve from 0.06667
Epoch 7/2000
Epoch 7: val_accuracy did not improve from 0.06667
Epoch 8/2000
Epoch 8: val_accuracy did not improve from 0.06667
Epoch 9/2000
Epoch 9: val_accuracy improved from 0.06667 to 0.08000, saving model to /content/drive/MyDrive/Model_DE/DDModel.hdf5
Epoch 10/2000

In [None]:
model.evaluate(X_test, Y_test)

Y_train_predict = np.round(model.predict(X_train))
Y_train_l = tf.argmax(Y_train, axis = 1)
Y_train_predict_l = tf.argmax(Y_train_predict, axis =1)
import sklearn.metrics as skm
cm = skm.multilabel_confusion_matrix(Y_train_l, Y_train_predict_l)
print(cm)
print( skm.classification_report(Y_train_l, Y_train_predict_l))
 
train_acc = max(hist.history['precision'])
val_acc = max(hist.history['val_precision'])
train_loss = min(hist.history['loss'])
val_loss = min(hist.history['val_loss'])
print('Training Precision is')
print(train_acc)
print('Validation Precision is')
print(val_acc)
print('Training loss is')
print(train_loss)
print('Validation loss is')
print(val_loss)
 
# Error Graph
import matplotlib.pyplot as plt
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper right')
#plt.show()
#plt.figure(figsize=(5,5), dpi=300)
#plt.gcf()
plt.savefig('Loss.svg', DPI = 3500, Transperent = True)
 
plt.plot(hist.history['precision'])
plt.plot(hist.history['val_precision'])
plt.title('Model accuracy')
plt.ylabel('Precision')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='lower right')
#plt.show()
#plt.figure(figsize=(5,5), dpi=300)
#plt.gcf()
plt.savefig('Precision.svg', DPI = 3500, Transperent = True)


plt.scatter(Y_train,model.predict(X_train), label = 'Train', c='blue')
plt.title('Neural Network Predictor')
plt.xlabel('Actual Interaction')
plt.ylabel('Predicted Interaction')
plt.scatter(y_test,model.predict(X_test),c='lightgreen', label='Test', alpha = 0.8)
plt.legend(loc=4)
#plt.show()
plt.savefig('Act_vs_Pred.svg', DPI = 3500, Transperent = True)

In [None]:
# train the model
model.fit(padded_seqs, Fingerprint, epochs=10, batch_size=64, validation_split=0.2)
