In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.models import Sequential


In [2]:
def one_hot_encoding(protein_seq):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    aa_dict = {amino_acids[i]: i for i in range(len(amino_acids))}
    num_aa = len(aa_dict)

    max_seq_len = max(len(seq) for seq in protein_seq)
    one_hot = np.zeros((len(protein_seq), max_seq_len, num_aa))

    for i, seq in enumerate(protein_seq):
        for j, aa in enumerate(seq):
            one_hot[i, j, aa_dict[aa]] = 1

    return one_hot
#3D array of one-hot encoding
#len(prtein_seq) = number of sequences in data
#max_seq_len = maximum length of sequence
#num_aa = actual one-hot or number of aminoacid

In [14]:
# load protein sequence data
protein_data = pd.read_excel("/content/Trial_Dataset.xlsx") #Data in excel/csv. two columns "sequence" and "label" 

# convert protein sequences to one-hot encoding
protein_seqs = protein_data["Sequence"].tolist()
one_hot = one_hot_encoding(protein_seqs)

# pad sequences to same length
padded_seqs = pad_sequences(one_hot, padding='post', maxlen=1000) #set maxlen value as per the longest lenght of protein sequence in your data

Fingerprint = protein_data.iloc[:,7:888]

In [15]:
print(padded_seqs.shape)

(8, 1000, 20)


In [None]:
print(padded_seqs)

In [20]:
# define CNN model
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(1000, 20)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(units=128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=881, activation='sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# train the model
model.fit(padded_seqs, Fingerprint, epochs=1000, batch_size=64, validation_split=0.2)
