In [None]:
'''
This script is used to train the model using attention on hmm/pssm features along with bigram features (extracted using a CNN model).
'''

# import libraries
import glob
import random
import numpy as np
import pandas as pd
import csv
import h5py
import tensorflow as tf
import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import layers
from tensorflow.keras.layers import Layer, Dense, Dropout, LSTM, GRU, Conv1D, Conv2D, Conv2DTranspose, MaxPooling2D, AveragePooling2D, UpSampling2D
from tensorflow.keras.layers import concatenate, GlobalMaxPooling1D, Flatten, BatchNormalization
from tensorflow.keras.layers import Activation, Reshape, TimeDistributed, Embedding, Input
from tensorflow.keras.optimizers import Adam, SGD, RMSprop, Adamax, Adadelta, Adagrad, Nadam
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
from tensorflow.keras.regularizers import l1, l2
from keras import backend as K
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm import tqdm

import parse_files as p
from features import bigram_features0, bigram_features1, bigram_features2, bigram_features3, bigram_features4, bigram_features5

from numpy.random import seed
from tensorflow.python.keras.backend import set_session
from keras import regularizers

In [None]:
clsfeature = False # True/False (True if you want to use class features. Use only when predtype is Fold)
rawdata = 'hmm' # hmm/pssm
predtype = 'Fold' # Class/Fold
dataset = 'SCOPe' # dd/edd/tg/SCOPe/25_SCOPe_DDEDDTG

In [None]:
# set seed
seed = 420
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# initialise empty lists for storing data
labels = []
hmm = []
pssm = []
seq = []
seqlen = 200
biGram_features0 = []
biGram_features1 = []
biGram_features2 = []
biGram_features3 = []
biGram_features4 = []
biGram_features5 = []


# load all the filenames of PSSM's
filelist = glob.glob('./data/'+dataset+'/'+rawdata+'/*.txt')

# read all the labels of the given dataset
if dataset == "SCOPe":
	label_for_seq = pd.read_csv("./astral_2_08_final.csv") # make sure all the sequences are in uppercase
else:
	label_for_seq = p.load_labels('./data/'+dataset+'_'+predtype+'_labels.txt')

# read all the HMM and PSSM matrices of the given dataset
for i in range(0, len(filelist)):
	# HMM data
	if(rawdata == 'hmm'):
		seq_hmm,prob_hmm,extras_hmm = p.parse_hmm(filelist[i]) # parse hmm data
		tempseq = seq_hmm.upper() # convert the sequence to uppercase
		seq.append(tempseq) # append the sequence to the list
		# get the label
		if dataset == "SCOPe": 
			labels.append(label_for_seq.loc[label_for_seq["sequence"] == tempseq]["fold"].values[0])
		else: 
			labels.append(label_for_seq[seq_hmm.upper()])
		if(clsfeature): # if use class features, append them (use only with Fold prediction)
			biGram_features0.append(((np.append((bigram_features0(prob_hmm)), (p.get_class_label(dataset, label_for_seq[tempseq])))).reshape(1, -1)))
			biGram_features1.append(((np.append((bigram_features1(prob_hmm)), (p.get_class_label(dataset, label_for_seq[tempseq])))).reshape(1, -1)))
			biGram_features2.append(((np.append((bigram_features2(prob_hmm)), (p.get_class_label(dataset, label_for_seq[tempseq])))).reshape(1, -1)))
			biGram_features3.append(((np.append((bigram_features3(prob_hmm)), (p.get_class_label(dataset, label_for_seq[tempseq])))).reshape(1, -1)))
			biGram_features4.append(((np.append((bigram_features4(prob_hmm)), (p.get_class_label(dataset, label_for_seq[tempseq])))).reshape(1, -1)))
			biGram_features5.append(((np.append((bigram_features5(prob_hmm)), (p.get_class_label(dataset, label_for_seq[tempseq])))).reshape(1, -1)))
		else: # use only bigram features
			biGram_features0.append(bigram_features0(prob_hmm))
			biGram_features2.append(bigram_features2(prob_hmm))
			biGram_features3.append(bigram_features3(prob_hmm))
			biGram_features4.append(bigram_features4(prob_hmm))
			biGram_features1.append(bigram_features1(prob_hmm))
			biGram_features5.append(bigram_features5(prob_hmm))

		norm_hmm = prob_hmm + 0.01
		if(len(norm_hmm) < seqlen): # pad the sequence with zeros if it is less than the required length
			for j in range(seqlen-len(norm_hmm)):
				norm_hmm = np.concatenate((norm_hmm,norm_hmm[0]*0))
		else:
			norm_hmm = norm_hmm[:seqlen] # truncate the sequence if it is more than the required length
		hmm.append(norm_hmm)

	# PSSM data
	else:  
		seq_pssm,prob_pssm,lprob_pssm,extra_pssm = p.parse_pssm(filelist[i]) # get the pssm data
		tempseq = seq_pssm.upper() # convert the sequence to uppercase
		seq.append(tempseq) # append the sequence to the list
		labels.append(label_for_seq[seq_pssm.upper()]) # get the label
		if(clsfeature): # if use class features, append them (use only with Fold prediction)
			biGram_features0.append(((np.append((bigram_features0(prob_pssm)), (p.get_class_label(dataset, label_for_seq[tempseq])))).reshape(1, -1)))
			biGram_features1.append(((np.append((bigram_features1(prob_pssm)), (p.get_class_label(dataset, label_for_seq[tempseq])))).reshape(1, -1)))
			biGram_features2.append(((np.append((bigram_features2(prob_pssm)), (p.get_class_label(dataset, label_for_seq[tempseq])))).reshape(1, -1)))
			biGram_features3.append(((np.append((bigram_features3(prob_pssm)), (p.get_class_label(dataset, label_for_seq[tempseq])))).reshape(1, -1)))
			biGram_features4.append(((np.append((bigram_features4(prob_pssm)), (p.get_class_label(dataset, label_for_seq[tempseq])))).reshape(1, -1)))
			biGram_features5.append(((np.append((bigram_features5(prob_pssm)), (p.get_class_label(dataset, label_for_seq[tempseq])))).reshape(1, -1)))
		else: # use only bigram features
			biGram_features0.append(bigram_features0(prob_pssm))
			biGram_features2.append(bigram_features2(prob_pssm))
			biGram_features3.append(bigram_features3(prob_pssm))
			biGram_features4.append(bigram_features4(prob_pssm))
			biGram_features1.append(bigram_features1(prob_pssm))
			biGram_features5.append(bigram_features5(prob_pssm))

		norm_pssm = prob_pssm + 0.01

		if(len(norm_pssm) < seqlen): # pad the sequence with zeros if it is less than the required length
			for j in range(seqlen-len(norm_pssm)):
				norm_pssm = np.concatenate((norm_pssm,norm_pssm[0]*0))
		else: # truncate the sequence if it is more than the required length
			norm_pssm = norm_pssm[:seqlen]
		pssm.append(norm_pssm)

# convert everything to numpy arrays
labels = np.array(labels)
# print("Labels=",labels)
num_classes =  len(np.unique(labels))
foldlabels = pd.get_dummies(labels).values
# print("foldlabels=",foldlabels)
sequences = np.array(seq)
biGram0 = np.array(biGram_features0)
biGram1 = np.array(biGram_features1)
biGram2 = np.array(biGram_features2)
biGram3 = np.array(biGram_features3)
biGram4 = np.array(biGram_features4)
biGram5 = np.array(biGram_features5)
hmm = np.array(hmm)
pssm = np.array(pssm)

if(rawdata == 'hmm'): # use hmm data
	matrixdata = hmm
else: # use pssm data
	matrixdata = pssm

no_filters1 = 4

In [None]:
with tf.device('/device:GPU:0'): # use GPU
	f=0 # fold number
	config=tf.compat.v1.ConfigProto()
	config.gpu_options.allow_growth = True
	config.gpu_options.per_process_gpu_memory_fraction = 0.2
	tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))		

	acc_k_fold = [] # store the accuracy of each fold
	kf = StratifiedKFold(n_splits=10 shuffle=True, random_state=42) # 10-fold cross validation

	for train, test in kf.split(sequences, labels): # split the data into training and testing sets
		f=f+1 # fold number
		X_train, X_test = matrixdata[train], matrixdata[test] # get training and testing data
		Y_train, Y_test = foldlabels[train], foldlabels[test] # get training and testing labels
		X_biGram0_Train, X_biGram0_Test = biGram0[train], biGram0[test] # get training and testing bigram features
		X_biGram1_Train, X_biGram1_Test = biGram1[train], biGram1[test] # get training and testing bigram features
		X_biGram2_Train, X_biGram2_Test = biGram2[train], biGram2[test] # get training and testing bigram features
		X_biGram3_Train, X_biGram3_Test = biGram3[train], biGram3[test] # get training and testing bigram features
		X_biGram4_Train, X_biGram4_Test = biGram4[train], biGram4[test] # get training and testing bigram features 
		X_biGram5_Train, X_biGram5_Test = biGram5[train], biGram5[test] # get training and testing bigram features

		cnn_input = Input(shape=(seqlen,20), name='cnn_input') # input layer
		c_input = Reshape((seqlen,20,1))(cnn_input) # reshape the input
		c_output1 = Conv2D(no_filters1, (5,5),  activation='tanh', strides=5, padding='same')(c_input) # convolution layer
		m_output1 = MaxPooling2D((3,3), strides=3, padding='same')(c_output1) # max pooling layer
		f_input = Flatten()(m_output1) # flatten the output
		f_input = tf.expand_dims(f_input, axis=1) # expand the dimensions
		bigram_input0 = Input(shape=(X_biGram0_Train.shape[1], X_biGram0_Train.shape[2]), name='bigram_input0') # input layer for bigram features
		bigram_input1 = Input(shape=(X_biGram1_Train.shape[1], X_biGram1_Train.shape[2]), name='bigram_input1') # input layer for bigram features
		bigram_input2 = Input(shape=(X_biGram2_Train.shape[1], X_biGram2_Train.shape[2]), name='bigram_input2') # input layer for bigram features
		bigram_input3 = Input(shape=(X_biGram3_Train.shape[1], X_biGram3_Train.shape[2]), name='bigram_input3') # input layer for bigram features
		bigram_input4 = Input(shape=(X_biGram4_Train.shape[1], X_biGram4_Train.shape[2]), name='bigram_input4') # input layer for bigram features
		bigram_input5 = Input(shape=(X_biGram5_Train.shape[1], X_biGram5_Train.shape[2]), name='bigram_input5') # input layer for bigram features
		bigram_input5 = Input(shape=(X_biGram5_Train.shape[1], X_biGram5_Train.shape[2]), name='bigram_input5') # input layer for bigram features
        # concatenate the input layers
		hybrid_features = concatenate([f_input, bigram_input0, bigram_input1, bigram_input2, bigram_input3, bigram_input4, bigram_input5], axis=2)

		y_train_shape = [Y_train.shape[0], 1, Y_train.shape[1]] # reshape the labels
		y_test_shape = [Y_test.shape[0], 1, Y_test.shape[1]] # reshape the labels
		Y_train = tf.reshape(Y_train, y_train_shape) # reshape the labels
		Y_test = tf.reshape(Y_test, y_test_shape) # reshape the labels

		l_output1 = layers.MultiHeadAttention(num_heads=2, key_dim=2) # multi-head attention layer
		attention_out1 = l_output1(hybrid_features, hybrid_features) # multi-head attention layer
		l_output1 = layers.Add()([attention_out1, hybrid_features]) # add the output of the attention layer to the input
		l_output1 = layers.LayerNormalization(epsilon=1e-6)(l_output1) # layer normalization
		d_output1 = Dense(512, activation='relu')(l_output1) # dense layer
		d_output1 = Dense(2512, activation='relu')(d_output1) # dense layer

		d_output_last = Dense(512, activation='tanh')(d_output1) # dense layer
		d_output_last = Dense(128, activation='tanh')(d_output_last) # dense layer
		# output layer
		main_output = Dense(foldlabels.shape[1], activation='softmax', name='main_output', kernel_regularizer=l2(0.01))(d_output_last)
		# create the model
		model = Model(inputs=[cnn_input, bigram_input0, bigram_input1, bigram_input2, bigram_input3, bigram_input4, bigram_input5], outputs=[main_output])
		model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy']) # compile the model

		earlyStopping = EarlyStopping(monitor='val_accuracy', patience=50, verbose=0, mode='auto') # early stopper
		load_file = "./model/"+dataset+"_SXG_BiGram_best.h5" # model saving path
		checkpointer = ModelCheckpoint(monitor='val_accuracy', filepath=load_file, verbose=0, save_best_only=True) # checkpointer

		history=model.fit({'cnn_input': X_train, 'bigram_input0': X_biGram0_Train, 'bigram_input1': X_biGram1_Train, 'bigram_input2': X_biGram2_Train, 'bigram_input3': X_biGram3_Train, 'bigram_input4': X_biGram4_Train, 'bigram_input5': X_biGram5_Train}, {'main_output': Y_train}, 
			validation_data=({'cnn_input': X_test, 'bigram_input0': X_biGram0_Test, 'bigram_input1': X_biGram1_Test, 'bigram_input2': X_biGram2_Test, 'bigram_input3': X_biGram3_Test, 'bigram_input4': X_biGram4_Test, 'bigram_input5': X_biGram5_Test},{'main_output': Y_test}), 
			epochs=500, batch_size=64, callbacks=[checkpointer, earlyStopping], verbose=0) # train the model

		model.load_weights(load_file) # load the best model
		# score using the best model
		score = model.evaluate({'cnn_input': X_test, 'bigram_input0': X_biGram0_Test, 'bigram_input1': X_biGram1_Test, 'bigram_input2': X_biGram2_Test, 'bigram_input3': X_biGram3_Test, 'bigram_input4': X_biGram4_Test, 'bigram_input5': X_biGram5_Test},{'main_output': Y_test}, verbose=0, batch_size=1)
		print("Fold-",f, ": ", score) # print score

		acc_k_fold.append(score[1]) # append accuracy to the list

		# predict the scores
		pred_scores = model.predict({'cnn_input': X_test, 'bigram_input0': X_biGram0_Test, 'bigram_input1': X_biGram1_Test, 'bigram_input2': X_biGram2_Test, 'bigram_input3': X_biGram3_Test, 'bigram_input4': X_biGram4_Test, 'bigram_input5': X_biGram5_Test})
		print("pred_scores shape =", pred_scores.shape)


	resdata = "Class Features = "+str(clsfeature)+" -- 10-cross fold accuracy of Protein "+predtype+" Prediction of "+dataset+" using "+rawdata+" is :"+str(np.mean(acc_k_fold))+"\n"

	print(resdata)
	print("10 Fold Accuracies:", acc_k_fold)

print("# of Labels:", num_classes)
print("# of Labels:", foldlabels.shape[1])
print("hybrid_features count:", hybrid_features.shape)


Fold- 1 :  [1.637765645980835, 0.8464000225067139]
pred_scores shape = (625, 1, 171)
Fold- 2 :  [1.6080728769302368, 0.8464000225067139]
pred_scores shape = (625, 1, 171)
Fold- 3 :  [1.7202656269073486, 0.8399999737739563]
pred_scores shape = (625, 1, 171)
Fold- 4 :  [1.7978019714355469, 0.8159999847412109]
pred_scores shape = (625, 1, 171)
Fold- 5 :  [1.7183433771133423, 0.8223999738693237]
pred_scores shape = (625, 1, 171)
Fold- 6 :  [2.2258787155151367, 0.7247999906539917]
pred_scores shape = (625, 1, 171)
Fold- 7 :  [1.7343553304672241, 0.8288000226020813]
pred_scores shape = (625, 1, 171)
Fold- 8 :  [1.5747936964035034, 0.8543999791145325]
pred_scores shape = (625, 1, 171)
Fold- 9 :  [1.838038682937622, 0.8080000281333923]
pred_scores shape = (625, 1, 171)
Fold- 10 :  [1.6883172988891602, 0.8333333134651184]
pred_scores shape = (624, 1, 171)
Class Features = False -- 10-cross fold accuracy of Protein Fold Prediction of SCOPe using hmm is :0.8220533311367035

# of Labels: 171
# of

Fold- 1 :  [1.1856778860092163, 0.8428571224212646]
3/3 [==============================] - 1s 33ms/step
pred_scores shape = (70, 1, 27)
Fold- 2 :  [0.7193385362625122, 0.8714285492897034]
3/3 [==============================] - 1s 40ms/step
pred_scores shape = (70, 1, 27)
Fold- 3 :  [0.7449996471405029, 0.9142857193946838]
3/3 [==============================] - 1s 32ms/step
pred_scores shape = (70, 1, 27)
Fold- 4 :  [1.0007456541061401, 0.8714285492897034]
3/3 [==============================] - 1s 25ms/step
pred_scores shape = (70, 1, 27)
Fold- 5 :  [1.1810381412506104, 0.8571428656578064]
3/3 [==============================] - 1s 43ms/step
pred_scores shape = (70, 1, 27)
Fold- 6 :  [0.9316855072975159, 0.8695651888847351]
3/3 [==============================] - 1s 28ms/step
pred_scores shape = (69, 1, 27)
Fold- 7 :  [1.3379615545272827, 0.8405796885490417]
3/3 [==============================] - 1s 23ms/step
pred_scores shape = (69, 1, 27)
Fold- 8 :  [0.9299620389938354, 0.8840579986572266]
3/3 [==============================] - 1s 33ms/step
pred_scores shape = (69, 1, 27)
Fold- 9 :  [1.2655516862869263, 0.8550724387168884]
3/3 [==============================] - 1s 35ms/step
pred_scores shape = (69, 1, 27)
Fold- 10 :  [0.6408186554908752, 0.8985507488250732]
3/3 [==============================] - 1s 26ms/step
pred_scores shape = (69, 1, 27)
Class Features = False -- 10-cross fold accuracy of Protein Fold Prediction of dd using hmm is :0.8704968869686127
