In [None]:
import numpy as np
import tensorflow as tf
import keras
from keras import Model, activations, layers, losses, optimizers, callbacks, regularizers
from keras.layers import Dense, Convolution1D, Dropout, BatchNormalization, concatenate, TimeDistributed
import matplotlib.pyplot as plt

In [None]:
all_aa = "ARNDCEQGHILKMFPSTWYVX"
aa_onehot_dict = dict()
for i, aa in enumerate(all_aa):
    aa_onehot_dict[aa] = i
    
path = "C:/Users/vinicius/Downloads/data/training/"

def aa_onehot_encoding(seq):
    profile = []
    for aa in seq:
        encoded = np.zeros(21)
        encoded[aa_onehot_dict[aa]] = 1
        profile.append(encoded)
    while len(profile) != 800: # pad to 800
        profile.append(np.zeros(21))
    return profile

def parse_dssp(dssp_file):
    with open(path+"dssp/"+dssp_file+".dssp", 'r') as file:
        file.readline()
        ss = file.readline().rstrip()
    return ss

def parse_pssm(pssm_filename):
    profile = []
    seq = ''
    with open(path+"pssm/"+pssm_filename+".pssm", 'r') as pssm:
        pssm_lines = pssm.readlines()
        for line in pssm_lines[3:-6]:
            line = line.rstrip().split()
            seq += line[1]
            profile_line = []
            for n in line[22:-2]:
                profile_line.append(float(n)/100)
            profile.append(profile_line)
    while (len(profile) != 800):
        profile.append(np.zeros(20))
    return profile, seq


def parse_fasta(file):
    pass

ss_map = {'C': 0, 'H': 1, 'E': 2}

def ss_onehot_encoding(ss_sequence):
    ss_encoded = []
    for struc in ss_sequence:
        encoding = np.zeros(3)
        encoding[ss_map[struc]] = 1
        ss_encoded.append(encoding)
    while (len(ss_encoded) != 800):
        ss_encoded.append(np.zeros(3))
    return ss_encoded

def get_data(file, encode_y=True): 
    x = []
    y = []
    with open(path+file, 'r') as sample_file: # add some stuff to check?
        for line in sample_file:
            line = line.rstrip()
            pssm, sequence = parse_pssm(line)
            sequence_hot = aa_onehot_encoding(sequence)
            features = np.concatenate((sequence_hot, pssm), axis=1)
            x.append(features)

            dssp = parse_dssp(line).replace('-','C')
            if encode_y:
                dssp = ss_onehot_encoding(dssp)
            
            y.append(dssp)
    return np.array(x), np.array(y)

In [None]:
x_data, y_data = get_data('list.txt')
x_train, y_train = x_data[101:], y_data[101:]
fff, y_data = get_data('list.txt', encode_y=False)
x_test, y_test = x_data[:101], y_data[:101]

In [None]:
n = [1,1,1,3,3,3,3,3,5,5,5,5,5,5,5,7,7,7,7,7,7,7,7,7]
n1 = [1,3,3,3,5,5,5,5,5,7,7,7,7,7,7,7]
n2 = [1,3,3,5,5,5,7,7,7,7]
class InceptionNet_naive(layers.Layer):
    def __init__(self, num_layers=7, num_features=41):
        super().__init__()
        self.n_features = num_features
        self.n_layers = num_layers
        self.Xs=[]
        self.layers = []
        for i in n1:
            layer = layers.Conv1D(self.n_features, kernel_size=i, padding='same')
            self.layers.append(layer)

    def call(self, inputs):
        self.Xs=[]
        X = inputs
        for layer in self.layers:
            self.Xs.append(layer(X))
        X = layers.concatenate(self.Xs)

        return layers.Activation('relu')(X)
    
inputs = layers.Input((800, 41))
X = inputs
for i in range(2):
    X = InceptionNet_naive()(X)
# X1 = InceptionNet_naive()(X)
# X2 = InceptionNet_naive()(X)
# X3 = InceptionNet_naive()(X)
# X = layers.concatenate((X1,X2,X3))
# X = layers.Activation('relu')(X)

Y = layers.Dense(3, activation='softmax')(X)
model = Model(inputs=inputs, outputs=Y)

loss_fn = losses.CategoricalCrossentropy()
opt = optimizers.RMSprop()
model.compile(loss=loss_fn, # try siome: "categorical_focal_crossentropy, adam, sparse_categorical_crossentropy
              optimizer=opt,
              metrics=['accuracy'])

with tf.device('/GPU:1'):
    history = model.fit(x_train, y_train,
                        epochs=5,
                        batch_size=128,
                        validation_split=0.1)

In [None]:
ss_map = {'C': 0, 'H': 1, 'E': 2}
from_aa = {0: 'C', 1: 'H', 2: 'E'}
predictions_hot = model.predict(x_test)
predictions = []
for prediction in predictions_hot:
    dssp = ''
    for i in prediction:
        dssp += from_aa[np.argmax(i)]
    predictions.append(dssp)
total = 0
TP = 0
print(predictions)
for prediction, truth in zip(predictions, y_test):
    for pred_ss, truth_ss in zip(truth, prediction[:len(truth)]):
        total +=1
        if pred_ss==truth_ss:
            TP+=1

accuracy = TP/total
print(accuracy)

In [None]:
class inception_conv_simple(layers.Layer):
    def __init__(self, kernel_s, num_features=41):
        super().__init__()
        self.conv = layers.Conv1D(num_features, kernel_size=kernel_s, strides=1, padding='same', activation='relu')
    
    def call(self, inputs):
        X = self.conv(inputs)
        return X

class InceptionNet_paper_simple(layers.Layer):
    def __init__(self):
        super().__init__()
        self.conv1_1 = inception_conv_simple(1)
        self.conv1_2 = inception_conv_simple(1)
        self.conv1_3 = inception_conv_simple(1)
        self.conv3_1 = inception_conv_simple(3)
        self.conv3_2 = inception_conv_simple(3)
        self.conv3_3 = inception_conv_simple(3)
        self.conv3_4 = inception_conv_simple(3)

    def call(self, inputs):
        X1 = self.conv1_1(inputs)
        X2 = self.conv3_1(self.conv1_2(inputs))
        X3 = self.conv3_4(self.conv3_3(self.conv3_2(self.conv1_3(inputs))))
    
        X = layers.concatenate((X1,X2,X3))
        return X # activation?


In [None]:
class inception_conv(layers.Layer):
    def __init__(self, kernel_s, num_features=100):
        super().__init__()
        self.conv = Convolution1D(num_features, kernel_size=kernel_s, kernel_regularizer= regularizers.l2(0.001), strides=1, padding='same', activation='relu')
        self.b_norm = BatchNormalization()

    def call(self, inputs):
        X = self.conv(inputs)
        X = Dropout(0.3)(X)
        X = self.b_norm(X)
        return X


class InceptionNet_paper(layers.Layer):
    def __init__(self):
        super().__init__()
        self.conv1_1 = inception_conv(1)
        self.conv1_2 = inception_conv(1)
        self.conv1_3 = inception_conv(1)
        self.conv3_1 = inception_conv(3)
        self.conv3_2 = inception_conv(3)
        self.conv3_3 = inception_conv(3)
        self.conv3_4 = inception_conv(3)
        self.b_norm1 = BatchNormalization()
        self.b_norm2 = BatchNormalization()

    def call(self, inputs):
        X = self.b_norm1(inputs)
        X1 = self.conv1_1(X)
        X2 = self.conv3_1(self.conv1_2(X))
        X3 = self.conv3_3(self.conv3_2(self.conv1_3(X)))
        X3 = self.conv3_4(X3)
        X = concatenate([X1,X2,X3])
        X = self.b_norm2(X)
        return X


class DeepInception_block(layers.Layer):
    def __init__(self):
        super().__init__()
        self.inception1 = InceptionNet_paper()
        self.inception2_1 = InceptionNet_paper()
        self.inception2_2 = InceptionNet_paper()
        self.inception3_1 = InceptionNet_paper()
        self.inception3_2 = InceptionNet_paper()
        self.inception3_3 = InceptionNet_paper()
        self.inception3_4 = InceptionNet_paper()
        self.b_norm = BatchNormalization()

    def call(self, inputs):
        X1 = self.inception1(inputs)
        X2 = self.inception2_2(self.inception2_1(inputs))
        X3 = self.inception3_3(self.inception3_2(self.inception3_1(inputs)))
        X3 = self.inception3_4(X3)
        X = concatenate([X1,X2,X3])
        X = self.b_norm(X)
        return X

In [None]:
inputs = layers.Input((800, 41))

X = DeepInception_block()(inputs)
X = DeepInception_block()(X)

X = Convolution1D(100, 11, activation='relu', padding='same', kernel_regularizer=regularizers.l2(0.001))(X)
X = TimeDistributed(Dense(256, activation='relu'))(X)
X = Dropout(0.4)(X)
Y = TimeDistributed(Dense(3, activation='softmax'))(X)


model = Model(inputs=inputs, outputs=Y)
opt = optimizers.Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', # try siome: "categorical_focal_crossentropy, adam, sparse_categorical_crossentropy
              optimizer=opt,
              metrics=['accuracy'])

stop_monitor_loss = callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=8,
    mode='min',
    verbose=1,
    restore_best_weights=True
)

checkpoint = callbacks.ModelCheckpoint(
    'ss_pred_model.keras',
    monitor='val_truncated_accuracy',
    save_best_only=True,
    mode='max'
)

with tf.device('/GPU:1'):
    history = model.fit(x_train, y_train,
                        epochs=40,
                        batch_size=12,
                        validation_split=0.1,
                        callbacks=[stop_monitor_loss, checkpoint])

In [None]:
ss_map = {'C': 0, 'H': 1, 'E': 2}
from_aa = {0: 'C', 1: 'H', 2: 'E'}
predictions_hot = model.predict(x_test)
predictions = []
for prediction in predictions_hot:
    dssp = ''
    for i in prediction:
        dssp += from_aa[np.argmax(i)]
    predictions.append(dssp)

total = 0
TP = 0
for prediction, truth in zip(predictions, y_test):
    for i, ss in enumerate(truth):
        total +=1
        if ss==prediction[i]:
            TP+=1

accuracy = TP/total
print(accuracy)

In [None]:
histories = []

for i in range(200, 400, 20):
    inputs = layers.Input((800, 41))
    X = inputs
    for _ in range(3):
        X = InceptionNet_paper()(X)
    X = inception_conv(11)(X)
    X = layers.Dense(i, activation='relu')(X)
    Y = layers.Dense(3, activation='softmax')(X)


    model = Model(inputs=inputs, outputs=Y)
    model.compile(loss='categorical_crossentropy', # try siome: "categorical_focal_crossentropy, adam, sparse_categorical_crossentropy
                optimizer="sgd",
                metrics=['accuracy'])
    with tf.device('/GPU:1'):
        history = model.fit(x_train, y_train,
                            epochs=5,
                            batch_size=128,
                            validation_split=0.1)
        histories.append(history)

In [None]:
plt.plot(range(10, 400, 10), [his.history['accuracy'][-1] for his in histories])
plt.show()

input:
1. one hot encoded sequence
2. PSSM

Model:
1D convolutional neural network

output:
multiclass classification - dense layer with relu activaiton - 3?

validation metric - accuray + model specific measures

soruces:
https://www.csbj.org/article/S2001-0370(22)00506-2/fulltext
