In [257]:
import numpy as np
import tensorflow as tf
import keras
from keras import Model, layers, activations, losses

In [106]:
all_aa = "ARNDCEQGHILKMFPSTWYVX"
aa_onehot_dict = dict()
for i, aa in enumerate(all_aa):
    aa_onehot_dict[aa] = i

In [203]:
path = "C:/Users/vinicius/Downloads/data/training/"

def aa_onehot_encoding(seq):
    profile = []
    for aa in seq:
        encoded = np.zeros(21)
        encoded[aa_onehot_dict[aa]] = 1
        profile.append(encoded)
    while len(profile) != 800: # pad to 800
        profile.append(np.zeros(21))
    return profile

def parse_dssp(dssp_file):
    with open(path+"dssp/"+dssp_file+".dssp", 'r') as file:
        file.readline()
        ss = file.readline().rstrip()
    return ss

def parse_pssm(pssm_filename):
    profile = []
    seq = ''
    with open(path+"pssm/"+pssm_filename+".pssm", 'r') as pssm:
        pssm_lines = pssm.readlines()
        for line in pssm_lines[3:-6]:
            line = line.rstrip().split()
            seq += line[1]
            profile_line = []
            for n in line[22:-2]:
                profile_line.append(float(n)/100)
            profile.append(profile_line)
    while (len(profile) != 800):
        profile.append(np.zeros(20))
    return profile, seq


def parse_fasta(file):
    pass

ss_map = {'C': 0, '-': 0 , 'H': 1, 'E': 2}

def ss_onehot_encoding(ss_sequence):
    ss_encoded = []
    for struc in ss_sequence:
        encoding = np.zeros(3)
        encoding[ss_map[struc]] = 1
        ss_encoded.append(encoding)
    while (len(ss_encoded) != 800):
        ss_encoded.append(np.zeros(3))
    return ss_encoded

In [289]:
def get_data(file): 
    x = []
    y = []
    with open(path+file, 'r') as sample_file: # add some stuff to check?
        for line in sample_file:
            line = line.rstrip()
            pssm, sequence = parse_pssm(line)
            sequence_hot = aa_onehot_encoding(sequence)
            features = np.concatenate((sequence_hot, pssm), axis=1)
            x.append(features)

            dssp = parse_dssp(line)
            dssp_hot = ss_onehot_encoding(dssp)
            
            y.append(dssp_hot)
    return np.array(x), np.array(y)

In [282]:
x_train, y_train = get_data('cv/train1.txt')

In [159]:
class InceptionNet_naive(layers.Layer):
    def __init__(self, num_features=2):
        super().__init__()
        self.k = num_features

    def call(self, inputs, num_layers=3, layer_size=8):
        X1 = layers.Conv2D(layer_size, kernel_size=(1), strides=1, padding='same')(inputs)
        X2 = layers.Conv2D(layer_size, kernel_size=(3), strides=1, padding='same')(inputs)
        X3 = layers.Conv2D(layer_size, kernel_size=(5), strides=1, padding='same')(X)
        X = layers.concatenate((X1, X2, X3))

        # X_layers = []
        # for i in range(layers):
        #     fs = i*2 + 1
        #     X_layers.append(layers.conv1D(layer_size, filter=(1,fs)))
        # X = layers.concatenate(X_layers)
        return activations.relu(X)


In [218]:
x_train.shape

(1200, 800, 41)

In [294]:
class InceptionNet_naive(layers.Layer):
    def __init__(self, num_features=41, num_layers=3, layer_size=8):
        super().__init__()
        self.k = num_features
        self.num_l = num_layers
        self.size_l = layer_size
        self.conv1 = layers.Conv1D(self.k, kernel_size=1, strides=1, padding='same') # try with activation
        self.conv2 = layers.Conv1D(self.k, kernel_size=3, strides=1, padding='same')
        self.conv3 = layers.Conv1D(self.k, kernel_size=5, strides=1, padding='same')
        

    def call(self, inputs):
        X1 = self.conv1(inputs)
        X2 = self.conv2(inputs)
        X3 = self.conv3(inputs)
        X = layers.concatenate((X1, X2, X3))

        return layers.Activation('relu')(X)
num_labels = 3
num_positions = 800

inputs = layers.Input((800, 41))
X = inputs
# X = layers.Masking(mask_value=0)(X)
for i in range(3):
    X = InceptionNet_naive()(X)
Y = layers.Dense(3, activation='softmax')(X)
# Y = layers.Reshape((num_positions,num_labels))(Y)

loss_fn = losses.CategoricalFocalCrossentropy()

model = Model(inputs=inputs, outputs=Y)
model.compile(loss=loss_fn, # try siome: "categorical_focal_crossentropy, adam, sparse_categorical_crossentropy
              optimizer="adam",
              metrics=['accuracy'])
model.summary()

In [295]:
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=128,
                    validation_split=0.1)

Epoch 1/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 628ms/step - accuracy: 0.8078 - loss: 0.0238 - val_accuracy: 0.1305 - val_loss: 0.0257
Epoch 2/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 552ms/step - accuracy: 0.6401 - loss: 0.0210 - val_accuracy: 0.9060 - val_loss: 0.0221
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 554ms/step - accuracy: 0.9193 - loss: 0.0188 - val_accuracy: 0.9140 - val_loss: 0.0201
Epoch 4/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 531ms/step - accuracy: 0.9268 - loss: 0.0170 - val_accuracy: 0.9180 - val_loss: 0.0190
Epoch 5/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 538ms/step - accuracy: 0.9293 - loss: 0.0164 - val_accuracy: 0.9206 - val_loss: 0.0185
Epoch 6/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 514ms/step - accuracy: 0.9303 - loss: 0.0163 - val_accuracy: 0.9243 - val_loss: 0.0178
Epoch 7/10
[1m7/7[0m [32m━━━━━━━━━━━━

In [296]:
x_test, y_test = get_data('cv/test1.txt')

In [297]:
predictions = model.predict(x_test)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step


array([[0.42524293, 0.23900193, 0.33575514],
       [0.32573733, 0.23130693, 0.4429557 ],
       [0.2700767 , 0.17463085, 0.5552924 ],
       ...,
       [0.37862006, 0.29379633, 0.3275836 ],
       [0.37082148, 0.305559  , 0.32361948],
       [0.35886025, 0.3175122 , 0.32362762]], dtype=float32)

input:
1. one hot encoded sequence
2. PSSM

Model:
1D convolutional neural network

output:
multiclass classification - dense layer with relu activaiton - 3?

validation metric - accuray + model specific measures

soruces:
https://www.csbj.org/article/S2001-0370(22)00506-2/fulltext
