In [3]:
import numpy as np
import tensorflow as tf
import keras
from keras import Model, layers, activations, losses

In [8]:
all_aa = "ARNDCEQGHILKMFPSTWYVX"
aa_onehot_dict = dict()
for i, aa in enumerate(all_aa):
    aa_onehot_dict[aa] = i

In [16]:
path = "C:/Users/vinicius/Downloads/data/training/"

def aa_onehot_encoding(seq):
    profile = []
    for aa in seq:
        encoded = np.zeros(21)
        encoded[aa_onehot_dict[aa]] = 1
        profile.append(encoded)
    while len(profile) != 800: # pad to 800
        profile.append(np.zeros(21))
    return profile

def parse_dssp(dssp_file):
    with open(path+"dssp/"+dssp_file+".dssp", 'r') as file:
        file.readline()
        ss = file.readline().rstrip()
    return ss

def parse_pssm(pssm_filename):
    profile = []
    seq = ''
    with open(path+"pssm/"+pssm_filename+".pssm", 'r') as pssm:
        pssm_lines = pssm.readlines()
        for line in pssm_lines[3:-6]:
            line = line.rstrip().split()
            seq += line[1]
            profile_line = []
            for n in line[22:-2]:
                profile_line.append(float(n)/100)
            profile.append(profile_line)
    while (len(profile) != 800):
        profile.append(np.zeros(20))
    return profile, seq


def parse_fasta(file):
    pass

ss_map = {'C': 0, 'H': 1, 'E': 2}

def ss_onehot_encoding(ss_sequence):
    ss_encoded = []
    for struc in ss_sequence:
        encoding = np.zeros(3)
        encoding[ss_map[struc]] = 1
        ss_encoded.append(encoding)
    while (len(ss_encoded) != 800):
        ss_encoded.append(np.zeros(3))
    return ss_encoded

def get_data(file, encode_y=True): 
    x = []
    y = []
    with open(path+file, 'r') as sample_file: # add some stuff to check?
        for line in sample_file:
            line = line.rstrip()
            pssm, sequence = parse_pssm(line)
            sequence_hot = aa_onehot_encoding(sequence)
            features = np.concatenate((sequence_hot, pssm), axis=1)
            x.append(features)

            dssp = parse_dssp(line).replace('-','C')
            if encode_y:
                dssp = ss_onehot_encoding(dssp)
            
            y.append(dssp)
    return np.array(x), np.array(y)

In [17]:
x_train, y_train = get_data('cv/train1.txt')

In [13]:
class InceptionNet_naive(layers.Layer):
    def __init__(self, num_features=2):
        super().__init__()
        self.k = num_features

    def call(self, inputs, num_layers=3, layer_size=8):
        X1 = layers.Conv2D(layer_size, kernel_size=(1), strides=1, padding='same')(inputs)
        X2 = layers.Conv2D(layer_size, kernel_size=(3), strides=1, padding='same')(inputs)
        X3 = layers.Conv2D(layer_size, kernel_size=(5), strides=1, padding='same')(X)
        X = layers.concatenate((X1, X2, X3))

        # X_layers = []
        # for i in range(layers):
        #     fs = i*2 + 1
        #     X_layers.append(layers.conv1D(layer_size, filter=(1,fs)))
        # X = layers.concatenate(X_layers)
        return activations.relu(X)


In [None]:
x_train.shape

In [14]:
class InceptionNet_naive(layers.Layer):
    def __init__(self, num_features=41, num_layers=3, layer_size=8):
        super().__init__()
        self.k = num_features
        self.num_l = num_layers
        self.size_l = layer_size
        self.conv1 = layers.Conv1D(self.k, kernel_size=1, strides=1, padding='same') # try with activation
        self.conv2 = layers.Conv1D(self.k, kernel_size=3, strides=1, padding='same')
        self.conv3 = layers.Conv1D(self.k, kernel_size=5, strides=1, padding='same')
        

    def call(self, inputs):
        X1 = self.conv1(inputs)
        X2 = self.conv2(inputs)
        X3 = self.conv3(inputs)
        X = layers.concatenate((X1, X2, X3))

        return layers.Activation('relu')(X)
num_labels = 3
num_positions = 800

inputs = layers.Input((800, 41))
X = inputs
# X = layers.Masking(mask_value=0)(X)
for i in range(3):
    X = InceptionNet_naive()(X)
Y = layers.Dense(3, activation='softmax')(X)
# Y = layers.Reshape((num_positions,num_labels))(Y)

loss_fn = losses.CategoricalFocalCrossentropy()

model = Model(inputs=inputs, outputs=Y)
model.compile(loss=loss_fn, # try siome: "categorical_focal_crossentropy, adam, sparse_categorical_crossentropy
              optimizer="adam",
              metrics=['accuracy'])
model.summary()




In [15]:
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=128,
                    validation_split=0.1)

Epoch 1/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 769ms/step - accuracy: 0.8796 - loss: 0.0239 - val_accuracy: 0.1273 - val_loss: 0.0259
Epoch 2/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 713ms/step - accuracy: 0.2511 - loss: 0.0219 - val_accuracy: 0.9014 - val_loss: 0.0223
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 654ms/step - accuracy: 0.7437 - loss: 0.0183 - val_accuracy: 0.9133 - val_loss: 0.0201
Epoch 4/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 641ms/step - accuracy: 0.9274 - loss: 0.0168 - val_accuracy: 0.9196 - val_loss: 0.0189
Epoch 5/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 747ms/step - accuracy: 0.9291 - loss: 0.0168 - val_accuracy: 0.9227 - val_loss: 0.0185
Epoch 6/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 638ms/step - accuracy: 0.9323 - loss: 0.0161 - val_accuracy: 0.9264 - val_loss: 0.0178
Epoch 7/10
[1m7/7[0m [32m━━━━━━━━━━━━

In [10]:
x_test, y_test = get_data('cv/test1.txt', encode_y=False)

In [25]:
ss_map = {'C': 0, 'H': 1, 'E': 2}
from_aa = {0: '-', 1: 'H', 2: 'E'}
predictions = model.predict(x_test)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step


In [26]:
print(predictions)
def decode

[[[0.3390992  0.27493414 0.38596666]
  [0.27472723 0.24978751 0.47548538]
  [0.26043558 0.23897931 0.50058514]
  ...
  [0.40488154 0.2931451  0.30197334]
  [0.3925489  0.3069762  0.30047497]
  [0.37286958 0.32049984 0.30663064]]

 [[0.46454218 0.18135689 0.3541009 ]
  [0.51027834 0.12536204 0.36435956]
  [0.48334906 0.11182442 0.40482655]
  ...
  [0.40488154 0.2931451  0.30197334]
  [0.3925489  0.3069762  0.30047497]
  [0.37286958 0.32049984 0.30663064]]

 [[0.59053826 0.17190258 0.23755912]
  [0.6190752  0.16275613 0.21816868]
  [0.59574187 0.25193238 0.1523257 ]
  ...
  [0.40488154 0.2931451  0.30197334]
  [0.3925489  0.3069762  0.30047497]
  [0.37286958 0.32049984 0.30663064]]

 ...

 [[0.5236421  0.22716263 0.24919525]
  [0.38879642 0.23500012 0.37620345]
  [0.2704487  0.35954654 0.3700048 ]
  ...
  [0.40488154 0.2931451  0.30197334]
  [0.3925489  0.3069762  0.30047497]
  [0.37286958 0.32049984 0.30663064]]

 [[0.42188632 0.40742004 0.17069364]
  [0.4506171  0.4099823  0.1394006 ]


input:
1. one hot encoded sequence
2. PSSM

Model:
1D convolutional neural network

output:
multiclass classification - dense layer with relu activaiton - 3?

validation metric - accuray + model specific measures

soruces:
https://www.csbj.org/article/S2001-0370(22)00506-2/fulltext
