In [1]:
import numpy as np
import tensorflow as tf
import keras
from keras import Model, layers, activations, losses




In [2]:
all_aa = "ARNDCEQGHILKMFPSTWYVX"
aa_onehot_dict = dict()
for i, aa in enumerate(all_aa):
    aa_onehot_dict[aa] = i

In [3]:
path = "C:/Users/vinic/Downloads/data/training/"

def aa_onehot_encoding(seq):
    profile = []
    for aa in seq:
        encoded = np.zeros(21)
        encoded[aa_onehot_dict[aa]] = 1
        profile.append(encoded)
    while len(profile) != 800: # pad to 800
        profile.append(np.zeros(21))
    return profile

def parse_dssp(dssp_file):
    with open(path+"dssp/"+dssp_file+".dssp", 'r') as file:
        file.readline()
        ss = file.readline().rstrip()
    return ss

def parse_pssm(pssm_filename):
    profile = []
    seq = ''
    with open(path+"pssm/"+pssm_filename+".pssm", 'r') as pssm:
        pssm_lines = pssm.readlines()
        for line in pssm_lines[3:-6]:
            line = line.rstrip().split()
            seq += line[1]
            profile_line = []
            for n in line[22:-2]:
                profile_line.append(float(n)/100)
            profile.append(profile_line)
    while (len(profile) != 800):
        profile.append(np.zeros(20))
    return profile, seq


def parse_fasta(file):
    pass

ss_map = {'C': 0, 'H': 1, 'E': 2}

def ss_onehot_encoding(ss_sequence):
    ss_encoded = []
    for struc in ss_sequence:
        encoding = np.zeros(3)
        encoding[ss_map[struc]] = 1
        ss_encoded.append(encoding)
    while (len(ss_encoded) != 800):
        ss_encoded.append(np.zeros(3))
    return ss_encoded

def get_data(file, encode_y=True): 
    x = []
    y = []
    with open(path+file, 'r') as sample_file: # add some stuff to check?
        for line in sample_file:
            line = line.rstrip()
            pssm, sequence = parse_pssm(line)
            sequence_hot = aa_onehot_encoding(sequence)
            features = np.concatenate((sequence_hot, pssm), axis=1)
            x.append(features)

            dssp = parse_dssp(line).replace('-','C')
            if encode_y:
                dssp = ss_onehot_encoding(dssp)
            
            y.append(dssp)
    return np.array(x), np.array(y)

In [13]:
x_data, y_data = get_data('list.txt')
x_train, y_train = x_data[:1101], y_data[:1101]
fff, y_data = get_data('list.txt', encode_y=False)
x_test, y_test = x_data[1100:], y_data[1100:]

In [5]:
class InceptionNet_naive(layers.Layer):
    def __init__(self, num_features=2):
        super().__init__()
        self.k = num_features

    def call(self, inputs, num_layers=3, layer_size=8):
        X1 = layers.Conv2D(layer_size, kernel_size=(1), strides=1, padding='same')(inputs)
        X2 = layers.Conv2D(layer_size, kernel_size=(3), strides=1, padding='same')(inputs)
        X3 = layers.Conv2D(layer_size, kernel_size=(5), strides=1, padding='same')(X)
        X = layers.concatenate((X1, X2, X3))

        # X_layers = []
        # for i in range(layers):
        #     fs = i*2 + 1
        #     X_layers.append(layers.conv1D(layer_size, filter=(1,fs)))
        # X = layers.concatenate(X_layers)
        return activations.relu(X)


In [6]:
class InceptionNet_naive(layers.Layer):
    def __init__(self, num_features=41, num_layers=7):
        super().__init__()
        self.k = num_features
        self.conv_Xs = []
        self.conv1 = layers.Conv1D(self.k, kernel_size=1, strides=1, padding='same')
        self.conv2 = layers.Conv1D(self.k, kernel_size=3, strides=1, padding='same')    
        self.conv3 = layers.Conv1D(self.k, kernel_size=5, strides=1, padding='same')
        self.conv4 = layers.Conv1D(self.k, kernel_size=7, strides=1, padding='same') 
        self.conv5 = layers.Conv1D(self.k, kernel_size=9, strides=1, padding='same')
        self.conv6 = layers.Conv1D(self.k, kernel_size=11, strides=1, padding='same') 
        self.conv7 = layers.Conv1D(self.k, kernel_size=13, strides=1, padding='same') 
        self.conv_layers = [self.conv1, self.conv2, self.conv3, self.conv4, self.conv5, self.conv6, self.conv7]
        
    def call(self, inputs):
        X1 = self.conv1(inputs)
        X2 = self.conv2(inputs)
        X3 = self.conv3(inputs)
        X4 = self.conv4(inputs)
        X5 = self.conv5(inputs)
        X6 = self.conv6(inputs) 
        X7 = self.conv7(inputs)
        X = layers.concatenate((X1,X2,X3,X4,X5,X6,X7))
        
        return layers.Activation('relu')(X)
    



In [28]:
class InceptionNet_naive_chat(layers.Layer):
    def __init__(self, num_features=41, num_layers=7):
        super().__init__()
        self.k = num_features
        self.conv_Xs = []
        self.conv_layers = []
        for i in range(num_layers):
            self.conv_layers.append(layers.Conv1D(self.k, kernel_size=2*i+1, strides=1, padding='same'))
        
    def call(self, inputs):
        Xs = [conv(inputs) for conv in self.conv_layers]
        X = layers.concatenate(Xs)
        return layers.Activation('relu')(X)

num_labels = 3
num_positions = 800

inputs = layers.Input((800, 41))
X = inputs
# X = layers.Masking(mask_value=0)(X)
for i in range(3):
    X = InceptionNet_naive_chat()(X)
Y = layers.Dense(3, activation='softmax')(X)
# Y = layers.Reshape((num_positions,num_labels))(Y)

loss_fn = losses.CategoricalCrossentropy()

model = Model(inputs=inputs, outputs=Y)
model.compile(loss='categorical_crossentropy', # try siome: "categorical_focal_crossentropy, adam, sparse_categorical_crossentropy
              optimizer="sgd",
              metrics=['accuracy'])

In [29]:
history = model.fit(x_train, y_train,
                    epochs=5,
                    batch_size=128,
                    validation_split=0.1)

Epoch 1/5
Epoch 2/5
1/8 [==>...........................] - ETA: 29s - loss: 0.2170 - accuracy: 0.8703

KeyboardInterrupt: 

In [27]:
ss_map = {'C': 0, 'H': 1, 'E': 2}
from_aa = {0: 'C', 1: 'H', 2: 'E'}
predictions_hot = model.predict(x_test)
predictions = []
for prediction in predictions_hot:
    dssp = ''
    for i in prediction:
        dssp += from_aa[np.argmax(i)]
    predictions.append(dssp)

total = 0
TP = 0
for prediction, truth in zip(predictions, y_test):
    for i, ss in enumerate(truth):
        total +=1
        if ss==prediction[i]:
            TP+=1

accuracy = TP/total
print(accuracy)

0.5986188579017264


input:
1. one hot encoded sequence
2. PSSM

Model:
1D convolutional neural network

output:
multiclass classification - dense layer with relu activaiton - 3?

validation metric - accuray + model specific measures

soruces:
https://www.csbj.org/article/S2001-0370(22)00506-2/fulltext
