In [0]:
import urllib.request
import io
import gzip

import numpy as np
import tensorflow as tf

In [0]:
import keras
from keras.layers import Input, Dense, Activation, BatchNormalization, Flatten, Conv2D, concatenate
from keras.models import Model

Text is ordered in the following pattern:

Each line is a sample and contains the following information delimited by a colon (':')



*   Protein ID
*   Random Letter
*   Primary Sequence (using one-letter codes)
*   Secondary structure classification (C - coil, H - Alpha Helix, E - Beta Strand)

Here is an example:

    3L3K:C:EEFGAAASF:CCCCCCCCC

We have to convert amino acids and labels to numbers for the computer.

X: 04 04 05 06 01 01 01 16 05

Y1: 0 0 0 0 0 0 0 0 0

Y2: 1 1 1 1 1 1 1 1 1

Y3: 0 0 0 0 0 0 0 0 0

X is passed in as input to all 3 neural networks.

Y1 is output of neural network 1 (model 1)

Y2 is output of neural network 2 (model 2)

Y3 is output of neural network 3 (model 3)

Each neural network gives an output for probabilities that each element is part of the NN's corresponding class. For example, model 1 will assign probabilities for each element regarding how likely it is to be of class 1.

In [0]:
amino_lookup = {"A":1, "C":2, "D":3, "E":4, "F":5, "G":6, "H":7, "I":8, "K":9, "L":10, "M":11, "N":12, "P":13, "Q":14, "R":15, "S":16, "T":17,
                "V":18, "W":19, "Y":20, "0": -1}

label_lookup = {"C": 1, "H": 2, "E": 3}

def aa_to_number (aa_sequence):
  return [amino_lookup.get(aa, 0) for aa in aa_sequence]

def label_to_number (label_sequence):
  return [label_lookup.get(label, 0) for label in label_sequence]

In [0]:
def load_dataset():
  gh_url = 'https://github.com/SRavit1/CNN-Protein-Secondary-Structure-Prediction/raw/master/ModifiedSSData1.gz'
  response = urllib.request.urlopen(gh_url)
  compressed_file = io.BytesIO(response.read())
  decompressed_file = gzip.GzipFile(fileobj=compressed_file)

  contents = decompressed_file.read().decode()
  
  contents_array = np.array(contents.splitlines())
  num_of_samples = len(contents_array)
  #should be 467602
  
  features = np.zeros((num_of_samples, 11, 20))
  labels = np.zeros((num_of_samples, 3))

  for i in range(num_of_samples):
    sample = contents_array[i];
    x = sample.split(":")[0];
    for j in range (len(x)):
      aa_number = amino_lookup.get(x[j]) - 1
      if (aa_number >=0):
        features.itemset((i, j, aa_number), 1)
    y = sample.split(":")[1];
    y = label_lookup.get(y, 0);
    labels.itemset((i, y-1), 1)

  cutoff = 400000
  X_train = features[:cutoff]
  Y_train = labels[:cutoff]
  X_test = features[cutoff:]
  Y_test = labels[cutoff:]
  
  return X_train, Y_train, X_test, Y_test

In [0]:
X_train, Y_train, X_test, Y_test = load_dataset()

X_train = np.reshape (X_train, (400000, 11, 20, 1))
X_test = np.reshape (X_test, (67602, 11, 20, 1))

print ("Shape of")
print ("X_train:", X_train.shape)
print ("Y_train:", Y_train.shape)
print ("X_test:", X_test.shape)
print ("Y_test:", Y_test.shape)

In [0]:
def model(input_shape, model_name):
  X_input = Input(input_shape)
  X = X_input

  X1 = Conv2D (20, (1, 20), strides = (1, 1))(X)
  X1 = Flatten()(X1)
  X2 = Conv2D (500, (8, 20), strides = (1, 1))(X)
  X2 = Flatten()(X2)
  
  X = concatenate ([X1, X2])
  
  X = Activation('relu')(X)
  
  X = Dense(3, activation='sigmoid', name='fc')(X)
  return Model(inputs = X_input, outputs = X, name=model_name)

In [0]:
model = model((11, 20, 1), 'model')
model.compile (optimizer=keras.optimizers.Adadelta(lr=4.0), loss="categorical_crossentropy", metrics=["accuracy"])

In [0]:
model.fit (x=X_train, y=Y_train, epochs=10, batch_size=2048)

In [0]:
preds = model.evaluate(x=X_test, y=Y_test)

print()
print ("Loss = " + str(preds[0]))
print ("Test Accuracy = " + str(preds[1]))

In [0]:
index = 39494

print ("X")
print (X_train[index,:,:,0])
print ("Y")
print (Y_train[index])
print ("Predicted", model.predict(np.reshape(X_train[index], (1, 11, 20, 1))))

In [0]:
model.summary()