In [1]:
from keras.layers.convolutional import Convolution1D
from keras.layers.core import Dense, Activation, Flatten, RepeatVector
from keras.layers.wrappers import TimeDistributed
from keras.layers.recurrent import GRU, LSTM
from keras.layers import Dense, merge, Input, Dropout
from keras.optimizers import SGD
from keras.utils import np_utils, plot_model
from keras.models import Model, load_model, Sequential

import pandas as pd
import numpy as np
from keras.callbacks import CSVLogger, Callback, ModelCheckpoint

import os

from collections import OrderedDict

import csv

from collections import Iterable

from tqdm import tqdm_notebook
from tqdm import tqdm

import pickle

import uniprot


from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

Using TensorFlow backend.


In [2]:
# Optimization learning rate
LEARNING_RATE = .01

# Number of epochs to train the net
NUM_EPOCHS = 100

# Batch Size
BATCH_SIZE = 100

# Max length of protein
MAX_LENGTH = 300

# number of channels per aa
SEQ_NDIMS = 25 #X, B, Z

# All gradients above this will be clipped
GRAD_CLIP = 100

LATENT_REP_SIZE = 300

In [3]:
prots = pd.read_csv('../DeepPPI/dataset/Human/human_all_csv.tab', sep='\t')[['protA', 'protB', 'Interaction']]

In [4]:
prots_to_download = set(prots['protA']) | set(prots['protB'])

In [None]:
if os.path.exists('uniprot_dict.bin'):
    uniprot_data = pickle.load(open('uniprot_dict.bin', 'rb'))
else:
    uniprot_data = uniprot.batch_uniprot_metadata(prots_to_download, 'cache')
    pickle.dump(uniprot_data, open('uniprot_dict.bin', 'wb'))

In [None]:
filtered = {k:[] for k in prots.columns.values}

for idx, row in tqdm_notebook(prots.iterrows(), total=prots.shape[0]):
    if row['protA'] in uniprot_data and row['protB'] in uniprot_data:
        for k in filtered.keys():
            filtered[k].append(row[k])
filtered = pd.DataFrame(filtered)

In [None]:
prots_names = list(uniprot_data.keys())
for k in prots_names:
    if len(uniprot_data[k]['sequence']) > MAX_LENGTH:
        del uniprot_data[k]

In [None]:
class Data:
    def __init__(self):
        self.inputs = None

    def load(self, data):
        self.inputs = self.process_data(data)

    @staticmethod
    def convert_to_one_hot(amino_acid):
        one_hot = np.zeros(SEQ_NDIMS, dtype=np.int32)
        one_hot[amino_acid] = 1
        return one_hot

    def load_aa(self):
        d = dict()
        with open('aa_traits.tsv') as input:
            input.readline()
            index = 0
            for line in input:
                line = line.split()
                d[line[0]] = self.convert_to_one_hot(index)
                index += 1
        return d

    def process_single(self, protein, max_length=MAX_LENGTH):
        aa = self.load_aa()
        raw_protein = np.array(list(map(lambda x: aa[x], protein)), np.int32)

        raw_protein = np.pad(raw_protein, ((0, max_length - raw_protein.shape[0]), (0, 0)),
                             'constant', constant_values=0)
        return raw_protein

    def process_data(self, inputs):
#         inputs = data['protein'].as_matrix()
        inputs = np.array(list(map(self.process_single, inputs)))
        return inputs

    def split_data(self):
        split_index = int(self.inputs.shape[0] * 0.1)
        train = self.inputs[:-split_index]
        test = self.inputs[-split_index:]
        train, validation = train[:-split_index], train[-split_index:]

        return train, validation, test


class AutoEncoder:
    def __init__(self):
        self.input = Input(shape=(MAX_LENGTH, SEQ_NDIMS))

        self.encoder = self.build_encoder(input=self.input)
        self.decoder = self.build_decoder(self.encoder, LATENT_REP_SIZE)

        self.autoencoder = Model(self.input, self.decoder)

        self.autoencoder.compile(optimizer='Adam',
                                 loss='categorical_crossentropy',
                                 metrics=['accuracy'])

    @staticmethod
    def build_encoder(input):

        enc = Convolution1D(filters=30, kernel_size=3)(input)
        enc = Convolution1D(filters=5, kernel_size=1)(enc)

        enc_lstm_for = LSTM(128, return_sequences=True, name='enc_lstm_for')(enc)
        enc_lstm_back = LSTM(128, return_sequences=True, go_backwards=True, name='enc_lstm_back')(enc)
        enc = merge([enc_lstm_for, enc_lstm_back], mode='concat')

        enc = Flatten(name='flatten')(enc)
        enc = Dense(units=LATENT_REP_SIZE, name='latent', activation='linear')(enc)

        return enc

    @staticmethod
    def build_decoder(latent, latent_size=LATENT_REP_SIZE, max_length=MAX_LENGTH):

        dec = RepeatVector(max_length, name='repeat_vector')(latent)
        # dec = RepeatVector(max_length + 2, name='repeat_vector')(latent)
        # dec = Convolution1D(filters=30, kernel_size=3)(dec)
        # dec = Convolution1D(filters=5, kernel_size=1)(dec)

        dec_lstm_for = LSTM(128, return_sequences=True, name='dec_lstm_for')(dec)
        dec_lstm_back = LSTM(128, return_sequences=True, go_backwards=True, name='dec_lstm_back')(dec)
        dec = merge([dec_lstm_for, dec_lstm_back], mode='concat')

        return TimeDistributed(Dense(SEQ_NDIMS, activation='softmax'), name='decoded')(dec)


In [None]:
prots_to_train = [uniprot_data[k]['sequence'] for k in uniprot_data]

In [None]:
data = Data()
data.load(prots_to_train)

In [None]:
train, validation, test = data.split_data()

In [None]:
model = AutoEncoder()

In [None]:
BATCH_SIZE = 64

In [None]:
csv_logger = CSVLogger('logs.tsv', separator='\t')
checkpoints = ModelCheckpoint('checkpoints/epoch_{epoch:02d}.hdf5',
                                 monitor='var_loss', verbose=1, save_best_only=False,
                                 save_weights_only=False, mode='auto', period=1)

#     model.fit(train, train, BATCH_SIZE, NUM_EPOCHS, callbacks=[csv_logger, checkpoints],
model.autoencoder.fit(train, train, BATCH_SIZE, NUM_EPOCHS, 
#                      callbacks=[csv_logger, checkpoints], # dafaq this shit doesnt work 
                     validation_split=0.1, validation_data=(validation, validation),
                     shuffle=True, class_weight=None, verbose=1,
                     sample_weight=None, initial_epoch=0)