# Setup

In [1]:
%matplotlib inline

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import pandas as pd
import numpy as np
import os
import random
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm,trange
from Bio.Seq import *

import tensorflow as tf

tf.logging.set_verbosity(tf.logging.ERROR)

In [2]:
# ROOT_DIR - root directory
ROOT_DIR = os.getcwd()+'/'

# FEATURE_DIR - directory where feature dataframes are saved
DATA_DIR = ROOT_DIR + 'dataframes/'

In [3]:
DF_prest=pd.read_csv(DATA_DIR+'DF_prest.csv',index_col=0)
print(len(DF_prest))
DF_prest.head()

45206


Unnamed: 0,prest_id,uniprot_id,conc_cf,aa_seq,nt_seq,aa_len,true_nt_seq
0,140095,G3V3N0,4.3075,IMTAPSSFEQFKVAMNYLQLYNVPDCLEDIQDADCSSSKCSSSASS...,GACAAGCTTGCGGCCGCAATTATGACAGCTCCCTCCAGTTTTGAGC...,139,ATTATGACAGCTCCCTCCAGTTTTGAGCAGTTTAAAGTGGCAATGA...
1,140099,G3V537,2.9154,TYYAWKHELLGSGTCPALPPREVLGMEELEKLPEEQVAEEELECSA...,GACAAGCTTGCGGCCGCAACCTACTATGCCTGGAAGCATGAGCTGC...,144,ACCTACTATGCCTGGAAGCATGAGCTGCTGGGCTCTGGCACCTGCC...
2,140225,P12724,1.4877,SLHARPPQFTRAQWFAIQHISLNPPRCTIAMRAINNYRWRCKNQNT...,GACAAGCTTGCGGCCGCATCACTCCATGCCAGACCCCCACAGTTTA...,136,TCACTCCATGCCAGACCCCCACAGTTTACGAGGGCTCAGTGGTTTG...
3,140235,H0YH02,6.7224,ARALNESKRVNNGNTAPEDSSPAKKTRRCQRQESKKMPVAGGKANK...,GACAAGCTTGCGGCCGCAGCGAGAGCATTAAATGAAAGCAAAAGAG...,123,GCGAGAGCATTAAATGAAAGCAAAAGAGTTAATAATGGCAACACGG...
4,140309,F5GYC5,3.3848,HRKEPGARLEATRGAARPHKQGTKPMITRPSVSQLGEGKCPSSQHL...,GACAAGCTTGCGGCCGCACATCGGAAAGAGCCTGGGGCAAGGCTGG...,124,CATCGGAAAGAGCCTGGGGCAAGGCTGGAGGCCACAAGAGGAGCTG...


In [4]:
target = 'expressed'
DF_prest.loc[:,target] = DF_prest.conc_cf > DF_prest.conc_cf.mean()

In [5]:
min_expressed = np.percentile(DF_prest['conc_cf'],75)
max_not_expressed = np.percentile(DF_prest['conc_cf'],25)

DF_prest_filtered = DF_prest[(DF_prest['conc_cf'] > min_expressed) | (DF_prest['conc_cf'] < max_not_expressed)]

print('Number expressed:',len(DF_prest_filtered[DF_prest_filtered[target]]))
print('Number not expressed:',len(DF_prest_filtered[DF_prest_filtered[target]==False]))
print('Min expression cutoff:',min_expressed)
print('Max non-expression:',max_not_expressed)

Number expressed: 11301
Number not expressed: 11302
Min expression cutoff: 7.2576
Max non-expression: 3.269225


In [6]:
train_inds = random.sample(DF_prest_filtered.index,int(len(DF_prest_filtered)*.6))
test_inds = random.sample(set(DF_prest_filtered.index) - set(train_inds), int(len(DF_prest_filtered)*.3))
val_inds = set(DF_prest_filtered.index) - set(train_inds) - set(test_inds)

In [7]:
print('Test-Train-Val Ratio: %.1f - %.1f - %.1f'%(len(train_inds)/len(DF_prest_filtered),
                                                  len(test_inds)/len(DF_prest_filtered),
                                                  len(val_inds)/len(DF_prest_filtered)))

Test-Train-Val Ratio: 0.6 - 0.3 - 0.1


In [8]:
train_vals = DF_prest_filtered.loc[train_inds,'expressed'].values.astype(np.float32).reshape((len(train_inds),1))
test_vals = DF_prest_filtered.loc[test_inds,'expressed'].values.astype(np.float32).reshape((len(test_inds),1))
val_vals = DF_prest_filtered.loc[val_inds,'expressed'].values.astype(np.float32).reshape((len(val_inds),1))

## Encode nucleotide sequences

In [9]:
onehot_nuc = {'A':[1,0,0,0],'G':[0,1,0,0],'C':[0,0,1,0],'T':[0,0,0,1]}

def encode(seq,padlength=0):
    raw_encoding = np.array([onehot_nuc[char] for char in list(seq)],dtype=np.float32)
    return np.vstack([raw_encoding,np.zeros((padlength - len(seq),4),dtype=np.float32)])

In [10]:
maxlen = int(np.ceil(max([len(seq) for seq in DF_prest_filtered.nt_seq])/4)*4)

In [11]:
encoded_train = np.vstack(([encode(seq,maxlen).flatten() for seq in DF_prest_filtered.loc[train_inds,'nt_seq']]))
encoded_test = np.vstack(([encode(seq,maxlen).flatten() for seq in DF_prest_filtered.loc[test_inds,'nt_seq']]))
encoded_val = np.vstack(([encode(seq,maxlen).flatten() for seq in DF_prest_filtered.loc[val_inds,'nt_seq']]))

In [12]:
print(encoded_train.shape)
print(encoded_test.shape)
print(encoded_val.shape)

(13561, 2256)
(6780, 2256)
(2262, 2256)


In [13]:
train_set = tf.contrib.learn.datasets.base.Dataset(encoded_train,train_vals)
test_set = tf.contrib.learn.datasets.base.Dataset(encoded_test,test_vals)
val_set = tf.contrib.learn.datasets.base.Dataset(encoded_val,val_vals)

In [14]:
data = tf.contrib.learn.datasets.base.Datasets(train_set,test_set,val_set)

# Create Neural Network

In [15]:
import tensorflow as tf

In [29]:
# Parameters
learning_rate = 0.0001
training_iters = 200000
batch_size = 128
display_step = 10

# Network Parameters
n_input = 4*maxlen # Nucleotide data input (img shape: 4*561)
dropout = 0.5 # Dropout, probability to keep units

# tf Graph input
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None,1])
keep_prob = tf.placeholder(tf.float32) #dropout (keep probability)

In [17]:
# Shape [#batches, height (1), # nucleotides, depth]
x0 = tf.reshape(encoded_train,shape=[-1,1,maxlen,4])
x0.get_shape()

TensorShape([Dimension(13561), Dimension(1), Dimension(564), Dimension(4)])

In [18]:
# Shape: [height (1), width, depth (4), number of convs]
W0 = np.zeros((1,3,4,5),dtype=np.float32)
W0.shape

(1, 3, 4, 5)

In [19]:
# Shape: [# batches, height, width, #conv]
conv_out = tf.nn.conv2d(x0,W0,strides=[1,1,1,1],padding='SAME')
conv_out.get_shape()

TensorShape([Dimension(13561), Dimension(1), Dimension(564), Dimension(5)])

In [20]:
tf.nn.max_pool(conv_out,ksize=[1,1,11,1],strides=[1,1,11,1],padding='SAME')

<tf.Tensor 'MaxPool:0' shape=(13561, 1, 52, 5) dtype=float32>

#### conv2d:
* Input x: Tensor [# samples, 1 (height), #nucleotides, 4(channels)]
* Convolution Matrix W: Tensor [1 (height), conv width, 4(channels), #convolutions(1)]
* Strides: [1,1,stride width, 1]

#### max_pool
* Input x: [1, 1 (height), pool width, #convolutions(1)]
* Pool size ksize: [1,1,pool width, 1]
* Stride size strides: [1,1,pool width, 1]

In [30]:
# Create some wrappers for simplicity
def conv1d(x, W, b, strides=1):
    # Conv2D wrapper, with bias and relu activation
    x = tf.nn.conv2d(x, W, strides=[1, 1, strides, 1], padding='SAME')
    x = tf.nn.bias_add(x, b)
    return tf.nn.relu(x)


def maxpool1d(x, k=2):
    # MaxPool1D wrapper
    return tf.nn.max_pool(x, ksize=[1, 1, k, 1], strides=[1, 1, k, 1],
                          padding='SAME')


# Create model
def conv_net(x, weights, biases, dropout):
    # Reshape input picture
    x = tf.reshape(x, shape=[-1, 1, maxlen, 4])

    # Convolution Layer
    conv1 = conv1d(x, weights['wc1'], biases['bc1'])
    # Max Pooling (down-sampling)
    conv1 = maxpool1d(conv1, k=2)

    # Convolution Layer
    conv2 = conv1d(conv1, weights['wc2'], biases['bc2'])
    # Max Pooling (down-sampling)
    conv2 = maxpool1d(conv2, k=2)

    # Fully connected layer
    # Reshape conv2 output to fit fully connected layer input
    fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
    fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
    fc1 = tf.nn.relu(fc1)
    # Apply Dropout
    fc1 = tf.nn.dropout(fc1, dropout)

    # Output, class prediction
    out = tf.add(tf.matmul(fc1, weights['out']), biases['out'])
    return out

In [31]:
# Store layers weight & bias
weights = {
    # 1x15 conv, 4 inputs, 32 outputs
    'wc1': tf.Variable(tf.random_normal([1, 15, 4, 32])),
    # 1x15 conv, 32 inputs, 64 outputs
    'wc2': tf.Variable(tf.random_normal([1, 15, 32, 64])),
    # fully connected, 564/4*64 inputs, 1024 outputs
    'wd1': tf.Variable(tf.random_normal([int(564/4)*64, 1024])),
    # 1024 inputs, 1 output
    'out': tf.Variable(tf.random_normal([1024,1]))
}

biases = {
    'bc1': tf.Variable(tf.random_normal([32])),
    'bc2': tf.Variable(tf.random_normal([64])),
    'bd1': tf.Variable(tf.random_normal([1024])),
    'out': tf.Variable(tf.random_normal([1]))
}

# Construct model
pred = conv_net(x, weights, biases, keep_prob)

# Define loss and optimizer
cost = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(pred, y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Evaluate model
correct_pred = tf.equal(tf.greater(pred,0),tf.greater(y,0))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initializing the variables
init = tf.initialize_all_variables()

In [32]:
input_data = tf.constant(data.train.data)
input_target = tf.constant(data.train.target)
slice_x, slice_y = tf.train.slice_input_producer([input_data, input_target])
prebatch_x, prebatch_y = tf.train.batch([slice_x, slice_y], batch_size=batch_size)

In [33]:
# Launch the graph
with tf.Session() as sess:
    # Initialize all variables
    sess.run(init)
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    step = 1
    # Keep training until reach max iterations
    while step * batch_size < training_iters:
        batch_x = sess.run(prebatch_x)
        batch_y = sess.run(prebatch_y)
        # Run optimization op (backprop)
        sess.run(optimizer, feed_dict={x: batch_x, y: batch_y,
                                       keep_prob: dropout})
        if step % display_step == 0:
            # Calculate batch loss and accuracy
            loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x,
                                                              y: batch_y,
                                                              keep_prob: 1.})
            print("Iter " + str(step*batch_size) + ", Minibatch Loss= " + \
                  "{:.6f}".format(loss) + ", Training Accuracy= " + \
                  "{:.5f}".format(acc))
        step += 1
    print("Optimization Finished!")

    # Calculate accuracy for 256 mnist test images
    print("Testing Accuracy:", \
        sess.run(accuracy, feed_dict={x: data.test.data[:256],
                                      y: data.test.target[:256],
                                      keep_prob: 1.}))

Iter 1280, Minibatch Loss= 1337338.250000, Training Accuracy= 0.59375
Iter 2560, Minibatch Loss= 1305518.500000, Training Accuracy= 0.60156
Iter 3840, Minibatch Loss= 1891828.000000, Training Accuracy= 0.46875
Iter 5120, Minibatch Loss= 2059806.125000, Training Accuracy= 0.48438
Iter 6400, Minibatch Loss= 2218297.500000, Training Accuracy= 0.44531
Iter 7680, Minibatch Loss= 1599985.250000, Training Accuracy= 0.50781
Iter 8960, Minibatch Loss= 1494785.500000, Training Accuracy= 0.53125
Iter 10240, Minibatch Loss= 1510159.000000, Training Accuracy= 0.57031
Iter 11520, Minibatch Loss= 1754207.250000, Training Accuracy= 0.43750
Iter 12800, Minibatch Loss= 1528396.375000, Training Accuracy= 0.48438
Iter 14080, Minibatch Loss= 2033380.375000, Training Accuracy= 0.50000


KeyboardInterrupt: 

In [25]:
# # Create the graph, etc.
# init_op = tf.initialize_all_variables()

# # Create a session for running operations in the Graph.
# sess = tf.Session()

# # Initialize the variables (like the epoch counter).
# sess.run(init_op)

# # Start input enqueue threads.
# coord = tf.train.Coordinator()
# threads = tf.train.start_queue_runners(sess=sess, coord=coord)

# try:
#     while not coord.should_stop():
#         # Create training batch
#         batch_x, batch_y = tf.train.batch([slice_x, slice_y], batch_size=batch_size)
#         batch_x = sess.run(batch_x)
#         batch_y = sess.run(batch_y)
#         # Run optimization op (backprop)
#         sess.run(optimizer, feed_dict={x: batch_x, y: batch_y,
#                                        keep_prob: dropout})
#         if step % display_step == 0:
#             # Calculate batch loss and accuracy
#             loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x,
#                                                               y: batch_y,
#                                                               keep_prob: 1.})
#             print("Iter " + str(step*batch_size) + ", Minibatch Loss= " + \
#                   "{:.6f}".format(loss) + ", Training Accuracy= " + \
#                   "{:.5f}".format(acc))

# except tf.errors.OutOfRangeError:
#     print('Done training -- epoch limit reached')
# finally:
#     # When done, ask the threads to stop.
#     coord.request_stop()

# # Wait for threads to finish.
# coord.join(threads)
# sess.close()