In [14]:
import re
import os
import sys
import string
import pickle
import tempfile
import numpy  as np
import pandas as pd
from time import time
import tensorflow as tf
from nltk.util import ngrams
import tensorflow.contrib.rnn as rnn
import matplotlib.pyplot as plot

tf.logging.set_verbosity(tf.logging.ERROR)

In [15]:
# Configuration -
num_ex = 300000
famTHR = 200*num_ex/500000
numgrams= 3 # for Glove
min_after_dequeue = 10000*num_ex/500000
num_epochs=1
batch_size = 1000
dummy = "end end end end end"

alphabet =list(string.ascii_uppercase)
alphabet = ['0']+alphabet
vocab_size = len(alphabet)
filter_file='data/filtered_seqs_all.csv'
corpus_file='data/filtered_seq_corpus.txt'

num_classes_file='uniprot_num_classes'

# filepath='processed_uniprot.tfrecord'

In [34]:
def printbuf(x):
    sys.stdout.write(x)
    sys.stdout.write('\r')


In [16]:
def save_obj(obj, name ,overwrite=1):
	filename='data/'+ name + '.pkl';
	if(overwrite==1 and os.path.exists(name)):
		return [];
	with open(filename, 'wb') as f:
		pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
        print('File saved to '+filename)

def load_obj(name):
	filename='data/'+ name + '.pkl';
	# if(not os.path.exists(name)):
	# 	return [];
	with open('data/' + name + '.pkl', 'rb') as f:
		print('Loaded file '+filename)
		return pickle.load(f)

def saveFilteredData():
    fam='Cross-reference (Pfam)'
    seq = 'Sequence'
    main =pd.read_table("./data/uniprot-all.tab.gz", sep='\t',nrows=num_ex)
    main[fam]= main[fam].apply(lambda x:str(x).split(';')[0])
    col = (main[fam]!='nan') & (main[fam].isnull()==False) & (main[seq].isnull()==False)
    main = main[col]
    alphafam = list(main[fam].unique())
    alphafam = [-1]+alphafam
    num_classes = len(alphafam)
    seq_records = main[[seq,fam,'Length']]
    seq_records.columns=['seq','fam','length']
    #All the heavy jobs here - takes upto 92 seconds
    seq_records['seq']= seq_records['seq'].apply(lambda seq : list(map(lambda x : alphabet.index(x),list(seq))))
    seq_records['fam']= seq_records['fam'].apply(lambda x : alphafam.index(x))
    counts = seq_records.groupby(['fam']).size().reset_index(name='counts')
    #10801 -> 9254 families
    counts = counts[ (counts['counts']>=famTHR) & (counts['counts'] < 3400) ]
    #608 -> 555 families having count between 200 and 3400
    filtered_fams = list(counts['fam'])
    #323162 -> 302907 examples total satisfy it.
    filtered_seqs = seq_records[ seq_records['fam'].isin(filtered_fams)]
    filtered_seqs = filtered_seqs.sort_values('length')
#     Save for future use
    save_obj(num_classes,num_classes_file)
    filtered_seqs.to_csv(filter_file,index=False)

    # # takes 3 seconds !
    # gb = seq_records.groupby('fam')    
    # family_wise_db = [gb.get_group(x) for x in gb.groups]

if os.path.exists(filter_file):
    num_classes=load_obj(num_classes_file)
    filtered_seqs = pd.read_csv(filter_file,nrows=num_ex)
    print("Loaded filtered_seqs")
else:
    print('File not found, preparing data again')
    p = time()
    saveFilteredData()
    print('time taken: ',time() - p)


Loaded file data/uniprot_num_classes.pkl
Loaded filtered_seqs


In [39]:
# A function that generates words of 3 grams from the given list of sequences
# Then returns without disturbing their 'context'
def func(x):
    if(x <0 or x>len(alphabet)):
        print("IndeX",x)
    print(x)
    return alphabet[x]

def genCorpus(filtered_seqs,n=3):
#     seq is already a string(list of integers) -
    total = filtered_seqs['seq'].size    
    print('generating corpus : ')
    with open(corpus_file,'a') as f:
        i =0
        for rec in filtered_seqs.iterrows():
            i+=1
            grams = ngrams(eval(rec[1].seq),n)
            f.write(' '.join([''.join(map(lambda x :alphabet[x],g)) for g in grams]))
            f.write(' ')
            perc = (i*100)/total
            printbuf('%d %% completed' % perc)
    print("Wrote corpus to file")

if os.path.exists(corpus_file):
    with open(corpus_file,'r') as f:
        corpus = f.read()
        print('loaded corpus')
else :
    p = time()
    with open(corpus_file,'w') as f:
        f.write('')
    genCorpus(filtered_seqs,numgrams)
    print('time taken: ',time() - p)



generating corpus...
Wrote corpus to file
('time taken: ', 581.1159410476685)


In [None]:
class RnnForPfcModelOne:
#   @profile
    def __init__(self, 
        batch_size,
        num_classes = 549, 
        hidden_units=100,
        learning_rate=0.01,
         logs_path='/tmp/tensorflow/logs'
                ):
        global vocab_size
        # batch_size * no_of_time_steps * vocab_size _/
        self.weights = tf.Variable(tf.random_uniform(shape=[hidden_units, num_classes], maxval=1))
        self.biases = tf.Variable(tf.random_uniform(shape=[num_classes]))
        self.rnn_fcell = rnn.BasicLSTMCell(num_units = hidden_units, 
                                           forget_bias = 1.0,
                                           activation = tf.tanh)
        # self.len_data taken from feed_dict
        self.len_data = tf.placeholder(tf.uint8, [batch_size],name="Lengths")
        # self.x_input taken from feed_dict
        self.x_input = tf.placeholder(tf.uint8, [None, None], name = 'x_ip') # batch_size * no_of_time_steps _/
        # self.x_input_o takes self.x_input
        self.x_input_o = tf.one_hot(indices = self.x_input, 
            depth = vocab_size,
            on_value = 1.0,
            off_value = 0.0,
            axis = -1)
        # self.outputs takes self.x_input_o & len_data
        with tf.name_scope('Model'):
            self.outputs, self.states = tf.nn.dynamic_rnn(self.rnn_fcell,
                                                      self.x_input_o,
                                                      sequence_length = self.len_data,
                                                      dtype = tf.float32)
        
        # outputs of shape batch_size * no_of_time_steps * vocab_size
        # output at time t i.e. the last output
        self.outputs_t = tf.reshape(self.outputs[:, -1, :], [-1, hidden_units])
        # The single layer NN to classify - takes outputs_t
        self.y_predicted = tf.matmul(self.outputs_t, self.weights) + self.biases
        
        
        # self.y_input taken from feed_dict batch_size *1
        self.y_input = tf.placeholder(tf.uint8, [batch_size], name = 'y_ip')
        # self.y_input_o takes y_input
        self.y_input_o = tf.one_hot(indices = self.y_input, 
                                    depth = num_classes,
                                    on_value = 1.0,
                                    off_value = 0.0,
                                    axis = -1)
        
        # self.loss takes one hot y and y_predicted
        with tf.name_scope('Loss'):
            self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.y_predicted, labels=self.y_input_o)
        #y_predicted and y_input_o shud be of same size = batch_size * num_classes
        # define optimizer and trainer
        with tf.name_scope('Trainer'):
            self.trainer = tf.train.GradientDescentOptimizer(learning_rate).minimize(self.loss)

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.get_equal = tf.equal(tf.argmax(self.y_input_o, 1), tf.argmax(self.y_predicted, 1))
        with tf.name_scope('Accuracy'):
            self.accuracy = tf.reduce_mean(tf.cast(self.get_equal, tf.float32))
        
        self.summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())
        # Create a summary to monitor cost tensor
        tf.summary.scalar("loss", self.loss)
        # Create a summary to monitor accuracy tensor
        tf.summary.scalar("accuracy", self.accuracy)
        # Merge all summaries into a single op - to pass into sess.run
        self.merged_summary_op = tf.summary.merge_all()

#   @profile
    def predict(self, x, y, len_data):
        result = self.sess.run(self.y_predicted, feed_dict={self.x_input: x, self.y_input: y, self.len_data:len_data})
        return result

#   @profile
    def optimize(self, x, y, len_data,summary_index):
        c, summary = self.sess.run([self.loss, self.merged_summary_op],
                              feed_dict={self.x_input: x, self.y_input: y, self.len_data:len_data})
        self.summary_writer.add_summary(summary, summary_index)

        self.sess.run(self.trainer, feed_dict={self.x_input: x, self.y_input: y, self.len_data:len_data})
        return c

#   @profile
    def cross_validate(self, x, y, len_data):
        result = self.sess.run(self.accuracy, feed_dict={self.x_input:x, self.y_input:y, self.len_data:len_data})
        return result

#   @profile
    def close_summary_writer(self):
        self.summary_writer.close()

In [None]:

def pad(x,max_len):
    return np.lib.pad(x,(0,max_len - len(x)),'constant',constant_values=(-1,0))

class SimpleDataIterator():
    def __init__(self, df):
        self.df = df
        self.size = len(self.df)
        self.epochs = 0
        self.batch_no = 0
        self.shuffle()

    def shuffle(self):
        self.df = self.df.sample(frac=1).reset_index(drop=True)
        self.cursor = 0

class PaddedDataIterator(SimpleDataIterator):
    def next_batch(self, n):
        if self.cursor + n > self.size:
            self.epochs += 1
            self.batch_no=0
            self.shuffle()
        res = self.df.iloc[self.cursor:self.cursor+n]
        self.cursor += n
        self.batch_no+=1

        # Pad sequences with 0s so they are all the same length
        maxlen = max(res['length'])
        res['seq'] = map(lambda x:pad(x,maxlen),res['seq'])
        return res

it = PaddedDataIterator(filtered_seqs)

In [None]:
runonce=0

In [None]:
if(not runonce):
    model = RnnForPfcModelOne(batch_size,num_classes=num_classes)
    runonce=1
    
while(it.epochs != num_epochs):
    df = it.next_batch(batch_size)
    batch_x, batch_y, len_data = map(list,(df['seq'],df['fam'],df['length']))
    cost = model.optimize(batch_x, batch_y, len_data,it.batch_no + batch_size*it.epochs)
    accuracy_known = model.cross_validate(batch_x,batch_y,len_data)
    print("Iteration number, batch number, Cost : ", it.epochs, it.batch_no,cost,
          " Training data accuracy : ", accuracy_known)