# Classification on Splice-junction Gene Sequences 


In [20]:
#Imports
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

MY_MODE_TRAINING = True
MY_MODE_PREDICTION = False


### Load the dataset

In [21]:
def load_data(file_path) :
    df = pd.read_csv(file_path, header=None)
    df.columns = ['classlabel', 'name', 'sequence']
    df.tail()
    
    return df

### Pre-process the dataset
* Apply one-hot-encoding to input data
* Take 20% as test data 

In [22]:
def preprocess_data(df) :
    
    # Encoding class labels
    class_le = LabelEncoder()
    y = class_le.fit_transform(df['classlabel'].values)
    #print("y:",y)
    
    # Encoding sequence
    # Here we use one hot encoding to encode the character in DNA sequence. 
    # So each dna sequence is converted to a 60x8 2D array 
    def Seq2Vec(seq):
        s = str(seq).strip()
        CharDict = { "A":[0,0,0,0,0,0,0,1],
                     "G":[0,0,0,0,0,0,1,0],
                     "C":[0,0,0,0,0,1,0,0],
                     "T":[0,0,0,0,1,0,0,0],
                     "D":[0,0,0,1,0,0,0,0],
                     "N":[0,0,1,0,0,0,0,0],
                     "S":[0,1,0,0,0,0,0,0],
                     "R":[1,0,0,0,0,0,0,0]}
        return np.asarray([CharDict[c] for c in s], dtype=np.float32).flatten()

    df['seqvec'] = df['sequence'].apply(Seq2Vec)
    X = np.vstack(df['seqvec'].values)
    print("Total samples:", X.shape[0])
    
    # Split the data set into training/test set
    sss = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=0)
    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    print("Training samples: ", X_train.shape[0], "Test samples: ", X_test.shape[0])
    
    return X_train, y_train, X_test, y_test

### DNN model

In [23]:
def model_dnn(X_train, y_train, X_test, y_test, batch_size=100, n_epochs=5000) :
    
    #DNN approach
    '''
    feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(X_train)
    dnn_clf = tf.contrib.learn.DNNClassifier(hidden_units=[300, 100], n_classes=3,
                                             feature_columns=feature_columns)
    dnn_clf.fit(x=X_train, y=y_train, batch_size=50, steps=n_epochs)

    from sklearn.metrics import accuracy_score

    y_pred = list(dnn_clf.predict(X_test))
    accuracy = accuracy_score(y_test, y_pred)
    accuracy
    '''
    
    tf.reset_default_graph()
 
    # Hyper parameters
    fc1_node = 200
    beta = 0.01                  # Regularization 0.01
    dropout_rate = 0.5           # Dropout rate for dropout layer
    starter_learning_rate = 0.001
    
    # Variables and inputs
    X = tf.placeholder(tf.float32, shape=(None, 480), name="X")
    input = tf.reshape(X, [-1, 60*8])
    y = tf.placeholder(tf.int32, shape=(None), name="y")
    mode = tf.placeholder(tf.bool, shape=(None), name="mode")
    scale = tf.placeholder(tf.float32, shape=(None), name="scale")
    global_step = tf.placeholder(tf.int32, shape=(None), name="global_step")
    learning_rate = tf.Variable(starter_learning_rate, trainable=False)
    
    # Add the fully connected layer
    fc1 = tf.layers.dense(
        inputs= input, 
        units=fc1_node, 
        activation=tf.nn.relu, 
        kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=scale, scope=None),
        kernel_initializer=tf.contrib.layers.xavier_initializer(),
        name="fc1")

    #Adding a dropout layer to avoid overfitting
    dropout1 = tf.layers.dropout(inputs=fc1, rate=dropout_rate, training=mode )
    
    # The last output layer
    logits = tf.layers.dense(
        inputs=dropout1, 
        units=3,
        activation=None,    #softmax is done in tf.nn.sparse_softmax_cross_entropy_with_logits
        kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=scale, scope=None),
        kernel_initializer=tf.contrib.layers.xavier_initializer(),
        name="output_FC")
    
    # Cross entropy
    entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    
    # Apply regularization in training stage and not in prediction stage
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(entropy) + beta*sum( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) )
    tf.summary.scalar('loss', loss)

    with tf.name_scope('train'):
        # Decayed learning rate
        # Start learning rate as 0.001, decay rate as 0.5, decay 6 steps to ~0.000015 (~1e-5)
        learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                                   decay_steps=n_epochs//6, decay_rate=0.5, staircase=True)
        tf.summary.scalar('learning_rate', learning_rate)
        # Use Adam optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
    
    with tf.name_scope('accuracy'):
        with tf.name_scope('correct_prediction'):
            correct_prediction = tf.nn.in_top_k(logits, y, 1)
        with tf.name_scope('accuracy'):
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    tf.summary.scalar('accuracy', accuracy)

    # Merge all the summaries and write them out to /tmp/mnist_logs (by default)
    merged = tf.summary.merge_all()

    
    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        train_writer = tf.summary.FileWriter('train', sess.graph)
        test_writer = tf.summary.FileWriter("test")
        
        init.run()
        n_rounds = X_train.shape[0] // batch_size
        for epoch in range(n_epochs):
            X_batch, y_batch = next_batch(batch_size, X_train, y_train, epoch, n_rounds)
            summary_train, _ = sess.run([merged, optimizer], 
                                        feed_dict= {X: X_batch, y: y_batch, 
                                                    mode: MY_MODE_TRAINING, scale: beta,
                                                    global_step: epoch}) 
            train_writer.add_summary(summary_train, epoch)

            if (epoch % 10 == 0):
                _, acc_train = sess.run([merged, accuracy], feed_dict={X: X_train, y: y_train, 
                                                    mode: MY_MODE_PREDICTION, scale: 0,
                                                    global_step: epoch})
                
                summary_test, acc_test = sess.run([merged, accuracy], 
                                                  feed_dict={X: X_test, y:y_test, 
                                                            mode: MY_MODE_PREDICTION, scale: 0,
                                                            global_step: epoch})
                test_writer.add_summary(summary_test, epoch)
               
                print(epoch, "Train accuracy:", acc_train,  "Test_accuracy:", acc_test)

### Next batch - support function
* Get the next mini_batch from training data

In [24]:
def next_batch(num, data, labels, epoch, rounds):
    ''' 
    Return a total of `num` random samples and labels. 
    Reshuffle the index when running over the total data set epoch%rounds==0
    
    Arg: 
      num, the number of returned data size
      data, the input X
      labels, the label y
      epoch, the current epoch value
      rounds, rounds = label_size // num 

    Returns:
      Return "num" of X and y array
    '''
    global g_idx
    set_cnt = epoch % rounds
    if( (set_cnt) == 0 ) :
        #print("Reshuffling...")
        g_idx = np.arange(0, labels.shape[0])
        np.random.shuffle(g_idx)
        

    idx = g_idx[set_cnt*num:set_cnt*num+num]
    data_shuffle = [data[ i] for i in idx]
    labels_shuffle = [labels[ i] for i in idx]

    return np.asarray(data_shuffle), np.asarray(labels_shuffle)

### CNN Model

In [29]:
def model_cnn(X_train, y_train, X_test, y_test, batch_size=100, n_epochs=2000) :
    
    #CNN approach
    
    tf.reset_default_graph()
    
    #Hyper parameters
    conv1_depth = 64
    conv1_kernel_size = [3, 3]
    dense1_node = 64
    dense2_node = 16
    beta = 0.02                  # Regularization 0.01
    dropout_rate = 0.5           # Dropout layer for regularization
    starter_learning_rate = 0.001

    X = tf.placeholder(tf.float32, shape=(None, 480), name="X")
    input = tf.reshape(X, [-1, 60, 8, 1])
    y = tf.placeholder(tf.int32, shape=(None), name="y")
    mode = tf.placeholder(tf.bool, shape=(None), name="mode")
    scale = tf.placeholder(tf.float32, shape=(None), name="scale")
    global_step = tf.Variable(0, trainable=False)
    learning_rate = tf.Variable(starter_learning_rate, trainable=False)
    
    # Convolutional Layer #1
    # Computes 32 features using a 3x3 filter.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 60, 8, 1]
    # Output Tensor Shape: [batch_size, 60, 8, 64]
    conv1 = tf.layers.conv2d(
        inputs=input,
        filters=conv1_depth,
        kernel_size=conv1_kernel_size,
        padding="same",
        activation=tf.nn.relu,
        kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=scale, scope=None),
        kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
        name="conv1")

    #Adding a pooling layer - Output Tensor Shape: [batch_size, 30, 4, 64]
    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
    pool_flat = tf.reshape(pool1, [-1, 30 * 4 * conv1_depth])

    #Adding a fully connected layer
    dense1 = tf.layers.dense(
        inputs=pool_flat, 
        units=dense1_node, 
        activation=tf.nn.relu, 
        kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=scale, scope=None),
        kernel_initializer=tf.contrib.layers.xavier_initializer(),
        name="dense1")
    
    #Adding a dropout layer to avoid overfitting
    dropout1 = tf.layers.dropout(inputs=dense1, rate=dropout_rate, training=mode )  

    # Add the 2nd fully connected layer
    dense2 = tf.layers.dense(
        inputs=dropout1, 
        units=dense2_node, 
        activation=tf.nn.relu, 
        kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=scale, scope=None),
        kernel_initializer=tf.contrib.layers.xavier_initializer(),
        name="dense2")
  
     # The last output layer
    logits = tf.layers.dense(
        inputs=dense2, 
        units=3,
        activation=None,    #softmax is done in tf.nn.sparse_softmax_cross_entropy_with_logits
        kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=scale, scope=None),
        kernel_initializer=tf.contrib.layers.xavier_initializer(),
        name="output_FC")

    # Cross entropy
    entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    
    # Apply regularization in training stage and not in prediction stage
    reg_losses = sum( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) )
    loss_noreg = tf.reduce_mean(entropy)
    loss = loss_noreg + beta * reg_losses
    

    # Decayed learning rate
    # # Start learning rate as 0.001, decay rate as 0.5, decay 6 steps to ~0.000015 (~1e-5)
    learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                               decay_steps=n_epochs//6, decay_rate=0.5, staircase=True)
    
    # Use Adam optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)   

    correct_prediction = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    
    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        init.run()
        n_rounds = X_train.shape[0] // batch_size
        for epoch in range(n_epochs):
            X_batch, y_batch = next_batch(batch_size, X_train, y_train, epoch, n_rounds)
            sess.run(optimizer, feed_dict= {X: X_batch, y: y_batch, 
                                            mode: MY_MODE_TRAINING, scale: beta,
                                            global_step: epoch}) 

            if (epoch % 10 == 0):
                acc_train = accuracy.eval(feed_dict={X: X_train, y: y_train, 
                                                     mode: MY_MODE_PREDICTION, scale: 0,
                                                     global_step: epoch}) 
                acc_test = accuracy.eval(feed_dict={X: X_test, y:y_test, 
                                                    mode: MY_MODE_PREDICTION, scale: 0,
                                                    global_step: epoch})  
                print(epoch, "Train accuracy:", acc_train,  "Test_accuracy:", acc_test)

### RNN model

In [30]:
def model_rnn(X_train, y_train, X_test, y_test, batch_size=100, n_epochs=2000) :
    
    #RNN approach

    tf.reset_default_graph()

    # Input/output data dimension
    n_steps =  60
    n_inputs = 8
    n_outputs = 3
    
    #Hyper parameters
    n_neurons = 64
    beta = 0.1                   # Regularization 0.2
    dropout_rate = 0.5           # Dropout layer for regularization
    starter_learning_rate = 0.001

    X = tf.placeholder(tf.float32, shape=(None, n_steps * n_inputs), name="X")
    input = tf.reshape(X, [-1, n_steps, n_inputs])
    y = tf.placeholder(tf.int32, shape=(None), name="y")
    mode = tf.placeholder(tf.bool, shape=(None), name="mode")
    scale = tf.placeholder(tf.float32, shape=(None), name="scale")
    global_step = tf.Variable(0, trainable=False)
    learning_rate = tf.Variable(starter_learning_rate, trainable=False)
    
    # RNN layer
    basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
    outputs, states = tf.nn.dynamic_rnn(basic_cell, input, dtype=tf.float32)
        
    # Dropout layer to avoid overfitting
    dropout = tf.layers.dropout(inputs=states, rate=dropout_rate, training= mode )

    # Fully connected layer - Xavier initializer and L2 regularizer
    logits = tf.contrib.layers.fully_connected(
        inputs=dropout, 
        num_outputs=n_outputs, 
        weights_initializer=tf.contrib.layers.xavier_initializer(),
        weights_regularizer=tf.contrib.layers.l2_regularizer(scale=beta, scope=None),
        activation_fn=None,    #softmax is done in tf.nn.sparse_softmax_cross_entropy_with_logits
    )

    entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    
    # Apply regularization to training stage and not in prediction stage
    reg_losses = sum( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) )
    loss_noreg = tf.reduce_mean(entropy)
    loss = loss_noreg + beta * reg_losses

    # Decayed learning rate
    # # Start learning rate as 0.001, decay rate as 0.5, decay 6 steps to ~0.000015 (~1e-5)
    learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                               decay_steps=n_epochs//6, decay_rate=0.5, staircase=True)
    
    # Adam optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
    
    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        init.run()
        n_rounds = X_train.shape[0] // batch_size
        print("n_rounds:", n_rounds)
        
        for epoch in range(n_epochs):
            X_batch, y_batch = next_batch(batch_size, X_train, y_train, epoch, n_rounds)
            sess.run(optimizer, feed_dict= {X: X_batch, y: y_batch, mode: MY_MODE_TRAINING, 
                                            scale: beta, global_step: epoch}) 

            if (epoch % 10 == 0):
                acc_train = accuracy.eval( feed_dict={X: X_train, y: y_train, 
                                                      mode: MY_MODE_PREDICTION, scale: 0, global_step: epoch}) 
                acc_test = accuracy.eval( feed_dict={X: X_test, y:y_test, 
                                                     mode: MY_MODE_PREDICTION, scale: 0, global_step: epoch})  
                print(epoch, "Train accuracy:", acc_train,  "Test_accuracy:", acc_test)



### Main 

In [31]:
def main(unused_argv):
    
    # Load data
    df = load_data("..\data\splice.data")
    
    # Preprocess data and split training and validation data
    X_train, y_train, X_test, y_test = preprocess_data(df)
    
    # DNN model
    #model_dnn(X_train, y_train, X_test, y_test, batch_size=100, n_epochs=1000)
        
    # CNN model
    model_cnn(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, batch_size=100, n_epochs=2000)
    
    # RNN model
    #model_rnn(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, batch_size=100, n_epochs=3000)
    

In [32]:
if __name__ == "__main__":
  tf.app.run()

Total samples: 3190
Training samples:  2552 Test samples:  638
0 Train accuracy: 0.518809 Test_accuracy: 0.518809
10 Train accuracy: 0.518809 Test_accuracy: 0.518809
20 Train accuracy: 0.518809 Test_accuracy: 0.518809
30 Train accuracy: 0.518809 Test_accuracy: 0.518809
40 Train accuracy: 0.643025 Test_accuracy: 0.637931
50 Train accuracy: 0.905172 Test_accuracy: 0.869906
60 Train accuracy: 0.914969 Test_accuracy: 0.880878
70 Train accuracy: 0.893025 Test_accuracy: 0.873041
80 Train accuracy: 0.927508 Test_accuracy: 0.894984
90 Train accuracy: 0.9471 Test_accuracy: 0.918495
100 Train accuracy: 0.951411 Test_accuracy: 0.924765
110 Train accuracy: 0.942398 Test_accuracy: 0.909091
120 Train accuracy: 0.935737 Test_accuracy: 0.909091
130 Train accuracy: 0.943182 Test_accuracy: 0.92163
140 Train accuracy: 0.960815 Test_accuracy: 0.929467
150 Train accuracy: 0.967476 Test_accuracy: 0.938872
160 Train accuracy: 0.959248 Test_accuracy: 0.932602
170 Train accuracy: 0.971003 Test_accuracy: 0.9420

1540 Train accuracy: 0.999608 Test_accuracy: 0.960815
1550 Train accuracy: 0.999608 Test_accuracy: 0.962382
1560 Train accuracy: 0.999608 Test_accuracy: 0.962382
1570 Train accuracy: 0.999608 Test_accuracy: 0.960815
1580 Train accuracy: 0.999608 Test_accuracy: 0.962382
1590 Train accuracy: 0.999608 Test_accuracy: 0.962382
1600 Train accuracy: 0.999608 Test_accuracy: 0.962382
1610 Train accuracy: 0.999608 Test_accuracy: 0.960815
1620 Train accuracy: 0.999608 Test_accuracy: 0.959248
1630 Train accuracy: 0.999608 Test_accuracy: 0.959248
1640 Train accuracy: 0.999608 Test_accuracy: 0.959248
1650 Train accuracy: 0.999608 Test_accuracy: 0.962382
1660 Train accuracy: 0.999608 Test_accuracy: 0.96395
1670 Train accuracy: 0.999608 Test_accuracy: 0.960815
1680 Train accuracy: 0.999608 Test_accuracy: 0.960815
1690 Train accuracy: 0.999608 Test_accuracy: 0.962382
1700 Train accuracy: 0.999608 Test_accuracy: 0.962382
1710 Train accuracy: 0.999608 Test_accuracy: 0.965517
1720 Train accuracy: 0.999608

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
