# Classification on Splice-junction Gene Sequences 


In [50]:
#Imports
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

MY_MODE_TRAINING = True
MY_MODE_PREDICTION = False


### Load the dataset

In [27]:
def load_data(file_path) :
    df = pd.read_csv(file_path, header=None)
    df.columns = ['classlabel', 'name', 'sequence']
    df.tail()
    
    return df

### Pre-process the dataset
* Apply one-hot-encoding to input data
* Take 20% as test data 

In [28]:
def preprocess_data(df) :
    
    # Encoding class labels
    class_le = LabelEncoder()
    y = class_le.fit_transform(df['classlabel'].values)
    print("y:",y)
    
    # Encoding sequence
    # Here we use one hot encoding to encode the character in DNA sequence. 
    # So each dna sequence is converted to a 60x8 2D array 
    def Seq2Vec(seq):
        s = str(seq).strip()
        CharDict = { "A":[0,0,0,0,0,0,0,1],
                     "G":[0,0,0,0,0,0,1,0],
                     "C":[0,0,0,0,0,1,0,0],
                     "T":[0,0,0,0,1,0,0,0],
                     "D":[0,0,0,1,0,0,0,0],
                     "N":[0,0,1,0,0,0,0,0],
                     "S":[0,1,0,0,0,0,0,0],
                     "R":[1,0,0,0,0,0,0,0]}
        return np.asarray([CharDict[c] for c in s], dtype=np.float32).flatten()

    df['seqvec'] = df['sequence'].apply(Seq2Vec)
    X = np.vstack(df['seqvec'].values)
    print("First sequence:", df['sequence'][0])
    print("X shape:", X.shape)
    
    # Split the data set into training/test set
    sss = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=0)
    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    
    return X_train, y_train, X_test, y_test

### DNN model

In [29]:
def model_dnn(X_train, y_train, X_test, y_test, batch_size=100, n_epochs=5000) :
    
    #DNN approach
    feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(X_train)
    dnn_clf = tf.contrib.learn.DNNClassifier(hidden_units=[300, 100], n_classes=3,
                                             feature_columns=feature_columns)
    dnn_clf.fit(x=X_train, y=y_train, batch_size=50, steps=n_epochs)

    from sklearn.metrics import accuracy_score

    y_pred = list(dnn_clf.predict(X_test))
    accuracy = accuracy_score(y_test, y_pred)
    accuracy

### Next batch - support function
* Get the next mini_batch from training data

In [87]:
def next_batch(num, data, labels, epoch, rounds):
    ''' 
    Return a total of `num` random samples and labels. 
    Reshuffle the index when running over the total data set epoch%rounds==0
    
    Arg: 
      num, the number of returned data size
      data, the input X
      labels, the label y
      epoch, the current epoch value
      rounds, rounds = label_size // num 

    Returns:
      Return "num" of X and y array
    '''
    global g_idx
    set_cnt = epoch % rounds
    if( (set_cnt) == 0 ) :
        g_idx = np.arange(0, labels.shape[0])
        np.random.shuffle(g_idx)

    idx = g_idx[set_cnt*num:set_cnt*num+num]
    data_shuffle = [data[ i] for i in idx]
    labels_shuffle = [labels[ i] for i in idx]

    return np.asarray(data_shuffle), np.asarray(labels_shuffle)

### CNN Model

In [88]:
def model_cnn(X_train, y_train, X_test, y_test, batch_size=100, n_epochs=2000) :
    
    #CNN approach
    
    tf.reset_default_graph()
    
    #Hyper parameters
    conv1_depth = 32
    conv2_depth = 64
    conv1_kernel_size = [3, 3]
    conv2_kernel_size = [3, 3]
    beta = 0.1        # Regularization 0.01
    dropout_rate = 0.5  # Dropout layer for regularization

    X = tf.placeholder(tf.float32, shape=(None, 480), name="X")
    input = tf.reshape(X, [-1, 60, 8, 1])
    y = tf.placeholder(tf.int32, shape=(None), name="y")
    mode = tf.placeholder(tf.bool, shape=(None), name="mode")
    scale = tf.placeholder(tf.float32, shape=(None), name="scale")
    
    # Convolutional Layer #1
    # Computes 32 features using a 3x3 filter.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 60, 8, 1]
    # Output Tensor Shape: [batch_size, 60, 8, 32]
    conv1 = tf.layers.conv2d(
        inputs=input,
        filters=conv1_depth,
        kernel_size=conv1_kernel_size,
        padding="same",
        activation=tf.nn.relu,
        kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=scale, scope=None),
        kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
        name="conv1")

    #Adding a pooling layer
    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)

    # Convolutional Layer #2
    # Computes 64 features using a 3x3 filter.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 30, 4, 32]
    # Output Tensor Shape: [batch_size, 30, 4, 64]
    conv2 = tf.layers.conv2d(
        inputs=pool1,
        filters=conv2_depth,
        kernel_size=conv2_kernel_size,
        padding="same",
        activation=tf.nn.relu,
        kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=scale, scope=None),
        kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
        name="conv2")

    #Adding a pooling layer
    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
    pool2_flat = tf.reshape(pool2, [-1, 15 * 2 * conv2_depth])

    #Adding a fully connected layer
    dense1 = tf.layers.dense(
        inputs=pool2_flat, 
        units=64, 
        activation=tf.nn.relu, 
        kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=scale, scope=None),
        kernel_initializer=tf.contrib.layers.xavier_initializer(),
        name="dense1")
    
    #Adding a dropout layer to avoid overfitting
    dropout1 = tf.layers.dropout(inputs=dense1, rate=dropout_rate, training=mode )  

    dense2 = tf.layers.dense(
        inputs=dropout1, 
        units=16, 
        activation=tf.nn.relu, 
        kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=scale, scope=None),
        kernel_initializer=tf.contrib.layers.xavier_initializer(),
        name="dense2")

    #dropout2 = tf.layers.dropout(inputs=dense2, rate=0.4, training= mode==0 )
    
    logits = tf.layers.dense(
        inputs=dense2, 
        kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=scale, scope=None),
        kernel_initializer=tf.contrib.layers.xavier_initializer(),
        units=3
    )

    entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    
    # Apply regularization in training stage and not in prediction stage
    reg_losses = sum( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) )
    loss_noreg = tf.reduce_mean(entropy)
    loss = loss_noreg + beta * reg_losses
        
    # Decaying learning rate
    '''
    global_step = tf.Variable(0)  # count the number of steps taken.
    start_learning_rate = 0.5
    learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, 100000, 0.96, staircase=True)
    '''
    learning_rate = 1e-4
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)   

    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

       
    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        init.run()
        n_rounds = X_train.shape[0] // batch_size
        for epoch in range(n_epochs):
            X_batch, y_batch = next_batch(batch_size, X_train, y_train, epoch, n_rounds)
            sess.run(optimizer, feed_dict={X: X_batch, y: y_batch, mode: MY_MODE_TRAINING, scale: beta}) 

            if (epoch % 10 == 0):
                acc_train = accuracy.eval(feed_dict={X: X_train, y: y_train, mode: MY_MODE_PREDICTION, scale: 0}) 
                acc_test = accuracy.eval(feed_dict={X: X_test, y:y_test, mode: MY_MODE_PREDICTION, scale: 0})  
                print(epoch, "Train accuracy:", acc_train,  "Test_accuracy:", acc_test)

### RNN model

In [84]:
def model_rnn(X_train, y_train, X_test, y_test, batch_size=100, n_epochs=2000) :
    
    #RNN approach

    tf.reset_default_graph()

    #Hyper parameters
    n_steps =  60
    n_inputs = 8
    n_neurons = 150
    n_outputs = 3
    beta = 0.1        # Regularization
    dropout_rate = 0.5


    X = tf.placeholder(tf.float32, shape=(None, n_steps * n_inputs), name="X")
    input = tf.reshape(X, [-1, n_steps, n_inputs])
    y = tf.placeholder(tf.int32, shape=(None), name="y")
    mode = tf.placeholder(tf.int32, shape=(None), name="mode")

    basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
    outputs, states = tf.nn.dynamic_rnn(basic_cell, input, dtype=tf.float32)
    print("output.shape:", states.shape)
        
    # Add dropout layer to avoid overfitting
    dropout = tf.layers.dropout(inputs=states, rate=dropout_rate, training= mode==0 )

    logits = tf.contrib.layers.fully_connected(
        input=dropout, 
        num_outputs=n_outputs, 
        weights_initializer=tf.contrib.layers.xavier_initializer(),
        weights_regularizer=tf.contrib.layers.l2_regularizer(scale=beta, scope=None),
        #activation_fn=None  # Try out the default ReLU
    )

    entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    
    # Apply regularization to training stage and not in prediction stage
    reg_losses = sum( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) )
    loss_noreg = tf.reduce_mean(entropy)
    loss = loss_noreg + beta * reg_losses


    
    #loss_noreg = tf.reduce_mean(entropy)
    #print( "loss with reg vs no reg:", loss, loss_noreg)

    optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss)

    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

    
    
    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        init.run()
        for epoch in range(n_epochs):
            X_batch, y_batch = next_batch(batch_size, X_train, y_train)
            sess.run(optimizer, feed_dict={X: X_batch, y: y_batch, mode: MY_MODE_TRAINING})

            if (epoch % 10 == 0):
                acc_train = accuracy.eval(feed_dict={X: X_train, y: y_train, mode: MY_MODE_PREDICTION})
                acc_test = accuracy.eval(feed_dict={X: X_test, y:y_test, mode: MY_MODE_PREDICTION})
                print(epoch, "Train accuracy:", acc_train,  "Test_accuracy:", acc_test)




### Main Entry Function

In [89]:
def main(unused_argv):
    
    # Load data
    df = load_data("..\data\splice.data")
    
    # Preprocess data and split training and validation data
    X_train, y_train, X_test, y_test = preprocess_data(df)
    
    # DNN model
    #model_dnn(X_train, y_train, X_test, y_test, batch_size=100, n_epochs=3000)
        
    # CNN model
    model_cnn(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, batch_size=100, n_epochs=4000)
    
    # RNN model
    #model_rnn(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, batch_size=100, n_epochs=4000)
    

In [90]:
if __name__ == "__main__":
  tf.app.run()

y: [0 0 0 ..., 2 2 2]
First sequence:                CCAGCTGCATCACAGGAGGCCAGCGAGCAGGTCTGTTCCAAGGGCCTTCGAGCCAGTCTG
X shape: (3190, 480)
(2552, 480) (638, 480) (2552,) (638,)
0 Train accuracy: 0.237069 Test_accuracy: 0.238245
10 Train accuracy: 0.518809 Test_accuracy: 0.518809
20 Train accuracy: 0.518809 Test_accuracy: 0.518809
30 Train accuracy: 0.518809 Test_accuracy: 0.518809
40 Train accuracy: 0.518809 Test_accuracy: 0.518809
50 Train accuracy: 0.518809 Test_accuracy: 0.518809
60 Train accuracy: 0.518809 Test_accuracy: 0.518809
70 Train accuracy: 0.518809 Test_accuracy: 0.518809
80 Train accuracy: 0.518809 Test_accuracy: 0.518809
90 Train accuracy: 0.518809 Test_accuracy: 0.518809
100 Train accuracy: 0.518809 Test_accuracy: 0.518809
110 Train accuracy: 0.518809 Test_accuracy: 0.518809
120 Train accuracy: 0.518809 Test_accuracy: 0.518809
130 Train accuracy: 0.518809 Test_accuracy: 0.518809
140 Train accuracy: 0.519201 Test_accuracy: 0.518809
150 Train accuracy: 0.548198 Test_accuracy:

1520 Train accuracy: 0.973746 Test_accuracy: 0.96395
1530 Train accuracy: 0.972962 Test_accuracy: 0.96395
1540 Train accuracy: 0.973354 Test_accuracy: 0.965517
1550 Train accuracy: 0.974138 Test_accuracy: 0.967085
1560 Train accuracy: 0.972179 Test_accuracy: 0.96395
1570 Train accuracy: 0.974138 Test_accuracy: 0.968652
1580 Train accuracy: 0.973354 Test_accuracy: 0.962382
1590 Train accuracy: 0.973354 Test_accuracy: 0.962382
1600 Train accuracy: 0.975313 Test_accuracy: 0.967085
1610 Train accuracy: 0.97453 Test_accuracy: 0.96395
1620 Train accuracy: 0.975313 Test_accuracy: 0.967085
1630 Train accuracy: 0.976097 Test_accuracy: 0.967085
1640 Train accuracy: 0.975313 Test_accuracy: 0.962382
1650 Train accuracy: 0.973746 Test_accuracy: 0.962382
1660 Train accuracy: 0.975313 Test_accuracy: 0.967085
1670 Train accuracy: 0.974922 Test_accuracy: 0.962382
1680 Train accuracy: 0.975705 Test_accuracy: 0.96395
1690 Train accuracy: 0.978056 Test_accuracy: 0.970219
1700 Train accuracy: 0.975313 Test

3050 Train accuracy: 0.988636 Test_accuracy: 0.962382
3060 Train accuracy: 0.991771 Test_accuracy: 0.976489
3070 Train accuracy: 0.989028 Test_accuracy: 0.973354
3080 Train accuracy: 0.990596 Test_accuracy: 0.971787
3090 Train accuracy: 0.988636 Test_accuracy: 0.968652
3100 Train accuracy: 0.989028 Test_accuracy: 0.967085
3110 Train accuracy: 0.989812 Test_accuracy: 0.974922
3120 Train accuracy: 0.989812 Test_accuracy: 0.967085
3130 Train accuracy: 0.98942 Test_accuracy: 0.973354
3140 Train accuracy: 0.991771 Test_accuracy: 0.971787
3150 Train accuracy: 0.991379 Test_accuracy: 0.974922
3160 Train accuracy: 0.991379 Test_accuracy: 0.974922
3170 Train accuracy: 0.992163 Test_accuracy: 0.976489
3180 Train accuracy: 0.989028 Test_accuracy: 0.970219
3190 Train accuracy: 0.991379 Test_accuracy: 0.970219
3200 Train accuracy: 0.98942 Test_accuracy: 0.973354
3210 Train accuracy: 0.990987 Test_accuracy: 0.974922
3220 Train accuracy: 0.989812 Test_accuracy: 0.968652
3230 Train accuracy: 0.989812 

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
