In [None]:
#use this only when running the notebook on google colab
'''
from google.colab import drive
drive.mount('/content/drive')
filepath = '/content/drive/My Drive/rnn-workshop/Human Data Sample of 10000 records.csv'
'''
#use this when running locally:
filepath = './Human Data Sample of 10000 records.csv'

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns
import keras
from keras.models import Model
from keras.layers import LSTM,RNN, GRU, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score
import keras.backend as K

In [None]:
# read CSV data
df = pd.read_csv(filepath,delimiter=',',encoding='latin-1')
df.head()

In [None]:
#k-mer of size 8
def getKmers(V2, size=8):
    return [V2[x:x+size].lower() for x in range(len(V2) - size + 1)]

In [None]:
# create a new column ‘words’  and get k-mers in that column
df['words']=df.apply(lambda x: getKmers(x['V2']), axis=1)

In [None]:
#remove column ‘V2’
df=df.drop('V2',axis=1)
df.head()

In [None]:
#Apply leble encoder to the column ‘V1’, Promoter:0 Terminator:1
Y = df.V1
le = LabelEncoder()
Y = le.fit_transform(Y)

In [None]:
#Training and Testing division of data
X=df.words
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)

In [None]:
#Apply tokenization to the training data
tok = Tokenizer(num_words=None)
tok.fit_on_texts(X_train)
vocab_size = len(tok.word_index) + 1
sequences = tok.texts_to_sequences(X_train)
max_len = 150
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [None]:
class RNNCell(keras.layers.Layer):
    def __init__(self, units, **kwargs):
        self.units = units
        self.state_size = units
        super(RNNCell, self).__init__(**kwargs)

    def build(self, input_shape):
        self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
                                      initializer='uniform',
                                      name='kernel')
        self.recurrent_kernel = self.add_weight(
            shape=(self.units, self.units),
            initializer='uniform',
            name='recurrent_kernel')
        self.built = True

    def call(self, inputs, states):
        prev_output = states[0]
        h = K.dot(inputs, self.kernel)
        output = h + K.dot(prev_output, self.recurrent_kernel)
        return output, [output]

In [None]:
#Generate model
def sequenceModel():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(vocab_size,50,input_length=max_len)(inputs)
    #layer = LSTM(64)(layer)
    #layer = GRU(64)(layer)
    layer = RNN(RNNCell(64))(layer)
    
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [None]:
#Summary of model
model = sequenceModel()
model.summary()
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
#fitting a model
# For LSTM
#model.fit(sequences_matrix,Y_train,batch_size=32,epochs=10,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])
# For RNN
model.fit(sequences_matrix,Y_train,batch_size=32,epochs=10,callbacks=[EarlyStopping(monitor='loss',min_delta=0.0001)])

In [None]:
#Testing a model
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
accr = model.evaluate(test_sequences_matrix,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
# Accuracy, precision, recall, f1 for Evaluation
y_pred = model.predict(test_sequences_matrix)
y_pred= np.reshape(y_pred, (y_pred.shape[0],)) 
print(y_pred.shape)
print(Y_test.shape)

print("Confusion matrix\n")

In [None]:
def get_metrics(y_test, y_predicted):
    print(pd.crosstab(pd.Series(Y_test, name='Actual'), pd.Series(y_pred,
    name='Predicted')))
    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='weighted')
    recall = recall_score(y_test, y_predicted, average='weighted')
    f1 = f1_score(y_test, y_predicted, average='weighted')
    return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(Y_test, y_pred.round())
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))