# RNN cross validation
Classify pc vs nc RNA.
Use K=2,3,4.

Set aside the 20% test set, stratified by length.
On the remaining 80%,
perform 5-fold cross validation.

Test subsets of the data with RNN.
To do: Read sequences (not K-mers).

In [67]:
import numpy as np
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import ShuffleSplit

from sklearn.preprocessing import OneHotEncoder

# For the manual cross validation
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import StratifiedKFold

tf.keras.backend.set_floatx('float32')

In [68]:
# Assume file was preprocessed to contain one line per seq.
# Prefer Pandas dataframe but df does not support append.
# For conversion to tensor, must avoid python lists.
def load_fasta(filename,label):
    DEFLINE='>'
    labels=[]
    seqs=[]
    lens=[]
    nums=[]
    num=0
    with open (filename,'r') as infile:
        for line in infile:
            if line[0]!=DEFLINE:
                seq=line.rstrip()
                num += 1   # first seqnum is 1
                seqlen=len(seq)
                nums.append(num)
                labels.append(label)
                seqs.append(seq)
                lens.append(seqlen)
    df1=pd.DataFrame(nums,columns=['seqnum'])
    df2=pd.DataFrame(labels,columns=['class'])
    df3=pd.DataFrame(seqs,columns=['sequence'])
    df4=pd.DataFrame(lens,columns=['seqlen'])
    df=pd.concat((df1,df2,df3,df4),axis=1)
    return df

# Split into train/test stratified by sequence length.
def sizebin(df):
    return pd.cut(df["seqlen"],
                              bins=[0,1000,2000,4000,8000,16000,np.inf],
                              labels=[0,1,2,3,4,5])
def make_train_test(data):
    bin_labels= sizebin(data)
    from sklearn.model_selection import StratifiedShuffleSplit
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=37863)
    # split(x,y) expects that y is the labels. 
    # Trick: Instead of y, give it it the bin labels that we generated.
    for train_index,test_index in splitter.split(data,bin_labels):
        train_set = data.iloc[train_index]
        test_set = data.iloc[test_index]
    return (train_set,test_set)

def prepare_data_set(data):
    y_test=   data[['class']].copy()
    X_test=   data.drop(columns=['class','seqnum','seqlen'])
    return (X_test,y_test)

def subset(data_set,min_len,max_len):
    print("original "+str(data_set.shape))
    too_short = data_set[ data_set['seqlen'] < min_len ].index
    no_short=data_set.drop(too_short)
    print("no short "+str(no_short.shape))
    too_long = no_short[ no_short['seqlen'] >= max_len ].index
    no_long_no_short=no_short.drop(too_long)
    print("no long, no short "+str(no_long_no_short.shape))
    return no_long_no_short


In [77]:
def do_cross_validation(X,y,K):
    cv_scores = []
    act="sigmoid"
    dt='float32'
    fold=0
    eps=100
    splitter = ShuffleSplit(n_splits=5, test_size=0.2, random_state=37863)
    for train_index,valid_index in splitter.split(X):
        X_train=X.iloc[train_index]
        y_train=y.iloc[train_index]
        X_valid=X.iloc[valid_index]
        y_valid=y.iloc[valid_index]
        mlp = keras.models.Sequential([
            keras.layers.LayerNormalization(trainable=False),
            keras.layers.Dense(32, activation=act,dtype=dt),
            keras.layers.Dense(32, activation=act,dtype=dt),
            keras.layers.Dense(1,  activation=act,dtype=dt)
        ])
        seq_len=None  # none indicates variable length
        input_features=4**K   # 64 DNA K-mers at K=3
        rnn2 = keras.models.Sequential([
            keras.layers.SimpleRNN(16, return_sequences=True, 
                                   input_shape=[seq_len,input_features]),
            keras.layers.SimpleRNN(16, return_sequences=True),
            keras.layers.SimpleRNN(16, return_sequences=True),
            keras.layers.SimpleRNN(1),
        ])

        bc=tf.keras.losses.BinaryCrossentropy(from_logits=False)
        rnn2.compile(loss=bc, optimizer="Adam",metrics=["accuracy"])
        history=rnn2.fit(X_train, y_train, # batch_size=10, default=32 works nicely
                epochs=eps, verbose=0,  # verbose=1 for ascii art, verbose=0 for none
                validation_data=(X_valid,y_valid) )
                        
        fold += 1
        print("Fold %d, %d epochs"%(fold,eps))

        pd.DataFrame(history.history).plot(figsize=(8,5))
        plt.grid(True)
        plt.gca().set_ylim(0,1)
        plt.show()

        scores = mlp.evaluate(X_valid, y_valid, verbose=0)
        print("%s: %.2f%%" % (mlp.metrics_names[1], scores[1]*100))
        cv_scores.append(scores[1] * 100)
    print()
    print("Validation core mean %.2f%% (+/- %.2f%%)" % (np.mean(cv_scores), np.std(cv_scores)))


In [79]:
def generate_all_kmers(K):
    shorter_kmers=['']
    for i in range(K):
        longer_kmers=[]
        for mer in shorter_kmers:
            longer_kmers.append(mer+'A')
            longer_kmers.append(mer+'C')
            longer_kmers.append(mer+'G')
            longer_kmers.append(mer+'T')
        shorter_kmers = longer_kmers
    return shorter_kmers

def train_encoder(kmers):
    narray = np.array(kmers)
    array2d = narray.reshape(-1, 1)
    encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)
    encoder.fit(array2d)
    return encoder


## Load and partition sequences

In [80]:
MINLEN=1000
MAXLEN=2000
nc_seq=load_fasta('ncRNA.fasta',0)
pc_seq=load_fasta('pcRNA.fasta',1)
all_seq=pd.concat((nc_seq,pc_seq),axis=0)

(train_set,test_set)=make_train_test(all_seq)
(X_test,y_test)=prepare_data_set(test_set)
train_set=subset(train_set,MINLEN,MAXLEN)
train_set

original (30290, 4)
no short (9273, 4)
no long, no short (3368, 4)


Unnamed: 0,seqnum,class,sequence,seqlen
12641,12642,1,GGCGGGGTCGACTGACGGTAACGGGGCAGAGAGGCTGTTCGCAGAG...,1338
2971,2972,1,TGACATGGGCAGAGTTTCTCTTGCCCTTAAAGTCTTACTTTCCACT...,1454
1941,1942,1,CGGTGCCACAGGGACGAGGCCTGGAGAGCAGTCGCTCCTAGAACCG...,1721
2258,2259,1,CGCCCGCGAGGGGCCGGGGTCGGGGCCGCCGGGGCCATGCGCGCGG...,1550
16687,16688,0,GTTCTTTAGTAGAAGGATAACATGACATAGCAGGAATAATACTGCC...,1889
...,...,...,...,...
9272,9273,1,AGCGAGCCCTGCGGCCGCCGGAGCAGCTCCCGCGGCGGAGCAGGAG...,1808
3122,3123,0,AGAGGCAGCGCTGGCGTTGGAGAGTGATGGCGGCATGGCGGTGCGG...,1317
1515,1516,0,GGTGAACTCACATAGATTCAGAAAGCAGGGATTCAGGAACAAGGAA...,1445
10982,10983,0,AGCATCAGACTAGCCCCTGAGAGCCAAAAACTGTTTGCCTTTCAGT...,1638


In [81]:
(X_train_all,y_train_all)=prepare_data_set(train_set)
#print(X_train_all.shape,y_train_all.shape)
(X_train_all,y_train_all)
# y: Pandas dataframe to Python list.
# y_train_all=y_train_all.values.tolist()


(                                                sequence
 12641  GGCGGGGTCGACTGACGGTAACGGGGCAGAGAGGCTGTTCGCAGAG...
 2971   TGACATGGGCAGAGTTTCTCTTGCCCTTAAAGTCTTACTTTCCACT...
 1941   CGGTGCCACAGGGACGAGGCCTGGAGAGCAGTCGCTCCTAGAACCG...
 2258   CGCCCGCGAGGGGCCGGGGTCGGGGCCGCCGGGGCCATGCGCGCGG...
 16687  GTTCTTTAGTAGAAGGATAACATGACATAGCAGGAATAATACTGCC...
 ...                                                  ...
 9272   AGCGAGCCCTGCGGCCGCCGGAGCAGCTCCCGCGGCGGAGCAGGAG...
 3122   AGAGGCAGCGCTGGCGTTGGAGAGTGATGGCGGCATGGCGGTGCGG...
 1515   GGTGAACTCACATAGATTCAGAAAGCAGGGATTCAGGAACAAGGAA...
 10982  AGCATCAGACTAGCCCCTGAGAGCCAAAAACTGTTTGCCTTTCAGT...
 10776  GTGTTTTGACGTCGGCGGTGCCCGCGTTCCGCGCCGAGTAACGGTC...
 
 [3368 rows x 1 columns],
        class
 12641      1
 2971       1
 1941       1
 2258       1
 16687      0
 ...      ...
 9272       1
 3122       0
 1515       0
 10982      0
 10776      0
 
 [3368 rows x 1 columns])

## K-mer sequence, K=2

In [100]:
def make_kmers(data,K,uniform_len):
    all_seqs=[]
    pad_char='N'
    pad_kmer=pad_char*K
    # pad_kmer=None      # Prefer this to the above. Will it work? No. Encoder balks.
    for seq in data['sequence']:
        i=0
        seqlen=len(seq)
        kmers=[]
        while i < seqlen-K+1:
            kmer=seq[i:i+K]
            kmers.append(kmer)
            i += 1
        while i < uniform_len:
            kmers.append(pad_kmer)
            i += 1
        all_seqs.append(kmers)
    return all_seqs

In [101]:
K=2
encoder=train_encoder(generate_all_kmers(2))

# X: List of string to List of uniform-length ordered lists of K-mers.
X_train_kmers=make_kmers(X_train_all,K,MAXLEN)

# X: true 2D array (no more lists)
X_train_2D=pd.DataFrame(X_train_kmers) 
X_train_2D.shape
#X_train_reshape=X_train_numpy.reshape(-1, 1)
#X_train_encoded=encoder.fit(X_train_numpy)
X_train_encoded=encoder.fit(X_train_2D)

In [None]:

#encoder = train_encoder(K)
#seq=nc_seqs[0].reshape(-1, 1)  # expects 2D array
#encoder.fit(X_train_all)
#print("Encoder categories")
#print(str(encoder.categories_))


#do_cross_validation(X_train_all,y_train_all,K):

## K-mer sequence, K=3

## K-mer sequence, K=4