# MLP GenCode 
Wen et al 2019 used DNN to distinguish GenCode mRNA/lncRNA.
Based on K-mer frequencies, K={1,2,3}, they reported 99% accuracy.
Their CNN used 2 Conv2D layers of 32 filters of width 3x3, max pool 2x2, 25% drop, dense 128.
Can we reproduce that with MLP layers instead of CNN?
Extract features as list of K-mer frequencies for K={1,2,3}.

In [1]:
import time
def show_time():
    t = time.time()
    print(time.strftime('%Y-%m-%d %H:%M:%S %Z', time.localtime(t)))
show_time()

2021-07-10 10:19:18 EDT


In [2]:
PC_TRAINS=8000
NC_TRAINS=8000
PC_TESTS=2000
NC_TESTS=2000   # Wen et al 2019 used 8000 and 2000 of each class
PC_LENS=(200,4000)
NC_LENS=(200,4000)    # Wen et al 2019 used 250-3500 for lncRNA only
MAX_K = 3

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

In [4]:
import sys
IN_COLAB = False
try:
    from google.colab import drive
    IN_COLAB = True
except:
    pass
if IN_COLAB:
    print("On Google CoLab, mount cloud-local file, get our code from GitHub.")
    PATH='/content/drive/'
    #drive.mount(PATH,force_remount=True)  # hardly ever need this
    #drive.mount(PATH)    # Google will require login credentials
    DATAPATH=PATH+'My Drive/data/'  # must end in "/"
    import requests
    r = requests.get('https://raw.githubusercontent.com/ShepherdCode/Soars2021/master/SimTools/GenCodeTools.py')
    with open('GenCodeTools.py', 'w') as f:
        f.write(r.text)  
    from GenCodeTools import GenCodeLoader
    r = requests.get('https://raw.githubusercontent.com/ShepherdCode/Soars2021/master/SimTools/KmerTools.py')
    with open('KmerTools.py', 'w') as f:
        f.write(r.text)  
    from KmerTools import KmerTools
else:
        print("CoLab not working. On my PC, use relative paths.")
        DATAPATH='data/'  # must end in "/"
        sys.path.append("..") # append parent dir in order to use sibling dirs
        from SimTools.GenCodeTools import GenCodeLoader
        from SimTools.KmerTools import KmerTools
MODELPATH="BestModel"  # saved on cloud instance and lost after logout
#MODELPATH=DATAPATH+MODELPATH  # saved on Google Drive but requires login

CoLab not working. On my PC, use relative paths.


## Data Load
Restrict mRNA to those transcripts with a recognized ORF.

In [5]:
PC_FILENAME='gencode.v38.pc_transcripts.fa.gz'
NC_FILENAME='gencode.v38.lncRNA_transcripts.fa.gz'
PC_FULLPATH=DATAPATH+PC_FILENAME
NC_FULLPATH=DATAPATH+NC_FILENAME

In [6]:
# Full GenCode ver 38 human is 106143 pc + 48752 nc and loads in 7 sec.
# Expect fewer transcripts if special filtering is used.
loader=GenCodeLoader()
loader.set_label(1)
loader.set_check_utr(True)
pcdf=loader.load_file(PC_FULLPATH)
print("PC seqs loaded:",len(pcdf))
loader.set_label(0)
loader.set_check_utr(False)
ncdf=loader.load_file(NC_FULLPATH)
print("NC seqs loaded:",len(ncdf))
show_time()

PC seqs loaded: 70825
NC seqs loaded: 48752
2021-07-10 10:19:27 EDT


## Data Prep

In [7]:
def dataframe_length_filter(df,low_high):
    (low,high)=low_high
    # The pandas query language is strange, 
    # but this is MUCH faster than loop & drop.
    return df[ (df['seqlen']>=low) & (df['seqlen']<=high) ]
def dataframe_shuffle(df):
    # The ignore_index option is new in Pandas 1.3. 
    # The default (False) replicates the old behavior: shuffle the index too.
    # The new option seems more logical th
    # After shuffling, df.iloc[0] has index == 0.
    return df.sample(frac=1,ignore_index=True)
def dataframe_extract_sequence(df):
    return df['sequence'].tolist()

pc_all = dataframe_extract_sequence(
    dataframe_shuffle(
    dataframe_length_filter(pcdf,PC_LENS)))
nc_all = dataframe_extract_sequence(
    dataframe_shuffle(
    dataframe_length_filter(ncdf,NC_LENS)))

show_time()
print("PC seqs pass filter:",len(pc_all))
print("NC seqs pass filter:",len(nc_all))
# Garbage collection to reduce RAM footprint
pcdf=None
ncdf=None

2021-07-10 10:19:27 EDT
PC seqs pass filter: 55381
NC seqs pass filter: 46919


In [8]:
# Any portion of a shuffled list is a random selection
pc_train=pc_all[:PC_TRAINS] 
nc_train=pc_all[:NC_TRAINS]
pc_test=pc_all[PC_TRAINS:PC_TESTS] 
nc_test=pc_all[NC_TRAINS:PC_TESTS]
# Garbage collection
pc_all=None
nc_all=None

In [9]:
def prepare_x_and_y(seqs1,seqs0):
    len1=len(seqs1)
    len0=len(seqs0)
    labels1=[1]*len1
    labels0=[0]*len0
    all_seqs = seqs1 + seqs0
    all_labels = labels1 + labels0
    X,y = shuffle(all_seqs,all_labels) # sklearn.utils.shuffle
    return X,y
Xseq,y=prepare_x_and_y(pc_train,nc_train)

In [10]:
def seqs_to_kmer_freqs(seqs,max_K):
    tool = KmerTools()  # from SimTools
    empty = tool.make_dict_upto_K(max_K)
    Xnew = []
    for seq in seqs:
        counts = empty
        counts = tool.update_count_one_K(counts,max_K,seq,True)
        counts = tool.harvest_counts_from_K(counts,max_K)
        freqs = tool.count_to_frequency(counts,max_K)
        Xnew.append(freqs)
    return Xnew
Xcnt=seqs_to_kmer_freqs(Xseq,MAX_K)
print ("X[0]:\n",Xcnt[0])

X[0]:
 {'A': 0.3153846153846154, 'C': 0.20586080586080585, 'G': 0.21135531135531135, 'T': 0.2673992673992674, 'AA': 0.10406742396482228, 'AC': 0.052766581165262, 'AG': 0.0754855258336387, 'AT': 0.08318065225357274, 'CA': 0.08171491388787101, 'CC': 0.04727006229388054, 'CG': 0.015390252839868083, 'CT': 0.061194576768046904, 'GA': 0.07145474532795897, 'GC': 0.04690362770245511, 'GG': 0.043972150971051664, 'GT': 0.049102235251007695, 'TA': 0.05826310003664346, 'TC': 0.05899596921949432, 'TG': 0.07658482960791499, 'TT': 0.07365335287651155, 'AAA': 0.03409090909090909, 'AAC': 0.015762463343108504, 'AAG': 0.025293255131964808, 'AAT': 0.028958944281524925, 'ACA': 0.021994134897360705, 'ACC': 0.009530791788856305, 'ACG': 0.004765395894428153, 'ACT': 0.016495601173020527, 'AGA': 0.024560117302052785, 'AGC': 0.019794721407624633, 'AGG': 0.014296187683284457, 'AGT': 0.01686217008797654, 'ATA': 0.01686217008797654, 'ATC': 0.020161290322580645, 'ATG': 0.023826979472140762, 'ATT': 0.0223607038123167

In [11]:
nc_counts = tool.make_dict_upto_K(MAX_K)
for sample in nc_all:
    tool.update_count_one_K(nc_counts,MAX_K,sample,True)
tool.harvest_counts_from_K(nc_counts,MAX_K)
print("NC counts:\n",nc_counts)
nc_freqs = tool.count_to_frequency(nc_counts,MAX_K)
print ("Frequency:\n",nc_freqs)

NameError: name 'tool' is not defined

In [None]:
X,y = prepare_inputs_len_x_alphabet(pc_train,nc_train,ALPHABET) # shuffles
print("Data ready.")