# GenCode Explore

Explore the human RNA sequences from GenCode.

Assume user downloaded files from GenCode 38 [FTP](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/)
to a subdirectory called data.

In 103, we reduced the PC mRNA to a set where none lacked ORFs. Here, determine whether all those filters were actually necessary. Also engineer the code.

In [1]:
import time 
def show_time():
    t = time.time()
    s = time.strftime('%Y-%m-%d %H:%M:%S %Z', time.localtime(t))
    print(s)
show_time()

2021-06-07 16:28:32 EDT


In [2]:
import numpy as np
import pandas as pd
import gzip
import sys
import re

try:
    from google.colab import drive
    IN_COLAB = True
    print("On Google CoLab, mount cloud-local file, get our code from GitHub.")
    PATH='/content/drive/'
    #drive.mount(PATH,force_remount=True)  # hardly ever need this
    drive.mount(PATH)    # Google will require login credentials
    DATAPATH=PATH+'My Drive/data/'  # must end in "/"
    import requests
    s = requests.get('https://raw.githubusercontent.com/ShepherdCode/Soars2021/master/SimTools/RNA_describe.py')
    with open('RNA_describe.py', 'w') as f:
        f.write(s.text)  # writes to cloud local, delete the file later?
    s = requests.get('https://raw.githubusercontent.com/ShepherdCode/Soars2021/master/SimTools/GenCode_Protein_Include.py')
    with open('GenCode_Protein_Include', 'w') as f:
        f.write(s.text)  # writes to cloud local, delete the file later?
    from RNA_describe import ORF_counter
    from RNA_describe import assert_imported_RNA_describe
    from GenCode_preprocess import prot_incl
except:
    print("CoLab not working. On my PC, use relative paths.")
    IN_COLAB = False
    DATAPATH='../data/'  # must end in "/"
    sys.path.append("..") # append parent dir in order to use sibling dirs
    from SimTools.RNA_describe import ORF_counter
    from SimTools.RNA_describe import assert_imported_RNA_describe
    from SimTools.GenCode_Protein_Include import prot_incl

MODELPATH="BestModel"  # saved on cloud instance and lost after logout
#MODELPATH=DATAPATH+MODELPATH  # saved on Google Drive but requires login

if not assert_imported_RNA_describe():
    print("ERROR: Cannot use RNA_describe.")

CoLab not working. On my PC, use relative paths.


In [3]:
PC_FILENAME='gencode.v38.pc_transcripts.fa.gz'
NC_FILENAME='gencode.v38.lncRNA_transcripts.fa.gz'

In [4]:
class GenCodeLoader():
    def __init__(self):
        self.pattern5=re.compile('.*UTR5:')
        self.pattern3=re.compile('.*UTR3:')
        self.check_list = None
        self.check_utr = False
    def set_label(self,label):
        self.label=label
    def set_check_list(self,check_list):
        self.check_list=check_list
    def set_check_utr(self,check_utr):
        self.check_utr=check_utr
    def __save_previous(self,one_def,one_seq):
        if one_def is None:
            return
        if self.check_utr:
            if self.pattern5.match(one_def) is None: 
                return
            if self.pattern3.match(one_def) is None:
                return
        VERSION = '.'
        one_id = one_def[1:].split(VERSION)[0]
        if self.check_list is not None:
            if one_id not in self.check_list:
                return
        self.labels.append(self.label)
        self.seqs.append(one_seq)
        self.lens.append(len(one_seq))
        self.ids.append(one_id)
    def load_file(self,filename):
        self.labels=[]  # usually 1 for protein-coding or 0 for non-coding
        self.seqs=[]    # usually strings of ACGT
        self.lens=[]    # sequence length
        self.ids=[]     # GenCode transcript ID, always starts ENST, excludes version
        DEFLINE='>'  # start of line with ids in a FASTA FILE
        EMPTY=''
        one_def = None
        one_seq = ''
        with gzip.open (filename,'rt') as infile:
            for line in infile:
                if line[0]==DEFLINE:
                    self.__save_previous(one_def,one_seq)
                    one_def=line
                    one_seq = EMPTY
                else:
                    # Continue loading sequence lines till next defline.
                    additional = line.rstrip()
                    one_seq = one_seq + additional
            # Don't forget to save the last sequence after end-of-file.
            self.__save_previous(one_def,one_seq)
        df1=pd.DataFrame(self.ids,columns=['tid'])
        df2=pd.DataFrame(self.labels,columns=['class'])
        df3=pd.DataFrame(self.seqs,columns=['sequence'])
        df4=pd.DataFrame(self.lens,columns=['seqlen'])
        df=pd.concat((df1,df2,df3,df4),axis=1)
        return df

## Load the GenCode data.
Warning: GenCode has
over 100K protein-coding RNA (mRNA) 
and almost 50K non-coding RNA (lncRNA).

In [5]:
# Full GenCode ver 38 human is 106143 pc + 48752 nc and loads in 7 sec.
# Expect fewer transcripts if special filtering is used.
PC_FULLPATH=DATAPATH+PC_FILENAME
NC_FULLPATH=DATAPATH+NC_FILENAME
loader=GenCodeLoader()
show_time()
loader.set_label(1)
loader.set_check_list(prot_incl)
loader.set_check_utr(True)
pcdf=loader.load_file(PC_FULLPATH)
print("PC seqs loaded:",len(pcdf))
show_time()
loader.set_label(0)
loader.set_check_list(None)
loader.set_check_utr(False)
ncdf=loader.load_file(NC_FULLPATH)
print("NC seqs loaded:",len(ncdf))
show_time()

2021-06-07 16:28:33 EDT
PC seqs loaded: 57076
2021-06-07 16:28:37 EDT
NC seqs loaded: 48752
2021-06-07 16:28:39 EDT


In [6]:
print("Sorting PC...")
pcdf.sort_values('seqlen', ascending=True, inplace=True)
print("Sorting NC...")
ncdf.sort_values('seqlen', ascending=True, inplace=True)

Sorting PC...
Sorting NC...


In [7]:
ncdf

Unnamed: 0,tid,class,sequence,seqlen
659,ENST00000641727,0,TTGGGAGGCCGAGTCAGGTGGATCACCTGAGG,32
27605,ENST00000630527,0,CCACAATCTTCTGGACAAAGTGGGCCTGGTGAGCATGTGCAGCCTA...,54
17512,ENST00000612718,0,TTTTTTTTTTTTTTAAAAGTTCAAGTGATTCTCCTGGTTTTCCTCA...,60
9805,ENST00000678483,0,TGATCAGGCTGGTCTCGAACACCTGACCTTGTGATCCACCCACCTC...,60
9806,ENST00000679337,0,CTGGTCAGGGAGGAGTTTGGAATGTTTCTGGTTGGTGATGTTGTTT...,61
...,...,...,...,...
46918,ENST00000458178,0,GCAGAGCGGAGGAGCAGCAGCTGCCACCGGCCGGAAGCCTGCTGGC...,37852
47169,ENST00000624945,0,TGAAGGGGCCAGAGTACTGGCTGCCCTGAAATCAGAGCAGCAGCTA...,49287
27601,ENST00000597346,0,AGAACGGTCGCCGCGTCGCCTCAGCACGGACCTCCAGGGAGCTCCT...,91667
31002,ENST00000626826,0,GGAAGGAGCAATAACTGATTTTTCTGCAGTGTCTTCCTTCTGAAAG...,205012


## Look for short ORFs

In [8]:
def show_short(df,too_short):
    oc = ORF_counter()
    count=len(df)
    for pos in range(0,count):
        sequence=df.iloc[pos]['sequence']
        seqlen=df.iloc[pos]['seqlen']
        oc.set_sequence(sequence)
        orflen=oc.get_max_orf_len()
        seqlen=df.iloc[pos]['seqlen']
        if seqlen>200 and orflen<=TOO_SHORT:
            seqid=df.iloc[pos]['tid']
            print("%s len=%d orf=%d"%(seqid,seqlen,orflen))
        if pos%10000==0:
            print("...up to position",pos)
    print("done")
TOO_SHORT=60
show_short(pcdf,TOO_SHORT)


...up to position 0
ENST00000620323 len=349 orf=60
ENST00000491674 len=365 orf=57
ENST00000403463 len=464 orf=60
ENST00000534683 len=562 orf=42
ENST00000526256 len=582 orf=51
...up to position 10000
...up to position 20000
...up to position 30000
...up to position 40000
...up to position 50000
done


In [9]:
TOO_SHORT=9
show_short(ncdf,TOO_SHORT)


...up to position 0
ENST00000625894 len=203 orf=0
ENST00000609950 len=203 orf=0
ENST00000443801 len=205 orf=3
ENST00000623482 len=206 orf=0
ENST00000553679 len=207 orf=0
ENST00000622171 len=207 orf=0
ENST00000631268 len=207 orf=6
ENST00000580175 len=208 orf=0
ENST00000548846 len=209 orf=0
ENST00000508389 len=211 orf=0
ENST00000424083 len=212 orf=0
ENST00000630835 len=213 orf=0
ENST00000422353 len=214 orf=0
ENST00000462131 len=214 orf=0
ENST00000628097 len=216 orf=0
ENST00000624405 len=217 orf=3
ENST00000675758 len=219 orf=0
ENST00000600974 len=219 orf=0
ENST00000436963 len=220 orf=9
ENST00000601661 len=221 orf=0
ENST00000657976 len=221 orf=0
ENST00000474886 len=223 orf=0
ENST00000616952 len=225 orf=0
ENST00000513880 len=225 orf=0
ENST00000627210 len=225 orf=6
ENST00000519281 len=226 orf=0
ENST00000625124 len=226 orf=0
ENST00000539963 len=227 orf=0
ENST00000436501 len=229 orf=6
ENST00000587348 len=230 orf=0
ENST00000650499 len=231 orf=3
ENST00000634391 len=234 orf=6
ENST00000625037 len=