# GenCode Explore

Explore the human RNA sequences from GenCode.

Assume user downloaded files from GenCode 38 [FTP](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/)
to a subdirectory called data.

Build on 102 which excluded mitochondrial genes by ID. (If the number of exclusions grow, may need to move from an exclusion list to an annotation gff parser.)

Explore remaining PC mRNA that have tiny ORFs.

In [1]:
import time 
def show_time():
    t = time.time()
    s = time.strftime('%Y-%m-%d %H:%M:%S %Z', time.localtime(t))
    print(s)
show_time()

2021-06-06 17:11:17 EDT


In [2]:
import numpy as np
import pandas as pd
import gzip
import sys
try:
    from google.colab import drive
    IN_COLAB = True
    print("On Google CoLab, mount cloud-local file, get our code from GitHub.")
    PATH='/content/drive/'
    #drive.mount(PATH,force_remount=True)  # hardly ever need this
    drive.mount(PATH)    # Google will require login credentials
    DATAPATH=PATH+'My Drive/data/'  # must end in "/"
    import requests
    s = requests.get('https://raw.githubusercontent.com/ShepherdCode/Soars2021/master/SimTools/RNA_describe.py')
    with open('RNA_describe.py', 'w') as f:
        f.write(s.text)  # writes to cloud local, delete the file later?
    s = requests.get('https://raw.githubusercontent.com/ShepherdCode/Soars2021/master/SimTools/GenCode_Protein_Include.py')
    with open('GenCode_Protein_Include', 'w') as f:
        f.write(s.text)  # writes to cloud local, delete the file later?
    from RNA_describe import *
    from GenCode_preprocess import prot_incl
except:
    print("CoLab not working. On my PC, use relative paths.")
    IN_COLAB = False
    DATAPATH='../data/'  # must end in "/"
    sys.path.append("..") # append parent dir in order to use sibling dirs
    from SimTools.RNA_describe import *
    from SimTools.GenCode_Protein_Include import prot_incl

MODELPATH="BestModel"  # saved on cloud instance and lost after logout
#MODELPATH=DATAPATH+MODELPATH  # saved on Google Drive but requires login

if not assert_imported_RNA_describe():
    print("ERROR: Cannot use RNA_describe.")

CoLab not working. On my PC, use relative paths.


In [3]:
PC_FILENAME='gencode.v38.pc_transcripts.fa.gz'
NC_FILENAME='gencode.v38.lncRNA_transcripts.fa.gz'

In [4]:
def load_gencode(filename,label,check_list=None):
    DEFLINE='>'  # start of line with ids in a FASTA FILE
    DELIM='|'    # character between ids
    VERSION='.'  # character between id and version
    EMPTY=''     # use this to avoid saving "previous" sequence in first iteration
    labels=[]  # usually 1 for protein-coding or 0 for non-coding
    seqs=[]    # usually strings of ACGT
    lens=[]    # sequence length
    ids=[]     # GenCode transcript ID, always starts ENST, excludes version
    one_seq = EMPTY
    one_id = None
    with gzip.open (filename,'rt') as infile:
        for line in infile:
            if line[0]==DEFLINE:
                if not one_seq == EMPTY and (check_list is None or one_id in check_list):
                    labels.append(label)
                    seqs.append(one_seq)
                    lens.append(len(one_seq))
                    ids.append(one_id)
                one_id = line[1:].split(VERSION)[0]
                one_seq = EMPTY
            else:
                # Continue loading sequence lines till next defline.
                additional = line.rstrip()
                one_seq = one_seq + additional
        # Don't forget to save the last sequence after end-of-file.
        if not one_seq == EMPTY and (check_list is None or one_id in check_list):
            labels.append(label)
            seqs.append(one_seq)
            lens.append(len(one_seq))
            ids.append(one_id)

    df1=pd.DataFrame(ids,columns=['tid'])
    df2=pd.DataFrame(labels,columns=['class'])
    df3=pd.DataFrame(seqs,columns=['sequence'])
    df4=pd.DataFrame(lens,columns=['seqlen'])
    df=pd.concat((df1,df2,df3,df4),axis=1)
    return df

## Load the GenCode data.
Warning: GenCode has
over 100K protein-coding RNA (mRNA) 
and almost 50K non-coding RNA (lncRNA).

In [5]:
# Full GenCode ver 38 human is 106143 pc + 48752 nc and loads in 7 sec.
# Expect fewer transcripts if special filtering is used.
PC_FULLPATH=DATAPATH+PC_FILENAME
NC_FULLPATH=DATAPATH+NC_FILENAME
show_time()
pcdf=load_gencode(PC_FULLPATH,1,prot_incl)
print("PC seqs loaded:",len(pcdf))
show_time()
ncdf=load_gencode(NC_FULLPATH,0)
print("NC seqs loaded:",len(ncdf))
show_time()

2021-06-06 17:11:17 EDT
PC seqs loaded: 60946
2021-06-06 17:11:22 EDT
NC seqs loaded: 48752
2021-06-06 17:11:24 EDT


In [6]:
print("Sorting PC...")
pcdf.sort_values('seqlen', ascending=True, inplace=True)
print("Sorting NC...")
ncdf.sort_values('seqlen', ascending=True, inplace=True)

Sorting PC...
Sorting NC...


In [7]:
ncdf

Unnamed: 0,tid,class,sequence,seqlen
659,ENST00000641727,0,TTGGGAGGCCGAGTCAGGTGGATCACCTGAGG,32
27605,ENST00000630527,0,CCACAATCTTCTGGACAAAGTGGGCCTGGTGAGCATGTGCAGCCTA...,54
17512,ENST00000612718,0,TTTTTTTTTTTTTTAAAAGTTCAAGTGATTCTCCTGGTTTTCCTCA...,60
9805,ENST00000678483,0,TGATCAGGCTGGTCTCGAACACCTGACCTTGTGATCCACCCACCTC...,60
9806,ENST00000679337,0,CTGGTCAGGGAGGAGTTTGGAATGTTTCTGGTTGGTGATGTTGTTT...,61
...,...,...,...,...
46918,ENST00000458178,0,GCAGAGCGGAGGAGCAGCAGCTGCCACCGGCCGGAAGCCTGCTGGC...,37852
47169,ENST00000624945,0,TGAAGGGGCCAGAGTACTGGCTGCCCTGAAATCAGAGCAGCAGCTA...,49287
27601,ENST00000597346,0,AGAACGGTCGCCGCGTCGCCTCAGCACGGACCTCCAGGGAGCTCCT...,91667
31002,ENST00000626826,0,GGAAGGAGCAATAACTGATTTTTCTGCAGTGTCTTCCTTCTGAAAG...,205012


## Look for short ORFs

In [8]:
def show_short(df,too_short):
    oc = ORF_counter()
    count=len(df)
    for pos in range(0,count):
        sequence=df.iloc[pos]['sequence']
        seqlen=df.iloc[pos]['seqlen']
        oc.set_sequence(sequence)
        orflen=oc.get_max_orf_len()
        seqlen=df.iloc[pos]['seqlen']
        if seqlen>200 and orflen<=TOO_SHORT:
            seqid=df.iloc[pos]['tid']
            print("%s len=%d orf=%d"%(seqid,seqlen,orflen))

TOO_SHORT=50
show_short(pcdf,TOO_SHORT)


ENST00000629223 len=210 orf=0
ENST00000640179 len=210 orf=24
ENST00000638826 len=219 orf=0
ENST00000612485 len=225 orf=42
ENST00000640713 len=231 orf=42
ENST00000640869 len=237 orf=27
ENST00000638397 len=240 orf=0
ENST00000639871 len=243 orf=0
ENST00000548367 len=249 orf=0
ENST00000626868 len=249 orf=0
ENST00000618832 len=282 orf=0
ENST00000532589 len=331 orf=27
ENST00000640154 len=336 orf=15
ENST00000634235 len=351 orf=0
ENST00000635603 len=369 orf=0
ENST00000639772 len=421 orf=30
ENST00000515402 len=448 orf=30
ENST00000330910 len=468 orf=42
ENST00000613578 len=486 orf=27
ENST00000437752 len=525 orf=36
ENST00000528262 len=560 orf=27
ENST00000619146 len=561 orf=24
ENST00000534683 len=562 orf=42
ENST00000640148 len=596 orf=12
ENST00000621530 len=617 orf=39
ENST00000614767 len=762 orf=0
ENST00000639555 len=878 orf=0
ENST00000641094 len=1476 orf=0
