# GenCode Explore

Explore the human RNA sequences from GenCode.

Assume user downloaded files from GenCode 38 [FTP](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/)
to a subdirectory called data.

In [1]:
import time 
def show_time():
    t = time.time()
    s = time.strftime('%Y-%m-%d %H:%M:%S %Z', time.localtime(t))
    print(s)
show_time()

2021-05-26 15:35:12 EDT


In [2]:
import numpy as np
import pandas as pd
import gzip
import sys
try:
    from google.colab import drive
    IN_COLAB = True
    print("On Google CoLab, mount cloud-local file, get our code from GitHub.")
    PATH='/content/drive/'
    #drive.mount(PATH,force_remount=True)  # hardly ever need this
    drive.mount(PATH)    # Google will require login credentials
    DATAPATH=PATH+'My Drive/data/'  # must end in "/"
    import requests
    s = requests.get('https://raw.githubusercontent.com/ShepherdCode/Soars2021/master/SimTools/RNA_describe.py')
    with open('RNA_describe.py', 'w') as f:
        f.write(s.text)  # writes to cloud local, delete the file later?
    from RNA_describe import *
except:
    print("CoLab not working. On my PC, use relative paths.")
    IN_COLAB = False
    DATAPATH='../data/'  # must end in "/"
    sys.path.append("..") # append parent dir in order to use sibling dirs
    from SimTools.RNA_describe import *

MODELPATH="BestModel"  # saved on cloud instance and lost after logout
#MODELPATH=DATAPATH+MODELPATH  # saved on Google Drive but requires login

if not assert_imported_RNA_describe():
    print("ERROR: Cannot use RNA_describe.")

CoLab not working. On my PC, use relative paths.


In [3]:
PC_FILENAME='gencode.v38.pc_transcripts.fa.gz'
NC_FILENAME='gencode.v38.lncRNA_transcripts.fa.gz'
TEST_FILENAME='test.fa.gz'

In [4]:
def load_gencode(filename,label):
    DEFLINE='>'
    DELIM='|'
    EMPTY=''
    labels=[]  # usually 1 for protein-coding or 0 for non-coding
    seqs=[]    # usually string of ACGT
    lens=[]    # sequence length
    ids=[]     # GenCode transcript ID, always starts ENST
    one_seq = EMPTY
    one_id = None
    # Use gzip 'r' mode to open file in read-only mode.
    # Use gzip 't' mode to read each line of text as type string.
    with gzip.open (filename,'rt') as infile:
        for line in infile:
            if line[0]==DEFLINE:
                # Save the previous sequence if one exists.
                if not one_seq == EMPTY:
                    labels.append(label)
                    seqs.append(one_seq)
                    lens.append(len(one_seq))
                    ids.append(one_id)
                # Get ready to read the next sequence. 
                # Parse a GenCode defline that is formatted like this:
                # >transcript_ID|gene_ID|other_fields other_info|other_info
                one_id = line[1:].split(DELIM)[0]
                one_seq = EMPTY
            else:
                # Continue loading sequence lines till next defline.
                additional = line.rstrip()
                one_seq = one_seq + additional
        # Don't forget to save the last sequence after end-of-file.
        if not one_seq == EMPTY:
            labels.append(label)
            seqs.append(one_seq)
            lens.append(len(one_seq))
            ids.append(one_id)

    df1=pd.DataFrame(ids,columns=['tid'])
    df2=pd.DataFrame(labels,columns=['class'])
    df3=pd.DataFrame(seqs,columns=['sequence'])
    df4=pd.DataFrame(lens,columns=['seqlen'])
    df=pd.concat((df1,df2,df3,df4),axis=1)
    return df

In [13]:
def get_the_facts(seqs,verbose=False):
    rd = RNA_describer()
    facts = rd.get_three_lengths(seqs)
    facts_ary = np.asarray(facts) 
    mean_5utr, mean_orf, mean_3utr = np.mean(facts_ary,axis=0)
    std_5utr, std_orf, std_3utr = np.std(facts_ary,axis=0)
    if verbose:
        print("Facts array:",facts_ary.shape)
        print("mean 5' UTR length:",int(mean_5utr),"+/-",int(std_5utr))
        print("mean    ORF length:",int(mean_orf), "+/-",int(std_orf))
        print("mean 3' UTR length:",int(mean_3utr),"+/-",int(std_3utr))
    return mean_5utr, mean_orf, mean_3utr

## Demo on a small test set

In [14]:
FULLPATH=DATAPATH+TEST_FILENAME
df=load_gencode(FULLPATH,1)
df

Unnamed: 0,tid,class,sequence,seqlen
0,ENST00000641515.2,1,CCCAGATCTCTTCAGTTTTTATGCCTCATTCTGTGAAAATTGCTGT...,2618
1,ENST00000426406.4,1,ATGGATGGAGAGAATCACTCAGTGGTATCTGAGTTTTTGTTTCTGG...,939
2,ENST00000332831.4,1,AGCCCAGTTGGCTGGACCAATGGATGGAGAGAATCACTCAGTGGTA...,995
3,ENST00000616016.5,1,GGCGGCGGAGTCTCCCAAGTCCCCGCCGGGCGGGCGCGCGCCAGTG...,3465
4,ENST00000618323.5,1,GGCGGCGGAGTCTCCCAAGTCCCCGCCGGGCGGGCGCGCGCCAGTG...,3468
5,ENST00000437963.5,1,CAGCGCTTGGGGCTCGCGGGCCGCTCCCTCCGCTCGGAAGGGAAAA...,387
6,ENST00000342066.8,1,GCAGAGCCCAGCAGATCCCTGCGGCGTTCGCGAGGGTGGGACGGGA...,2557
7,ENST00000616125.5,1,ATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCC...,1722
8,ENST00000618779.5,1,ATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCC...,1860
9,ENST00000622503.5,1,ATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCC...,2049


In [15]:
FULLPATH=DATAPATH+TEST_FILENAME
testdf=load_gencode(FULLPATH,2)
x = get_the_facts( testdf['sequence'].tolist() ,True)

Facts array: (15, 3)
mean 5' UTR length: 132 +/- 199
mean    ORF length: 1533 +/- 710
mean 3' UTR length: 252 +/- 394


## Load the GenCode data.
Warning: this is slow.
Over 100K protein-coding RNA (mRNA).
Almost 50K non-coding RNA (lncRNA).

In [16]:
PC_FULLPATH=DATAPATH+PC_FILENAME
NC_FULLPATH=DATAPATH+NC_FILENAME
show_time()
pcdf=load_gencode(PC_FULLPATH,1)
print("PC seqs loaded:",len(pcdf))
show_time()
ncdf=load_gencode(NC_FULLPATH,0)
print("NC seqs loaded:",len(ncdf))
show_time()

2021-05-26 15:53:36 EDT
PC seqs loaded: 106143
2021-05-26 15:53:41 EDT
NC seqs loaded: 48752
2021-05-26 15:53:43 EDT


In [18]:
# Warning: each get_the_facts() takes up to 5 minutes.
show_time()
print("Protein Coding set:")
pc_means = get_the_facts( pcdf['sequence'].tolist() ,True)
show_time()
print("Non Coding set:")
nc_means = get_the_facts( ncdf['sequence'].tolist() ,True)
show_time()

2021-05-26 16:03:35 EDT
Protein Coding set:
Facts array: (106143, 3)
mean 5' UTR length: 261 +/- 339
mean    ORF length: 1136 +/- 1556
mean 3' UTR length: 897 +/- 1385
2021-05-26 16:07:36 EDT
Non Coding set:
Facts array: (48752, 3)
mean 5' UTR length: 511 +/- 1344
mean    ORF length: 211 +/- 135
mean 3' UTR length: 606 +/- 1100
2021-05-26 16:08:03 EDT


Original:
get three
2021-05-26 15:07:53 EDT
as array
2021-05-26 15:09:13 EDT

Check ATG before method invocation:
get three
2021-05-26 15:19:32 EDT
as array
2021-05-26 15:19:48 EDT



In [10]:
mask = (ncdf['sequence'].str.len() < 1000)
subset = ncdf.loc[mask]
subset

Unnamed: 0,tid,class,sequence,seqlen
0,ENST00000473358.1,0,GTGCACACGGCTCCCATGCGTTGTCTTCCGAGCGTCAGGCCGCCCC...,712
1,ENST00000469289.1,0,TCATCAGTCCAAAGTCCAGCAGTTGTCCCTCCTGGAATCCGTTGGC...,535
3,ENST00000461467.1,0,GGGGTTTCGGGGCTGTGGACCCTGTGCCAGGAAAGGAAGGGCGCAG...,590
5,ENST00000477740.5,0,GACAAGTTCGAGCATCTTAAAATGATTCAACAGGAGGAGATAAGGA...,491
6,ENST00000471248.1,0,GAAGCTCGAGGAAGAGAAAAAAAAACTGGAAGGAGAAATCATAGAT...,629
...,...,...,...,...
48746,ENST00000427373.5,0,AGCTCACTGTAACCTTGAACTCCTGGGCTCAAGTGATCTTCCTGCT...,842
48747,ENST00000611754.1,0,TGCACACACCTTCTTTTCCAAGGTTTGTGTGCAGAACATCCTGCCC...,723
48748,ENST00000306641.1,0,CACACAACATGGCCTTCAGCAATGCTGATTCAGGCTTTGTGGATGC...,795
48749,ENST00000417334.1,0,GGTTGCCACTTCAAGGGACTACATCATGATGTCCTGTTTCTAATGA...,344


In [11]:
get_the_facts( subset['sequence'].tolist() ,True)

constructor
2021-05-26 15:35:46 EDT
get three
2021-05-26 15:35:46 EDT
as array
2021-05-26 15:35:53 EDT
mean
2021-05-26 15:35:53 EDT
std
2021-05-26 15:35:53 EDT
print
2021-05-26 15:35:53 EDT
Facts array: (25607, 3)
mean 5' UTR length: 225 +/- 165
mean    ORF length: 160 +/- 85
mean 3' UTR length: 230 +/- 167


(225.5727730698637, 160.50954817042216, 230.8049361502714)

In [12]:
subset['sequence']

0        GTGCACACGGCTCCCATGCGTTGTCTTCCGAGCGTCAGGCCGCCCC...
1        TCATCAGTCCAAAGTCCAGCAGTTGTCCCTCCTGGAATCCGTTGGC...
3        GGGGTTTCGGGGCTGTGGACCCTGTGCCAGGAAAGGAAGGGCGCAG...
5        GACAAGTTCGAGCATCTTAAAATGATTCAACAGGAGGAGATAAGGA...
6        GAAGCTCGAGGAAGAGAAAAAAAAACTGGAAGGAGAAATCATAGAT...
                               ...                        
48746    AGCTCACTGTAACCTTGAACTCCTGGGCTCAAGTGATCTTCCTGCT...
48747    TGCACACACCTTCTTTTCCAAGGTTTGTGTGCAGAACATCCTGCCC...
48748    CACACAACATGGCCTTCAGCAATGCTGATTCAGGCTTTGTGGATGC...
48749    GGTTGCCACTTCAAGGGACTACATCATGATGTCCTGTTTCTAATGA...
48750    GTCTAGGTGCAGAGGCCAGAGGAAGTCATTGCTGTCCTGTCCCGCC...
Name: sequence, Length: 25607, dtype: object