# GenCode Explore

Explore the human RNA sequences from GenCode.

Assume user downloaded files from GenCode 38 [FTP](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/)
to a subdirectory called data.

Use our ORF_describer code.

In [1]:
import time 
def show_time():
    t = time.time()
    s = time.strftime('%Y-%m-%d %H:%M:%S %Z', time.localtime(t))
    print(s)
show_time()

2021-05-28 16:22:05 EDT


In [2]:
import numpy as np
import pandas as pd
import gzip
import sys
try:
    from google.colab import drive
    IN_COLAB = True
    print("On Google CoLab, mount cloud-local file, get our code from GitHub.")
    PATH='/content/drive/'
    #drive.mount(PATH,force_remount=True)  # hardly ever need this
    drive.mount(PATH)    # Google will require login credentials
    DATAPATH=PATH+'My Drive/data/'  # must end in "/"
    import requests
    s = requests.get('https://raw.githubusercontent.com/ShepherdCode/Soars2021/master/SimTools/RNA_describe.py')
    with open('RNA_describe.py', 'w') as f:
        f.write(s.text)  # writes to cloud local, delete the file later?
    from RNA_describe import *
except:
    print("CoLab not working. On my PC, use relative paths.")
    IN_COLAB = False
    DATAPATH='../data/'  # must end in "/"
    sys.path.append("..") # append parent dir in order to use sibling dirs
    from SimTools.RNA_describe import *

MODELPATH="BestModel"  # saved on cloud instance and lost after logout
#MODELPATH=DATAPATH+MODELPATH  # saved on Google Drive but requires login

if not assert_imported_RNA_describe():
    print("ERROR: Cannot use RNA_describe.")

CoLab not working. On my PC, use relative paths.


In [3]:
PC_FILENAME='gencode.v38.pc_transcripts.fa.gz'
NC_FILENAME='gencode.v38.lncRNA_transcripts.fa.gz'
TEST_FILENAME='test.fa.gz'

In [4]:
def load_gencode(filename,label):
    DEFLINE='>'
    DELIM='|'
    EMPTY=''
    labels=[]  # usually 1 for protein-coding or 0 for non-coding
    seqs=[]    # usually string of ACGT
    lens=[]    # sequence length
    ids=[]     # GenCode transcript ID, always starts ENST
    one_seq = EMPTY
    one_id = None
    # Use gzip 'r' mode to open file in read-only mode.
    # Use gzip 't' mode to read each line of text as type string.
    with gzip.open (filename,'rt') as infile:
        for line in infile:
            if line[0]==DEFLINE:
                # Save the previous sequence if one exists.
                if not one_seq == EMPTY:
                    labels.append(label)
                    seqs.append(one_seq)
                    lens.append(len(one_seq))
                    ids.append(one_id)
                # Get ready to read the next sequence. 
                # Parse a GenCode defline that is formatted like this:
                # >transcript_ID|gene_ID|other_fields other_info|other_info
                one_id = line[1:].split(DELIM)[0]
                one_seq = EMPTY
            else:
                # Continue loading sequence lines till next defline.
                additional = line.rstrip()
                one_seq = one_seq + additional
        # Don't forget to save the last sequence after end-of-file.
        if not one_seq == EMPTY:
            labels.append(label)
            seqs.append(one_seq)
            lens.append(len(one_seq))
            ids.append(one_id)

    df1=pd.DataFrame(ids,columns=['tid'])
    df2=pd.DataFrame(labels,columns=['class'])
    df3=pd.DataFrame(seqs,columns=['sequence'])
    df4=pd.DataFrame(lens,columns=['seqlen'])
    df=pd.concat((df1,df2,df3,df4),axis=1)
    return df

In [5]:
def get_the_facts(seqs,verbose=False):
    rd = RNA_describer()
    facts = rd.get_three_lengths(seqs)
    facts_ary = np.asarray(facts) 
    mean_5utr, mean_orf, mean_3utr = np.mean(facts_ary,axis=0)
    std_5utr, std_orf, std_3utr = np.std(facts_ary,axis=0)
    if verbose:
        print("Facts array:",facts_ary.shape)
        print("mean 5' UTR length:",int(mean_5utr),"+/-",int(std_5utr))
        print("mean    ORF length:",int(mean_orf), "+/-",int(std_orf))
        print("mean 3' UTR length:",int(mean_3utr),"+/-",int(std_3utr))
    return mean_5utr, mean_orf, mean_3utr

## Demo on a small test set

In [16]:
FULLPATH=DATAPATH+TEST_FILENAME
df=load_gencode(FULLPATH,1)
df.sort_values('seqlen', ascending=True, inplace=True)
df

Unnamed: 0,tid,class,sequence,seqlen
5,ENST00000437963.5,1,CAGCGCTTGGGGCTCGCGGGCCGCTCCCTCCGCTCGGAAGGGAAAA...,387
1,ENST00000426406.4,1,ATGGATGGAGAGAATCACTCAGTGGTATCTGAGTTTTTGTTTCTGG...,939
2,ENST00000332831.4,1,AGCCCAGTTGGCTGGACCAATGGATGGAGAGAATCACTCAGTGGTA...,995
14,ENST00000327044.7,1,GCTTCGGGTTGGTGTCATGGCAGCTGCGGGGAGCCGCAAGAGGCGC...,1140
10,ENST00000618181.5,1,ATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCC...,1671
7,ENST00000616125.5,1,ATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCC...,1722
13,ENST00000455979.1,1,AGGCGCTGCTGCTGCCGCGGGAGCTGGGGCCCAGCATGGCCCCGGA...,1731
8,ENST00000618779.5,1,ATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCC...,1860
11,ENST00000617307.5,1,ATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCC...,1986
9,ENST00000622503.5,1,ATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCC...,2049


In [7]:
FULLPATH=DATAPATH+TEST_FILENAME
testdf=load_gencode(FULLPATH,2)
x = get_the_facts( testdf['sequence'].tolist() ,True)

Facts array: (15, 3)
mean 5' UTR length: 132 +/- 199
mean    ORF length: 1533 +/- 710
mean 3' UTR length: 252 +/- 394


## Load the GenCode data.
Warning: GenCode has
over 100K protein-coding RNA (mRNA) 
and almost 50K non-coding RNA (lncRNA).

In [8]:
PC_FULLPATH=DATAPATH+PC_FILENAME
NC_FULLPATH=DATAPATH+NC_FILENAME
show_time()
pcdf=load_gencode(PC_FULLPATH,1)
print("PC seqs loaded:",len(pcdf))
show_time()
ncdf=load_gencode(NC_FULLPATH,0)
print("NC seqs loaded:",len(ncdf))
show_time()

2021-05-28 16:22:16 EDT
PC seqs loaded: 106143
2021-05-28 16:22:21 EDT
NC seqs loaded: 48752
2021-05-28 16:22:23 EDT


In [18]:
print("Sorting PC...")
pcdf.sort_values('seqlen', ascending=True, inplace=True)
print("Sorting NC...")
ncdf.sort_values('seqlen', ascending=True, inplace=True)

Sorting PC...
Sorting NC...


In [13]:
# Warning: each get_the_facts() can take up to 5 minutes.
# It is basically a 3-deep nested loop: for each seq, for each start, for each stop.
# Usually run this on subsets, not the whole data set.
def big_summary():
    show_time()
    print("Protein Coding set:")
    pc_means = get_the_facts( pcdf['sequence'].tolist() ,True)
    show_time()
    print("Non Coding set:")
    nc_means = get_the_facts( ncdf['sequence'].tolist() ,True)
    show_time()
#big_summary()

GenCode38  
```
2021-05-28 16:22:23 EDT  
Protein Coding set:  
Facts array: (106143, 3)  
mean 5' UTR length: 261 +/- 339  
mean    ORF length: 1136 +/- 1556  
mean 3' UTR length: 897 +/- 1385  
2021-05-28 16:26:34 EDT  
Non Coding set:  
Facts array: (48752, 3)  
mean 5' UTR length: 511 +/- 1344  
mean    ORF length: 211 +/- 135  
mean 3' UTR length: 606 +/- 1100  
2021-05-28 16:27:00 EDT  
```

## Subset by RNA length and analyze ORF lengths


In [19]:
# This is a fast way to slice if you have length thresholds.
mask = (ncdf['sequence'].str.len() < 1000)
subset = ncdf.loc[mask]
discard = get_the_facts( subset['sequence'].tolist() ,True)

Facts array: (25607, 3)
mean 5' UTR length: 225 +/- 165
mean    ORF length: 160 +/- 85
mean 3' UTR length: 230 +/- 167


In [27]:
def show_divisions(df,divisions,label):
    total=len(df)
    step=total//divisions
    for i in range(0,total+1,step):
        subset = df[i:i+step]
        first_len=subset.iloc[0]['seqlen']
        last_len=subset.iloc[-1]['seqlen']
        print("-- ",label,"RNA lengths",first_len,"to",last_len)
        discard = get_the_facts( subset['sequence'].tolist() ,True)
show_divisions(ncdf,10,"NC")
print()
show_divisions(pcdf,10,"PC")


--  NC RNA lengths 32 to 441
Facts array: (4875, 3)
mean 5' UTR length: 117 +/- 82
mean    ORF length: 111 +/- 68
mean 3' UTR length: 118 +/- 81
--  NC RNA lengths 441 to 548
Facts array: (4875, 3)
mean 5' UTR length: 175 +/- 111
mean    ORF length: 146 +/- 73
mean 3' UTR length: 174 +/- 110
--  NC RNA lengths 548 to 641
Facts array: (4875, 3)
mean 5' UTR length: 210 +/- 129
mean    ORF length: 159 +/- 76
mean 3' UTR length: 216 +/- 129
--  NC RNA lengths 641 to 777
Facts array: (4875, 3)
mean 5' UTR length: 258 +/- 159
mean    ORF length: 180 +/- 86
mean 3' UTR length: 269 +/- 161
--  NC RNA lengths 777 to 947
Facts array: (4875, 3)
mean 5' UTR length: 327 +/- 196
mean    ORF length: 193 +/- 91
mean 3' UTR length: 334 +/- 197
--  NC RNA lengths 947 to 1180
Facts array: (4875, 3)
mean 5' UTR length: 410 +/- 250
mean    ORF length: 212 +/- 99
mean 3' UTR length: 436 +/- 250
--  NC RNA lengths 1180 to 1456
Facts array: (4875, 3)
mean 5' UTR length: 514 +/- 318
mean    ORF length: 227 +/-