# GenCode Explore

Explore the human RNA sequences from GenCode.

Assume user downloaded files from GenCode 38 [FTP](http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/)
to a subdirectory called data.

In [1]:
import time 
t = time.time()
time.strftime('%Y-%m-%d %H:%M:%S %Z', time.localtime(t))

'2021-05-26 10:18:54 EDT'

In [2]:
import numpy as np
import pandas as pd
import gzip
import sys
try:
    from google.colab import drive
    IN_COLAB = True
    print("On Google CoLab, mount cloud-local file, get our code from GitHub.")
    PATH='/content/drive/'
    #drive.mount(PATH,force_remount=True)  # hardly ever need this
    drive.mount(PATH)    # Google will require login credentials
    DATAPATH=PATH+'My Drive/data/'  # must end in "/"
    import requests
    s = requests.get('https://raw.githubusercontent.com/ShepherdCode/Soars2021/master/SimTools/RNA_describe.py')
    with open('RNA_describe.py', 'w') as f:
        f.write(s.text)  # writes to cloud local, delete the file later?
    from RNA_describe import *
except:
    print("CoLab not working. On my PC, use relative paths.")
    IN_COLAB = False
    DATAPATH='../data/'  # must end in "/"
    sys.path.append("..") # append parent dir in order to use sibling dirs
    from SimTools.RNA_describe import *

MODELPATH="BestModel"  # saved on cloud instance and lost after logout
#MODELPATH=DATAPATH+MODELPATH  # saved on Google Drive but requires login

if not assert_imported_RNA_describe():
    print("ERROR: Cannot use RNA_describe.")

CoLab not working. On my PC, use relative paths.


In [3]:
PC_FILENAME='gencode.v38.pc_transcripts.fa.gz'
NC_FILENAME='gencode.v38.lncRNA_transcripts.fa.gz'
TEST_FILENAME='test.fa.gz'

In [8]:
def load_gencode(filename,label):
    DEFLINE='>'
    DELIM='|'
    EMPTY=''
    labels=[]  # usually 1 for protein-coding or 0 for non-coding
    seqs=[]    # usually string of ACGT
    lens=[]    # sequence length
    ids=[]     # GenCode transcript ID, always starts ENST
    one_seq = EMPTY
    one_id = None
    # Use gzip 'r' mode to open file in read-only mode.
    # Use gzip 't' mode to read each line of text as type string.
    with gzip.open (filename,'rt') as infile:
        for line in infile:
            if line[0]==DEFLINE:
                # Save the previous sequence if one exists.
                if not one_seq == EMPTY:
                    labels.append(label)
                    seqs.append(one_seq)
                    lens.append(len(one_seq))
                    ids.append(one_id)
                # Get ready to read the next sequence. 
                # Parse a GenCode defline that is formatted like this:
                # >transcript_ID|gene_ID|other_fields other_info|other_info
                one_id = line[1:].split(DELIM)[0]
                one_seq = EMPTY
            else:
                # Continue loading sequence lines till next defline.
                additional = line.rstrip()
                one_seq = one_seq + additional
        # Don't forget to save the last sequence after end-of-file.
        if not one_seq == EMPTY:
            labels.append(label)
            seqs.append(one_seq)
            lens.append(len(one_seq))
            ids.append(one_id)

    df1=pd.DataFrame(ids,columns=['tid'])
    df2=pd.DataFrame(labels,columns=['class'])
    df3=pd.DataFrame(seqs,columns=['sequence'])
    df4=pd.DataFrame(lens,columns=['seqlen'])
    df=pd.concat((df1,df2,df3,df4),axis=1)
    return df

In [9]:
def get_the_facts(seqs):
    rd = RNA_describer()
    facts = rd.get_three_lengths(seqs)
    facts_ary = np.asarray(facts) # 5000 rows, 3 columns 
    print("Facts array:",type(facts_ary))
    print("Facts array:",facts_ary.shape)
    # Get the mean of each column
    mean_5utr, mean_orf, mean_3utr = np.mean(facts_ary,axis=0)
    std_5utr, std_orf, std_3utr = np.std(facts_ary,axis=0)
    print("mean 5' UTR length:",int(mean_5utr),"+/-",int(std_5utr))
    print("mean    ORF length:",int(mean_orf), "+/-",int(std_orf))
    print("mean 3' UTR length:",int(mean_3utr),"+/-",int(std_3utr))

## Protein coding RNA (mRNA)

In [10]:
FULLPATH=DATAPATH+TEST_FILENAME
df=load_gencode(FULLPATH,1)
df

Unnamed: 0,tid,class,sequence,seqlen
0,ENST00000641515.2,1,CCCAGATCTCTTCAGTTTTTATGCCTCATTCTGTGAAAATTGCTGT...,2618
1,ENST00000426406.4,1,ATGGATGGAGAGAATCACTCAGTGGTATCTGAGTTTTTGTTTCTGG...,939
2,ENST00000332831.4,1,AGCCCAGTTGGCTGGACCAATGGATGGAGAGAATCACTCAGTGGTA...,995
3,ENST00000616016.5,1,GGCGGCGGAGTCTCCCAAGTCCCCGCCGGGCGGGCGCGCGCCAGTG...,3465
4,ENST00000618323.5,1,GGCGGCGGAGTCTCCCAAGTCCCCGCCGGGCGGGCGCGCGCCAGTG...,3468
5,ENST00000437963.5,1,CAGCGCTTGGGGCTCGCGGGCCGCTCCCTCCGCTCGGAAGGGAAAA...,387
6,ENST00000342066.8,1,GCAGAGCCCAGCAGATCCCTGCGGCGTTCGCGAGGGTGGGACGGGA...,2557
7,ENST00000616125.5,1,ATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCC...,1722
8,ENST00000618779.5,1,ATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCC...,1860
9,ENST00000622503.5,1,ATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCC...,2049


In [11]:
get_the_facts( df['sequence'].tolist() )

Facts array: <class 'numpy.ndarray'>
Facts array: (15, 3)
mean 5' UTR length: 132 +/- 199
mean    ORF length: 1533 +/- 710
mean 3' UTR length: 252 +/- 394


In [12]:
FULLPATH=DATAPATH+PC_FILENAME
df=load_gencode(FULLPATH,1)
get_the_facts( df['sequence'].tolist() )

Facts array: <class 'numpy.ndarray'>
Facts array: (106143, 3)
mean 5' UTR length: 261 +/- 339
mean    ORF length: 1136 +/- 1556
mean 3' UTR length: 897 +/- 1385


In [13]:
FULLPATH=DATAPATH+NC_FILENAME
df=load_gencode(FULLPATH,1)
get_the_facts( df['sequence'].tolist() )

Facts array: <class 'numpy.ndarray'>
Facts array: (48752, 3)
mean 5' UTR length: 511 +/- 1344
mean    ORF length: 211 +/- 135
mean 3' UTR length: 606 +/- 1100
