# GenCode

GenCode release 42
[LINK](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_42/)

In [1]:
DATA_DIR = '/Users/jasonmiller/WVU/Localization/GenCode/'
ANNOTATION = 'gencode.v42.basic.annotation.gff3'
NONCODING_SEQUENCE = 'gencode.v42.lncRNA_transcripts.fa'
CODING_SEQUENCE = 'gencode.v42.pc_transcripts.fa'
CODING_CSV = 'gencode.v42.pc_transcripts.csv'
NONCODING_CSV = 'gencode.v42.lncRNA_transcripts.csv'

In [2]:
class fasta_reader():
    '''
    Parser for human transcripts FASTA file from GenCode.
    '''
    def __init__(self,infile,outfile,biotype):
        '''
        Biotype should reflect the filename: either 'pc' or 'lncRNA'.
        '''
        self.infile = infile
        self.outfile = outfile
        self.biotype = biotype
        self.FASTA_DEFCHAR = '>'  # signals a defline = definition line
        self.count=0
        self.headers='transcript_id,gene_id,biotype,length,sequence\n'
    def print_one_sequence(self,handle,tran,gene,seq):
        if seq is not None:
            # sequence is None when we encounter the first defline
            biotype = self.biotype
            length = str(len(seq))
            outstr = ','.join((tran,gene,biotype,length,seq))
            handle.write(outstr+'\n')
            self.count += 1
    def fasta_to_csv(self):
        with open(self.outfile,'w') as handle:
            handle.write(self.headers)
            with open(self.infile,'r') as fasta:
                transcript_id = None
                gene_id = None
                next_seq = None
                for line in fasta:
                    if line[0]==self.FASTA_DEFCHAR:
                        # The defline starts with '>'
                        # The defline has fields separated by vertical bar
                        # Wrap up the previous sequence before moving on to the next.
                        self.print_one_sequence(handle,transcript_id,gene_id,next_seq)
                        tokens = line.split('|')
                        transcript_id = tokens[0][1:] # chop off '>'
                        gene_id = tokens[1]   
                        version_index=gene_id.find('.')   
                        if version_index>=0:
                            # chop off version number, as in ENSG00000198888.2
                            gene_id = gene_id[:version_index]  
                        next_seq = ""   # get ready for one to many sequence lines
                    else:
                        # In FASTA format, one sequence may continue to next line
                        next_seq = next_seq + line.strip()  
            self.print_one_sequence(handle,transcript_id,gene_id,next_seq)
        print("Output sequences: %d"%self.count)

## mRNA
One genes can have several RNA transcripts called isoforms. 

Typical defline of the GenCode pc file:    

    >ENST00000641515.2|ENSG00000186092.7|OTTHUMG00000001094.4|OTTHUMT00000003223.4|OR4F5-201|OR4F5|2618|UTR5:1-60|CDS:61-1041|UTR3:1042-2618|

In [3]:
infile = DATA_DIR + CODING_SEQUENCE
outfile = DATA_DIR + CODING_CSV
converter = fasta_reader(infile,outfile,'pc')
converter.fasta_to_csv()

Output sequences: 111053


## lncRNA

Non-coding genes can halso ave different transcripts called isoforms. 

Typical defline of the GenCode lncRNA file:   

    >ENST00000456328.2|ENSG00000290825.1|-|OTTHUMT00000362751.1|DDX11L2-202|DDX11L2|1657|


In [4]:
infile = DATA_DIR + NONCODING_SEQUENCE
outfile = DATA_DIR + NONCODING_CSV
converter = fasta_reader(infile,outfile,'lncRNA')
converter.fasta_to_csv()

Output sequences: 57936
