# Dictionaries

- Inputs: *gff3* file
- Programs: N/A
- Purpose: creates a dictionary that that maps **locus_tag** to **id** from the **attributes** column of *gff3* files.

In [None]:
import os
import pandas as pd
import re
import sys


## These constants should be defined in the ipynb calling this ipynb.

try:
    GENOME_VERSION
except NameError:
    raise Exception('Please define GENOME_VERSION in the local ipynb to use DK_0911_dictionaries.')

GENOME_PATH = '/home/gamran/genome_analysis/Warrior/Richard/output/genome_%s/' % GENOME_VERSION
GENOME = 'DK_0911_%s' % GENOME_VERSION

P_GFF = pd.read_csv(os.path.join(GENOME_PATH, GENOME + '_p_ctg.anno.gff3'), sep='\t', header=None)
H_GFF = pd.read_csv(os.path.join(GENOME_PATH, GENOME + '_h_ctg.anno.gff3'), sep='\t', header=None)

def getLocusToIdDict(p_gff=P_GFF, h_gff=H_GFF):
    '''returns a dictionary with key: locus_tag and val: ID from a
    DataFrame of haplotig and primary contig gff3 files containing
    attribute entries (column 9) such as:
    ID=evm.TU.pcontig_000.1;locus_tag=DK0911_00000;Name=EVM prediction pcontig_000.1'''
    
    p_gff_genes = p_gff.loc[p_gff[2] == 'mRNA']
    h_gff_genes = h_gff.loc[h_gff[2] == 'mRNA']
    
    dSeries = pd.concat([h_gff_genes[8], p_gff_genes[8]], ignore_index=True)

    idSearch = re.compile(r'ID=(.*?);')
    locusSearch = re.compile(r'^.*locus_tag=(.*?)(;|$)')

    d = {}

    for attr in dSeries:
        key = locusSearch.match(attr).group(1)
        val = idSearch.match(attr).group(1)
        if key in d.keys():
            print('Unexpected: locus tag: %s is already in dictionary!' % key)
        d[key] = val
    return d

def mapWithDict(x):
    locusToIdDict = getLocusToIdDict()
    if x == 'NaN':
        return x
    if x in locusToIdDict:
        return locusToIdDict[x]
    print("x: %s\n is not in the dictionary mapping loci to id." %x)
    sys.exit()

def changeFastaToIdTag(fastaFiles, fOut, rewriteInfile=False):
    '''combines fasta files and rewrites headings from locus tags
    (DK0911_16805) to id tags (evm.model.pcontig_057.39 (protein)
    or evm.TU.xxx (gene))'''
    locusToIdDict = getLocusToIdDict()
    fOut = fOut + '_tmp' # in case fOut is one of the in files
    with open(fOut, 'w') as outFile:
        for fastaFile in fastaFiles:
            with open(fastaFile, 'r') as inFile:
                for line in inFile:
                    if '>' in line:
                        line = '>' + locusToIdDict[line[1:-1]] + '\n'
                        if fOut[:-4].endswith('gene.fa'):
                            line = line.replace('model', 'TU')
                    outFile.write(line)
    os.rename(fOut, fOut[:-4])
    
    return True