In [2]:
import os
import pandas as pd
import re

GENOME_PATH = '/home/gamran/genome_analysis/Warrior/Richard/output/genome_v03/'

P_GFF = pd.read_csv(os.path.join(GENOME_PATH, 'DK_0911_v03_p_ctg.anno.gff3'), sep='\t', header=None)
H_GFF = pd.read_csv(os.path.join(GENOME_PATH, 'DK_0911_v03_h_ctg.anno.gff3'), sep='\t', header=None)

def getLocusToIdDict(p_gff=P_GFF, h_gff=H_GFF):
    '''returns a dictionary with key: locus_tag and val: ID from a
    DataFrame of haplotig and primary contig gff3 files containing
    attribute entries (column 9) such as:
    ID=evm.TU.pcontig_000.1;locus_tag=DK0911_00000;Name=EVM prediction pcontig_000.1'''
    
    p_gff_genes = p_gff.loc[p_gff[2] == 'mRNA']
    h_gff_genes = h_gff.loc[h_gff[2] == 'mRNA']
    
    dSeries = pd.concat([h_gff_genes[8], p_gff_genes[8]], ignore_index=True)

    idSearch = re.compile(r'ID=(.*?);')
    locusSearch = re.compile(r'^.*locus_tag=(.*?)(;|$)')

    d = {}

    for attr in dSeries:
        key = locusSearch.match(attr).group(1)
        val = idSearch.match(attr).group(1)
        if key in d.keys():
            print('Unexpected: locus tag: %s is already in dictionary!' % key)
        d[key] = val
    return d

In [4]:
getLocusToIdDict()

{'DK0911_23846': 'evm.model.hcontig_013_019.2',
 'DK0911_04238': 'evm.model.pcontig_005.218',
 'DK0911_22518': 'evm.model.hcontig_009_030.52',
 'DK0911_05639': 'evm.model.pcontig_009.100',
 'DK0911_27006': 'evm.model.hcontig_028_013.12',
 'DK0911_05251': 'evm.model.pcontig_008.207',
 'DK0911_26238': 'evm.model.hcontig_023_001.19',
 'DK0911_18056': 'evm.model.pcontig_088.1',
 'DK0911_18109': 'evm.model.pcontig_090.12',
 'DK0911_26213': 'evm.model.hcontig_022_021.1',
 'DK0911_15088': 'evm.model.pcontig_041.33',
 'DK0911_22281': 'evm.model.hcontig_009_011.36',
 'DK0911_06040': 'evm.model.pcontig_009.501',
 'DK0911_15606': 'evm.model.pcontig_046.79',
 'DK0911_01834': 'evm.model.pcontig_001.731',
 'DK0911_21095': 'evm.model.hcontig_005_017.1',
 'DK0911_01569': 'evm.model.pcontig_001.466',
 'DK0911_01502': 'evm.model.pcontig_001.399',
 'DK0911_12717': 'evm.model.pcontig_029.230',
 'DK0911_13218': 'evm.model.pcontig_032.22',
 'DK0911_09767': 'evm.model.pcontig_018.296',
 'DK0911_18096': 'evm.