In [59]:
import re

def getIdToLocusTagDict(df):
    df = df[df['type'] == 'mRNA']
    
    locusSearch = re.compile(r'^.*locus_tag=(.*?)(;|$)')
    idSearch = re.compile(r'ID=(.*?);')
    
    d = {}
    
    for attr in df['attributes']:
        val = locusSearch.match(attr).group(1)
        key = idSearch.match(attr).group(1)
        if key in d.keys():
            print("Unexpected: id key: %s already in dictionary." % key)
        d[key] = val
    return d

In [60]:
import pandas as pd
import os
BASE_DIR = '/home/gamran/genome_analysis/Warrior/Richard/output/genome_v03/'
OUT_DIR = '/home/gamran/genome_analysis/Warrior/Richard/output/defining_alleles/Proteinortho/'

H_CTG_GFF_LOC = os.path.join(BASE_DIR, 'DK_0911_v03_h_ctg.anno.gff3')
P_CTG_GFF_LOC = os.path.join(BASE_DIR, 'DK_0911_v03_p_ctg.anno.gff3')
        

def rewriteGffForProteinOrtho(gff_in_loc, gff_out_loc): 
    ctg_df = pd.read_table(gff_in_loc, skiprows = 1, header = None, \
                  names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
    
    # get dict mapping ID to locus_tag from mRNA entries
    # need attribute to have 'ID=locus_tag' because .faa file has ID=locus_tag
    # will be used in this way: CDS.parent = mRNA.id =(d)> locus_tag
    d = getIdToLocusTagDict(ctg_df)
    
    # subset DataFrame to only CDS type (only type required for proteinortho analysis)
    ctg_df = ctg_df[ctg_df['type'] == 'CDS']
    
    # turn attribute column into parent value
    parentSearch = re.compile(r'^.*Parent=(.*?)(;|$)')
    ctg_df['attributes'] = ctg_df['attributes'].apply(lambda x: parentSearch.match(x).group(1))
    
    # map CDS.parent values (= mRNA.id) in attribute column to locus_tag
    ctg_df['attributes'] = ctg_df['attributes'].map(d)
    
    # make attribute column into format 'ID=locus_tag' to be readable by proteinortho
    ctg_df['attributes'] = ctg_df['attributes'].apply(lambda x: 'ID=%s;'%x)
    
    with open(gff_out_loc, 'w') as outfile:
        ctg_df.to_csv(outfile, sep='\t', header = False, index = False)
    return ctg_df

htg_df = rewriteGffForProteinOrtho(H_CTG_GFF_LOC, OUT_DIR + 'DK_0911_v03_h_ctg.protein.gff')
pCtg_df = rewriteGffForProteinOrtho(P_CTG_GFF_LOC, OUT_DIR + 'DK_0911_v03_p_ctg.protein.gff')

In [62]:
htg_df.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
2,hcontig_000_001,EVM,CDS,2114,2632,.,-,0,ID=DK0911_18236;
4,hcontig_000_001,EVM,CDS,1903,2006,.,-,0,ID=DK0911_18236;
6,hcontig_000_001,EVM,CDS,1679,1837,.,-,1,ID=DK0911_18236;
8,hcontig_000_001,EVM,CDS,1463,1595,.,-,1,ID=DK0911_18236;
10,hcontig_000_001,EVM,CDS,1168,1381,.,-,0,ID=DK0911_18236;


In [None]:
os.chdir(OUT_DIR)

# !/home/gamran/anaconda3/proteinortho_v5.16b/proteinortho5.pl -project=test -synteny DK_0911_v03_h_ctg.protein.faa DK_0911_v03_p_ctg.protein.faa