In [1]:
import pandas as pd
import os
import re

ANNOTATION_PATH = '/home/gamran/genome_analysis/Warrior/annotation/'

H_GFF_PATH_IN = os.path.join(ANNOTATION_PATH, 'DK_0911_v01_h_ctg/DK_0911_v01_h_ctg.evm.all.lt.gff3')
P_GFF_PATH_IN = os.path.join(ANNOTATION_PATH + 'DK_0911_v01_p_ctg/DK_0911_v01_p_ctg.evm.all.lt.gff3')

P_GENOME_OUT = 'DK_0911_v031_p_ctg'
H_GENOME_OUT = 'DK_0911_v031_h_ctg'

P_GFF_PATH_OUT = os.path.join(ANNOTATION_PATH, P_GENOME_OUT, '%s.anno.gff3' % P_GENOME_OUT)
H_GFF_PATH_OUT = os.path.join(ANNOTATION_PATH, H_GENOME_OUT, '%s.anno.gff3' % H_GENOME_OUT)

pCtg_df = pd.read_table(P_GFF_PATH_IN, skiprows = 1, header = None, index_col = 0, \
                  names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
htg_df = pd.read_table(H_GFF_PATH_IN, skiprows = 1, header = None, index_col = 0, \
                  names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']) 

def assign(pwohNum, pwhNum, pCtg_df, htg_df):
    '''Assigns a single pwoh to a pwh, removing the pwoh from the 
    gff pCtg_df and assigning it to the gff htg_df with the correct
    naming convention. For example, pcontig_086 that is a pwoh
    belonging to pcontig_039 will be renamed to hcontig_039_086 and
    removed from pCtg_df and added to htg_df.'''
    '''NOTE that this does not handle duplicates (e.g. if hcontig_039_086
    already exists, this function does nothing to handle this case)'''
    
    # e.g. 86 -> '086'
    pwohNum = '0'*(3-len(str(pwohNum))) + str(pwohNum)
    pwhNum = '0'*(3-len(str(pwhNum))) + str(pwhNum)
    
    htg = pCtg_df.loc['pcontig_' + pwohNum]
    pCtg_df = pCtg_df.drop('pcontig_' + pwohNum)
    htg.index = ['hcontig_%s_%s' % (pwhNum, pwohNum)]*len(htg.index)
    htg_df = pd.concat([htg_df, htg])
    
    return pCtg_df, htg_df

# pCtg_df, htg_df = assign(86, 39, pCtg_df, htg_df)

def testAssign(pCtg_df, htg_df):
    '''tests assign() function with one example case. Checks that pwoh was removed
    from pCtg_df and also that it was reassigned to the htg_df with the correct naming'''
    
    newHtgs = len(pCtg_df[[s.startswith('pcontig_086') for s in pCtg_df.index]])
    startHtgs = len(htg_df[[s.startswith('hcontig_039') for s in htg_df.index]])
    
    pCtg_df, htg_df = assign(86, 39, pCtg_df, htg_df)
    assert(len(pCtg_df[[s.startswith('pcontig_086') for s in pCtg_df.index]]) == 0)
    
    finalHtgs = len(htg_df[[s.startswith('hcontig_039') for s in htg_df.index]])
    assert(finalHtgs == newHtgs + startHtgs)

    return True

testAssign(pCtg_df, htg_df)

def assignMany(pairs, pCtg_df, htg_df):
    for pwohNum, pwhNum in pairs:
        pCtg_df, htg_df = assign(pwohNum, pwhNum, pCtg_df, htg_df)
    return pCtg_df, htg_df

pairs = [(86, 39), \
        (96, 33), \
        (97, 39), \
        (100, 33), \
        (103, 74)]

pCtg_df, htg_df = assignMany(pairs, pCtg_df, htg_df)

# pCtg_df[[s.startswith('pcontig_086') for s in pCtg_df.index]]
# htg_df[[s.startswith('hcontig_074') for s in htg_df.index]]

In [2]:
# write haplotigs to new file (v03)
if not os.path.exists(os.path.join(ANNOTATION_PATH, H_GENOME_OUT)):
    os.mkdir(os.path.join(ANNOTATION_PATH, H_GENOME_OUT))
with open(H_GFF_PATH_OUT, 'w') as outfile:
    htg_df.to_csv(outfile, sep='\t', header = False)

In [3]:
# write primary contigs to new file (v03)
if not os.path.exists(os.path.join(ANNOTATION_PATH, P_GENOME_OUT)):
    os.mkdir(os.path.join(ANNOTATION_PATH, P_GENOME_OUT))
with open(P_GFF_PATH_OUT, 'w') as outfile:
    pCtg_df.to_csv(outfile, sep='\t', header = False)

In [4]:
# fix attributes column
def fixHtgAttributes(row):
    '''With the above code, the attributes column is not changed for
    hcontigs that are manually assigned. This changes the attributes
    column of a DataFrame row to the correct format.
    
    E.g. hcontig_074_103 would have attributes column:
    ID=cds.evm.model.pcontig_103.5;Parent=evm.model.pcontig_103.5
    
    This code will change the column to:
    ID=cds.evm.model.hcontig_074_103.5;Parent=evm.model.hcontig_074_103.5'''
    
    attributes = row['attributes']
    contigLoc = row['contigLoc'] # e.g. pcontig_103.5 (incorrect if manually assigned)
    seqid = row['seqid'] # e.g. hcontig_074_103
    
    numberSuffix = contigLoc.split('.')[-1]
    
    if attributes.find('pcontig') == -1:
        return attributes
    return attributes.replace(contigLoc, '%s.%s' %(seqid, str(numberSuffix)))

pCtg_gff_df = pd.read_table(P_GFF_PATH_OUT, header = None, index_col = None, \
                  names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
htg_gff_df = pd.read_table(H_GFF_PATH_OUT, header = None, index_col = None, \
                  names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']) 

ID_SEARCH = re.compile(r'^.*ID=(.*?)(;|$)')
P_CONTIG_LOC_SEARCH = re.compile(r'^.*\.(pcontig_\d{3}\.\d+)(?:\.|$)') # P_CONTIG_LOC_SEARCH.match('cds.evm.model.pcontig_000.1').group(1) will extract 'pcontig_000.1' 
H_CONTIG_LOC_SEARCH = re.compile(r'^.*\.((?:p|h)contig_\d{3}(?:_\d{3})?\.\d+)(?:\.|$)') # H_CONTIG_LOC_SEARCH.match('evm.model.hcontig_006_028.2').group(1) will extract 'hcontig_006_028.2'

pCtg_gff_df['contigID'] = pCtg_gff_df['attributes'].apply(lambda s: ID_SEARCH.match(s).group(1))
pCtg_gff_df['contigLoc'] = pCtg_gff_df['contigID'].apply(lambda s: P_CONTIG_LOC_SEARCH.match(s).group(1))

htg_gff_df['contigID'] = htg_gff_df['attributes'].apply(lambda s: ID_SEARCH.match(s).group(1))
htg_gff_df['contigLoc'] = htg_gff_df['contigID'].apply(lambda s: H_CONTIG_LOC_SEARCH.match(s).group(1))

# fix attributes column in genome v03, and re-write this as genome v031 
htg_gff_df['attributes'] = htg_gff_df.apply(fixHtgAttributes, axis=1)
htg_gff_df.drop(['contigID', 'contigLoc'], inplace=True, axis=1)
pCtg_gff_df.drop(['contigID', 'contigLoc'], inplace=True, axis=1)

htg_gff_df.to_csv(H_GFF_PATH_OUT, sep='\t', header=None, index=None)
pCtg_gff_df.to_csv(P_GFF_PATH_OUT, sep='\t', header=None, index=None)