In [212]:
import pandas as pd

BASE_DIR = '/home/gamran/genome_analysis/Warrior/annotation/'

H_CTG_GFF_LOC = BASE_DIR + 'DK_0911_v01_h_ctg/DK_0911_v01_h_ctg.evm.all.lt.gff3'
P_CTG_GFF_LOC = BASE_DIR + 'DK_0911_v01_p_ctg/DK_0911_v01_p_ctg.evm.all.lt.gff3'

pCtg_df = pd.read_table(P_CTG_GFF_LOC, skiprows = 1, header = None, index_col = 0, \
                  names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
htg_df = pd.read_table(H_CTG_GFF_LOC, skiprows = 1, header = None, index_col = 0, \
                  names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']) 

def assign(pwohNum, pwhNum, pCtg_df, htg_df):
    '''Assigns a single pwoh to a pwh, removing the pwoh from the 
    gff pCtg_df and assigning it to the gff htg_df with the correct
    naming convention. For example, pcontig_086 that is a pwoh
    belonging to pcontig_039 will be renamed to hcontig_039_086 and
    removed from pCtg_df and added to htg_df.'''
    '''NOTE that this does not handle duplicates (e.g. if hcontig_039_086
    already exists, this function does nothing to handle this case)'''
    
    # e.g. 86 -> '086'
    pwohNum = '0'*(3-len(str(pwohNum))) + str(pwohNum)
    pwhNum = '0'*(3-len(str(pwhNum))) + str(pwhNum)
    
    htg = pCtg_df.loc['pcontig_' + pwohNum]
    pCtg_df = pCtg_df.drop('pcontig_' + pwohNum)
    htg.index = ['hcontig_%s_%s' % (pwhNum, pwohNum)]*len(htg.index)
    htg_df = pd.concat([htg_df, htg])
    
    return pCtg_df, htg_df

# pCtg_df, htg_df = assign(86, 39, pCtg_df, htg_df)

def testAssign(pCtg_df, htg_df):
    '''tests assign() function with one example case. Checks that pwoh was removed
    from pCtg_df and also that it was reassigned to the htg_df with the correct naming'''
    
    newHtgs = len(pCtg_df[[s.startswith('pcontig_086') for s in pCtg_df.index]])
    startHtgs = len(htg_df[[s.startswith('hcontig_039') for s in htg_df.index]])
    
    pCtg_df, htg_df = assign(86, 39, pCtg_df, htg_df)
    assert(len(pCtg_df[[s.startswith('pcontig_086') for s in pCtg_df.index]]) == 0)
    
    finalHtgs = len(htg_df[[s.startswith('hcontig_039') for s in htg_df.index]])
    assert(finalHtgs == newHtgs + startHtgs)

    return True

testAssign(pCtg_df, htg_df)

def assignMany(pairs, pCtg_df, htg_df):
    for pwohNum, pwhNum in pairs:
        pCtg_df, htg_df = assign(pwohNum, pwhNum, pCtg_df, htg_df)
    return pCtg_df, htg_df

pairs = [(86, 39), \
        (96, 33), \
        (97, 39), \
        (100, 33), \
        (103, 74)]

pCtg_df, htg_df = assignMany(pairs, pCtg_df, htg_df)

# pCtg_df[[s.startswith('pcontig_086') for s in pCtg_df.index]]
# htg_df[[s.startswith('hcontig_074') for s in htg_df.index]]

In [213]:
# write to haplotigs to new file (v03)
base_filename = 'DK_0911_v03_h_ctg.evm.all.lt.gff3'
folder_path = BASE_DIR + 'DK_0911_v03_h_ctg'
if not os.path.exists(folder_path):
    os.mkdir(folder_path)
with open(os.path.join(folder_path, base_filename), 'w') as outfile:
    htg_df.to_csv(outfile, sep='\t', header = False)

In [214]:
# write primary contigs to new file (v03)
base_filename = 'DK_0911_v03_p_ctg.evm.all.lt.gff3'
folder_path = BASE_DIR + 'DK_0911_v03_p_ctg'
if not os.path.exists(folder_path):
    os.mkdir(folder_path)
with open(os.path.join(folder_path, base_filename), 'w') as outfile:
    pCtg_df.to_csv(outfile, sep='\t', header = False)