In [71]:
import os
from Bio import SeqIO

file_path = '/home/gamran/genome_analysis/Warrior/genome/'
p_file_name = 'DK_0911_v01_p_ctg.fa'
h_file_name = 'DK_0911_v01_h_ctg.fa'

h_ctg_file_loc = file_path + h_file_name
p_ctg_file_loc = file_path + p_file_name

output_dir = '/home/gamran/genome_analysis/Warrior/genome/'

htgs = [h for h in SeqIO.parse(h_ctg_file_loc, 'fasta')]
pCtgs = [p for p in SeqIO.parse(p_ctg_file_loc, 'fasta')]

def getPCtgNum(ctg):
    '''returns the primary contig number to which a haplotig or primary contig belongs,
    e.g. hcontig_003_048 will return 003
    e.g. pcontig_008 will return 008'''
    return ctg[8:11]

def getPCtg(num, pCtgs):
    '''gets a primary contig by its ID number'''
    for pCtg in pCtgs:
        if int(getPCtgNum(pCtg.id)) == num:
            return pCtg
    print('ERROR: primary contig of number: ' + str(num) + ' could not be found.')

def changeName(htg, pCtgNum):
    pCtgNum = str(pCtgNum)
    if len(pCtgNum) < 3:
        pCtgNum = '0'*(3-len(pCtgNum)) + str(pCtgNum)
        # print('h' + htg.id[1:][:-3] + pCtgNum + '_' + htg.id[-3:])
    return 'h' + htg.id[1:][:-3] + pCtgNum + '_' + htg.id[-3:]

def assignPwohToPwh(pwohNum, pCtgNum, htgs, pCtgs):
    '''Takes a pwoh, changes its id and removes it from pCtgs
    and adds it to htgs. Returns list of pCtgs and htgs.
    For example, if pwoh.id = 049 and pCtgNum = 005,
    pcontig_049 will become hcontig_005_049'''
    
    for i in range (len(pCtgs)):
        if int(getPCtgNum(pCtgs[i].id)) == pwohNum:
            htg = pCtgs.pop(i)
            
            # HACK FIX HERE
            if getPCtgNum(htg.id) == '103':
                htg = htg.reverse_complement(id = htg.id, description = 'reverse complement')
            break

    htg.id = changeName(htg, pCtgNum)
    
    htgs.append(htg)
    return pCtgs, htgs

def assignManyPwohToPwh(pairs, htgs, pCtgs):
    '''takes a list of [(pwohNum, pCtgNum), ...] pairs and changes the pwoh
    to a haplotig paired with pCtgNum. Returns a list of pCtgs and htgs.'''
    for pwohNum, pCtgNum in pairs:
        pCtgs, htgs = assignPwohToPwh(pwohNum, pCtgNum, htgs, pCtgs)
    return pCtgs, htgs


assignmentPairs = [(86, 39), \
                   (96, 33), \
                   (97, 39), \
                   (100, 33), \
                   (103, 74)]

pCtgs, htgs = assignManyPwohToPwh(assignmentPairs, htgs, pCtgs)

SeqIO.write(pCtgs, output_dir + 'DK_0911_v03_p_ctg.fa', 'fasta')
SeqIO.write(htgs, output_dir + 'DK_0911_v03_h_ctg.fa', 'fasta')

1176