In [64]:
import os
from Bio import SeqIO

file_path = '/home/gamran/genome_analysis/Warrior/genome/'
p_file_name = 'DK_0911_v01_p_ctg.fa'
h_file_name = 'DK_0911_v01_h_ctg.fa'

h_ctg_file_loc = file_path + h_file_name
p_ctg_file_loc = file_path + p_file_name

output_dir = '/home/gamran/genome_analysis/Warrior/Richard/output/'

def getPCtgNum(ctg):
    '''returns the primary contig number to which a haplotig or primary contig belongs,
    e.g. hcontig_003_048 will return 003
    e.g. pcontig_008 will return 008'''
    return ctg[8:11]

def isCtgPaired(pCtgNum, haplotigs):
    '''returns True if primary contig (pCtg) has an associated haplotig'''
    for haplotig in haplotigs:
        if getPCtgNum(haplotig.id) == pCtgNum:
            return True
    return False
    
def getPairedUnpairedContigs(PCtgs, htgs):
    '''returns a tuple containing all (pwhs, pwohs)
    from a <SeqRecord> list of PCtgs and htgs.
    While doing so, generates pwh and pwoh fasta files in organised directories.'''
    pwhs = []
    pwohs = []

    for pContig in pCtgs:
        pCtgNumber = getPCtgNum(pContig.id)
        pContigFileName = p_file_name[:-3] + "_" + getPCtgNum(pContig.id) + ".fa"
        if isCtgPaired(pCtgNumber, htgs):
            pwhs.append(pContig)
            SeqIO.write(pContig, output_dir + 'pwh/' + pContigFileName, 'fasta')
        else:
            pwohs.append(pContig)
            directory = output_dir + 'pwoh/' + pContigFileName[:-3] + '/'
            if not os.path.exists(directory):
                os.makedirs(directory)
            SeqIO.write(pContig, directory + pContigFileName, 'fasta')

    assert len(pwhs) + len(pwohs) == len(pCtgs)
    
    return pwhs, pwohs


def getPairedCtgsHtgs(pCtgs, htgs):
    '''returns a list of lists [[pCtg, [htgs]],...] containing primary contigs with 
    a list of their associated haplotigs'''
    '''N.B. dictionary with pCtg as key does not work as SeqRecord object is unhashable'''
    
    d = {}
    
    for htg in htgs:
        htgNum = getPCtgNum(htg.id) # the id of the primary contig associated with the haplotig
        if htgNum not in d:
            d[htgNum] = [htg]
        else:
            d[htgNum].append(htg)
    # generating list to return, with pCtg <SeqRecord> as opposed to ID
    l = []
    for pCtg in pCtgs:
        pCtgNum = getPCtgNum(pCtg.id)
        if pCtgNum in d:
            l.append((pCtg, d[pCtgNum]))
    return l

def assignHaplotig(pairsList, pwoh, pwhNum):
    '''Assigns a pwoh as a htg paired to a pwh, and returns a modified list containing
    [[pCtg, [htgs]], ...]'''
    for pair in pairsList:
        if int(getPCtgNum(pair[0].id)) == pwhNum:
            pair[1].append(pwoh)
    return pairsList

def assignHaplotigs(pairsList, assignmentPairs):
    '''From a list of assignmentPairs [(pwoh, pwhNum), ...], returns a modified list of
    assignmentPairs [[pCtg, [htgs]], ...]'''
    for pwoh, pwhNum in assignmentPairs:
        pairsList = assignHaplotig(pairsList, pwoh, pwhNum)
    return pairsList

def getPCtg(num, pCtgs):
    '''gets a primary contig by its ID number'''
    for pCtg in pCtgs:
        if int(getPCtgNum(pCtg.id)) == num:
            return pCtg
    print('ERROR: primary contig of number: ' + str(num) + ' could not be found.')

def writeScript(in_loc1, in_loc2, out_loc, qFilter = True):
    ''' from two .fa file locations and an output directory, returns a section
    of bash script that compares the two fasta inputs files using either a -g 
    or -q filter'''
    
    base_path_0 = '/home/gamran/anaconda3/bin/'
    base_path_1 = '/usr/bin/'
    
    s = 'cd ' + out_loc + '\n'
    
    # qfilter
    if qFilter:
        s += base_path_1 + 'nucmer ' + in_loc1 + ' ' + in_loc2 + ' > out.delta\n'
        s += base_path_1 + 'delta-filter -q out.delta > _qfiltered.delta\n'
        s += base_path_1 + 'show-coords -T _qfiltered.delta > _.qcoords\n'
        s += base_path_0 + 'mummerplot -p _qfiltered --postscript _qfiltered.delta\n'
        s += base_path_0 + 'mummerplot -c -p _qfiltered_cov --postscript _qfiltered.delta\n'
    #gfilter
    else:
        s += base_path_1 + 'nucmer ' + in_loc1 + ' ' + in_loc2 + ' > out.delta\n'
        s += base_path_1 + 'delta-filter -g out.delta > _gfiltered.delta\n'
        s += base_path_1 + 'show-coords -T _gfiltered.delta > _.gcoords\n'
        s += base_path_0 + 'mummerplot -p _gfiltered --postscript _gfiltered.delta\n'
        s += base_path_0 + 'mummerplot -c -p _gfiltered_cov --postscript _gfiltered.delta\n'
    return s

In [63]:
def generatePCtgHtgPairedFiles(pairsList, version = '1'):
    '''From a list of PCtg and their paired htgs, generates .fasta files in organised
    directories. If this is being used for manual assignment of haplotigs to primary contigs,
    then versions can be specified to stage changes'''
    for pCtg, htgs in pairsList:
        pCtgNum = getPCtgNum(pCtg.id)
        pContigFileName = "DK_0911_v01_p_ctg_" + getPCtgNum(pCtg.id) + ".fa"
        hContigFileName = "DK_0911_v01_h_ctg_" + getPCtgNum(pCtg.id) + ".fa"
        
        directory = output_dir + 'manual_assignment_' + version + '/'  + pContigFileName[:-3] + '/'
        
        if not os.path.exists(directory):
            os.makedirs(directory)
        SeqIO.write(pCtg, directory + pContigFileName[:-3] + '_ma' + version + '.fa', 'fasta')
        SeqIO.write(htgs, directory + hContigFileName[:-3] + '_ma' + version + '.fa', 'fasta')

    return

htgs = [h for h in SeqIO.parse(h_ctg_file_loc, 'fasta')]
pCtgs = [p for p in SeqIO.parse(p_ctg_file_loc, 'fasta')]

pairsList = getPairedCtgsHtgs(pCtgs, htgs)

assignmentPairs = [(getPCtg(61, pCtgs), 17), \
                   (getPCtg(80, pCtgs), 17), \
                   (getPCtg(86, pCtgs), 39), \
                   (getPCtg(96, pCtgs), 33), \
                   (getPCtg(97, pCtgs), 39), \
                   (getPCtg(99, pCtgs), 60), \
                   (getPCtg(100, pCtgs), 33), \
                   (getPCtg(103, pCtgs), 74), \
                   (getPCtg(104, pCtgs), 21), \
                   (getPCtg(109, pCtgs), 3), \
                   (getPCtg(110, pCtgs), 68)]

pairsList = assignHaplotigs(pairsList, assignmentPairs)

generatePCtgHtgPairedFiles(pairsList, '1')

def scriptPwhVsHtgs(pairsList, version = '1'):
    bash_script_q="pwh_ctg_qmapping.sh"
    bash_script_g="pwh_ctg_gmapping.sh"
    outfq = open(output_dir + 'manual_assignment_1/' + bash_script_q, 'w')
    outfq.write('#!/bin/bash\n')
    outfg = open(output_dir + 'manual_assignment_1/' + bash_script_g, 'w')
    outfg.write('#!/bin/bash\n')

    for pwh, htgs in pairsList:
        out_dir = output_dir + 'manual_assignment_' + version + '/' + 'DK_0911_v01_p_ctg_' + getPCtgNum(pwh.id) + '/'
        pwh_loc = out_dir + 'DK_0911_v01_p_ctg_' + getPCtgNum(pwh.id) + '_ma' + version + '.fa'
        htgs_loc = out_dir + 'DK_0911_v01_h_ctg_' + getPCtgNum(pwh.id) + '_ma' + version + '.fa'
        
        s = writeScript(pwh_loc, htgs_loc, out_dir, True)
        outfq.write(s)
        s = writeScript(pwh_loc, htgs_loc, out_dir, False)
        outfg.write(s)

    outfq.close()
    outfg.close()

scriptPwhVsHtgs(pairsList, '1')

In [50]:
htgs = [h for h in SeqIO.parse(h_ctg_file_loc, 'fasta')]
pCtgs = [p for p in SeqIO.parse(p_ctg_file_loc, 'fasta')]

pwhs, pwohs = getPairedUnpairedContigs(pCtgs, htgs)

SeqIO.write(pwhs, output_dir + 'pwh/' + p_file_name[:-3] + '_pwh.fa', 'fasta')
SeqIO.write(pwohs, output_dir + 'pwoh/' + p_file_name[:-3] + '_pwoh.fa', 'fasta')

def scriptPwohVsPwhs(pwohs, pwhs):
    bash_script_q="ph_ctg_qmapping.sh"
    bash_script_g="ph_ctg_gmapping.sh"
    outfq = open(output_dir + bash_script_q, 'w')
    outfq.write('#!/bin/bash\n')
    outfg = open(output_dir + bash_script_g, 'w')
    outfg.write('#!/bin/bash\n')

    for pwoh in pwohs:
        pwoh_name = 'DK_0911_v01_p_ctg_' + getPCtgNum(pwoh.id)
        pwhs_loc = output_dir + 'pwh/DK_0911_v01_p_ctg_pwh.fa'
        pwoh_loc = output_dir + 'pwoh/' + 'DK_0911_v01_p_ctg_' + getPCtgNum(pwoh.id) + '/' + pwoh_name + '.fa'
        
        out_loc = output_dir + 'pwoh/' + pwoh_name + '/'
        
        s = writeScript(pwoh_loc, pwhs_loc, out_loc, True)
        outfq.write(s)
        s = writeScript(pwoh_loc, pwhs_loc, out_loc, False)
        outfg.write(s)

    outfq.close()
    outfg.close()
    
# scriptPwohVsPwhs(pwohs, pwhs)