In [123]:
import os
import itertools
from Bio import SeqIO

file_path = '/home/gamran/genome_analysis/Warrior/genome/'
p_file_name = 'DK_0911_v01_p_ctg.fa'
h_file_name = 'DK_0911_v01_h_ctg.fa'

h_ctg_file_loc = file_path + h_file_name
p_ctg_file_loc = file_path + p_file_name

output_dir = '/home/gamran/genome_analysis/Warrior/Richard/output/'

def getPCtgNum(ctg):
    '''returns the primary contig number to which a haplotig or primary contig belongs,
    e.g. hcontig_003_048 will return 003
    e.g. pcontig_008 will return 008'''
    return ctg[8:11]

# returns True if primary contig (pCtg) has an associated haplotig
def isCtgPaired(pCtgNum, haplotigs):
    for haplotig in haplotigs:
        if getPCtgNum(haplotig.id) == pCtgNum:
            return True
    return False

def getPairedUnpairedContigs(p_ctgs, h_ctgs):
    '''returns a tuple containing all (pwhs, pwohs)
    from a <SeqRecord> list of p_ctgs and h_ctgs'''
    p_with_h = []
    p_without_h = []

    for pContig in p_ctgs:
        pCtgNumber = getPCtgNum(pContig.id)
        pContigFileName = p_file_name[:-3] + "_" + getPCtgNum(pContig.id) + ".fa"
        if isCtgPaired(pCtgNumber, h_ctgs):
            p_with_h.append(pContig)
            SeqIO.write(pContig, output_dir + 'pwh/' + pContigFileName, 'fasta')
        else:
            p_without_h.append(pContig)
            directory = output_dir + 'pwoh/' + pContigFileName[:-3] + '/'
            if not os.path.exists(directory):
                os.makedirs(directory)
            SeqIO.write(pContig, directory + pContigFileName, 'fasta')

    assert len(p_with_h) + len(p_without_h) == len(p_ctgs)
    
    return p_with_h, p_without_h

def writeScript(pwoh, pwhs, qFilter = True):
    '''from a single <SeqRecord> pwoh and list of <SeqRecord> pwhs,
    returns a section of bash script that compares a single pwoh to all pwhs,
    using either a -g or -q filter'''
    
    pwhs_loc = output_dir + 'pwh/DK_0911_v01_p_ctg_pwh.fa'
    pwoh_dir = output_dir + 'pwoh/'
    
    pwoh_name = 'DK_0911_v01_p_ctg_' + getPCtgNum(pwoh.id)
    pwoh_loc = pwoh_dir + pwoh_name + '/' + pwoh_name + '.fa'
    
    specific_out_dir = output_dir + 'pwoh/' + pwoh_name + '/'
    
    base_path_0 = '/home/gamran/anaconda3/bin/'
    base_path_1 = '/usr/bin/'
    
    s = 'cd ' + specific_out_dir + '\n'
    
    # qfilter
    if qFilter:
        s += base_path_1 + 'nucmer ' + pwhs_loc + ' ' + pwoh_loc + ' > out.delta\n'
        s += base_path_1 + 'delta-filter -q out.delta > ' + pwoh_name + '_qfiltered.delta\n'
        s += base_path_1 + 'show-coords -T ' + pwoh_name + "_qfiltered.delta > " + pwoh_name + '.qcoords\n'
        s += base_path_0 + 'mummerplot -p ' + pwoh_name + '_qfiltered --postscript ' + pwoh_name + '_qfiltered.delta\n'
        s += base_path_0 + 'mummerplot -c -p ' + pwoh_name + '_qfiltered_cov --postscript ' + pwoh_name + '_qfiltered.delta\n'
    #gfilter
    else:
        s += base_path_1 + 'nucmer ' + pwhs_loc + ' ' + pwoh_loc + ' > out.delta\n'
        s += base_path_1 + 'delta-filter -g out.delta > ' + pwoh_name + '_gfiltered.delta\n'
        s += base_path_1 + 'show-coords -T ' + pwoh_name + "_gfiltered.delta > " + pwoh_name + '.gcoords\n'
        s += base_path_0 + 'mummerplot -p ' + pwoh_name + '_gfiltered --postscript ' + pwoh_name + '_gfiltered.delta\n'
        s += base_path_0 + 'mummerplot -c -p ' + pwoh_name + '_gfiltered_cov --postscript ' + pwoh_name + '_gfiltered.delta\n'
    return s

In [124]:
h_ctgs = [h for h in SeqIO.parse(h_ctg_file_loc, 'fasta')]
p_ctgs = [p for p in SeqIO.parse(p_ctg_file_loc, 'fasta')]

pwhs, pwohs = getPairedUnpairedContigs(p_ctgs, h_ctgs)

SeqIO.write(pwhs, output_dir + 'pwh/' + p_file_name[:-3] + '_pwh.fa', 'fasta')
SeqIO.write(pwohs, output_dir + 'pwoh/' + p_file_name[:-3] + '_pwoh.fa', 'fasta')

13

In [125]:
bash_script_q="ph_ctg_qmapping.sh"
bash_script_g="ph_ctg_gmapping.sh"
outfq = open(output_dir + bash_script_q, 'w')
outfq.write('#!/bin/bash\n')
outfg = open(output_dir + bash_script_g, 'w')
outfg.write('#!/bin/bash\n')

for pwoh in pwohs:
    s = writeScript(pwoh, pwhs, True)
    outfq.write(s)
    s = writeScript(pwoh, pwhs, False)
    outfg.write(s)


outfq.write(writeScript(pwohs[0], pwhs, True))
outfq.close()
outfg.close()