In [72]:
import os
import itertools
from Bio import SeqIO

In [73]:
file_path = '/home/gamran/genome_analysis/Warrior/genome/'
p_file_name = 'DK_0911_v01_p_ctg.fa'
h_file_name = 'DK_0911_v01_h_ctg.fa'

h_ctg_file_loc = file_path + h_file_name
p_ctg_file_loc = file_path + p_file_name

output_dir = '/home/gamran/genome_analysis/Warrior/Richard/output/'

In [74]:
# returns the primary contig number to which a haplotig or primary contig belongs,
# e.g. hcontig_003_048 will return 003
# e.g. pcontig_008 will return 008

def getPCtgNum(ctg):
    return ctg[8:11]

In [75]:
h_ctgs = [h for h in SeqIO.parse(h_ctg_file_loc, 'fasta')]
p_ctgs = [p for p in SeqIO.parse(p_ctg_file_loc, 'fasta')]

In [76]:
# returns True if primary contig (pCtg) has an associated haplotig
def isCtgPaired(pCtgNum, haplotigs):
    for haplotig in haplotigs:
        if getPCtgNum(haplotig.id) == pCtgNum:
            return True
    return False

In [77]:
'''
for seq_record in itertools.islice(p_ctgs, 1):
    print(type(seq_record))
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))
    print(getPCtgNum(seq_record.id))
'''

<class 'Bio.SeqRecord.SeqRecord'>
pcontig_000
Seq('TGTTGTCTATTTTTACTCGTCATTCATGTAAAATGATGTATTTCTTCATTCCTG...gtt', SingleLetterAlphabet())
4476905
000


In [85]:
def getPairedUnpairedContigs(p_ctgs, h_ctgs):
    p_with_h = []
    p_without_h = []

    for pContig in p_ctgs:
        pCtgNumber = getPCtgNum(pContig.id)
        pContigFileName = p_file_name[:-3] + "_" + getPCtgNum(pContig.id) + ".fa"
        if isCtgPaired(pCtgNumber, h_ctgs):
            p_with_h.append(pContig)
            SeqIO.write(pContig, output_dir + 'pwh/' + pContigFileName, 'fasta')
        else:
            p_without_h.append(pContig)
            directory = output_dir + 'pwoh/' + pContigFileName[:-3] + '/'
            if not os.path.exists(directory):
                os.makedirs(directory)
            SeqIO.write(pContig, directory + pContigFileName, 'fasta')

    assert len(p_with_h) + len(p_without_h) == len(p_ctgs)
    
    return p_with_h, p_without_h

pwhs, pwohs = getPairedUnpairedContigs(p_ctgs, h_ctgs)

In [79]:
SeqIO.write(pwhs, output_dir + 'pwh/' + p_file_name[:-3] + '_pwh.fa', 'fasta')
SeqIO.write(pwohs, output_dir + 'pwoh/' + p_file_name[:-3] + '_pwoh.fa', 'fasta')

13

In [118]:
# pwoh = one primary contig without a haplotig <SeqRecord>
# pwhs = primary contigs with haplotigs (all) <list of SeqRecords>
def writeScript(pwoh, pwhs, qFilter = True):
    pwhs_loc = '/home/gamran/genome_analysis/Warrior/Richard/output/pwh/DK_0911_v01_p_ctg_pwh.fa'
    pwoh_dir = '/home/gamran/genome_analysis/Warrior/Richard/output/pwoh/'
    
    pwoh_name = 'DK_0911_v01_p_ctg_' + getPCtgNum(pwoh.id)
    pwoh_loc = pwoh_dir + pwoh_name + '/' + pwoh_name + '.fa'
    
    specific_out_dir = output_dir + 'pwoh/' + pwoh_name + '/'
    
    base_path_0 = '/home/gamran/anaconda3/bin/'
    base_path_1 = '/usr/bin/'
    
    s = 'cd ' + specific_out_dir + '\n'
    
    # qfilter
    if qFilter:
        s += base_path_1 + 'nucmer ' + pwhs_loc + ' ' + pwoh_loc + ' > out.delta\n'
        s += base_path_1 + 'delta-filter -q out.delta > ' + pwoh_name + '_qfiltered.delta\n'
        s += base_path_1 + 'show-coords -T ' + pwoh_name + "_qfiltered.delta > " + pwoh_name + '.qcoords\n'
        s += base_path_0 + 'mummerplot -p ' + pwoh_name + '_qfiltered --postscript ' + pwoh_name + '_qfiltered.delta\n'
        s += base_path_0 + 'mummerplot -c -p ' + pwoh_name + '_qfiltered_cov --postscript ' + pwoh_name + '_qfiltered.delta\n'
    #gfilter
    else:
        s += base_path_1 + 'nucmer ' + pwhs_loc + ' ' + pwoh_loc + ' > out.delta\n'
        s += base_path_1 + 'delta-filter -g out.delta > ' + pwoh_name + '_gfiltered.delta\n'
        s += base_path_1 + 'show-coords -T ' + pwoh_name + "_gfiltered.delta > " + pwoh_name + '.gcoords\n'
        s += base_path_0 + 'mummerplot -p ' + pwoh_name + '_gfiltered --postscript ' + pwoh_name + '_gfiltered.delta\n'
        s += base_path_0 + 'mummerplot -c -p ' + pwoh_name + '_gfiltered_cov --postscript ' + pwoh_name + '_gfiltered.delta\n'
    return s

print(writeScript(pwohs[1], pwhs))

cd /home/gamran/genome_analysis/Warrior/Richard/output/pwoh/DK_0911_v01_p_ctg_077/
/usr/bin/nucmer /home/gamran/genome_analysis/Warrior/Richard/output/pwh/DK_0911_v01_p_ctg_pwh.fa /home/gamran/genome_analysis/Warrior/Richard/output/pwoh/DK_0911_v01_p_ctg_077/DK_0911_v01_p_ctg_077.fa > out.delta
/usr/bin/delta-filter -q out.delta > DK_0911_v01_p_ctg_077_qfiltered.delta
/usr/bin/show-coords -T DK_0911_v01_p_ctg_077_qfiltered.delta > DK_0911_v01_p_ctg_077.qcoords
/home/gamran/anaconda3/bin/mummerplot -p DK_0911_v01_p_ctg_077_qfiltered --postscript DK_0911_v01_p_ctg_077_qfiltered.delta
/home/gamran/anaconda3/bin/mummerplot -c -p DK_0911_v01_p_ctg_077_qfiltered_cov --postscript DK_0911_v01_p_ctg_077_qfiltered.delta



In [119]:
bash_script_q="ph_ctg_qmapping.sh"
bash_script_g="ph_ctg_gmapping.sh"
outfq = open(output_dir + bash_script_q, 'w')
outfq.write('#!/bin/bash\n')
outfg = open(output_dir + bash_script_g, 'w')
outfg.write('#!/bin/bash\n')

for pwoh in pwohs:
    s = writeScript(pwoh, pwhs, True)
    outfq.write(s)
    s = writeScript(pwoh, pwhs, False)
    outfg.write(s)


outfq.write(writeScript(pwohs[0], pwhs, True))
outfq.close()
outfg.close()