In [21]:
from Bio import SeqIO

FILE_PATH = '/home/gamran/genome_analysis/Warrior/genome/'
P_FILE_NAME = 'DK_0911_v03_p_ctg.fa'
H_FILE_NAME = 'DK_0911_v03_h_ctg.fa'

H_CTG_FILE_LOC = FILE_PATH + H_FILE_NAME
P_CTG_FILE_LOC = FILE_PATH + P_FILE_NAME

htgs = [h for h in SeqIO.parse(H_CTG_FILE_LOC, 'fasta')]
pCtgs = [p for p in SeqIO.parse(P_CTG_FILE_LOC, 'fasta')]

def getPCtgNum(ctg):
    '''returns the primary contig number to which a haplotig or primary contig belongs,
    e.g. hcontig_003_048 will return 003
    e.g. pcontig_008 will return 008'''
    return ctg[8:11]

def getPairedContigs(pCtgs, htgs):
    '''returns a tuple containing all [pwhs]
    from a <SeqRecord> list of PCtgs and htgs.'''
    
    pwhs = []
    pwohs = []
    
    l = []
    for htg in htgs:
        pCtgNum = getPCtgNum(htg.id)
        if pCtgNum not in l:
            l.append(pCtgNum)
    
    for pCtg in pCtgs:
        if getPCtgNum(pCtg.id) in l:
            pwhs.append(pCtg)
            l.remove(getPCtgNum(pCtg.id))
        else:
            pwohs.append(pCtg)
    return pwhs, pwohs

pwhs, pwohs = getPairedContigs(pCtgs, htgs)

sumHtgs = sum([len(htg.seq) for htg in htgs])
sumPCtgs = sum([len(pCtg.seq) for pCtg in pCtgs])

sumPwhs = sum([len(pwh.seq) for pwh in pwhs])
sumPwohs = sum([len(pwoh.seq) for pwoh in pwohs])
    
print('\
Total number of haplotigs: %i\n\
Total haplotig length: %i\n\n\
Total number of primary contigs: %i\n\
Total primary contig length: %i\n\n\
Total number of primary contigs with haplotigs: %i\n\
Total primary contigs with haplotigs length: %i\n\n\
Total number of primary contigs without haplotigs: %i\n\
Total primary contigs without haplotigs length: %i'\

%(len(htgs), sumHtgs, len(pCtgs), sumPCtgs, len(pwhs), sumPwhs, len(pwohs), sumPwohs))

Total number of haplotigs: 1176
Total haplotig length: 52126201

Total number of primary contigs: 94
Total primary contig length: 74427617

Total number of primary contigs with haplotigs: 86
Total primary contigs with haplotigs length: 73945211

Total number of primary contigs without haplotigs: 8
Total primary contigs without haplotigs length: 482406


In [18]:
!grep -v '^>' {P_CTG_FILE_LOC} | wc

1240505 1240505 75668122


In [19]:
!grep -v '^>' {H_CTG_FILE_LOC} | wc

 869358  869358 52995559
