   - 5(i) `DK_0911_contig_lengths`
     - Inputs: .*fasta* files
     - Programs: N/A
     - Purpose: calculate genomic information pertaining to number and size of contigs. `PWH_SIZE` is required for `DK_0911_assemblytics_analysis`.
     
NB: this notebook, as itis written here, is meant to be run from another notebook, `DK_0911_assemblytics_analysis` (using command `%run DK_0911_contig_lengths.ipynb`).

In [None]:
from Bio import SeqIO


# IF NOT IMPORTING FROM ANOTHER IPYNB, SET GENOME_VERSION IN THIS NOTEBOOK

try:
    GENOME_VERSION
except NameError:
    raise Exception('Please define GENOME_VERSION in the local ipynb to use DK_0911_dictionaries.')

FILE_PATH = '/home/gamran/genome_analysis/Warrior/Richard/output/genome_%s/' % GENOME_VERSION

P_FILE_NAME = 'DK_0911_%s_p_ctg.fa' % GENOME_VERSION
H_FILE_NAME = 'DK_0911_%s_h_ctg.fa' % GENOME_VERSION

H_CTG_FILE_LOC = FILE_PATH + H_FILE_NAME
P_CTG_FILE_LOC = FILE_PATH + P_FILE_NAME

htgs = [h for h in SeqIO.parse(H_CTG_FILE_LOC, 'fasta')]
pCtgs = [p for p in SeqIO.parse(P_CTG_FILE_LOC, 'fasta')]

def getPCtgNum(ctg):
    '''returns the primary contig number to which a haplotig or primary contig belongs,
    e.g. hcontig_003_048 will return 003
    e.g. pcontig_008 will return 008'''
    return ctg[8:11]

def getPairedContigs(pCtgs, htgs):
    '''returns a tuple containing all [pwhs]
    from a <SeqRecord> list of PCtgs and htgs.'''
    
    pwhs = []
    pwohs = []
    
    l = []
    for htg in htgs:
        pCtgNum = getPCtgNum(htg.id)
        if pCtgNum not in l:
            l.append(pCtgNum)
    
    for pCtg in pCtgs:
        if getPCtgNum(pCtg.id) in l:
            pwhs.append(pCtg)
            l.remove(getPCtgNum(pCtg.id))
        else:
            pwohs.append(pCtg)
    return pwhs, pwohs

pwhs, pwohs = getPairedContigs(pCtgs, htgs)

sumHtgs = sum([len(htg.seq) for htg in htgs])
sumPCtgs = sum([len(pCtg.seq) for pCtg in pCtgs])

sumPwhs = sum([len(pwh.seq) for pwh in pwhs])
sumPwohs = sum([len(pwoh.seq) for pwoh in pwohs])

def getPwhSize():
    return sumPwhs

print('\
Total number of haplotigs: %i\n\
Total haplotig length: %i\n\n\
Total number of primary contigs: %i\n\
Total primary contig length: %i\n\n\
Total number of primary contigs with haplotigs: %i\n\
Total primary contigs with haplotigs length: %i\n\n\
Total number of primary contigs without haplotigs: %i\n\
Total primary contigs without haplotigs length: %i'\

%(len(htgs), sumHtgs, len(pCtgs), sumPCtgs, len(pwhs), sumPwhs, len(pwohs), sumPwohs))

In [None]:
# !grep -v '^>' {P_CTG_FILE_LOC} | wc

In [None]:
# !grep -v '^>' {H_CTG_FILE_LOC} | wc