# Fasta File Reassignment

- Inputs: original .*fasta* files, reassignment pairs
- Programs: N/A
- Purpose: reassign pwoh as haplotigs in *fasta* files

In [2]:
import os
from Bio import SeqIO

GENOME_IN_VERSION = 'v01'
GENOME_OUT_VERSION = 'v032'

GENOME_PATH_IN = '/home/gamran/genome_analysis/Warrior/genome_%s/' % GENOME_IN_VERSION
GENOME_PATH_OUT = '/home/gamran/genome_analysis/Warrior/Richard/output/genome_%s' % GENOME_OUT_VERSION

GENOME_IN = 'DK_0911_%s' % GENOME_IN_VERSION
GENOME_OUT = 'DK_0911_%s' % GENOME_OUT_VERSION

H_FILE_LOC = GENOME_PATH_IN + GENOME_IN + '_h_ctg.fa'
P_FILE_LOC = GENOME_PATH_IN + GENOME_IN + '_p_ctg.fa'

htgs = [h for h in SeqIO.parse(H_FILE_LOC, 'fasta')]
pCtgs = [p for p in SeqIO.parse(P_FILE_LOC, 'fasta')]

def getPCtgNum(ctg):
    '''returns the primary contig number to which a haplotig or primary contig belongs,
    e.g. hcontig_003_048 will return 003
    e.g. pcontig_008 will return 008'''
    return ctg[8:11]

def getPCtg(num, pCtgs):
    '''gets a primary contig by its ID number'''
    for pCtg in pCtgs:
        if int(getPCtgNum(pCtg.id)) == num:
            return pCtg
    print('ERROR: primary contig of number: ' + str(num) + ' could not be found.')

def changeName(htg, pCtgNum):
    pCtgNum = str(pCtgNum)
    if len(pCtgNum) < 3:
        pCtgNum = '0'*(3-len(pCtgNum)) + str(pCtgNum)
        # print('h' + htg.id[1:][:-3] + pCtgNum + '_' + htg.id[-3:])
    return 'h' + htg.id[1:][:-3] + pCtgNum + '_' + htg.id[-3:]

def assignPwohToPwh(pwohNum, pCtgNum, htgs, pCtgs):
    '''Takes a pwoh, changes its id and removes it from pCtgs
    and adds it to htgs. Returns list of pCtgs and htgs.
    For example, if pwoh.id = 049 and pCtgNum = 005,
    pcontig_049 will become hcontig_005_049.
    
    NB: if, for example, hcontig_005_049 already exists, 
    this is not accounted for in this function and may cause an error.'''
    
    for i in range (len(pCtgs)):
        if int(getPCtgNum(pCtgs[i].id)) == pwohNum:
            htg = pCtgs.pop(i)

            newId = changeName(htg, pCtgNum)
            htg.id = newId
            htg.description = newId

            htgs.append(htg)
            return pCtgs, htgs

def assignManyPwohToPwh(pairs, htgs, pCtgs):
    '''takes a list of [(pwohNum, pCtgNum), ...] pairs and changes the pwoh
    to a haplotig paired with pCtgNum. Returns a list of pCtgs and htgs.'''
    for pwohNum, pCtgNum in pairs:
        pCtgs, htgs = assignPwohToPwh(pwohNum, pCtgNum, htgs, pCtgs)
    return pCtgs, htgs


assignmentPairs = [(86, 39), \
                   (96, 33), \
                   (97, 39), \
                   (100, 33), \
                   (103, 74)]

pCtgs, htgs = assignManyPwohToPwh(assignmentPairs, htgs, pCtgs)

if not os.path.exists(GENOME_PATH_OUT):
    os.mkdir(GENOME_PATH_OUT)
SeqIO.write(pCtgs, os.path.join(GENOME_PATH_OUT, GENOME_OUT + '_p_ctg.fa'), 'fasta')
SeqIO.write(htgs, os.path.join(GENOME_PATH_OUT, GENOME_OUT + '_h_ctg.fa'), 'fasta')

1176