In [None]:
# default_exp synteny

In [None]:
#hide
%load_ext autoreload
%autoreload 2

# Synteny module

> Provides functionality to construct special synteny graph from pangenome annotation (and actual sequences).

In [None]:
#hide
from nbdev.showdoc import *
from nbdev.export import notebook2script
notebook2script()

Converted 00_init.ipynb.
Converted 01_graph.ipynb.
Converted 02_tree.ipynb.
Converted 03_synteny.ipynb.
Converted 04_utils.ipynb.
Converted 05_export.ipynb.
Converted deBruijnGraphProcessing.ipynb.
Converted graphTesting.ipynb.
Converted index.ipynb.


In [None]:
#exporti
import os
import glob
import json
import itertools

import pandas as pd

from skbio.io import read as skbio_read
from skbio.metadata import IntervalMetadata
from skbio.sequence import DNA

from dnasim.IO import writeFASTA
from dnasim.simulation import inverseSequence
from pangraph_constructor.utils import bidict

In [None]:
#export
def readTransMap(transMapFile,ATaccessionName='araport'):
    # reading and preparing transmap in pandas format
    transMap = pd.read_csv(transMapFile,delimiter='\t')
    transMap.fillna('',inplace=True)
    transMap.rename(columns={'Orthogroup:':'orthogroup'},inplace=True)
    transMap['orthogroup'] = transMap.orthogroup.str.rstrip(':')
    transMap.set_index('orthogroup',inplace=True)
#     transMap.fillna('',inplace=True)
    if ATaccessionName is not None:
        ATTransMap = transMap[ATaccessionName]
        return bidict({og:ATTransMap[og].split(', ') for og in ATTransMap.index})
    else:
        return transMap

In [None]:
#export
def generateOrder(files, priorityAccession='TIAR10'):
    idxList = list(range(len(files)))
    if priorityAccession is not None:
        ind = [idx for idx, file in enumerate(files) if priorityAccession in file][0]
        del idxList[ind]
        idxList = [ind] + idxList
    return idxList

In [None]:
#export
def getIDs(iterator):
    idList = []
    for interval in iterator:
        idList.append(interval.metadata['ID'])
    
    return idList

In [None]:
#| export
def getChromosomeFromSeqID(seqID):
    # This function should be modified if chromosome is encoded differently in gff3 file.
    return seqID.split('_')[1]

In [None]:
#| export
def processAccession(annotationFile, sequenceFile=None,
                     ATmap=None, isRef=False, accID=None):
    if accID is None:
        accessionID = os.path.splitext(os.path.basename(annotationFile))[0]
    else:
        accessionID = accID
    annotationGen = skbio_read(annotationFile, format='gff3')
    
    sequenceDict = None
    
    if sequenceFile is not None:
        sequenceDict = {}
        sequenceGen = skbio_read(sequenceFile,format='fasta')
        for seq in sequenceGen:
            sequenceDict[seq.metadata['id']] = bytearray(seq.values).decode()
    
    genes = []
    for seqID,annotation in annotationGen:
        geneInts = annotation.query(metadata={'type':'gene'})
        
        for gene in geneInts:
            geneID = gene.metadata['ID'].split('.')[-1]
            if isRef and ATmap is not None:
                orthogroup = ATmap.inverse.get(geneID,[None])[0]
            elif isRef:
                raise ValueError("If reference is provided, then the TransMap (bidict) with bidirectional relation between reference and annotation IDs should be provided.")
                #orthogroup = gene.metadata['ID']
            else:
                orthogroup = gene.metadata['OG']
            
            if orthogroup is None:
                continue
            
            if ATmap is not None:
                atNamesStr = ATmap.get(orthogroup,[''])
            else:
                atNameStr = ['']
            forward = gene.metadata['strand']=='+'
            start,end = gene.bounds[0]
            
            if sequenceDict is not None:
                geneSeq = sequenceDict[seqID][start:end+1]
            else:
                geneSeq = ''
            if isRef:
                pass
            overlaps = getIDs(annotation.query(bounds=[(start,end)],metadata={'type':'gene'}))
            genes.append([geneID,orthogroup,seqID,accessionID,getChromosomeFromSeqID(seqID),forward,start+1,end,atNamesStr,geneSeq,overlaps])
    
    genes = pd.DataFrame(genes,columns=['geneID','orthogroup','sequenceID','accessionID','chromosome','forward','start','end','AT_str','geneSeq','overlapGenes'])
    genes.sort_values(by=['sequenceID','start'],inplace=True)
        
    return accessionID,genes,sequenceDict

In [None]:
#export
def recordSegment(name,segmentIDs,segmentIDToNumDict,sequence=None,gfaFile=None,segmentData=None):
    segmentIDs.append(name)
    
    segmentIDToNumDict[name] = len(segmentIDs)-1
    segID = len(segmentIDs)
    
    if segmentData is not None and sequence is not None:
        segmentData.append(sequence)
    
    if gfaFile is not None:
        if sequence is not None:
            gfaFile.write(f'S\t{segID}\t{sequence}\n')        
        else:
            gfaFile.write(f'S\t{segID}\t{name}\n')
    return segID

In [None]:
#| export
def recordAnnotation(nodeID,accessionID,sequenceID,chrID,start,end,og,atList,sequence,nodesMetadata):
    # if len(nodesAnnotation)==nodeID-1:
    #     nodesAnnotation.append({})

    if len(nodesMetadata)==nodeID-1:
        nodesMetadata.append({})

    geneLen = 1
    if len(sequence)>0:
        geneLen = len(sequence)

    # nodesAnnotation[nodeID-1].setdefault(accessionID,{})[og] = [(0,geneLen-1)]#[(0,len(og)-1)]
    # for at in atList:
    #     nodesAnnotation[nodeID-1].setdefault(accessionID,{})[at] = [(0,geneLen-1)]#[(0,len(at)-1)]

    nodesMetadata[nodeID-1].setdefault(accessionID, {}).setdefault('genPos',[]).append({'chr':chrID, 'genomePosition':[start,end]})
    nodesMetadata[nodeID-1][accessionID].setdefault('annotation',{}).update({og: [(0,geneLen-1)]})
    nodesMetadata[nodeID-1][accessionID]['annotation'].update({at:[(0,geneLen-1)] for at in atList})

In [None]:
#| export
def recordAltChr(nodeID,accessionID,chrID,start,end,nodesMetadata):
    nodesMetadata[nodeID-1][accessionID].setdefault('altChrGenPos',[]).append({'chr':chrID, 'genomePosition':[start,end]})

In [None]:
#| export
def addLink(links,prevPathSegment,name,forward):
    '''
    `links`: mutable
    `prevPathSegment`: mutable
    '''
    if prevPathSegment is not None:
        links[prevPathSegment].add(f'{name}\t{"+" if forward else "-"}')
    return f'{name}\t{"+" if forward else "-"}'

In [None]:
#| export
def generatePathsLinks(genesAll,sequenceID,accessionID,
                       sequences,OGList,segmentIDs,
                       nodesMetadata,
                       segmentIDToNumDict,links,usCounter,
                       doUS=True,segmentData=None,gfaFile=None):
    '''
    `gfaFile`: file handle to write segments to GFA file
    `OGList`: mutable
    `links`: mutable
    `usCounter`: mutable
    
    '''
    genes = genesAll.loc[genesAll.sequenceID == sequenceID]
    path = []
    cigar = []
    prevEnd = 0
    prevPathSegment = None
    curSeqID = ''
    for generow,gene in genes.iterrows():
        og = gene.orthogroup
        
        geneSeqID = gene.sequenceID
        if curSeqID != geneSeqID:
            curSeqID = geneSeqID
        
        atStr = gene.AT_str
        if len(atStr[0])>0:
            atList = atStr
        else:
            atList = []
        
        geneChr = gene.chromosome
        geneStart = gene.start
        geneEnd = gene.end
        geneForward = gene.forward
        
        if sequences is not None:
            if geneForward:
                geneSeq = sequences[geneSeqID][geneStart-1:geneEnd]
            else:
                geneSeq = inverseSequence(sequences[geneSeqID][geneStart-1:geneEnd])
        else:
            geneSeq = ''
        
        if doUS:
        
            if sequences is not None:
                usSeq = sequences[geneSeqID][prevEnd-1:geneStart]
            else:
                usSeq = ''

            if len(usSeq)>0:
                isUS = True
                us = f'US{usCounter:07d}'
                usCounter += 1
            else:
                isUS = False

            if isUS:
                usID = recordSegment(us,segmentIDs,segmentIDToNumDict,usSeq,gfaFile=gfaFile,segmentData=segmentData)
                recordAnnotation(usID,accessionID,geneSeqID,geneChr,prevEnd,geneStart-1,us,[],usSeq,nodesMetadata)
        
        if og not in OGList:
            ogID = recordSegment(og,segmentIDs,segmentIDToNumDict,geneSeq,gfaFile=gfaFile,segmentData=segmentData)
            OGList.append(og)
        else:
            ogID = segmentIDs.index(og)+1
        
        recordAnnotation(ogID,accessionID,geneSeqID,geneChr,geneStart,geneEnd,og,atList,geneSeq,nodesMetadata)
        altPos = genesAll.loc[(genesAll.orthogroup == og) & (genesAll.sequenceID != sequenceID)]
        for altrow,altChrOG in altPos.iterrows():
            recordAltChr(ogID,accessionID,altChrOG.chromosome,altChrOG.start,altChrOG.end,nodesMetadata)
        
        pathAdd = [f'{ogID}{"+" if geneForward else "-"}']
        if doUS and isUS:
            pathAdd.insert(0,f'{usID}+')
            
        path.extend(pathAdd)
        
        if len(cigar)>0 and doUS and isUS:
            cigar.extend(['0M','0M']) # with previous block and between two current blocks
        else:
            cigar.append('0M') # only between current blocks or between previous and current gene
                               # without unrelated sequence (intergenic) block.
        
        if doUS and isUS:
            prevPathSegment = addLink(links,prevPathSegment,usID,True)
            links[prevPathSegment] = set()
        
        prevPathSegment = addLink(links,prevPathSegment,ogID,geneForward)
        if prevPathSegment not in links:
            links[prevPathSegment] = set()
        
        prevEnd = geneEnd+1
    
    if doUS:
        
        if sequences is not None:
            usSeq = sequences[curSeqID][prevEnd-1:]
        else:
            usSeq = ''

        if len(usSeq)>0:
            us = f'US{usCounter:07d}'
            usID = recordSegment(us,segmentIDs,segmentIDToNumDict,usSeq,gfaFile=gfaFile,segmentData=segmentData)
            recordAnnotation(usID,accessionID,geneSeqID,prevEnd,len(sequences[curSeqID]),us,[],nodesMetadata)
            usCounter += 1
            path.append(f'{usID}+')
            cigar.append('0M')
            prevPathSegment = addLink(links,prevPathSegment,usID,True)

    return path,cigar,usCounter

In [None]:
#export
def writeLinks(gfaFile,links,doCigars=True):
    for linkLeft,linksRight in links.items():
        for linkRight in linksRight:
            if doCigars:
                gfaFile.write(f'L\t{linkLeft}\t{linkRight}\t0M\n')
            else:
                gfaFile.write(f'L\t{linkLeft}\t{linkRight}\t*\n')
                
def writePath(gfaFile,AccessionID,path,cigar,doCigars):   
    if doCigars:
        cigarString = ",".join(cigar)
    else:
        cigarString = "*"
        
    gfaFile.write(f'P\t{AccessionID}\t{",".join(path)}\t{cigarString}\n')
    
def writeSegmentIDs(path,segmentIDs):
    with open(path,'w') as jsf:
        json.dump(segmentIDs,jsf)
        
def readSegmentIDs(path):
    with open(path,'r') as jsf:
        return json.load(jsf)