In [None]:
#| include: false
%load_ext autoreload
%autoreload 2

### code profiling help

https://jakevdp.github.io/PythonDataScienceHandbook/01.07-timing-and-profiling.html

`%time`: Time the execution of a single statement  
`%timeit`: Time repeated execution of a single statement for more accuracy  
`%prun`: Run code with the profiler  
`%lprun`: Run code with the line-by-line profiler  
`%memit`: Measure the memory use of a single statement  
`%mprun`: Run code with the line-by-line memory profiler  


## Install

Enter the directory of the library and enter:

`pip install .`

## How to use

In [None]:
from nbdev import nbdev_export
nbdev_export()

In [None]:
from redis import Redis

In [None]:
import cProfile
import pstats
from pstats import SortKey

In [None]:
import pdb

import os
import glob
import re
import time
import pdb
from random import shuffle
from copy import deepcopy
import joblib

import numpy as np
import networkx as nx

from pangraph_constructor.graph import GenomeGraph
from pangraph_constructor.synteny import generateOrder,readTransMap
from pangraph_constructor.tree import TremauxTree
from pangraph_constructor.utils import pathFileToPathDict,resetDB
from pangraph_constructor.utils import iset_add,iset_score

In [None]:
resetDB()

0

In [None]:
redisConn = Redis(host='redis',port=6379)

redisConn.flushall()

redisConn.close()

del redisConn

In [None]:
from pangraph_constructor.utils import adjustZoomLevels,pathConvert
from pangraph_constructor.exportDev import exportToPantograph

In [None]:
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('always',category=RuntimeWarning)

# Generating graphs

## Generating from annotation

In [None]:
datadir = '../../1001G/annotations/freeze2.1'
gfadir = '../../1001G/pantograph/data'

In [None]:
annotationFiles = sorted(glob.glob(f'{datadir}{os.path.sep}*.gff'))
sequenceFiles = sorted(glob.glob(f'{datadir}{os.path.sep}sequences{os.path.sep}*.fasta'))
transMapFile = f'{datadir}{os.path.sep}TransMap{os.path.sep}TransMap.map'
refAnnotationFile = f'{datadir}{os.path.sep}outgroups/araport.gff'
refSequenceFile = f'{datadir}{os.path.sep}outgroups/araport.fasta'

In [None]:
refAnnotationFile,refSequenceFile,annotationFiles,sequenceFiles,transMapFile

('../../1001G/annotations/freeze2.1/outgroups/araport.gff',
 '../../1001G/annotations/freeze2.1/outgroups/araport.fasta',
 ['../../1001G/annotations/freeze2.1/10002.gff',
  '../../1001G/annotations/freeze2.1/10015.gff',
  '../../1001G/annotations/freeze2.1/10024.gff',
  '../../1001G/annotations/freeze2.1/1741.gff',
  '../../1001G/annotations/freeze2.1/22001.gff',
  '../../1001G/annotations/freeze2.1/22002.gff',
  '../../1001G/annotations/freeze2.1/22003.gff',
  '../../1001G/annotations/freeze2.1/22004.gff',
  '../../1001G/annotations/freeze2.1/22005.gff',
  '../../1001G/annotations/freeze2.1/22006.gff',
  '../../1001G/annotations/freeze2.1/22007.gff',
  '../../1001G/annotations/freeze2.1/6024.gff',
  '../../1001G/annotations/freeze2.1/6069.gff',
  '../../1001G/annotations/freeze2.1/6124.gff',
  '../../1001G/annotations/freeze2.1/6244.gff',
  '../../1001G/annotations/freeze2.1/6909.gff',
  '../../1001G/annotations/freeze2.1/6966.gff',
  '../../1001G/annotations/freeze2.1/8236.gff',
  '.

In [None]:
ATmap = readTransMap(transMapFile)

In [None]:
fileOrder = generateOrder(annotationFiles,priorityAccession=None)
fileOrder

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26]

In [None]:
# GenomeGraph(annotationFiles=annotationFiles,
#                         sequenceFiles=None,
#                         fileOrder=fileOrder,
#                         doUS=doUS,
#                         seqSuffix=seqSuffix,
#                         refAnnotationFile=refAnnotationFile,
#                         refAccession='TAIR10',
#                         transMap=ATmap)

In [None]:
# def __init__(self,gfaPath=None,doOverlapCleaning=True,
#                  paths=None,
#                  nodes=None,nodesData=None,links=None,
#                  pathsDict=None,
#                  sequenceFiles=None,annotationFiles=None,
#                  doBack=False,**kwargs)
self._graphFromAnnotation(annotationFiles,sequenceFiles,**kwargs)

In [None]:
annotationFiles,fileOrder,doUS,seqSuffix,ATMap,refAnnotationFile,refAccession#'TAIR10'

In [None]:
# _graphFromAnnotation(self,annotationFiles,sequenceFiles=None,**kwargs):
self.nodeNameToID = {}

fileOrder = kwargs.get('fileOrder',list(range(len(annotationFiles))))

doUS = kwargs.get('doUS',False)
self.usCounter = 0
self.OGList = []
links = self._linksDictToSet(self.forwardLinks)

if 'refAnnotationFile' in kwargs:
    links = self._processAnnotation(kwargs['refAnnotationFile'], links,
                                    ATmap=kwargs.get('transMap', None),
                                    seqFile=kwargs.get('refSequenceFile',None),
                                    seqSuffix=kwargs.get('seqSuffix', None),
                                    doUS=doUS,
                                    isRef=True,
                                    accID=kwargs.get('refAccession',None))

for fileNum in fileOrder:
    if sequenceFiles is not None:
        seqFile = sequenceFiles[fileNum]
    else:
        seqFile = None
     # annotationFile
     # transMap
    links = self._processAnnotation(annotationFiles[fileNum], links,
                                    ATmap=kwargs.get('transMap', None),
                                    seqFile=seqFile,
                                    seqSuffix=kwargs.get('seqSuffix', None),
                                    doUS=doUS)


self.forwardLinks = self._linksSetToDict(links)

In [None]:
# _processAnnotation(self, annotationFile, links, ATmap=None, seqFile=None, seqSuffix=None, doUS=False, isRef=False, accID=None):

In [None]:
annotationFiles[2]

'../../1001G/annotations/freeze2.1/10024.gff'

In [None]:
from pangraph_constructor.synteny import processAccession,generatePathsLinks
from skbio import read as skbio_read
from skbio.metadata import IntervalMetadata

In [None]:
ann = skbio_read(annotationFiles[0], format='gff3', into=IntervalMetadata, seq_id='10002_Chr1')

In [None]:
q = ann.query(bounds=((252670,252680),),metadata={'type':'gene'})
list(q)

[Interval(interval_metadata=<140264681975920>, bounds=[(252677, 255684)], fuzzy=[(False, False)], metadata={'source': 'EVM', 'type': 'gene', 'score': '.', 'strand': '+', 'ID': 'evm.TU.10002_Chr1.68', 'Name': 'EVM%20prediction%2010002_Chr1.68', 'OG': 'OG0001856', 'AT': 'AT1G01690,AT1G01695'})]

In [None]:
accessionID, genes, sequences = \
        processAccession(annotationFiles[2],
                         None,
                         ATmap=ATmap,
                         isRef=False,
                         accID=None)


In [None]:
genes.loc[genes.orthogroup=='OG0000490']

Unnamed: 0,geneID,orthogroup,sequenceID,accessionID,chromosome,forward,start,end,AT_str,geneSeq,overlapGenes
13382,42,OG0000490,10024_Chr1,10024,Chr1,False,153796,155238,"[AT1G01390, AT1G01420, AT4G01070]",,[evm.TU.10024_Chr1.42]
13385,45,OG0000490,10024_Chr1,10024,Chr1,False,160034,161479,"[AT1G01390, AT1G01420, AT4G01070]",,[evm.TU.10024_Chr1.45]
130,123,OG0000490,10024_Chr4,10024,Chr4,False,456556,457998,"[AT1G01390, AT1G01420, AT4G01070]",,[evm.TU.10024_Chr4.123]


In [None]:
genes.loc[(genes.orthogroup=='OG0000490') & (genes.sequenceID!='10024_Chr1')]

Unnamed: 0,geneID,orthogroup,sequenceID,accessionID,chromosome,forward,start,end,AT_str,geneSeq,overlapGenes
130,123,OG0000490,10024_Chr4,10024,Chr4,False,456556,457998,"[AT1G01390, AT1G01420, AT4G01070]",,[evm.TU.10024_Chr4.123]


In [None]:
if seqSuffix is None:
        seqList = genes.sequenceID.unique().tolist()
        seqList.sort()
    else:
        seqList = [f'{accessionID}{seqSuffix}']

    path = []
    for seqID in seqList:

        p, cigar, usCounter = generatePathsLinks(genes.loc[genes.sequenceID == seqID], seqID, accessionID, sequences, self.OGList,
                                                 self.nodes, self.nodesMetadata, self.nodeNameToID, links,
                                                 self.usCounter, doUS=doUS, segmentData=self.nodesData)
        path = path + p
    if isRef:
        self.paths.insert(0,path)
        self.accessions.insert(0,accessionID)
    else:
        self.paths.append(path)
        self.accessions.append(accessionID)

In [None]:
nbdev_export()

In [None]:
doUS = False
for chrnum in range(1,6):
    seqID = f'Chr{chrnum}'
    gfaFilename = f'AT_{seqID}_OGOnly_2.1.gfa'
    seqSuffix = f'_{seqID}'

    print(f'\nProcessing {seqID}\n============')

    curtst = time.time()
    graph = GenomeGraph(annotationFiles=annotationFiles,
                        sequenceFiles=None,
                        fileOrder=fileOrder,
                        doUS=doUS,
                        seqSuffix=seqSuffix,
                        refAnnotationFile=refAnnotationFile,
                        refAccession='TAIR10',
                        transMap=ATmap)
    print(f'Generating graph for {seqID} took {time.time() - curtst} seconds')
    
    curtst = time.time()
    graph.treeSort()
    print(f'Sorting graph for {seqID} took {time.time() - curtst} seconds')
    assert len(graph.nodes)==len(graph.order)
    
    
    graph.toGFA(f'{gfadir}{os.path.sep}{gfaFilename}',doSeq=False)
    # graph.toGFA(f'{gfadir}{os.path.sep}unordered_{gfaFilename}',doSeq=False)


Processing Chr1
Calculating nodes length...
Processing node 7455/7455
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths
Node 7455 inverted
Generating graph for Chr1 took 2385.88067817688 seconds
Constructing Tremaux tree
Preprocessing tree 100%
Done!
Getting root nodes
Start Loop...
Nodes in order: 7454/7455
Sorting graph for Chr1 took 138.0292296409607 seconds

Processing Chr2
Calculating nodes length...
Processing node 4877/4877
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths
Node 4873 inverted
Generating graph for Chr2 took 1497.942242860794 seconds
Constructing Tremaux tree
Preprocessing tree 100%
Done!
Getting root nodes
Start Loop...
Nodes in order: 4876/4877
Sorting graph for Chr2 took 21.346941709518433 seconds

Processing Chr3
Calculating nodes length...
Processing node 8442/8442
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28

In [None]:
curT = time.localtime()
message = f"Generating gene graphs for all chromosome finished at \
            {curT.tm_hour:02d}:{curT.tm_min:02d} on {curT.tm_mday:02d}/{curT.tm_mon:02d}!"
!ntfy send "{message}"

## Loading Pathfile to graph

In [None]:
notebook2script()

Converted 00_init.ipynb.
Converted 01_graph.ipynb.
Converted 02_tree.ipynb.
Converted 03_synteny.ipynb.
Converted 04_utils.ipynb.
Converted 05_export.ipynb.
Converted 05_exportDev.ipynb.
Converted deBruijnGraphProcessing.ipynb.
Converted dev.ipynb.
Converted graphTesting.ipynb.
Converted index.ipynb.


In [None]:
# For v1
pathfileDir = '../../1001G/coreGraph'

for seqNum in range(1,6): 

    pathsfile = f'paths{seqNum:d}.txt'

    _paths = pathFileToPathDict(f'{pathfileDir}{os.path.sep}{pathsfile}',True,'TAIR10')

    paths = {}
    for accession in sorted(list(_paths.keys())):
        paths[accession] = _paths[accession]
    del _paths

    coregraph = GenomeGraph(pathsDict=paths)

    # On undirected coregraph sorting is not optimal! Check sorting!!!

    coregraph.treeSort()

    assert len(coregraph.nodes)==len(coregraph.order)

    coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_Chr{seqNum:d}.gfa'

    coregraph.toGFA(coreGFApath,False)

Calculating nodes length...
Processing node 35/35
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Constructing Tremaux tree
Preprocessing tree 100%
Done!
Getting root nodes
Start Loop...
Nodes in order: 34/35
Calculating nodes length...
Processing node 19/19
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Constructing Tremaux tree
Preprocessing tree 100%
Done!
Getting root nodes
Start Loop...
Nodes in order: 18/19
Calculating nodes length...
Processing node 26/26
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Constructing Tremaux tree
Preprocessing tree 100%
Done!
Getting root nodes
Start Loop...
Nodes in order: 25/26
Calculating nodes length...
Processing node 15/15
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Constructing Tremaux tree
P

In [None]:
# For v2
pathfileDir = '../../1001G/coreGraph/new_Nov2022/'

pathsfile = f'paths.txt'

paths = pathFileToPathDict(f'{pathfileDir}{os.path.sep}{pathsfile}',True,'TAIR10',True)

for seqNum in paths.keys():#range(1,6): 
    # paths = {}
    # for accession in sorted(list(_paths.keys())):
    #     paths[accession] = _paths[accession]
    # del _paths

    coregraph = GenomeGraph(pathsDict=paths[seqNum])

    # On undirected coregraph sorting is not optimal! Check sorting!!!

    coregraph.treeSort()

    assert len(coregraph.nodes)==len(coregraph.order)

    coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_v2_Chr{seqNum}.gfa'

    coregraph.toGFA(coreGFApath,False)

Calculating nodes length...
Processing node 33/33
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Constructing Tremaux tree
Preprocessing tree 100%
Done!
Getting root nodes
Start Loop...
Nodes in order: 32/33
Calculating nodes length...
Processing node 19/19
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Constructing Tremaux tree
Preprocessing tree 100%
Done!
Getting root nodes
Start Loop...
Nodes in order: 18/19
Calculating nodes length...
Processing node 24/24
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Constructing Tremaux tree
Preprocessing tree 100%
Done!
Getting root nodes
Start Loop...
Nodes in order: 23/24
Calculating nodes length...
Processing node 14/14
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Constructing Tremaux tree
P

In [None]:
curT = time.localtime()
message = f"Generating chain graphs for all chromosome finished at \
            {curT.tm_hour:02d}:{curT.tm_min:02d} on {curT.tm_mday:02d}/{curT.tm_mon:02d}!"
!ntfy send "{message}"

## Loading test sorting graph

In [None]:
pathfileDir = '../../1001G/pantograph/data'
pathsfile = 'testCollapse_path.txt'

paths = pathFileToPathDict(f'{pathfileDir}{os.path.sep}{pathsfile}',True)

In [None]:
coregraph = GenomeGraph(pathsDict=paths,nodeNameLength=4)

In [None]:
coregraph.treeSort()

In [None]:
coregraph.tremauxTree.draw()

In [None]:
len(coregraph.nodes),len(coregraph.order)

In [None]:
coregraph.nodes,coregraph.order

In [None]:
[coregraph.nodes[i-1] for i in coregraph.order]

In [None]:
coreGFApath = f'{pathfileDir}{os.path.sep}testCollapse.gfa'

In [None]:
coregraph.toGFA(coreGFApath,False)

# Loading graph from GFA

## Large graph

In [None]:
notebook2script()

In [None]:
f'{gfadir}{os.path.sep}{gfaFilename}'

In [None]:
graph_new = GenomeGraph(gfaPath=f'{gfadir}{os.path.sep}{gfaFilename}',isGFASeq=False)

In [None]:
graph_new.treeSort()

In [None]:
assert len(graph_new.nodes)==len(graph_new.order)

In [None]:
len(graph_new.order),len(graph_new.nodes),len(graph_new.tremauxTree),len(graph_new.tremauxTree.originalGraph)

In [None]:
gfaFilename

In [None]:
gfadir

In [None]:
graph_new.toGFA(f'{gfadir}{os.path.sep}{gfaFilename}',doSeq=False)

In [None]:
!ntfy send "Sorting gene graph finished!"

## Large graph with unrelated sequences and real sequences

In [None]:
graph_new2 = GenomeGraph(gfaPath='./test_long_graph_full_seq.gfa',isGFASeq=True)

In [None]:
len(graph_new2.accessions)

In [None]:
graph_new2.treeSort()

In [None]:
len(graph_new2.order),len(graph_new2.nodes),len(graph_new2.tremauxTree),len(graph_new2.tremauxTree.originalGraph)

In [None]:
gfaFilename

In [None]:
gfadir

In [None]:
gfaFilename = 'AT_Chr1_OGOnly_Seq.gfa'

In [None]:
graph_new2.toGFA(f'{gfadir}{os.path.sep}{gfaFilename}',doSeq=True)

## Christian's test graph

In [None]:
graph_new = GenomeGraph(gfaPath='../../1001G/graphSorting/graphs/sixRef.panGenome.Chroms.pggb.Chr2.5mb.gfa',isGFASeq=True)

In [None]:
len(graph_new.accessions)

In [None]:
cProfile.run('graph_new.treeSort()','treesort_profile')

In [None]:
p = pstats.Stats('treesort_profile')
p.strip_dirs().sort_stats(SortKey.CUMULATIVE).print_stats()

In [None]:
len(graph_new.order),len(graph_new.nodes),len(graph_new.tremauxTree),len(graph_new.tremauxTree.originalGraph)

In [None]:
gfaFilename='testChristian_new.gfa'

In [None]:
gfadir

In [None]:
graph_new.toGFA(f'{gfadir}{os.path.sep}{gfaFilename}',doSeq=True)

In [None]:
shuffle(graph_new.order)

In [None]:
gfaFilename='testChristian_shuffled.gfa'

In [None]:
graph_new = GenomeGraph(gfaPath='../../1001G/pantograph/data/testChristian_shuffled.gfa',isGFASeq=True)

## Christian's full graph (Chr1)

In [None]:
graph_new = GenomeGraph(gfaPath='../../1001G/chrisGraph/chr1.wfmash.n20.a90.s10000.p1,19,39,3,81,1.seqwish.sort.smooth.sort.gfa',
                        isGFASeq=True,
                       doOverlapCleaning=False,
                       accessionsToRemove=['Consensus'])

In [None]:
len(graph_new.accessions)

In [None]:
len(graph_new.order),len(graph_new.nodes),len(graph_new.tremauxTree),len(graph_new.tremauxTree.originalGraph)

In [None]:
graph_new.treeSort()

In [None]:
assert len(graph_new.nodes)==len(graph_new.order)

In [None]:
len(graph_new.order),len(graph_new.nodes),len(graph_new.tremauxTree),len(graph_new.tremauxTree.originalGraph)

In [None]:
gfaFilename='Christian_chr1.gfa'

In [None]:
gfadir='../../1001G/chrisGraph/'

In [None]:
graph_new.toGFA(f'{gfadir}{os.path.sep}{gfaFilename}',doSeq=True)

In [None]:
!ntfy send "Sorting nucleotide graph finished!"

## Small simple graph

In [None]:
graph2 = GenomeGraph(gfaPath='./shorttest.gfa',isGFASeq=False)

In [None]:
graph2.forwardLinks

In [None]:
graph2.treeSort()

In [None]:
graph2.tremauxTree.draw_original()

In [None]:
graph2.tremauxTree.draw()

In [None]:
graph2.order

In [None]:
graph2.toGFA('../../1001G/pantograph/data/shorttest1.gfa',doSeq=False)

## Small graph with complex loops

In [None]:
graph3 = GenomeGraph(gfaPath='./shorttest2.gfa',isGFASeq=False)

In [None]:
graph3.paths

In [None]:
graph3.treeSort()

In [None]:
pdb.pm()

In [None]:
graph3.tremauxTree.draw_original()

In [None]:
graph3.tremauxTree.draw()

In [None]:
len(graph3.order),len(graph3.nodes)

In [None]:
graph3.toGFA('../../1001G/pantograph/data/shorttest2.gfa',doSeq=False)

## Small graph with very complex relationships

In [None]:
graph4 = GenomeGraph(gfaPath='./shorttest3.gfa',isGFASeq=False)

In [None]:
graph4.paths

In [None]:
graph4.treeSort()

In [None]:
graph4.tremauxTree.draw_original()

In [None]:
graph4.tremauxTree.draw()

In [None]:
graph4.order

In [None]:
graph4.toGFA('../../1001G/pantograph/data/shorttest3.gfa',doSeq=False)

## Small graph with self-loops

In [None]:
graph5 = GenomeGraph(gfaPath='./shorttest_loop.gfa',isGFASeq=False)

In [None]:
graph5.paths

In [None]:
graph5.treeSort()

In [None]:
graph5.tremauxTree.draw_original()

In [None]:
graph5.tremauxTree.draw()

In [None]:
graph5.order

In [None]:
graph5.toGFA('../../1001G/pantograph/data/shorttest4.gfa',doSeq=False)

# Exporting to Front-end

### Exporting presentation graph

In [None]:
pathfileDir = '../../Meetings/1001G+_20220518/'
coreGFApath = f'{pathfileDir}{os.path.sep}paths_presentation.gfa'

In [None]:
genome = GenomeGraph(coreGFApath,isGFASeq=True)

Loading graph from ../../Meetings/1001G+_20220518//paths_presentation.gfa
Loading segment 7/7
Loading segments finished.
Loading link 12/12
Loading links finished
Loading path 5/5
Loading paths finished. 5 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths



In [None]:
genome.nodesData

['ATGCC', 'GCACGTTC', 'ATTTGCCCAA', 'AG', 'CTGAA', 'ATCCCA', 'GCATTCA']

In [None]:
# changing annotation
genome = GenomeGraph(coreGFApath,isGFASeq=True)

for nodeID,node in enumerate(genome.nodesAnnotation):
    for seqName,seqDict in node.items():
        for annText in seqDict.keys():
            genome.nodesAnnotation[nodeID][seqName][annText] = [(0,len(genome.nodesData[nodeID])-1)]

genome.toGFA(coreGFApath,doSeq=True)

Loading graph from ../../Meetings/1001G+_20220518//paths_presentation.gfa
Loading segment 7/7
Loading segments finished.
Loading link 12/12
Loading links finished
Loading path 5/5
Loading paths finished. 5 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths



In [None]:
# genome = GenomeGraph(coreGFApath,isGFASeq=True)
genome.nodesAnnotation

[{}, {}, {}, {}, {}, {}, {}]

In [None]:
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = True
zoomLevels = [1,2,4,8,16]#,8,16,32]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
outputPath,outputName,zoomLevels

('../../Meetings/1001G+_20220518', 'paths_presentation_new', [1, 2, 4, 8, 16])

In [None]:
redisConn=None

In [None]:
notebook2script()

Converted 00_init.ipynb.
Converted 01_graph.ipynb.
Converted 02_tree.ipynb.
Converted 03_synteny.ipynb.
Converted 04_utils.ipynb.
Converted 05_export.ipynb.
Converted 05_exportDev.ipynb.
Converted deBruijnGraphProcessing.ipynb.
Converted dev.ipynb.
Converted graphTesting.ipynb.
Converted index.ipynb.


In [None]:

#%%capture output
startTime = time.time()
# [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,toComponentLinks,fromComponentLinks,graph] = \
# [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,zoomAccStarts,zoomAccEnds, 
#                 invertedStarts,invertedEnds,toComponentLinks,fromComponentLinks,collapsibleBlocks,fromLinks,toLinks,graph,rootStruct] = \
exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                outputPath=outputPath, outputName=outputName,
                                isSeq=isSeq,
                                redisConn=redisConn,
                                zoomLevels=zoomLevels,
                                fillZoomLevels = True,
                                maxLengthComponent=maxLengthComponent, 
                                maxLengthChunk=maxLengthChunk, 
                                inversionThreshold=inversionThreshold)
#                                 returnDebugData=True)
runTime = time.time() - startTime

Loading Genome
Loading graph from ../../Meetings/1001G+_20220518//paths_presentation.gfa
Found nodeNames file ../../Meetings/1001G+_20220518/nodeNames_paths_presentation.json, loading names.
Found node annotation file ../../Meetings/1001G+_20220518/annotation_paths_presentation.dat, loading associations.
Loading segment 7/7
Loading segments finished.
Loading link 12/12
Loading links finished
Loading path 5/5
Loading paths finished. 5 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths

Recording Pantograph data to ../../Meetings/1001G+_20220518/paths_presentation_new
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths
Processing path breaks...
Postprocessing interconnected links 2/2
Preprocessing interconnected links finished.

Processing path breaks fin

In [None]:
import pdb
pdb.pm()

> [0;32m/data/YandexDisk/Kew/src/graphConstruction/pangraph_constructor/exportDev.py[0m(3902)[0;36mexportToPantograph[0;34m()[0m
[0;32m   3900 [0;31m    [0;32mif[0m [0mfillZoomLevels[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   3901 [0;31m        [0mmaxLinkLength[0m [0;34m=[0m [0mmax[0m[0;34m([0m[0mlinkLengths[0m[0;34m.[0m[0mkeys[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 3902 [0;31m        [0mmaxRearrangementLength[0m [0;34m=[0m [0mmax[0m[0;34m([0m[0mblockEdges[0m[0;34m.[0m[0mvalues[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   3903 [0;31m        [0mmaxBlock[0m [0;34m=[0m [0mmax[0m[0;34m([0m[0mmaxLinkLength[0m[0;34m,[0m[0mmaxRearrangementLength[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   3904 [0;31m[0;34m[0m[0m
[0m


ipdb>  blockEdges


{}


ipdb>  linkLengths


{25: {(3, 1), (4, 5), (1, 4)}, 36: {(6, 1), (5, 6)}}


ipdb>  q


### Testing removable elements identification

In [None]:
from pangraph_constructor.exportDev import getRemovableStructures

In [None]:
import IPython.display as ipd
import joblib

In [None]:
if os.path.exists('./tests/breakIdentify.dat'):
    correctResults = joblib.load('./tests/breakIdentify.dat')
else:
    correctResults = {}

In [None]:
notebook2script()

In [None]:
path = '../../1001G/GraphCollapsing/TestGraphs'
filePrefix = 'test'

In [None]:
#test

for filename,resDict in correctResults.items():
    print(f'\n####### Testing on case {filename} ########')
    graph = GenomeGraph(f'{path}/{filename}',isGFASeq=False)
    linksLengths, pairedLinks, blockEdges, _ = getRemovableStructures(graph=graph)
    assert linksLengths==resDict['linksLengths']
    assert pairedLinks==resDict['pairedLinks']
    assert blockEdges==resDict['blockEdges']

In [None]:
caseNum = 17
filename = f'{filePrefix}{caseNum:02d}.gfa'


print('############')
print(f'Graph from file {filename}')
coreGFApath = f'{path}/{filename}'
graph = GenomeGraph(coreGFApath,isGFASeq=False)
print('Graph Paths:')
ipd.display(graph.paths)
linkLengths, pairedLinks, blockEdges, _ = getRemovableStructures(graph=graph)

print('Link-Lengths associations:')
ipd.display(linkLengths)

print('PairedLinks:')
ipd.display(pairedLinks)

print('Rearrangemenet block edges:')
ipd.display(blockEdges)

In [None]:
correctResults[filename]

In [None]:
correctResults[filename] = {'linksLengths':linkLengths,'pairedLinks':pairedLinks,'blockEdges':blockEdges}

In [None]:
joblib.dump(correctResults,'./tests/breakIdentify.dat')

### Exporting test collapse graph

In [None]:
pathfileDir = '../../1001G/pantograph/data'
coreGFApath = f'{pathfileDir}{os.path.sep}testCollapse.gfa'

In [None]:
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = False
zoomLevels = [1,2,4]#,8,16,32]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
outputPath,outputName,zoomLevels

In [None]:
redisConn=None

In [None]:
#%%capture output
startTime = time.time()
# [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,toComponentLinks,fromComponentLinks,graph] = \
[zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,zoomAccStarts,zoomAccEnds, 
                invertedStarts,invertedEnds,toComponentLinks,fromComponentLinks,collapsibleBlocks,fromLinks,toLinks,graph,rootStruct] = \
exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                outputPath=outputPath, outputName=outputName,
                                isSeq=isSeq,
                                redisConn=redisConn,
                                zoomLevels=zoomLevels,
                                fillZoomLevels = True,
                                maxLengthComponent=maxLengthComponent, 
                                maxLengthChunk=maxLengthChunk, 
                                inversionThreshold=inversionThreshold,
                                returnDebugData=True)
runTime = time.time() - startTime

In [None]:
print(f'Executed in {runTime} seconds')

In [None]:
!ntfy send "Exporting test collapse graph finished. Overall time = {runTime} seconds"

### Exporting coregraph

In [None]:
pathfileDir = '../../1001G/coreGraph'
coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_Chr1.gfa'

In [None]:
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = False
zoomLevels = [1,2,4,8,16]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
outputPath,outputName,zoomLevels

('../../1001G/coreGraph', 'coregraph_Chr1_new', [1, 2, 4, 8, 16])

In [None]:
# dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
# print(f'Opening Redis connection for db {dbid}')
redisConn = Redis(host='redis',port = 6379,db=0)

In [None]:
notebook2script()

Converted 00_init.ipynb.
Converted 01_graph.ipynb.
Converted 02_tree.ipynb.
Converted 03_synteny.ipynb.
Converted 04_utils.ipynb.
Converted 05_export.ipynb.
Converted 05_exportDev.ipynb.
Converted deBruijnGraphProcessing.ipynb.
Converted dev.ipynb.
Converted graphTesting.ipynb.
Converted index.ipynb.


In [None]:
#%%capture output
startTime = time.time()
# [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,toComponentLinks,fromComponentLinks,graph] = \
exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                outputPath=outputPath, outputName=outputName,
                                isSeq=isSeq,
                                redisConn=redisConn,
                                zoomLevels=zoomLevels, 
                                maxLengthComponent=maxLengthComponent, 
                                maxLengthChunk=maxLengthChunk, 
                                inversionThreshold=inversionThreshold)
#                                 returnDebugata=True)
runTime = time.time() - startTime
print(runTime)

Loading Genome
Loading graph from ../../1001G/coreGraph/coregraph_Chr1.gfa
Found node annotation file ../../1001G/coreGraph/annotation_coregraph_Chr1.dat, loading associations.
Loading segment 35/35
Loading segments finished.
Loading link 72/72
Loading links finished
Loading path 28/28
Loading paths finished. 28 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 35/35
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Recording Pantograph data to ../../1001G/coreGraph/coregraph_Chr1_new
Calculating nodes length...
Processing node 35/35
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths
Processing path breaks...
Postprocessing interconnected links 16/16
Preprocessing interconnected links finished.

Processing path breaks finished.
Converting blocks to block lengths 34/34
Conversion finished.
Reformating links to block lengths associations 34/34
Refo

In [None]:
!ntfy send "Exporting coregraph finished. Overall time = {runTime} seconds"



❗️❗️❗️ TODO: Next test API. WHen works, change front end to get jsons without annotation and then to load annotation from API!

In [None]:
graph = GenomeGraph(coreGFApath,isGFASeq=False)

Loading graph from ../../1001G/coreGraph/coregraph_Chr1.gfa
Found node annotation file ../../1001G/coreGraph/annotation_coregraph_Chr1.dat, loading associations.
Loading segment 35/35
Loading segments finished.
Loading link 72/72
Loading links finished
Loading path 28/28
Loading paths finished. 28 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 35/35
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths



In [None]:
from pangraph_constructor.utils import iset_get,iset_score,iset_add

In [None]:
redisConn = Redis(host='redis',port=6379,db=0)

In [None]:
iset_get(redisConn,'test')

{'a_0': (1.0, 3.0), 'b_0': (1.0, 5.0)}

In [None]:
iset_score(redisConn,'coregraph_Chr1_new.1.10024.Pos',22)
# coregraph_Chr1_new/10024/1/22

['23']

In [None]:
iset_score(redisConn,'coregraph_Chr1_new.9543.Gene',50,50)

['3370',
 'AT1G15940',
 'AT1G15950',
 'AT1G16000',
 'AT1G16010',
 'AT1G16022',
 'AT1G62830',
 'AT1G62840',
 'OG0001502',
 'OG0004404',
 'OG0007340',
 'OG0007341',
 'OG0007343',
 'OG0008435',
 'OG0015400',
 'OG0017985',
 'OG0022841',
 'OG0024046',
 'OG0024055',
 'OG0025924',
 'OG0027925',
 'OG0030893']

In [None]:
iset_get(redisConn,'coregraph_Chr1_new.9543.Gene','3370_0')

{'3370_0': (48.0, 51.0)}

### Exporting coregraph with genes

In [None]:
notebook2script()

In [None]:
pathfileDir = '../../1001G/coreGraph'

In [None]:
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = True
zoomLevels = [1,3,9]# + [9*2**i for i in range(9)]
# zoomLevels = [2**i for i in range(12)]#  [1,2,4,8,16,32,9*16,9*32,9*128,9*256]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
# outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
# outputPath,outputName,zoomLevels

In [None]:
# %%capture output
for chrNum in range(1,6):
    coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_genes_Chr{chrNum}.gfa'
    
    outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
    
    dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
    print(f'Opening Redis connection for db {dbid}')
    redisConn = Redis(host='redis',port = 6379,db=dbid)
    
    startTime = time.time()
    # [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,toComponentLinks,fromComponentLinks,graph] = \
    exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                    outputPath=outputPath, outputName=outputName,
                                    isSeq=isSeq,
                                    redisConn=redisConn,
                                    zoomLevels=zoomLevels,
                                    fillZoomLevels = True,
                                    maxLengthComponent=maxLengthComponent, 
                                    maxLengthChunk=maxLengthChunk, 
                                    inversionThreshold=inversionThreshold)
    #                                 returnDebugata=True)
    runTime = time.time() - startTime

In [None]:
curT = time.localtime(time.time()+3600)
message = f"Exporting core graph with genes for all chromosomes finished at \
            {curT.tm_hour:02d}:{curT.tm_min:02d} on {curT.tm_mday:02d}/{curT.tm_mon:02d}!\n"
!ntfy send "{message}"

In [None]:
notebook2script()

In [None]:
zoomLevels = [1,2,4]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
# outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
# outputPath,outputName,zoomLevels

In [None]:
#%%capture output2
for chrNum in range(1,6):
    coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_Chr{chrNum}.gfa'
    
    outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
    
    dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
    print(f'Opening Redis connection for db {dbid}')
    redisConn = Redis(host='redis',port = 6379,db=dbid)
    
    startTime = time.time()
    # [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,toComponentLinks,fromComponentLinks,graph] = \
    exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                    outputPath=outputPath, outputName=outputName,
                                    isSeq=isSeq,
                                    redisConn=redisConn,
                                    zoomLevels=zoomLevels,
                                    fillZoomLevels = True,
                                    maxLengthComponent=maxLengthComponent, 
                                    maxLengthChunk=maxLengthChunk, 
                                    inversionThreshold=inversionThreshold)
    #                                 returnDebugata=True)
    runTime = time.time() - startTime
    
    print(f'Exporting core graph for Chr{chrNum} took {runTime} seconds')

In [None]:
curT = time.localtime(time.time()+3600)
message = f"Exporting core graph with genes for all chromosomes finished at \
 {curT.tm_hour:02d}:{curT.tm_min:02d} on {curT.tm_mday:02d}/{curT.tm_mon:02d}!\n"
!ntfy send "{message}"

In [None]:
!ntfy send "Exporting coregraph with genes finished. Overall time = {runTime} seconds"

### Exporting gene graphs for all chromosomes

In [None]:
pathfileDir = '../../1001G/pantograph/data'

In [None]:
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = False
zoomLevels = [1]#,2,4]# + [9*2**i for i in range(12)]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)

In [None]:
nbdev_export()

In [None]:
for seqID in ['Chr1','Chr2','Chr3','Chr4','Chr5']:#'Chr1','Chr2',

    coreGFApath = f'{pathfileDir}{os.path.sep}AT_{seqID}_OGOnly_2.1.gfa'

    outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
    
    print()
    print(f'Processing case {outputName}')
    # dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
    # print(f'Opening Redis connection for db {dbid}')
    redisConn = Redis(host='redis',port = 6379,db=0)

    #%%capture output
    startTime = time.time()
    # initialLinkLengths, initialPairedLinks, initialInterconnectedLinks, initialBlockEdges, \
    # zoomNodeToComponent,zoomComponentToNodes,zoomComponents,\
    # zoomFromComponentLinks, zoomToComponentLinks, zoomLinkLengths, zoomPairedLinks, zoomInterconnectedLinks, \
    # zoomOldToNewRemoval, zoomNewToOldRemoval, \
    # zoomLinkLengthsRemoval, zoomPairedLinksRemoval, zoomInterconnectedLinksRemoval, zoomBlockEdgesRemoval, \
    # zoomFromComponentLinksRemoval, zoomToComponentLinksRemoval, \
    # graph = \
    exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                    outputPath=outputPath, outputName=outputName,
                                    isSeq=isSeq,
                                    redisConn=redisConn,
                                    zoomLevels=zoomLevels, 
                                    fillZoomLevels = False,
                                    maxLengthComponent=maxLengthComponent, 
                                    maxLengthChunk=maxLengthChunk, 
                                    inversionThreshold=inversionThreshold)#,
                                    # returnDebugData=True)
    runTime = time.time() - startTime
    
    print(f'Exporting gene graph for {seqID} took {runTime} seconds')
    
    redisConn.close()


Processing case AT_Chr1_OGOnly_2.1_new
Loading Genome
Loading graph from ../../1001G/pantograph/data/AT_Chr1_OGOnly_2.1.gfa
Found node annotation file ../../1001G/pantograph/data/annotation_AT_Chr1_OGOnly_2.1.dat, loading associations.
Loading segment 7455/7455
Loading segments finished.
Loading link 14001/14001
Loading links finished
Loading path 28/28
Loading paths finished. 28 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7455/7455
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Recording Pantograph data to ../../1001G/pantograph/data/AT_Chr1_OGOnly_2.1_new
Calculating nodes length...
Processing node 7455/7455
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths
Identifying rearrangement blocks
Processing node 7455/7455
Identifying rearrangement blocks finished.

Zoom level 1
Processing node 7455/7455
Processing component links 3869/3869


In [None]:
curT = time.localtime()
message = f"Exporting gene graph for all chromosomes finished at \
            {curT.tm_hour:02d}:{curT.tm_min:02d} on {curT.tm_mday:02d}/{curT.tm_mon:02d}!"
!ntfy send "{message}"

ERROR: Failed to send notification using default
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/ntfy/backends/default.py", line 20, in notify
    module.notify(title=title, message=message, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/ntfy/backends/linux.py", line 32, in notify
    bus = dbus.SessionBus()
  File "/usr/lib/python3/dist-packages/dbus/_dbus.py", line 212, in __new__
    return Bus.__new__(cls, Bus.TYPE_SESSION, private=private,
  File "/usr/lib/python3/dist-packages/dbus/_dbus.py", line 102, in __new__
    bus = BusConnection.__new__(subclass, bus_type, mainloop=mainloop)
  File "/usr/lib/python3/dist-packages/dbus/bus.py", line 124, in __new__
    bus = cls._new_for_bus(address_or_type, mainloop=mainloop)
dbus.exceptions.DBusException: org.freedesktop.DBus.Error.NotSupported: Unable to autolaunch a dbus-daemon without a $DISPLAY for X11

During handling of the above exception, another exception occurred:

Traceback (most rec

In [None]:
graph = GenomeGraph(gfaPath='../../1001G/pantograph/data/AT_Chr1_OGOnly_2.1.gfa', isGFASeq=False)

Loading graph from ../../1001G/pantograph/data/AT_Chr1_OGOnly_2.1.gfa
Found node annotation file ../../1001G/pantograph/data/annotation_AT_Chr1_OGOnly_2.1.dat, loading associations.
Loading segment 7455/7455
Loading segments finished.
Loading link 14001/14001
Loading links finished
Loading path 28/28
Loading paths finished. 28 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7455/7455
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths



In [None]:
graph.nodesMetadata[37]

{'TAIR10': {'genPos': [{'chr': 'Chr1', 'genomePosition': [148013, 149848]},
   {'chr': 'Chr1', 'genomePosition': [154367, 156178]}],
  'annotation': {'OG0000490': [(0, 0)],
   'AT1G01390': [(0, 0)],
   'AT1G01420': [(0, 0)],
   'AT4G01070': [(0, 0)]}},
 '10002': {'genPos': [{'chr': 'Chr1', 'genomePosition': [150525, 151967]}],
  'annotation': {'OG0000490': [(0, 0)],
   'AT1G01390': [(0, 0)],
   'AT1G01420': [(0, 0)],
   'AT4G01070': [(0, 0)]}},
 '10015': {'genPos': [{'chr': 'Chr1', 'genomePosition': [149434, 150876]},
   {'chr': 'Chr1', 'genomePosition': [155667, 157112]}],
  'annotation': {'OG0000490': [(0, 0)],
   'AT1G01390': [(0, 0)],
   'AT1G01420': [(0, 0)],
   'AT4G01070': [(0, 0)]}},
 '10024': {'genPos': [{'chr': 'Chr1', 'genomePosition': [153796, 155238]},
   {'chr': 'Chr1', 'genomePosition': [160034, 161479]}],
  'annotation': {'OG0000490': [(0, 0)],
   'AT1G01390': [(0, 0)],
   'AT1G01420': [(0, 0)],
   'AT4G01070': [(0, 0)]}},
 '1741': {'genPos': [{'chr': 'Chr1', 'genomePos

### Comments

Each node processing time increase significantly with overall number of nodes. This is wrong and should be investigated.

# Adding nucleotide data

In [None]:
from copy import deepcopy
import numpy as np

In [None]:
pathfileDir = '../../1001G/coreGraph'
coreGFApath = f'{pathfileDir}{os.path.sep}coreGraph_f2.1_Ref_v04.gfa'

In [None]:
coregraph = GenomeGraph(gfaPath=coreGFApath,isGFASeq=False)
coregraph_genes = deepcopy(coregraph)

In [None]:
fullGraphPath = '../../1001G/pantograph/data/AT_Chr1_OGOnly_2.1.gfa'
fullgraph = GenomeGraph(gfaPath=fullGraphPath,isGFASeq=False)

In [None]:
chainToGenesFile = 'chain2gene_f2.1_Ref_v04.txt'
maxChainLength = len(coregraph.nodes[0])
chainToListDict = {}
with open(f'{pathfileDir}{os.path.sep}{chainToGenesFile}') as f:
    for line in f:
        chainName, geneList = line.split(':')
        geneList = geneList.lstrip().rstrip().split(',')
        chainToListDict[chainName.zfill(maxChainLength)] = geneList

In [None]:
for nodeIdx,nodeName in enumerate(coregraph.nodes):
    print(f'\nNode {nodeIdx+1}/{len(coregraph.nodes)}',end='')
    geneList = chainToListDict.get(nodeName.zfill(maxChainLength), [f'ch{nodeName.zfill(7)}'])
    geneIds = []
    if geneList[0][:2]!='ch':
        geneIds = [int(gene.rstrip('+'))-1 for gene in geneList]
        geneList = [fullgraph.nodes[geneid] for geneid in geneIds]
    coregraph_genes.nodesData[nodeIdx] = ''.join(geneList)
    
    for accession, chainDict in coregraph.nodesAnnotation[nodeIdx].items():
        interval = chainDict[nodeName]
        geneCumLengths = np.hstack((0, np.cumsum([len(gene) for gene in geneList])))
        
        coregraph.nodesAnnotation[nodeIdx][accession].\
            update({geneAnnotation:interval \
                    for geneid in geneIds \
                        for geneAnnotation in fullgraph.nodesAnnotation[geneid].get(accession,{fullgraph.nodes[geneid]:None}).keys()})

        coregraph_genes.nodesAnnotation[nodeIdx][accession].\
            update({geneAnnotation:[(geneCumLengths[i], geneCumLengths[i+1]-1)] \
                            for i,geneid in enumerate(geneIds) \
                                for geneAnnotation in fullgraph.nodesAnnotation[geneid].get(accession,{fullgraph.nodes[geneid]:None}).keys()})

        coregraph_genes.nodesAnnotation[nodeIdx][accession].\
            update({nodeName:[(geneCumLengths[0], geneCumLengths[-1]-1)]})
print('')

In [None]:
coregraph.toGFA(f'{pathfileDir}{os.path.sep}coregraph_f2.1_Ref_v04.gfa',doSeq=False)
coregraph_genes.toGFA(f'{pathfileDir}{os.path.sep}coregraph_genes_f2.1_Ref_v04.gfa',doSeq=True)

In [None]:
pathfileDir = '../../1001G/coreGraph'
coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_genes_f2.1_Ref_v04.gfa'
# coreGFApath = f'{pathfileDir}{os.path.sep}coreGraph.gfa'

In [None]:
# zoomLevels = [1,10,20,100,500,1000,5000,10000,50000,100000,500000,1000000]
zoomLevels = [1,3,9,45,90,450,900,4500,9000]
# zoomLevels = [1,3,9,18]
# zoomLevels = [4,8,16]

isSeq = True

maxLengthComponent = 100
maxLengthChunk = 16
invertionThreshold = 0.5
# inputPath = '../../1001G/pantograph/data/shorttest2.gfa'
# inputPath = '../../1001G/pantograph/data/AT_Chr1_OGOnly.gfa'
# inputPath = '../../1001G/chrisGraph/chr1.wfmash.n20.a90.s10000.p1,19,39,3,81,1.seqwish.sort.smooth.sort.gfa'
# inputPath = '../../1001G/pantograph/data/shorttest_seq.gfa'
inputPath = coreGFApath

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
outputPath,outputName = pathConvert(inputPath,suffix='_new')
outputPath,outputName,zoomLevels

In [None]:
dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
print(f'Opening Redis connection for db {dbid}')
redisConn = Redis(host='redis',port = 6379,db=dbid)

In [None]:
# zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides = \
exportToPantograph(inputPath=inputPath,
                   outputName=outputName,
                   outputPath=outputPath,
                   isSeq=isSeq,
                   redisConn=redisConn,
                   GenomeGraphParams={'accessionsToRemove':['Consensus']},
                   zoomLevels=zoomLevels,
                   maxLengthChunk=maxLengthChunk,
                   maxLengthComponent=maxLengthComponent,
                   invertionThreshold=invertionThreshold,)
#                                              debug=True,returnDebugData=True)

In [None]:
!ntfy send "Pantograph data generation for coregraph finished."

# Adding gene data mass processing several chromosomes

In [None]:
notebook2script()

Converted 00_init.ipynb.
Converted 01_graph.ipynb.
Converted 02_tree.ipynb.
Converted 03_synteny.ipynb.
Converted 04_utils.ipynb.
Converted 05_export.ipynb.
Converted 05_exportDev.ipynb.
Converted deBruijnGraphProcessing.ipynb.
Converted dev.ipynb.
Converted graphTesting.ipynb.
Converted index.ipynb.


In [None]:
def checkNodeLengthsFile(GFAPath):
    directory = os.path.dirname(GFAPath)
    filebase = os.path.splitext(os.path.basename(GFAPath))[0]
    nodeLenPath = f'{directory}{os.path.sep}nodeLengths_{filebase}.dat'
    
    if os.path.exists(nodeLenPath):
        return joblib.load(nodeLenPath)
    else:
        return None

In [None]:
doCreateCoreGenes = True

In [None]:
pathfileDir = '../../1001G/coreGraph/new_Nov2022'
maxLengthComponent = 100
maxLengthChunk = 16
invertionThreshold = 0.5
zoomLevels = [1,3,9]
zoomLevels = adjustZoomLevels(zoomLevels)

for seqNum in range(1,6):
    if doCreateCoreGenes:
        coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_v2_Chr{seqNum:d}.gfa'

        coregraph = GenomeGraph(gfaPath=coreGFApath,isGFASeq=False)
        coregraph_genes = deepcopy(coregraph)

        fullGraphPath = f'../../1001G/pantograph/data/AT_Chr{seqNum:d}_OGOnly_2.1.gfa'
        fullgraph = GenomeGraph(gfaPath=fullGraphPath,isGFASeq=False)

        chainToGenesFile = f'chain2gene.txt'
        # maxChainLength = len(coregraph.nodes[0])
        chainToListDict = {}
        nodeLengths = []
        with open(f'{pathfileDir}{os.path.sep}{chainToGenesFile}') as f:
            for line in f:
                chainName, geneList = line.split(':')
                geneList = geneList.lstrip().rstrip().split(',')
                chainToListDict[chainName] = geneList

        for nodeIdx,nodeName in enumerate(coregraph.nodes):
            print(f'\rNode {nodeIdx+1}/{len(coregraph.nodes)}',end='')
            geneList = chainToListDict.get(nodeName, [])
            # geneIds = []
            # if geneList[0][:2]!='ch':
            geneList = [gene.rstrip('+') for gene in geneList]
            geneIds = [fullgraph.nodes.index(genename) for genename in geneList]
            # coregraph_genes.nodesData[nodeIdx] = ''.join(geneList)
            geneNum = max(1,len(geneList))
            nodeLengths.append(geneNum)

            for accession, chainDict in coregraph.nodesAnnotation[nodeIdx].items():
                interval = chainDict[nodeName]
                # geneCumLengths = np.hstack((0, np.cumsum([len(gene) for gene in geneList])))
                
                coregraph.nodesAnnotation[nodeIdx][accession].pop(nodeName,None)
                
                coregraph.nodesAnnotation[nodeIdx][accession].\
                    update({geneAnnotation:interval \
                            for i,geneid in enumerate(geneIds) \
                                for geneAnnotation in fullgraph.nodesAnnotation[geneid].get(accession,{fullgraph.nodes[geneid]:None}).keys()})

                coregraph_genes.nodesAnnotation[nodeIdx][accession].\
                    update({geneAnnotation:[(i,i)] \
                                    for i,geneid in enumerate(geneIds) \
                                        for geneAnnotation in fullgraph.nodesAnnotation[geneid].get(accession,{fullgraph.nodes[geneid]:None}).keys()})

                # coregraph_genes.nodesAnnotation[nodeIdx][accession].\
                #     update({nodeName:[(0, geneNum-1)]})
        print('')

    coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_v2_Chr{seqNum:d}.gfa'
    coreGeneGFApath = f'{pathfileDir}{os.path.sep}coregraph_v2_genes_Chr{seqNum:d}.gfa'
    
    if doCreateCoreGenes:
        coreGeneNodeLengthsPath = f'{pathfileDir}{os.path.sep}nodeLengths_coregraph_v2_genes_Chr{seqNum:d}.dat'

        coregraph.toGFA(coreGFApath,doSeq=False)
        coregraph_genes.toGFA(coreGeneGFApath,doSeq=True)
        joblib.dump(nodeLengths,coreGeneNodeLengthsPath)

    #Exporting chain graph with annotation only
    isSeq = False
    inputPath = coreGFApath
    
    outputPath,outputName = pathConvert(inputPath,suffix='_new')

    # dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
    # print(f'Opening Redis connection for db {dbid}')
    redisConn = Redis(host='redis',port = 6379,db=0)

    exportToPantograph(inputPath=inputPath,
                       outputName=outputName,
                       outputPath=outputPath,
                       isSeq=isSeq,
                       redisConn=redisConn,
                       GenomeGraphParams={'accessionsToRemove':['Consensus']},
                       zoomLevels=zoomLevels,
                       fillZoomLevels=True,
                       maxLengthChunk=maxLengthChunk,
                       maxLengthComponent=maxLengthComponent,
                       inversionThreshold=inversionThreshold,)

    #Exporting chain graph with genes
    isSeq = False
    inputPath = coreGeneGFApath
    
    outputPath,outputName = pathConvert(inputPath,suffix='_new')

    # dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
    # print(f'Opening Redis connection for db {dbid}')
    redisConn = Redis(host='redis',port = 6379,db=0)

    exportToPantograph(inputPath=inputPath,
                       outputName=outputName,
                       outputPath=outputPath,
                       isSeq=isSeq,
                       nodeLengths=checkNodeLengthsFile(inputPath),
                       redisConn=redisConn,
                       GenomeGraphParams={'accessionsToRemove':['Consensus']},
                       zoomLevels=zoomLevels,
                       fillZoomLevels=True,
                       maxLengthChunk=maxLengthChunk,
                       maxLengthComponent=maxLengthComponent,
                       inversionThreshold=inversionThreshold,)

Loading graph from ../../1001G/coreGraph/new_Nov2022/coregraph_v2_Chr1.gfa
Found node annotation file ../../1001G/coreGraph/new_Nov2022/annotation_coregraph_v2_Chr1.dat, loading associations.
Loading segment 33/33
Loading segments finished.
Loading link 68/68
Loading links finished
Loading path 28/28
Loading paths finished. 28 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 33/33
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Loading graph from ../../1001G/pantograph/data/AT_Chr1_OGOnly_2.1.gfa
Found node annotation file ../../1001G/pantograph/data/annotation_AT_Chr1_OGOnly_2.1.dat, loading associations.
Loading segment 7455/7455
Loading segments finished.
Loading link 14001/14001
Loading links finished
Loading path 28/28
Loading paths finished. 28 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7455/7455
Finished calculating nodes lengths
Preprocessing paths...
Processing

In [None]:
curT = time.localtime()
message = f"Adding genes and exporting of chain graphs for all chromosome finished at \
            {curT.tm_hour:02d}:{curT.tm_min:02d} on {curT.tm_mday:02d}/{curT.tm_mon:02d}!"
!ntfy send "{message}"



In [None]:
red = Redis(host='redis')

In [None]:
from pangraph_constructor.utils import iset_get

In [None]:
red.keys()

[b'AT_Chr1_OGOnly_2.1_new.4.6124.PosStart',
 b'coregraph_v2_Chr1_new.3.9905.PosStart',
 b'coregraph_v2_Chr5_new.1.9905.PosEnd',
 b'AT_Chr2_OGOnly_2.1_new.8192.22002.PosStart',
 b'coregraph_v2_genes_Chr4_new.1.TAIR10.PosStart',
 b'AT_Chr3_OGOnly_2.1_new.512.6124.PosStart',
 b'AT_Chr2_OGOnly_2.1_new.1.9728.PosStart',
 b'coregraph_v2_genes_Chr4_new.9.9537.PosStart',
 b'coregraph_Chr1_new.6244.GeneStart',
 b'coregraph_v2_genes_Chr3_new.72.6124.PosEnd',
 b'coregraph_v2_genes_Chr2_new.36.10002.PosEnd',
 b'coregraph_v2_Chr2_new.3.22005.PosEnd',
 b'AT_Chr2_OGOnly_2.1_new.32.6124.PosEnd',
 b'AT_Chr2_OGOnly_2.1_new.64.9537.PosEnd',
 b'coregraph_v2_genes_Chr3_new.36.9888.PosEnd',
 b'coregraph_v2_Chr2_new.1.22004.PosStart',
 b'AT_Chr2_OGOnly_2.1_new.1.10024.PosStart',
 b'AT_Chr3_OGOnly_2.1_new.1024.10015.PosEnd',
 b'AT_Chr2_OGOnly_2.1_new.32.22007.PosEnd',
 b'coregraph_v2_Chr1_new.1.9543.PosStart',
 b'coregraph_v2_Chr5_new.9.TAIR10.PosStart',
 b'coregraph_v2_Chr5_new.6244.GeneStart',
 b'AT_Chr3_OG

In [None]:
iset_get(red,'coregraph_v2_genes_Chr1_new.10002.Gene','OG0008080_0')

{'OG0008080_0': (1777.0, 1777.0)}

In [None]:
iset_get(red,'coregraph_v2_genes_Chr3_new.9981.Gene')

{'AT3G06020_0': (0.0, 0.0),
 'OG0010622_0': (0.0, 0.0),
 'AT3G06030_0': (1.0, 1.0),
 'OG0005087_0': (1.0, 1.0),
 'AT3G06035_0': (2.0, 2.0),
 'OG0019490_0': (2.0, 2.0),
 'AT3G06070_0': (3.0, 3.0),
 'OG0010623_0': (3.0, 3.0),
 'AT3G06080_0': (4.0, 4.0),
 'OG0010624_0': (4.0, 4.0),
 'AT3G06100_0': (5.0, 5.0),
 'OG0010625_0': (5.0, 5.0),
 'AT3G06110_0': (6.0, 6.0),
 'OG0010626_0': (6.0, 6.0),
 'AT3G06120_0': (7.0, 7.0),
 'OG0010627_0': (7.0, 7.0),
 'AT3G06130_0': (8.0, 8.0),
 'OG0010628_0': (8.0, 8.0),
 'AT3G06140_0': (9.0, 9.0),
 'OG0010629_0': (9.0, 9.0),
 'AT3G06145_0': (10.0, 10.0),
 'OG0010630_0': (10.0, 10.0),
 'AT3G06150_0': (11.0, 11.0),
 'OG0005088_0': (11.0, 11.0),
 'AT3G06160_0': (12.0, 12.0),
 'OG0005089_0': (12.0, 12.0),
 'AT3G06170_0': (13.0, 13.0),
 'OG0005090_0': (13.0, 13.0),
 'AT3G06180_0': (14.0, 14.0),
 'OG0010631_0': (14.0, 14.0),
 'AT3G06190_0': (15.0, 15.0),
 'OG0010632_0': (15.0, 15.0),
 'AT3G06200_0': (16.0, 16.0),
 'OG0010633_0': (16.0, 16.0),
 'AT3G06210_0': (17.