In [None]:
#| include: false
%load_ext autoreload
%autoreload 2

## Install

Enter the directory of the library and enter:

`pip install .`

 and for development use

`pip install -e .`

## How to use

In [None]:
from nbdev import nbdev_export
nbdev_export()

In [None]:
import os
import glob
import re
import time

from pygengraph.graph import GenomeGraph
from pygengraph.utils import pathFileToPathDict
from pygengraph.export import exportProject

In [None]:
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('always',category=RuntimeWarning)

# Generating graphs

## Generating from annotation

### Preparing list of files

In [None]:
#| eval: false
refdir = '/path/to/reference/'
annotationdir = '/path/to/annotation'
gfadir = '/path/to/graphs'

In [None]:
#| eval: false
annotationFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}*.gff'))
pangenomeFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}*pangen.gff'))
# If you want to include sequences instead of simple notion of genes.
# It should also be converted to sequenceFileDict, see details in documentation for GenomeGraph Class constructor.
# sequenceFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}sequences{os.path.sep}*.fasta'))
refAnnotationFile = f'{refdir}{os.path.sep}reference.gff'
# If you want to include sequences instead of simple notion of genes
# refSequenceFile = f'{refdir}{os.path.sep}reference.fasta'

In [None]:
#| eval: false
refdir = '../../1001G/annotations/freeze2.1/outgroups'
annotationdir = '../../1001G/annotations/freeze2.1'
gfadir = '../../1001G/annotations/graphs'

In [None]:
#| eval: false
annotationFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}*.gff'))
# pangenomeFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}*pangen.gff'))
# If you want to include sequences instead of simple notion of genes.
# It should also be converted to sequenceFileDict, see details in documentation for GenomeGraph Class constructor.
# sequenceFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}sequences{os.path.sep}*.fasta'))
refAnnotationFile = f'{refdir}{os.path.sep}araport.gff'
# If you want to include sequences instead of simple notion of genes
# refSequenceFile = f'{refdir}{os.path.sep}reference.fasta'

In [None]:
annotationFiles

['../../1001G/annotations/freeze2.1/10002.gff',
 '../../1001G/annotations/freeze2.1/10015.gff',
 '../../1001G/annotations/freeze2.1/10024.gff',
 '../../1001G/annotations/freeze2.1/1741.gff',
 '../../1001G/annotations/freeze2.1/22001.gff',
 '../../1001G/annotations/freeze2.1/22002.gff',
 '../../1001G/annotations/freeze2.1/22003.gff',
 '../../1001G/annotations/freeze2.1/22004.gff',
 '../../1001G/annotations/freeze2.1/22005.gff',
 '../../1001G/annotations/freeze2.1/22006.gff',
 '../../1001G/annotations/freeze2.1/22007.gff',
 '../../1001G/annotations/freeze2.1/6024.gff',
 '../../1001G/annotations/freeze2.1/6069.gff',
 '../../1001G/annotations/freeze2.1/6124.gff',
 '../../1001G/annotations/freeze2.1/6244.gff',
 '../../1001G/annotations/freeze2.1/6909.gff',
 '../../1001G/annotations/freeze2.1/6966.gff',
 '../../1001G/annotations/freeze2.1/8236.gff',
 '../../1001G/annotations/freeze2.1/9075.gff',
 '../../1001G/annotations/freeze2.1/9537.gff',
 '../../1001G/annotations/freeze2.1/9543.gff',
 '.

In [None]:
refAnnotationFile,'../../1001G/annotations/freeze2.1/outgroups/araport.gff'

('../../1001G/annotations/freeze2.1/outgroups/araport.gff',
 '../../1001G/annotations/freeze2.1/outgroups/araport.gff')

### Generaton of gene graph

In [None]:
nbdev_export()

In [None]:
#| eval: false
doUS = False
n = 1
for chrnum in range(1,n+1): # here n is number of chromosomes.
    chromosome = f'Chr{chrnum}'

    print(f'\nProcessing {chromosome}\n============')

    curtst = time.time()
    
    graph = GenomeGraph(annotationFiles = annotationFiles,
                        pangenomeFiles = None,
                        sequenceFilesDict = None,
                        doUS = doUS,
                        chromosome = chromosome,
                        refAnnotationFile=refAnnotationFile,
                        refAccession='TAIR10')
    
    print(f'Generating graph for {chromosome} took {time.time() - curtst} seconds')
    
    curtst = time.time()
    graph.treeSort()
    print(f'Sorting graph for {chromosome} took {time.time() - curtst} seconds')
    if len(graph.nodes)!=len(graph.order):
            print('Sorting failed and not all nodes were sorted. Saving unsorted graph')
            gfaFilename = f'Gene_{chromosome}_simOnly_unordered.gfa'
            graph.order = list(range(1,len(graph.nodes)+1))
    else:
        gfaFilename = f'Gene_{chromosome}_simOnly.gfa'
    
    graph.toGFA(f'{gfadir}{os.path.sep}{gfaFilename}',doSeq=False)
    


Processing Chr1
Processing file ../../1001G/annotations/freeze2.1/10002.gff...
Annotation file ../../1001G/annotations/freeze2.1/10002.gff read.
Processing file ../../1001G/annotations/freeze2.1/10002.gff finished in 28.673924207687378 seconds.
Processing file ../../1001G/annotations/freeze2.1/10015.gff...
Annotation file ../../1001G/annotations/freeze2.1/10015.gff read.
Processing file ../../1001G/annotations/freeze2.1/10015.gff finished in 24.959019660949707 seconds.
Processing file ../../1001G/annotations/freeze2.1/10024.gff...
Annotation file ../../1001G/annotations/freeze2.1/10024.gff read.
Processing file ../../1001G/annotations/freeze2.1/10024.gff finished in 23.587310552597046 seconds.
Processing file ../../1001G/annotations/freeze2.1/1741.gff...
Annotation file ../../1001G/annotations/freeze2.1/1741.gff read.
Processing file ../../1001G/annotations/freeze2.1/1741.gff finished in 24.746922492980957 seconds.
Processing file ../../1001G/annotations/freeze2.1/22001.gff...
Annotat

AttributeError: 'NoneType' object has no attribute 'items'

## Loading Pathfile to graph

In [None]:
# For path file v1
pathfileDir = 'examples/gene_graph'

pathsfile = 'paths_genegraph.txt'

paths = pathFileToPathDict(f'{pathfileDir}{os.path.sep}{pathsfile}', True, True)

graph = GenomeGraph(pathsDict=paths)

graph.treeSort()

if len(graph.nodes)!=len(graph.order):
    print('Sorting failed and not all nodes were sorted. Saving unsorted graph')
    output = 'paths_genegraph_unordered.gfa'
    graph.order = list(range(1,len(graph.nodes)+1))
    graph.toGFA(output,doSeq=False)
else:
    coreGFApath = f'paths_genegraph.gfa'
    coregraph.toGFA(coreGFApath,doSeq=False)

In [None]:
#| eval: false
# For v2
# This is example, no v2 file currently available for demonstration.
pathfileDir = '/path/to/file'

pathsfile = f'paths.txt'

paths = pathFileToPathDict(f'{pathfileDir}{os.path.sep}{pathsfile}',True,'reference',True)

for seqNum in paths.keys():

    graph = GenomeGraph(pathsDict=paths[seqNum])

    # On undirected coregraph sorting is not optimal! Check sorting!!!

    graph.treeSort()

    if len(graph.nodes)!=len(graph.order):
        print('Sorting failed and not all nodes were sorted. Saving unsorted graph')
        output = f'{pathfileDir}{os.path.sep}graph_Chr{seqNum}_unordered.gfa'
        graph.order = list(range(1,len(graph.nodes)+1))
        graph.toGFA(output,doSeq=False)
    else:
        coreGFApath = f'{pathfileDir}{os.path.sep}graph_Chr{seqNum}.gfa'
        graph.toGFA(coreGFApath,doSeq=False)


# Loading graph from GFA and sorting it

In [None]:
gfadir = 'examples/nucleotide_graph'

# It is nucleotide graph. If it is not nucleotide graph, then `isSeq` variable should be changed to False.
gfafilename = 'paths_presentation.gfa'
isSeq = True

graph = GenomeGraph(gfaPath=f'{gfadir}{os.path.sep}{gfafilename}',isGFASeq=isSeq)

graph_new.treeSort()

assert len(graph_new.nodes)==len(graph_new.order)

basename,ext = os.path.splitext(gfafilename)

graph_new.toGFA(f'{gfadir}{os.path.sep}{basename}_ordered.{ext}',doSeq=isSeq)

### Exporting presentation graph

In [None]:
pathfileDir = '../../Meetings/1001G+_20220518/'
coreGFApath = f'{pathfileDir}{os.path.sep}paths_presentation.gfa'

In [None]:
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = True
zoomLevels = [1,2,4,8,16]#,8,16,32]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
outputPath,outputName,zoomLevels

('../../Meetings/1001G+_20220518', 'paths_presentation_new', [1, 2, 4, 8, 16])

In [None]:
redisConn=None

In [None]:

#%%capture output
startTime = time.time()
# [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,toComponentLinks,fromComponentLinks,graph] = \
# [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,zoomAccStarts,zoomAccEnds, 
#                 invertedStarts,invertedEnds,toComponentLinks,fromComponentLinks,collapsibleBlocks,fromLinks,toLinks,graph,rootStruct] = \
exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                outputPath=outputPath, outputName=outputName,
                                isSeq=isSeq,
                                redisConn=redisConn,
                                zoomLevels=zoomLevels,
                                fillZoomLevels = True,
                                maxLengthComponent=maxLengthComponent, 
                                maxLengthChunk=maxLengthChunk, 
                                inversionThreshold=inversionThreshold)
#                                 returnDebugData=True)
runTime = time.time() - startTime

Loading Genome
Loading graph from ../../Meetings/1001G+_20220518//paths_presentation.gfa
Found nodeNames file ../../Meetings/1001G+_20220518/nodeNames_paths_presentation.json, loading names.
Found node annotation file ../../Meetings/1001G+_20220518/annotation_paths_presentation.dat, loading associations.
Loading segment 7/7
Loading segments finished.
Loading link 12/12
Loading links finished
Loading path 5/5
Loading paths finished. 5 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths

Recording Pantograph data to ../../Meetings/1001G+_20220518/paths_presentation_new
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths
Processing path breaks...
Postprocessing interconnected links 2/2
Preprocessing interconnected links finished.

Processing path breaks fin

### Exporting tutorial gene graph

In [None]:
pathfileDir = '../../1001G/pantograph'
coreGFApath = f'{pathfileDir}{os.path.sep}genegraph_tutorial.gfa'

In [None]:
graph = GenomeGraph(gfaPath=coreGFApath,isGFASeq=False)

Loading graph from ../../1001G/pantograph/genegraph_tutorial.gfa
Found node annotation file ../../1001G/pantograph/annotation_genegraph_tutorial.dat, loading associations.
Loading segment 7/7
Loading segments finished.
Loading link 13/13
Loading links finished
Loading path 5/5
Loading paths finished. 5 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths



In [None]:
graph.paths

[['1+', '2+', '3+', '4+'],
 ['1+', '5+', '6+', '7+', '1+'],
 ['4+', '1+', '2+', '3+'],
 ['2+', '3+', '4+', '1+', '5-', '4+', '7+'],
 ['6+', '7-', '1+', '2+', '3+']]

In [None]:
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = False
zoomLevels = [1]#,8,16,32]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
outputPath,outputName = pathConvert(coreGFApath,suffix='')
outputPath,outputName,zoomLevels

('../../1001G/pantograph', 'genegraph_tutorial', [1])

In [None]:
# dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
# print(f'Opening Redis connection for db {dbid}')
redisConn = Redis(host = 'redis',port = 6379, db = 0)

In [None]:

#%%capture output
startTime = time.time()
# [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,toComponentLinks,fromComponentLinks,graph] = \
# [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,zoomAccStarts,zoomAccEnds, 
#                 invertedStarts,invertedEnds,toComponentLinks,fromComponentLinks,collapsibleBlocks,fromLinks,toLinks,graph,rootStruct] = \
exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                outputPath=outputPath, outputName=outputName,
                                isSeq=isSeq,
                                redisConn=redisConn,
                                zoomLevels=zoomLevels,
                                fillZoomLevels = False,
                                maxLengthComponent=maxLengthComponent, 
                                maxLengthChunk=maxLengthChunk, 
                                inversionThreshold=inversionThreshold)
#                                 returnDebugData=True)
runTime = time.time() - startTime

Loading Genome
Loading graph from ../../1001G/pantograph/genegraph_tutorial.gfa
Found node annotation file ../../1001G/pantograph/annotation_genegraph_tutorial.dat, loading associations.
Loading segment 7/7
Loading segments finished.
Loading link 13/13
Loading links finished
Loading path 5/5
Loading paths finished. 5 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths

Recording Pantograph data to ../../1001G/pantograph/genegraph_tutorial
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths
Identifying rearrangement blocks
Processing node 7/7
Identifying rearrangement blocks finished.

Zoom level 1
Processing node 7/7
Processing component links 6/6
Recording component 6/6
Recording zoom level 1 finished.


### Testing removable elements identification

In [None]:
from pangraph_constructor.exportDev import getRemovableStructures

In [None]:
import IPython.display as ipd
import joblib

In [None]:
if os.path.exists('./tests/breakIdentify.dat'):
    correctResults = joblib.load('./tests/breakIdentify.dat')
else:
    correctResults = {}

In [None]:
notebook2script()

In [None]:
path = '../../1001G/GraphCollapsing/TestGraphs'
filePrefix = 'test'

In [None]:
#test

for filename,resDict in correctResults.items():
    print(f'\n####### Testing on case {filename} ########')
    graph = GenomeGraph(f'{path}/{filename}',isGFASeq=False)
    linksLengths, pairedLinks, blockEdges, _ = getRemovableStructures(graph=graph)
    assert linksLengths==resDict['linksLengths']
    assert pairedLinks==resDict['pairedLinks']
    assert blockEdges==resDict['blockEdges']

In [None]:
caseNum = 17
filename = f'{filePrefix}{caseNum:02d}.gfa'


print('############')
print(f'Graph from file {filename}')
coreGFApath = f'{path}/{filename}'
graph = GenomeGraph(coreGFApath,isGFASeq=False)
print('Graph Paths:')
ipd.display(graph.paths)
linkLengths, pairedLinks, blockEdges, _ = getRemovableStructures(graph=graph)

print('Link-Lengths associations:')
ipd.display(linkLengths)

print('PairedLinks:')
ipd.display(pairedLinks)

print('Rearrangemenet block edges:')
ipd.display(blockEdges)

In [None]:
correctResults[filename]

In [None]:
correctResults[filename] = {'linksLengths':linkLengths,'pairedLinks':pairedLinks,'blockEdges':blockEdges}

In [None]:
joblib.dump(correctResults,'./tests/breakIdentify.dat')

### Exporting test collapse graph

In [None]:
pathfileDir = '../../1001G/pantograph/data'
coreGFApath = f'{pathfileDir}{os.path.sep}testCollapse.gfa'

In [None]:
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = False
zoomLevels = [1,2,4]#,8,16,32]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
outputPath,outputName,zoomLevels

In [None]:
redisConn=None

In [None]:
#%%capture output
startTime = time.time()
# [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,toComponentLinks,fromComponentLinks,graph] = \
[zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,zoomAccStarts,zoomAccEnds, 
                invertedStarts,invertedEnds,toComponentLinks,fromComponentLinks,collapsibleBlocks,fromLinks,toLinks,graph,rootStruct] = \
exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                outputPath=outputPath, outputName=outputName,
                                isSeq=isSeq,
                                redisConn=redisConn,
                                zoomLevels=zoomLevels,
                                fillZoomLevels = True,
                                maxLengthComponent=maxLengthComponent, 
                                maxLengthChunk=maxLengthChunk, 
                                inversionThreshold=inversionThreshold,
                                returnDebugData=True)
runTime = time.time() - startTime

In [None]:
print(f'Executed in {runTime} seconds')

In [None]:
!ntfy send "Exporting test collapse graph finished. Overall time = {runTime} seconds"

### Exporting coregraph

In [None]:
pathfileDir = '../../1001G/coreGraph'
coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_Chr1.gfa'

In [None]:
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = False
zoomLevels = [1,2,4,8,16]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
outputPath,outputName,zoomLevels

('../../1001G/coreGraph', 'coregraph_Chr1_new', [1, 2, 4, 8, 16])

In [None]:
# dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
# print(f'Opening Redis connection for db {dbid}')
redisConn = Redis(host='redis',port = 6379,db=0)

In [None]:
notebook2script()

Converted 00_init.ipynb.
Converted 01_graph.ipynb.
Converted 02_tree.ipynb.
Converted 03_synteny.ipynb.
Converted 04_utils.ipynb.
Converted 05_export.ipynb.
Converted 05_exportDev.ipynb.
Converted deBruijnGraphProcessing.ipynb.
Converted dev.ipynb.
Converted graphTesting.ipynb.
Converted index.ipynb.


In [None]:
#%%capture output
startTime = time.time()
# [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,toComponentLinks,fromComponentLinks,graph] = \
exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                outputPath=outputPath, outputName=outputName,
                                isSeq=isSeq,
                                redisConn=redisConn,
                                zoomLevels=zoomLevels, 
                                maxLengthComponent=maxLengthComponent, 
                                maxLengthChunk=maxLengthChunk, 
                                inversionThreshold=inversionThreshold)
#                                 returnDebugata=True)
runTime = time.time() - startTime
print(runTime)

Loading Genome
Loading graph from ../../1001G/coreGraph/coregraph_Chr1.gfa
Found node annotation file ../../1001G/coreGraph/annotation_coregraph_Chr1.dat, loading associations.
Loading segment 35/35
Loading segments finished.
Loading link 72/72
Loading links finished
Loading path 28/28
Loading paths finished. 28 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 35/35
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Recording Pantograph data to ../../1001G/coreGraph/coregraph_Chr1_new
Calculating nodes length...
Processing node 35/35
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths
Processing path breaks...
Postprocessing interconnected links 16/16
Preprocessing interconnected links finished.

Processing path breaks finished.
Converting blocks to block lengths 34/34
Conversion finished.
Reformating links to block lengths associations 34/34
Refo

In [None]:
!ntfy send "Exporting coregraph finished. Overall time = {runTime} seconds"



❗️❗️❗️ TODO: Next test API. WHen works, change front end to get jsons without annotation and then to load annotation from API!

In [None]:
graph = GenomeGraph(coreGFApath,isGFASeq=False)

Loading graph from ../../1001G/coreGraph/coregraph_Chr1.gfa
Found node annotation file ../../1001G/coreGraph/annotation_coregraph_Chr1.dat, loading associations.
Loading segment 35/35
Loading segments finished.
Loading link 72/72
Loading links finished
Loading path 28/28
Loading paths finished. 28 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 35/35
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths



In [None]:
from pangraph_constructor.utils import iset_get,iset_score,iset_add

In [None]:
redisConn = Redis(host='redis',port=6379,db=0)

In [None]:
iset_get(redisConn,'test')

{'a_0': (1.0, 3.0), 'b_0': (1.0, 5.0)}

In [None]:
iset_score(redisConn,'coregraph_Chr1_new.1.10024.Pos',22)
# coregraph_Chr1_new/10024/1/22

['23']

In [None]:
iset_score(redisConn,'coregraph_Chr1_new.9543.Gene',50,50)

['3370',
 'AT1G15940',
 'AT1G15950',
 'AT1G16000',
 'AT1G16010',
 'AT1G16022',
 'AT1G62830',
 'AT1G62840',
 'OG0001502',
 'OG0004404',
 'OG0007340',
 'OG0007341',
 'OG0007343',
 'OG0008435',
 'OG0015400',
 'OG0017985',
 'OG0022841',
 'OG0024046',
 'OG0024055',
 'OG0025924',
 'OG0027925',
 'OG0030893']

In [None]:
iset_get(redisConn,'coregraph_Chr1_new.9543.Gene','3370_0')

{'3370_0': (48.0, 51.0)}

### Exporting coregraph with genes

In [None]:
notebook2script()

In [None]:
pathfileDir = '../../1001G/coreGraph'

In [None]:
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = True
zoomLevels = [1,3,9]# + [9*2**i for i in range(9)]
# zoomLevels = [2**i for i in range(12)]#  [1,2,4,8,16,32,9*16,9*32,9*128,9*256]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
# outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
# outputPath,outputName,zoomLevels

In [None]:
# %%capture output
for chrNum in range(1,6):
    coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_genes_Chr{chrNum}.gfa'
    
    outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
    
    dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
    print(f'Opening Redis connection for db {dbid}')
    redisConn = Redis(host='redis',port = 6379,db=dbid)
    
    startTime = time.time()
    # [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,toComponentLinks,fromComponentLinks,graph] = \
    exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                    outputPath=outputPath, outputName=outputName,
                                    isSeq=isSeq,
                                    redisConn=redisConn,
                                    zoomLevels=zoomLevels,
                                    fillZoomLevels = True,
                                    maxLengthComponent=maxLengthComponent, 
                                    maxLengthChunk=maxLengthChunk, 
                                    inversionThreshold=inversionThreshold)
    #                                 returnDebugata=True)
    runTime = time.time() - startTime

In [None]:
curT = time.localtime(time.time()+3600)
message = f"Exporting core graph with genes for all chromosomes finished at \
            {curT.tm_hour:02d}:{curT.tm_min:02d} on {curT.tm_mday:02d}/{curT.tm_mon:02d}!\n"
!ntfy send "{message}"

In [None]:
notebook2script()

In [None]:
zoomLevels = [1,2,4]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
# outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
# outputPath,outputName,zoomLevels

In [None]:
#%%capture output2
for chrNum in range(1,6):
    coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_Chr{chrNum}.gfa'
    
    outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
    
    dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
    print(f'Opening Redis connection for db {dbid}')
    redisConn = Redis(host='redis',port = 6379,db=dbid)
    
    startTime = time.time()
    # [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,toComponentLinks,fromComponentLinks,graph] = \
    exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                    outputPath=outputPath, outputName=outputName,
                                    isSeq=isSeq,
                                    redisConn=redisConn,
                                    zoomLevels=zoomLevels,
                                    fillZoomLevels = True,
                                    maxLengthComponent=maxLengthComponent, 
                                    maxLengthChunk=maxLengthChunk, 
                                    inversionThreshold=inversionThreshold)
    #                                 returnDebugata=True)
    runTime = time.time() - startTime
    
    print(f'Exporting core graph for Chr{chrNum} took {runTime} seconds')

In [None]:
curT = time.localtime(time.time()+3600)
message = f"Exporting core graph with genes for all chromosomes finished at \
 {curT.tm_hour:02d}:{curT.tm_min:02d} on {curT.tm_mday:02d}/{curT.tm_mon:02d}!\n"
!ntfy send "{message}"

In [None]:
!ntfy send "Exporting coregraph with genes finished. Overall time = {runTime} seconds"

### Exporting gene graphs for all chromosomes

In [None]:
pathfileDir = '../../1001G/pantograph/data'

In [None]:
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = False
zoomLevels = [1]#,2,4]# + [9*2**i for i in range(12)]
fillZoomLevel = False

In [None]:
nbdev_export()

In [None]:
caseDict = {}
for i in range(1,6):
    GFA = f'AT_Chr{i}_OGOnly_2.1.gfa'
    caseDict[f'Chr{i}'] = GFA

{'Chr1': 'AT_Chr1_OGOnly_2.1.gfa',
 'Chr2': 'AT_Chr2_OGOnly_2.1.gfa',
 'Chr3': 'AT_Chr3_OGOnly_2.1.gfa',
 'Chr4': 'AT_Chr4_OGOnly_2.1.gfa',
 'Chr5': 'AT_Chr5_OGOnly_2.1.gfa'}

In [None]:
exportProject(projectID = 'AT_OGOnly_2.1', projectName = 'A. thaliana 27 genomes + TAIR10 gene graph 2.1', 
              caseDict = caseDict, pathToIndex = pathfileDir, pathToGraphs = pathfileDir,
              redisHost='redis', redisPort = 6379, redisDB = 0,
              suffix = 'pr',
              maxLengthComponent = maxLengthComponent, maxLengthChunk = maxLengthChunk,
              inversionThreshold = inversionThreshold,
              isSeq = isSeq, zoomLevels = zoomLevels, fillZoomLevel = fillZoomLevel)


Processing case AT_OGOnly_2.1_Chr1pr
Loading Genome
Loading graph from ../../1001G/pantograph/data/AT_Chr1_OGOnly_2.1.gfa
Found node annotation file ../../1001G/pantograph/data/annotation_AT_Chr1_OGOnly_2.1.dat, loading associations.
Loading segment 7455/7455
Loading segments finished.
Loading link 14001/14001
Loading links finished
Loading path 28/28
Loading paths finished. 28 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7455/7455
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Recording Pantograph data to ../../1001G/pantograph/data/AT_OGOnly_2.1/AT_OGOnly_2.1_Chr1pr
Calculating nodes length...
Processing node 7455/7455
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths
Identifying rearrangement blocks
Processing node 7455/7455
Identifying rearrangement blocks finished.

Zoom level 1
Processing node 7455/7455
Processing component links 

In [None]:
for seqID in ['Chr1','Chr2','Chr3','Chr4','Chr5']:#'Chr1','Chr2',

    coreGFApath = f'{pathfileDir}{os.path.sep}AT_{seqID}_OGOnly_2.1.gfa'

    outputPath,outputName = pathConvert(coreGFApath,suffix='_viscol')
    
    print()
    print(f'Processing case {outputName}')
    # dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
    # print(f'Opening Redis connection for db {dbid}')
    redisConn = Redis(host='redis',port = 6379,db=0)

    #%%capture output
    startTime = time.time()
    # initialLinkLengths, initialPairedLinks, initialInterconnectedLinks, initialBlockEdges, \
    # zoomNodeToComponent,zoomComponentToNodes,zoomComponents,\
    # zoomFromComponentLinks, zoomToComponentLinks, zoomLinkLengths, zoomPairedLinks, zoomInterconnectedLinks, \
    # zoomOldToNewRemoval, zoomNewToOldRemoval, \
    # zoomLinkLengthsRemoval, zoomPairedLinksRemoval, zoomInterconnectedLinksRemoval, zoomBlockEdgesRemoval, \
    # zoomFromComponentLinksRemoval, zoomToComponentLinksRemoval, \
    # graph = \
    exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                    outputPath=outputPath, outputName=outputName,
                                    isSeq=isSeq,
                                    redisConn=redisConn,
                                    zoomLevels=zoomLevels, 
                                    fillZoomLevels = False,
                                    maxLengthComponent=maxLengthComponent, 
                                    maxLengthChunk=maxLengthChunk, 
                                    inversionThreshold=inversionThreshold)#,
                                    # returnDebugData=True)
    runTime = time.time() - startTime
    
    print(f'Exporting gene graph for {seqID} took {runTime} seconds')
    
    redisConn.close()


Processing case AT_Chr1_OGOnly_2.1_viscol
Loading Genome
Loading graph from ../../1001G/pantograph/data/AT_Chr1_OGOnly_2.1.gfa
Found node annotation file ../../1001G/pantograph/data/annotation_AT_Chr1_OGOnly_2.1.dat, loading associations.
Loading segment 7455/7455
Loading segments finished.
Loading link 14001/14001
Loading links finished
Loading path 28/28
Loading paths finished. 28 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7455/7455
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Recording Pantograph data to ../../1001G/pantograph/data/AT_Chr1_OGOnly_2.1_viscol
Calculating nodes length...
Processing node 7455/7455
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths
Identifying rearrangement blocks
Processing node 7455/7455
Identifying rearrangement blocks finished.

Zoom level 1
Processing node 7455/7455
Processing component links 3869

In [None]:
curT = time.localtime()
message = f"Exporting gene graph for all chromosomes finished at \
            {curT.tm_hour:02d}:{curT.tm_min:02d} on {curT.tm_mday:02d}/{curT.tm_mon:02d}!"
!ntfy send "{message}"

In [None]:
coreGFApath = f'{pathfileDir}{os.path.sep}AT_Chr1_OGOnly_2.1.gfa'
graph = GenomeGraph(gfaPath=coreGFApath, isGFASeq=False)

Loading graph from ../../1001G/pantograph/data/AT_Chr1_OGOnly_2.1.gfa
Found node annotation file ../../1001G/pantograph/data/annotation_AT_Chr1_OGOnly_2.1.dat, loading associations.
Loading segment 7455/7455
Loading segments finished.
Loading link 14001/14001
Loading links finished
Loading path 28/28
Loading paths finished. 28 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7455/7455
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths



In [None]:
graph.nodesMetadata[1361]['22006']

{'genPos': [{'chr': 'Chr1', 'genomePosition': [19020282, 19020893]},
  {'chr': 'Chr1', 'genomePosition': [19022280, 19023284]}],
 'annotation': {'OG0001585': [(0, 0)],
  'AT1G55230': [(0, 0)],
  'AT1G55240': [(0, 0)]}}

### Comments

Each node processing time increase significantly with overall number of nodes. This is wrong and should be investigated.

In [None]:
from copy import deepcopy
import numpy as np

In [None]:
pathfileDir = '../../1001G/coreGraph'
coreGFApath = f'{pathfileDir}{os.path.sep}coreGraph_f2.1_Ref_v04.gfa'

In [None]:
coregraph = GenomeGraph(gfaPath=coreGFApath,isGFASeq=False)
coregraph_genes = deepcopy(coregraph)

In [None]:
fullGraphPath = '../../1001G/pantograph/data/AT_Chr1_OGOnly_2.1.gfa'
fullgraph = GenomeGraph(gfaPath=fullGraphPath,isGFASeq=False)

In [None]:
chainToGenesFile = 'chain2gene_f2.1_Ref_v04.txt'
maxChainLength = len(coregraph.nodes[0])
chainToListDict = {}
with open(f'{pathfileDir}{os.path.sep}{chainToGenesFile}') as f:
    for line in f:
        chainName, geneList = line.split(':')
        geneList = geneList.lstrip().rstrip().split(',')
        chainToListDict[chainName.zfill(maxChainLength)] = geneList

In [None]:
for nodeIdx,nodeName in enumerate(coregraph.nodes):
    print(f'\nNode {nodeIdx+1}/{len(coregraph.nodes)}',end='')
    geneList = chainToListDict.get(nodeName.zfill(maxChainLength), [f'ch{nodeName.zfill(7)}'])
    geneIds = []
    if geneList[0][:2]!='ch':
        geneIds = [int(gene.rstrip('+'))-1 for gene in geneList]
        geneList = [fullgraph.nodes[geneid] for geneid in geneIds]
    coregraph_genes.nodesData[nodeIdx] = ''.join(geneList)
    
    for accession, chainDict in coregraph.nodesAnnotation[nodeIdx].items():
        interval = chainDict[nodeName]
        geneCumLengths = np.hstack((0, np.cumsum([len(gene) for gene in geneList])))
        
        coregraph.nodesAnnotation[nodeIdx][accession].\
            update({geneAnnotation:interval \
                    for geneid in geneIds \
                        for geneAnnotation in fullgraph.nodesAnnotation[geneid].get(accession,{fullgraph.nodes[geneid]:None}).keys()})

        coregraph_genes.nodesAnnotation[nodeIdx][accession].\
            update({geneAnnotation:[(geneCumLengths[i], geneCumLengths[i+1]-1)] \
                            for i,geneid in enumerate(geneIds) \
                                for geneAnnotation in fullgraph.nodesAnnotation[geneid].get(accession,{fullgraph.nodes[geneid]:None}).keys()})

        coregraph_genes.nodesAnnotation[nodeIdx][accession].\
            update({nodeName:[(geneCumLengths[0], geneCumLengths[-1]-1)]})
print('')

In [None]:
coregraph.toGFA(f'{pathfileDir}{os.path.sep}coregraph_f2.1_Ref_v04.gfa',doSeq=False)
coregraph_genes.toGFA(f'{pathfileDir}{os.path.sep}coregraph_genes_f2.1_Ref_v04.gfa',doSeq=True)

In [None]:
pathfileDir = '../../1001G/coreGraph'
coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_genes_f2.1_Ref_v04.gfa'
# coreGFApath = f'{pathfileDir}{os.path.sep}coreGraph.gfa'

In [None]:
# zoomLevels = [1,10,20,100,500,1000,5000,10000,50000,100000,500000,1000000]
zoomLevels = [1,3,9,45,90,450,900,4500,9000]
# zoomLevels = [1,3,9,18]
# zoomLevels = [4,8,16]

isSeq = True

maxLengthComponent = 100
maxLengthChunk = 16
invertionThreshold = 0.5
# inputPath = '../../1001G/pantograph/data/shorttest2.gfa'
# inputPath = '../../1001G/pantograph/data/AT_Chr1_OGOnly.gfa'
# inputPath = '../../1001G/chrisGraph/chr1.wfmash.n20.a90.s10000.p1,19,39,3,81,1.seqwish.sort.smooth.sort.gfa'
# inputPath = '../../1001G/pantograph/data/shorttest_seq.gfa'
inputPath = coreGFApath

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
outputPath,outputName = pathConvert(inputPath,suffix='_new')
outputPath,outputName,zoomLevels

In [None]:
dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
print(f'Opening Redis connection for db {dbid}')
redisConn = Redis(host='redis',port = 6379,db=dbid)

In [None]:
# zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides = \
exportToPantograph(inputPath=inputPath,
                   outputName=outputName,
                   outputPath=outputPath,
                   isSeq=isSeq,
                   redisConn=redisConn,
                   GenomeGraphParams={'accessionsToRemove':['Consensus']},
                   zoomLevels=zoomLevels,
                   maxLengthChunk=maxLengthChunk,
                   maxLengthComponent=maxLengthComponent,
                   invertionThreshold=invertionThreshold,)
#                                              debug=True,returnDebugData=True)

In [None]:
!ntfy send "Pantograph data generation for coregraph finished."

# Exporting to Pantograph visualisation

In [None]:
projectID = 'paths_genegraph'
projectName = 'Example gene graph'
pathToGraphs = 'examples/gene_graph'
caseDict = {'Main': 'paths_genegraph.gfa'}
pathToIndex = 'examples/Visdata'

# This is if you run it in Docker compose together with active Redis image, which is named "redis".
# If you have separate redis server, enter full address here.
# If you do not want to add any annotation, `redisHost` should be None.
redisHost = 'redis'
redisPort = 6379
redisDB = 0

suffix = ''

maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = False
zoomLevels = [1]
fillZoomLevel = True

exportProject(projectID, projectName, caseDict, pathToIndex, pathToGraphs,
              redisHost = redisHost, redisPort = redisPort, redisDB = redisDB,
              suffix = suffix,
              maxLengthComponent = maxLengthComponent, maxLengthChunk = maxLengthChunk,
              inversionThreshold = inversionThreshold,
              isSeq = isSeq,
              zoomLevels = zoomLevels, fillZoomLevel = fillZoomLevel):

In [None]:
projectID = 'tutorial_graph'
projectName = 'Example nucleotide graph'
pathToGraphs = 'examples/nucleotide_graph'
caseDict = {'Main': 'paths_presentation.gfa'}
pathToIndex = 'examples/Visdata'

# This is if you run it in Docker compose together with active Redis image, which is named "redis".
# If you have separate redis server, enter full address here.
# If you do not want to add any annotation, `redisHost` should be None.
redisHost = 'redis'
redisPort = 6379
redisDB = 0

suffix = ''

maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = 
zoomLevels = [1]
fillZoomLevel = True

exportProject(projectID, projectName, caseDict, pathToIndex, pathToGraphs,
              redisHost = redisHost, redisPort = redisPort, redisDB = redisDB,
              suffix = suffix,
              maxLengthComponent = maxLengthComponent, maxLengthChunk = maxLengthChunk,
              inversionThreshold = inversionThreshold,
              isSeq = isSeq,
              zoomLevels = zoomLevels, fillZoomLevel = fillZoomLevel):

#| hide
# Adding gene data mass processing several chromosomes

#| hide

This is old version of adding subunits to graph containing units (e.g. graph of conserved gene blocks and adding information about individual genes to it).

In [None]:
#| hide
from copy import deepcopy
import joblib

from pangraph_constructor.utils import checkNodeLengthsFile, pathConvert


In [None]:
#| hide
doCreateCoreGenes = True

In [None]:
#| hide
pathfileDir = '../../1001G/coreGraph/new_Nov2022'
maxLengthComponent = 100
maxLengthChunk = 16
invertionThreshold = 0.5
zoomLevels = [1,3,9]
zoomLevels = adjustZoomLevels(zoomLevels)

for seqNum in range(1,6):
    if doCreateCoreGenes:
        coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_v2_Chr{seqNum:d}.gfa'

        coregraph = GenomeGraph(gfaPath=coreGFApath,isGFASeq=False)
        coregraph_genes = deepcopy(coregraph)

        fullGraphPath = f'../../1001G/pantograph/data/AT_Chr{seqNum:d}_OGOnly_2.1.gfa'
        fullgraph = GenomeGraph(gfaPath=fullGraphPath,isGFASeq=False)

        chainToGenesFile = f'chain2gene.txt'
        # maxChainLength = len(coregraph.nodes[0])
        chainToListDict = {}
        nodeLengths = []
        with open(f'{pathfileDir}{os.path.sep}{chainToGenesFile}') as f:
            for line in f:
                chainName, geneList = line.split(':')
                geneList = geneList.lstrip().rstrip().split(',')
                chainToListDict[chainName] = geneList

        for nodeIdx,nodeName in enumerate(coregraph.nodes):
            print(f'\rNode {nodeIdx+1}/{len(coregraph.nodes)}',end='')
            geneList = chainToListDict.get(nodeName, [])
            # geneIds = []
            # if geneList[0][:2]!='ch':
            geneList = [gene.rstrip('+') for gene in geneList]
            geneIds = [fullgraph.nodes.index(genename) for genename in geneList]
            # coregraph_genes.nodesData[nodeIdx] = ''.join(geneList)
            geneNum = max(1,len(geneList))
            nodeLengths.append(geneNum)

            for accession, chainDict in coregraph.nodesAnnotation[nodeIdx].items():
                interval = chainDict[nodeName]
                # geneCumLengths = np.hstack((0, np.cumsum([len(gene) for gene in geneList])))
                
                coregraph.nodesAnnotation[nodeIdx][accession].pop(nodeName,None)
                
                coregraph.nodesAnnotation[nodeIdx][accession].\
                    update({geneAnnotation:interval \
                            for i,geneid in enumerate(geneIds) \
                                for geneAnnotation in fullgraph.nodesAnnotation[geneid].get(accession,{fullgraph.nodes[geneid]:None}).keys()})

                coregraph_genes.nodesAnnotation[nodeIdx][accession].\
                    update({geneAnnotation:[(i,i)] \
                                    for i,geneid in enumerate(geneIds) \
                                        for geneAnnotation in fullgraph.nodesAnnotation[geneid].get(accession,{fullgraph.nodes[geneid]:None}).keys()})

                # coregraph_genes.nodesAnnotation[nodeIdx][accession].\
                #     update({nodeName:[(0, geneNum-1)]})
        print('')

    coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_v2_Chr{seqNum:d}.gfa'
    coreGeneGFApath = f'{pathfileDir}{os.path.sep}coregraph_v2_genes_Chr{seqNum:d}.gfa'
    
    if doCreateCoreGenes:
        coreGeneNodeLengthsPath = f'{pathfileDir}{os.path.sep}nodeLengths_coregraph_v2_genes_Chr{seqNum:d}.dat'

        coregraph.toGFA(coreGFApath,doSeq=False)
        coregraph_genes.toGFA(coreGeneGFApath,doSeq=True)
        joblib.dump(nodeLengths,coreGeneNodeLengthsPath)

    #Exporting chain graph with annotation only
    isSeq = False
    inputPath = coreGFApath
    
    outputPath,outputName = pathConvert(inputPath,suffix='_new')

    # dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
    # print(f'Opening Redis connection for db {dbid}')
    redisConn = Redis(host='redis',port = 6379,db=0)

    exportToPantograph(inputPath=inputPath,
                       outputName=outputName,
                       outputPath=outputPath,
                       isSeq=isSeq,
                       redisConn=redisConn,
                       GenomeGraphParams={'accessionsToRemove':['Consensus']},
                       zoomLevels=zoomLevels,
                       fillZoomLevels=True,
                       maxLengthChunk=maxLengthChunk,
                       maxLengthComponent=maxLengthComponent,
                       inversionThreshold=inversionThreshold,)

    #Exporting chain graph with genes
    isSeq = False
    inputPath = coreGeneGFApath
    
    outputPath,outputName = pathConvert(inputPath,suffix='_new')

    # dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
    # print(f'Opening Redis connection for db {dbid}')
    redisConn = Redis(host='redis',port = 6379,db=0)

    exportToPantograph(inputPath=inputPath,
                       outputName=outputName,
                       outputPath=outputPath,
                       isSeq=isSeq,
                       nodeLengths=checkNodeLengthsFile(inputPath),
                       redisConn=redisConn,
                       GenomeGraphParams={'accessionsToRemove':['Consensus']},
                       zoomLevels=zoomLevels,
                       fillZoomLevels=True,
                       maxLengthChunk=maxLengthChunk,
                       maxLengthComponent=maxLengthComponent,
                       inversionThreshold=inversionThreshold,)