In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
%load_ext line_profiler
%load_ext memory_profiler

### code profiling help

https://jakevdp.github.io/PythonDataScienceHandbook/01.07-timing-and-profiling.html

`%time`: Time the execution of a single statement  
`%timeit`: Time repeated execution of a single statement for more accuracy  
`%prun`: Run code with the profiler  
`%lprun`: Run code with the line-by-line profiler  
`%memit`: Measure the memory use of a single statement  
`%mprun`: Run code with the line-by-line memory profiler  


# pantograph_constructor

> This package provides all functionality from graph construction (currently, from annotation, in the future, from raw assembled sequences) to graph processing (sorting, grouping, adjusting). 

>It also provides universal graph data model class `GenomeGraph`, which implements graph IO through GFA v1 (in the future also v2 format) as well as all operations on the graph.

## Install

Enter the directory of the library and enter:

`pip install .`

## How to use

In [None]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_init.ipynb.
Converted 01_graph.ipynb.
Converted 02_tree.ipynb.
Converted 03_synteny.ipynb.
Converted 04_utils.ipynb.
Converted 05_export.ipynb.
Converted 05_exportDev.ipynb.
Converted deBruijnGraphProcessing.ipynb.
Converted dev.ipynb.
Converted graphTesting.ipynb.
Converted index.ipynb.


In [None]:
from redis import Redis

In [None]:
import cProfile
import pstats
from pstats import SortKey

In [None]:
import pdb

import os
import glob
import re
import time
import pdb
from random import shuffle
from copy import deepcopy
import joblib

import numpy as np
import networkx as nx

from pangraph_constructor.graph import GenomeGraph
from pangraph_constructor.synteny import generateOrder,readTransMap
from pangraph_constructor.tree import TremauxTree
from pangraph_constructor.utils import pathFileToPathDict,getDBID,resetDB
from pangraph_constructor.utils import iset_add,iset_score



In [None]:
from pangraph_constructor.utils import adjustZoomLevels,pathConvert
from pangraph_constructor.exportDev import exportToPantograph

In [None]:
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('always',category=RuntimeWarning)

# Generating graphs

## Generating from annotation

In [None]:
datadir = '../../1001G/annotations/freeze2.1'
gfadir = '../../1001G/pantograph/data'

In [None]:
annotationFiles = sorted(glob.glob(f'{datadir}{os.path.sep}*.gff'))
sequenceFiles = sorted(glob.glob(f'{datadir}{os.path.sep}sequences{os.path.sep}*.fasta'))
transMapFile = f'{datadir}{os.path.sep}TransMap{os.path.sep}TransMap.map'
refAnnotationFile = f'{datadir}{os.path.sep}outgroups/araport.gff'
refSequenceFile = f'{datadir}{os.path.sep}outgroups/araport.fasta'

In [None]:
refAnnotationFile,refSequenceFile,annotationFiles,sequenceFiles,transMapFile

In [None]:
ATmap = readTransMap(transMapFile)

In [None]:
fileOrder = generateOrder(annotationFiles,priorityAccession=None)
fileOrder

In [None]:
notebook2script()

In [None]:
seqID = 'Chr1'
gfaFilename = f'AT_{seqID}_OGOnly_2.1.gfa'
seqSuffix = f'_{seqID}'

In [None]:
graph = GenomeGraph(gfaPath=f'{gfadir}{os.path.sep}unordered_{gfaFilename}',isGFASeq=False)
curtst = time.time()
graph.treeSort()
print(f'Sorting graph took {time.time() - curtst} seconds')

assert len(graph.nodes)==len(graph.order)

graph.toGFA(f'{gfadir}{os.path.sep}{gfaFilename}',doSeq=False)

In [None]:
doUS = False

for seqID in ['Chr1','Chr2','Chr3','Chr4','Chr5']:
    
    print(f'\nProcessing {seqID}\n============')
    
    gfaFilename = f'AT_{seqID}_OGOnly_2.1.gfa'
    seqSuffix = f'_{seqID}'
    
#     curtst = time.time()
#     graph = GenomeGraph(annotationFiles=annotationFiles,
#                         sequenceFiles=None,
#                         fileOrder=fileOrder,
#                         doUS=doUS,
#                         seqSuffix=seqSuffix,
#                         refAnnotationFile=refAnnotationFile,
#                         refAccession='TAIR10',
#                         transMap=ATmap)
#     print(f'Generating graph took {time.time() - curtst} seconds')
    
#     graph.toGFA(f'{gfadir}{os.path.sep}unordered_{gfaFilename}',doSeq=False)
    
    graph = GenomeGraph(gfaPath=f'{gfadir}{os.path.sep}unordered_{gfaFilename}',isGFASeq=False)
    
    curtst = time.time()
    graph.treeSort()
    print(f'Generating graph took {time.time() - curtst} seconds')
    assert len(graph.nodes)==len(graph.order)
    
    
    graph.toGFA(f'{gfadir}{os.path.sep}{gfaFilename}',doSeq=False)

In [None]:
curT = time.localtime()
message = f"Generating gene graphs for all chromosome finished at \
            {curT.tm_hour:02d}:{curT.tm_min:02d} on {curT.tm_mday:02d}/{curT.tm_mon:02d}!"
!ntfy send "{message}"

## Loading Pathfile to graph

In [None]:
notebook2script()

Converted 00_init.ipynb.
Converted 01_graph.ipynb.
Converted 02_tree.ipynb.
Converted 03_synteny.ipynb.
Converted 04_utils.ipynb.
Converted 05_export.ipynb.
Converted 05_exportDev.ipynb.
Converted deBruijnGraphProcessing.ipynb.
Converted dev.ipynb.
Converted graphTesting.ipynb.
Converted index.ipynb.


In [None]:
pathfileDir = '../../1001G/coreGraph'

for seqNum in range(1,6): 

    pathsfile = f'paths{seqNum:d}.txt'

    _paths = pathFileToPathDict(f'{pathfileDir}{os.path.sep}{pathsfile}',True,'TAIR10')

    paths = {}
    for accession in sorted(list(_paths.keys())):
        paths[accession] = _paths[accession]
    del _paths

    coregraph = GenomeGraph(pathsDict=paths)

    # On undirected coregraph sorting is not optimal! Check sorting!!!

    coregraph.treeSort()

    assert len(coregraph.nodes)==len(coregraph.order)

    coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_Chr{seqNum:d}.gfa'

    coregraph.toGFA(coreGFApath,False)

Calculating nodes length...
Processing node 35/35
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Constructing Tremaux tree
Preprocessing tree 100%
Done!
Getting root nodes
Start Loop...
Nodes in order: 34/35
Calculating nodes length...
Processing node 19/19
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Constructing Tremaux tree
Preprocessing tree 100%
Done!
Getting root nodes
Start Loop...
Nodes in order: 18/19
Calculating nodes length...
Processing node 26/26
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Constructing Tremaux tree
Preprocessing tree 100%
Done!
Getting root nodes
Start Loop...
Nodes in order: 25/26
Calculating nodes length...
Processing node 15/15
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Constructing Tremaux tree
P

In [None]:
curT = time.localtime()
message = f"Generating chain graphs for all chromosome finished at \
            {curT.tm_hour:02d}:{curT.tm_min:02d} on {curT.tm_mday:02d}/{curT.tm_mon:02d}!"
!ntfy send "{message}"

## Loading test sorting graph

In [None]:
pathfileDir = '../../1001G/pantograph/data'
pathsfile = 'testCollapse_path.txt'

paths = pathFileToPathDict(f'{pathfileDir}{os.path.sep}{pathsfile}',True)

In [None]:
coregraph = GenomeGraph(pathsDict=paths,nodeNameLength=4)

In [None]:
coregraph.treeSort()

In [None]:
coregraph.tremauxTree.draw()

In [None]:
len(coregraph.nodes),len(coregraph.order)

In [None]:
coregraph.nodes,coregraph.order

In [None]:
[coregraph.nodes[i-1] for i in coregraph.order]

In [None]:
coreGFApath = f'{pathfileDir}{os.path.sep}testCollapse.gfa'

In [None]:
coregraph.toGFA(coreGFApath,False)

# Loading graph from GFA

## Large graph

In [None]:
notebook2script()

In [None]:
f'{gfadir}{os.path.sep}{gfaFilename}'

In [None]:
graph_new = GenomeGraph(gfaPath=f'{gfadir}{os.path.sep}{gfaFilename}',isGFASeq=False)

In [None]:
graph_new.treeSort()

In [None]:
assert len(graph_new.nodes)==len(graph_new.order)

In [None]:
len(graph_new.order),len(graph_new.nodes),len(graph_new.tremauxTree),len(graph_new.tremauxTree.originalGraph)

In [None]:
gfaFilename

In [None]:
gfadir

In [None]:
graph_new.toGFA(f'{gfadir}{os.path.sep}{gfaFilename}',doSeq=False)

In [None]:
!ntfy send "Sorting gene graph finished!"

## Large graph with unrelated sequences and real sequences

In [None]:
graph_new2 = GenomeGraph(gfaPath='./test_long_graph_full_seq.gfa',isGFASeq=True)

In [None]:
len(graph_new2.accessions)

In [None]:
graph_new2.treeSort()

In [None]:
len(graph_new2.order),len(graph_new2.nodes),len(graph_new2.tremauxTree),len(graph_new2.tremauxTree.originalGraph)

In [None]:
gfaFilename

In [None]:
gfadir

In [None]:
gfaFilename = 'AT_Chr1_OGOnly_Seq.gfa'

In [None]:
graph_new2.toGFA(f'{gfadir}{os.path.sep}{gfaFilename}',doSeq=True)

## Christian's test graph

In [None]:
graph_new = GenomeGraph(gfaPath='../../1001G/graphSorting/graphs/sixRef.panGenome.Chroms.pggb.Chr2.5mb.gfa',isGFASeq=True)

In [None]:
len(graph_new.accessions)

In [None]:
cProfile.run('graph_new.treeSort()','treesort_profile')

In [None]:
p = pstats.Stats('treesort_profile')
p.strip_dirs().sort_stats(SortKey.CUMULATIVE).print_stats()

In [None]:
len(graph_new.order),len(graph_new.nodes),len(graph_new.tremauxTree),len(graph_new.tremauxTree.originalGraph)

In [None]:
gfaFilename='testChristian_new.gfa'

In [None]:
gfadir

In [None]:
graph_new.toGFA(f'{gfadir}{os.path.sep}{gfaFilename}',doSeq=True)

In [None]:
shuffle(graph_new.order)

In [None]:
gfaFilename='testChristian_shuffled.gfa'

In [None]:
graph_new = GenomeGraph(gfaPath='../../1001G/pantograph/data/testChristian_shuffled.gfa',isGFASeq=True)

## Christian's full graph (Chr1)

In [None]:
graph_new = GenomeGraph(gfaPath='../../1001G/chrisGraph/chr1.wfmash.n20.a90.s10000.p1,19,39,3,81,1.seqwish.sort.smooth.sort.gfa',
                        isGFASeq=True,
                       doOverlapCleaning=False,
                       accessionsToRemove=['Consensus'])

In [None]:
len(graph_new.accessions)

In [None]:
len(graph_new.order),len(graph_new.nodes),len(graph_new.tremauxTree),len(graph_new.tremauxTree.originalGraph)

In [None]:
graph_new.treeSort()

In [None]:
assert len(graph_new.nodes)==len(graph_new.order)

In [None]:
len(graph_new.order),len(graph_new.nodes),len(graph_new.tremauxTree),len(graph_new.tremauxTree.originalGraph)

In [None]:
gfaFilename='Christian_chr1.gfa'

In [None]:
gfadir='../../1001G/chrisGraph/'

In [None]:
graph_new.toGFA(f'{gfadir}{os.path.sep}{gfaFilename}',doSeq=True)

In [None]:
!ntfy send "Sorting nucleotide graph finished!"

## Small simple graph

In [None]:
graph2 = GenomeGraph(gfaPath='./shorttest.gfa',isGFASeq=False)

In [None]:
graph2.forwardLinks

In [None]:
graph2.treeSort()

In [None]:
graph2.tremauxTree.draw_original()

In [None]:
graph2.tremauxTree.draw()

In [None]:
graph2.order

In [None]:
graph2.toGFA('../../1001G/pantograph/data/shorttest1.gfa',doSeq=False)

## Small graph with complex loops

In [None]:
graph3 = GenomeGraph(gfaPath='./shorttest2.gfa',isGFASeq=False)

In [None]:
graph3.paths

In [None]:
graph3.treeSort()

In [None]:
pdb.pm()

In [None]:
graph3.tremauxTree.draw_original()

In [None]:
graph3.tremauxTree.draw()

In [None]:
len(graph3.order),len(graph3.nodes)

In [None]:
graph3.toGFA('../../1001G/pantograph/data/shorttest2.gfa',doSeq=False)

## Small graph with very complex relationships

In [None]:
graph4 = GenomeGraph(gfaPath='./shorttest3.gfa',isGFASeq=False)

In [None]:
graph4.paths

In [None]:
graph4.treeSort()

In [None]:
graph4.tremauxTree.draw_original()

In [None]:
graph4.tremauxTree.draw()

In [None]:
graph4.order

In [None]:
graph4.toGFA('../../1001G/pantograph/data/shorttest3.gfa',doSeq=False)

## Small graph with self-loops

In [None]:
graph5 = GenomeGraph(gfaPath='./shorttest_loop.gfa',isGFASeq=False)

In [None]:
graph5.paths

In [None]:
graph5.treeSort()

In [None]:
graph5.tremauxTree.draw_original()

In [None]:
graph5.tremauxTree.draw()

In [None]:
graph5.order

In [None]:
graph5.toGFA('../../1001G/pantograph/data/shorttest4.gfa',doSeq=False)

# Exporting to Front-end

In [None]:
notebook2script()

### Exporting presentation graph

In [None]:
pathfileDir = '../../Meetings/1001G+_20220518/'
coreGFApath = f'{pathfileDir}{os.path.sep}paths_presentation.gfa'

In [None]:
genome = GenomeGraph(coreGFApath,isGFASeq=True)

Loading graph from ../../Meetings/1001G+_20220518//paths_presentation.gfa
Loading segment 7/7
Loading segments finished.
Loading link 12/12
Loading links finished
Loading path 5/5
Loading paths finished. 5 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths



In [None]:
genome.nodesData

['ATGCC', 'GCACGTTC', 'ATTTGCCCAA', 'AG', 'CTGAA', 'ATCCCA', 'GCATTCA']

In [None]:
# changing annotation
genome = GenomeGraph(coreGFApath,isGFASeq=True)

for nodeID,node in enumerate(genome.nodesAnnotation):
    for seqName,seqDict in node.items():
        for annText in seqDict.keys():
            genome.nodesAnnotation[nodeID][seqName][annText] = [(0,len(genome.nodesData[nodeID])-1)]

genome.toGFA(coreGFApath,doSeq=True)

Loading graph from ../../Meetings/1001G+_20220518//paths_presentation.gfa
Loading segment 7/7
Loading segments finished.
Loading link 12/12
Loading links finished
Loading path 5/5
Loading paths finished. 5 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths



In [None]:
# genome = GenomeGraph(coreGFApath,isGFASeq=True)
genome.nodesAnnotation

[{}, {}, {}, {}, {}, {}, {}]

In [None]:
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = True
zoomLevels = [1,2,4,8,16]#,8,16,32]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
outputPath,outputName,zoomLevels

('../../Meetings/1001G+_20220518', 'paths_presentation_new', [1, 2, 4, 8, 16])

In [None]:
redisConn=None

In [None]:
notebook2script()

Converted 00_init.ipynb.
Converted 01_graph.ipynb.
Converted 02_tree.ipynb.
Converted 03_synteny.ipynb.
Converted 04_utils.ipynb.
Converted 05_export.ipynb.
Converted 05_exportDev.ipynb.
Converted deBruijnGraphProcessing.ipynb.
Converted dev.ipynb.
Converted graphTesting.ipynb.
Converted index.ipynb.


In [None]:

#%%capture output
startTime = time.time()
# [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,toComponentLinks,fromComponentLinks,graph] = \
# [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,zoomAccStarts,zoomAccEnds, 
#                 invertedStarts,invertedEnds,toComponentLinks,fromComponentLinks,collapsibleBlocks,fromLinks,toLinks,graph,rootStruct] = \
exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                outputPath=outputPath, outputName=outputName,
                                isSeq=isSeq,
                                redisConn=redisConn,
                                zoomLevels=zoomLevels,
                                fillZoomLevels = True,
                                maxLengthComponent=maxLengthComponent, 
                                maxLengthChunk=maxLengthChunk, 
                                inversionThreshold=inversionThreshold)
#                                 returnDebugData=True)
runTime = time.time() - startTime

Loading Genome
Loading graph from ../../Meetings/1001G+_20220518//paths_presentation.gfa
Found nodeNames file ../../Meetings/1001G+_20220518/nodeNames_paths_presentation.json, loading names.
Found node annotation file ../../Meetings/1001G+_20220518/annotation_paths_presentation.dat, loading associations.
Loading segment 7/7
Loading segments finished.
Loading link 12/12
Loading links finished
Loading path 5/5
Loading paths finished. 5 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths

Recording Pantograph data to ../../Meetings/1001G+_20220518/paths_presentation_new
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths
Processing path breaks...
Postprocessing interconnected links 2/2
Preprocessing interconnected links finished.

Processing path breaks fin

In [None]:
import pdb
pdb.pm()

> [0;32m/data/YandexDisk/Kew/src/graphConstruction/pangraph_constructor/exportDev.py[0m(3902)[0;36mexportToPantograph[0;34m()[0m
[0;32m   3900 [0;31m    [0;32mif[0m [0mfillZoomLevels[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   3901 [0;31m        [0mmaxLinkLength[0m [0;34m=[0m [0mmax[0m[0;34m([0m[0mlinkLengths[0m[0;34m.[0m[0mkeys[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 3902 [0;31m        [0mmaxRearrangementLength[0m [0;34m=[0m [0mmax[0m[0;34m([0m[0mblockEdges[0m[0;34m.[0m[0mvalues[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   3903 [0;31m        [0mmaxBlock[0m [0;34m=[0m [0mmax[0m[0;34m([0m[0mmaxLinkLength[0m[0;34m,[0m[0mmaxRearrangementLength[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   3904 [0;31m[0;34m[0m[0m
[0m


ipdb>  blockEdges


{}


ipdb>  linkLengths


{25: {(3, 1), (4, 5), (1, 4)}, 36: {(6, 1), (5, 6)}}


ipdb>  q


### Testing removable elements identification

In [None]:
from pangraph_constructor.exportDev import getRemovableStructures

In [None]:
import IPython.display as ipd
import joblib

In [None]:
if os.path.exists('./tests/breakIdentify.dat'):
    correctResults = joblib.load('./tests/breakIdentify.dat')
else:
    correctResults = {}

In [None]:
notebook2script()

In [None]:
path = '../../1001G/GraphCollapsing/TestGraphs'
filePrefix = 'test'

In [None]:
#test

for filename,resDict in correctResults.items():
    print(f'\n####### Testing on case {filename} ########')
    graph = GenomeGraph(f'{path}/{filename}',isGFASeq=False)
    linksLengths, pairedLinks, blockEdges, _ = getRemovableStructures(graph=graph)
    assert linksLengths==resDict['linksLengths']
    assert pairedLinks==resDict['pairedLinks']
    assert blockEdges==resDict['blockEdges']

In [None]:
caseNum = 17
filename = f'{filePrefix}{caseNum:02d}.gfa'


print('############')
print(f'Graph from file {filename}')
coreGFApath = f'{path}/{filename}'
graph = GenomeGraph(coreGFApath,isGFASeq=False)
print('Graph Paths:')
ipd.display(graph.paths)
linkLengths, pairedLinks, blockEdges, _ = getRemovableStructures(graph=graph)

print('Link-Lengths associations:')
ipd.display(linkLengths)

print('PairedLinks:')
ipd.display(pairedLinks)

print('Rearrangemenet block edges:')
ipd.display(blockEdges)

In [None]:
correctResults[filename]

In [None]:
correctResults[filename] = {'linksLengths':linkLengths,'pairedLinks':pairedLinks,'blockEdges':blockEdges}

In [None]:
joblib.dump(correctResults,'./tests/breakIdentify.dat')

### Exporting test collapse graph

In [None]:
pathfileDir = '../../1001G/pantograph/data'
coreGFApath = f'{pathfileDir}{os.path.sep}testCollapse.gfa'

In [None]:
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = False
zoomLevels = [1,2,4]#,8,16,32]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
outputPath,outputName,zoomLevels

In [None]:
redisConn=None

In [None]:
#%%capture output
startTime = time.time()
# [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,toComponentLinks,fromComponentLinks,graph] = \
[zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,zoomAccStarts,zoomAccEnds, 
                invertedStarts,invertedEnds,toComponentLinks,fromComponentLinks,collapsibleBlocks,fromLinks,toLinks,graph,rootStruct] = \
exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                outputPath=outputPath, outputName=outputName,
                                isSeq=isSeq,
                                redisConn=redisConn,
                                zoomLevels=zoomLevels,
                                fillZoomLevels = True,
                                maxLengthComponent=maxLengthComponent, 
                                maxLengthChunk=maxLengthChunk, 
                                inversionThreshold=inversionThreshold,
                                returnDebugData=True)
runTime = time.time() - startTime

In [None]:
print(f'Executed in {runTime} seconds')

In [None]:
!ntfy send "Exporting test collapse graph finished. Overall time = {runTime} seconds"

### Exporting coregraph

In [None]:
pathfileDir = '../../1001G/coreGraph'
coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_Chr1.gfa'

In [None]:
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = False
zoomLevels = [1,2,4,8,16]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
outputPath,outputName,zoomLevels

('../../1001G/coreGraph', 'coregraph_Chr1_new', [1, 2, 4, 8, 16])

In [None]:
# dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
# print(f'Opening Redis connection for db {dbid}')
redisConn = Redis(host='redis',port = 6379,db=0)

In [None]:
notebook2script()

Converted 00_init.ipynb.
Converted 01_graph.ipynb.
Converted 02_tree.ipynb.
Converted 03_synteny.ipynb.
Converted 04_utils.ipynb.
Converted 05_export.ipynb.
Converted 05_exportDev.ipynb.
Converted deBruijnGraphProcessing.ipynb.
Converted dev.ipynb.
Converted graphTesting.ipynb.
Converted index.ipynb.


In [None]:
#%%capture output
startTime = time.time()
# [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,toComponentLinks,fromComponentLinks,graph] = \
exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                outputPath=outputPath, outputName=outputName,
                                isSeq=isSeq,
                                redisConn=redisConn,
                                zoomLevels=zoomLevels, 
                                maxLengthComponent=maxLengthComponent, 
                                maxLengthChunk=maxLengthChunk, 
                                inversionThreshold=inversionThreshold)
#                                 returnDebugata=True)
runTime = time.time() - startTime
print(runTime)

Loading Genome
Loading graph from ../../1001G/coreGraph/coregraph_Chr1.gfa
Found node annotation file ../../1001G/coreGraph/annotation_coregraph_Chr1.dat, loading associations.
Loading segment 35/35
Loading segments finished.
Loading link 72/72
Loading links finished
Loading path 28/28
Loading paths finished. 28 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 35/35
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Recording Pantograph data to ../../1001G/coreGraph/coregraph_Chr1_new
Calculating nodes length...
Processing node 35/35
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths
Processing path breaks...
Postprocessing interconnected links 16/16
Preprocessing interconnected links finished.

Processing path breaks finished.
Converting blocks to block lengths 34/34
Conversion finished.
Reformating links to block lengths associations 34/34
Refo

In [None]:
!ntfy send "Exporting coregraph finished. Overall time = {runTime} seconds"



❗️❗️❗️ TODO: Next test API. WHen works, change front end to get jsons without annotation and then to load annotation from API!

In [None]:
graph = GenomeGraph(coreGFApath,isGFASeq=False)

Loading graph from ../../1001G/coreGraph/coregraph_Chr1.gfa
Found node annotation file ../../1001G/coreGraph/annotation_coregraph_Chr1.dat, loading associations.
Loading segment 35/35
Loading segments finished.
Loading link 72/72
Loading links finished
Loading path 28/28
Loading paths finished. 28 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 35/35
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths



In [None]:
graph.nodesAnnotation[0]

{'10002': {'0001': [(0, 3)],
  'OG0002930': [(0, 3)],
  'AT1G01010': [(0, 3)],
  'OG0002309': [(0, 3)],
  'AT1G01020': [(0, 3)],
  'OG0003917': [(0, 3)],
  'AT1G01040': [(0, 3)],
  'OG0003918': [(0, 3)],
  'AT1G01050': [(0, 3)],
  'OG0002310': [(0, 3)],
  'AT1G01060': [(0, 3)],
  'OG0002931': [(0, 3)],
  'AT1G01080': [(0, 3)],
  'OG0003919': [(0, 3)],
  'AT1G01090': [(0, 3)],
  'OG0000234': [(0, 3)],
  'AT4G00810': [(0, 3)],
  'AT5G24510': [(0, 3)],
  'AT5G47700': [(0, 3)],
  'AT1G01100': [(0, 3)],
  'OG0003920': [(0, 3)],
  'AT1G01110': [(0, 3)],
  'OG0003921': [(0, 3)],
  'AT1G01120': [(0, 3)],
  'OG0000849': [(0, 3)],
  'AT5G47170': [(0, 3)],
  'AT1G01130': [(0, 3)],
  'OG0002932': [(0, 3)],
  'AT1G01140': [(0, 3)],
  'OG0006742': [(0, 3)],
  'AT1G01150': [(0, 3)],
  'OG0006743': [(0, 3)],
  'AT1G01160': [(0, 3)],
  'OG0000850': [(0, 3)],
  'AT4G00860': [(0, 3)],
  'AT1G01170': [(0, 3)],
  'OG0006744': [(0, 3)],
  'AT1G01180': [(0, 3)],
  'OG0006745': [(0, 3)],
  'AT1G01190': [(0, 3

In [None]:
from pangraph_constructor.utils import iset_get,iset_score,iset_add

In [None]:
redisConn = Redis(host='redis',port=6379,db=0)

In [None]:
iset_get(redisConn,'test')

{'a_0': (1.0, 3.0), 'b_0': (1.0, 5.0)}

In [None]:
iset_score(redisConn,'coregraph_Chr1_new.1.10024.Pos',22)
# coregraph_Chr1_new/10024/1/22

['23']

In [None]:
iset_score(redisConn,'coregraph_Chr1_new.9543.Gene',50,50)

['3370',
 'AT1G15940',
 'AT1G15950',
 'AT1G16000',
 'AT1G16010',
 'AT1G16022',
 'AT1G62830',
 'AT1G62840',
 'OG0001502',
 'OG0004404',
 'OG0007340',
 'OG0007341',
 'OG0007343',
 'OG0008435',
 'OG0015400',
 'OG0017985',
 'OG0022841',
 'OG0024046',
 'OG0024055',
 'OG0025924',
 'OG0027925',
 'OG0030893']

In [None]:
iset_get(redisConn,'coregraph_Chr1_new.9543.Gene','3370_0')

{'3370_0': (48.0, 51.0)}

### Exporting coregraph with genes

In [None]:
notebook2script()

In [None]:
pathfileDir = '../../1001G/coreGraph'

In [None]:
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = True
zoomLevels = [1,3,9]# + [9*2**i for i in range(9)]
# zoomLevels = [2**i for i in range(12)]#  [1,2,4,8,16,32,9*16,9*32,9*128,9*256]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
# outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
# outputPath,outputName,zoomLevels

In [None]:
# %%capture output
for chrNum in range(1,6):
    coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_genes_Chr{chrNum}.gfa'
    
    outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
    
    dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
    print(f'Opening Redis connection for db {dbid}')
    redisConn = Redis(host='redis',port = 6379,db=dbid)
    
    startTime = time.time()
    # [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,toComponentLinks,fromComponentLinks,graph] = \
    exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                    outputPath=outputPath, outputName=outputName,
                                    isSeq=isSeq,
                                    redisConn=redisConn,
                                    zoomLevels=zoomLevels,
                                    fillZoomLevels = True,
                                    maxLengthComponent=maxLengthComponent, 
                                    maxLengthChunk=maxLengthChunk, 
                                    inversionThreshold=inversionThreshold)
    #                                 returnDebugata=True)
    runTime = time.time() - startTime

In [None]:
curT = time.localtime(time.time()+3600)
message = f"Exporting core graph with genes for all chromosomes finished at \
            {curT.tm_hour:02d}:{curT.tm_min:02d} on {curT.tm_mday:02d}/{curT.tm_mon:02d}!\n"
!ntfy send "{message}"

In [None]:
notebook2script()

In [None]:
zoomLevels = [1,2,4]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
# outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
# outputPath,outputName,zoomLevels

In [None]:
#%%capture output2
for chrNum in range(1,6):
    coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_Chr{chrNum}.gfa'
    
    outputPath,outputName = pathConvert(coreGFApath,suffix='_new')
    
    dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
    print(f'Opening Redis connection for db {dbid}')
    redisConn = Redis(host='redis',port = 6379,db=dbid)
    
    startTime = time.time()
    # [zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,toComponentLinks,fromComponentLinks,graph] = \
    exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                    outputPath=outputPath, outputName=outputName,
                                    isSeq=isSeq,
                                    redisConn=redisConn,
                                    zoomLevels=zoomLevels,
                                    fillZoomLevels = True,
                                    maxLengthComponent=maxLengthComponent, 
                                    maxLengthChunk=maxLengthChunk, 
                                    inversionThreshold=inversionThreshold)
    #                                 returnDebugata=True)
    runTime = time.time() - startTime
    
    print(f'Exporting core graph for Chr{chrNum} took {runTime} seconds')

In [None]:
curT = time.localtime(time.time()+3600)
message = f"Exporting core graph with genes for all chromosomes finished at \
 {curT.tm_hour:02d}:{curT.tm_min:02d} on {curT.tm_mday:02d}/{curT.tm_mon:02d}!\n"
!ntfy send "{message}"

In [None]:
!ntfy send "Exporting coregraph with genes finished. Overall time = {runTime} seconds"

### Exporting gene graphs for all chromosomes

In [None]:
pathfileDir = '../../1001G/pantograph/data'

In [None]:
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = False
zoomLevels = [1,3,9]# + [9*2**i for i in range(12)]

In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)

In [None]:
notebook2script()

Converted 00_init.ipynb.
Converted 01_graph.ipynb.
Converted 02_tree.ipynb.
Converted 03_synteny.ipynb.
Converted 04_utils.ipynb.
Converted 05_export.ipynb.
Converted 05_exportDev.ipynb.
Converted deBruijnGraphProcessing.ipynb.
Converted dev.ipynb.
Converted graphTesting.ipynb.
Converted index.ipynb.


In [None]:
for seqID in ['Chr1']:#,'Chr2','Chr3','Chr4','Chr5']:

    coreGFApath = f'{pathfileDir}{os.path.sep}AT_{seqID}_OGOnly_2.1.gfa'

    outputPath,outputName = pathConvert(coreGFApath,suffix='_new')

    dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
    print(f'Opening Redis connection for db {dbid}')
    redisConn = Redis(host='redis',port = 6379,db=dbid)

    #%%capture output
    startTime = time.time()
    # initialLinkLengths, initialPairedLinks, initialInterconnectedLinks, initialBlockEdges, \
    # zoomNodeToComponent,zoomComponentToNodes,zoomComponents,\
    # zoomFromComponentLinks, zoomToComponentLinks, zoomLinkLengths, zoomPairedLinks, zoomInterconnectedLinks, \
    # zoomOldToNewRemoval, zoomNewToOldRemoval, \
    # zoomLinkLengthsRemoval, zoomPairedLinksRemoval, zoomInterconnectedLinksRemoval, zoomBlockEdgesRemoval, \
    # zoomFromComponentLinksRemoval, zoomToComponentLinksRemoval, \
    # graph = \
    exportToPantograph(inputPath=coreGFApath, GenomeGraphParams={}, 
                                    outputPath=outputPath, outputName=outputName,
                                    isSeq=isSeq,
                                    redisConn=redisConn,
                                    zoomLevels=zoomLevels, 
                                    fillZoomLevels = True,
                                    maxLengthComponent=maxLengthComponent, 
                                    maxLengthChunk=maxLengthChunk, 
                                    inversionThreshold=inversionThreshold)#,
                                    # returnDebugData=True)
    runTime = time.time() - startTime
    
    print(f'Exporting gene graph for {seqID} took {runTime} seconds')

Opening Redis connection for db 0
Loading Genome
Loading graph from ../../1001G/pantograph/data/AT_Chr1_OGOnly_2.1.gfa
Found node annotation file ../../1001G/pantograph/data/annotation_AT_Chr1_OGOnly_2.1.dat, loading associations.
Loading segment 7455/7455
Loading segments finished.
Loading link 14001/14001
Loading links finished
Loading path 28/28
Loading paths finished. 28 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7455/7455
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Recording Pantograph data to ../../1001G/pantograph/data/AT_Chr1_OGOnly_2.1_new
Calculating nodes length...
Processing node 7455/7455
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths
Processing path breaks...
Postprocessing interconnected links 746/746
Preprocessing interconnected links finished.

Processing path breaks finished.
Converting blocks to block lengths 6



Converting rearrangement blocks

Recording component 2543/2543
Recording zoom level 9216 finished.
Removing links according to collapsible blocks
All links associated with collapsibleComponents <18432 were removed.     21 components were deleted as isolated.

Zoom level 18432
Processing component 2522/2522
Processing component finished.
Converting link to block lengths associations
Converting paired links
Converting interconnected links
Converting rearrangement blocks

Recording component 2475/2475
Recording zoom level 18432 finished.
Removing links according to collapsible blocks
All links associated with collapsibleComponents <36864 were removed.     83 components were deleted as isolated.

Zoom level 36864
Processing component 2392/2392
Processing component finished.
Converting link to block lengths associations
Converting paired links
Converting interconnected links




Converting rearrangement blocks

Recording component 2254/2254
Recording zoom level 36864 finished.
Removing links according to collapsible blocks
All links associated with collapsibleComponents <73728 were removed.     2200 components were deleted as isolated.

Zoom level 73728
Processing component 54/54
Processing component finished.
Converting link to block lengths associations
Converting paired links
Converting interconnected links
Converting rearrangement blocks

Recording component 7/7
Recording zoom level 73728 finished.
Exporting gene graph for Chr1 took 3748.841618537903 seconds


In [None]:
zoomLinkLengths[4608][13509]

{(150, 795), (151, 148)}

In [None]:
zoomFromComponentLinks[4608][151]

{'+': {152: {'+': {0,
    1,
    2,
    3,
    4,
    5,
    7,
    8,
    9,
    10,
    11,
    12,
    13,
    14,
    15,
    16,
    17,
    18,
    19,
    20,
    21,
    22,
    23,
    24,
    25,
    26,
    27}},
  970: {'+': {6}}}}

In [None]:
curT = time.localtime()
message = f"Exporting gene graph for all chromosomes finished at \
            {curT.tm_hour:02d}:{curT.tm_min:02d} on {curT.tm_mday:02d}/{curT.tm_mon:02d}!"
!ntfy send "{message}"



In [None]:
import joblib

In [None]:
joblib.dump(initialLinkLengths,'dumps/initialLinkLengths.dump')
joblib.dump(initialPairedLinks,'dumps/initialPairedLinks.dump')
joblib.dump(initialInterconnectedLinks,'dumps/initialInterconnectedLinks.dump')
joblib.dump(initialBlockEdges,'dumps/initialBlockEdges.dump')
joblib.dump(zoomNodeToComponent,'dumps/zoomNodeToComponent.dump')
joblib.dump(zoomComponentToNodes,'dumps/zoomComponentToNodes.dump')
joblib.dump(zoomComponents,'dumps/zoomComponents.dump')
joblib.dump(zoomFromComponentLinks,'dumps/zoomFromComponentLinks.dump')
joblib.dump(zoomToComponentLinks,'dumps/zoomToComponentLinks.dump')
joblib.dump(zoomLinkLengths,'dumps/zoomLinkLengths.dump')
joblib.dump(zoomPairedLinks,'dumps/zoomPairedLinks.dump')
joblib.dump(zoomInterconnectedLinks,'dumps/zoomInterconnectedLinks.dump')
joblib.dump(zoomOldToNewRemoval,'dumps/zoomOldToNewRemoval.dump')
joblib.dump(zoomNewToOldRemoval,'dumps/zoomNewToOldRemoval.dump')
joblib.dump(zoomLinkLengthsRemoval,'dumps/zoomLinkLengthsRemoval.dump')
joblib.dump(zoomPairedLinksRemoval,'dumps/zoomPairedLinksRemoval.dump')
joblib.dump(zoomInterconnectedLinksRemoval,'dumps/zoomInterconnectedLinksRemoval.dump')
joblib.dump(zoomBlockEdgesRemoval,'dumps/zoomBlockEdgesRemoval.dump')
joblib.dump(zoomFromComponentLinksRemoval,'dumps/zoomFromComponentLinksRemoval.dump')
joblib.dump(zoomToComponentLinksRemoval,'dumps/zoomToComponentLinksRemoval.dump')
joblib.dump(graph,'dumps/graph.dump')

['dumps/graph.dump']

In [None]:
initialLinkLengths = joblib.load('dumps/initialLinkLengths.dump')

In [None]:
initialPairedLinks = joblib.load('dumps/initialPairedLinks.dump')

In [None]:
initialInterconnectedLinks = joblib.load('dumps/initialInterconnectedLinks.dump')

In [None]:
initialBlockEdges = joblib.load('dumps/initialBlockEdges.dump')

In [None]:
zoomNodeToComponent = joblib.load('dumps/zoomNodeToComponent.dump')

In [None]:
zoomComponentToNodes = joblib.load('dumps/zoomComponentToNodes.dump')

In [None]:
zoomComponents = joblib.load('dumps/zoomComponents.dump')

In [None]:
zoomFromComponentLinks = joblib.load('dumps/zoomFromComponentLinks.dump')

In [None]:
zoomToComponentLinks = joblib.load('dumps/zoomToComponentLinks.dump')

In [None]:
zoomLinkLengths = joblib.load('dumps/zoomLinkLengths.dump')

In [None]:
zoomPairedLinks = joblib.load('dumps/zoomPairedLinks.dump')

In [None]:
zoomInterconnectedLinks = joblib.load('dumps/zoomInterconnectedLinks.dump')

In [None]:
zoomOldToNewRemoval = joblib.load('dumps/zoomOldToNewRemoval.dump')

In [None]:
zoomNewToOldRemoval = joblib.load('dumps/zoomNewToOldRemoval.dump')

In [None]:
zoomLinkLengthsRemoval = joblib.load('dumps/zoomLinkLengthsRemoval.dump')

In [None]:
zoomPairedLinksRemoval = joblib.load('dumps/zoomPairedLinksRemoval.dump')

In [None]:
zoomInterconnectedLinksRemoval = joblib.load('dumps/zoomInterconnectedLinksRemoval.dump')

In [None]:
zoomBlockEdgesRemoval = joblib.load('dumps/zoomBlockEdgesRemoval.dump')

In [None]:
zoomFromComponentLinksRemoval = joblib.load('dumps/zoomFromComponentLinksRemoval.dump')

In [None]:
zoomToComponentLinksRemoval = joblib.load('dumps/zoomToComponentLinksRemoval.dump')

In [None]:
graph = joblib.load('dumps/graph.dump')

In [None]:
probLink = (151,148)

In [None]:
highestZoomLevel = max(list(zoomLinkLengthsRemoval.keys()))
otherZoomLevels = sorted(list(zoomLinkLengths.keys()),reverse=True)

In [None]:
print(f'Zoom level {highestZoomLevel}')
print(f'After removal {probLink} associated with:')

Zoom level 9216
After removal (151, 148) associated with:


In [None]:
for blLen,linkList in zoomLinkLengthsRemoval[highestZoomLevel].items(): 
    if probLink in linkList:
        print(blLen)

13509


In [None]:
# comp zl = 36
probLink = (zoomNewToOldRemoval[highestZoomLevel][probLink[0]-1]+1,zoomNewToOldRemoval[highestZoomLevel][probLink[1]-1]+1)

In [None]:
for zl in otherZoomLevels[:-1]:
    print(f'Zoom level {zl}')
    print(f'After update {probLink} associated with:')
    for blLen,linkList in zoomLinkLengths[zl].items(): 
        if probLink in linkList:
            print(blLen)
    probLink = (zoomComponentToNodes[zl][probLink[0]-1][0]+1,zoomComponentToNodes[zl][probLink[1]-1][0]+1)
    
    print(f'After Removal {probLink} associated with:')
    for blLen,linkList in zoomLinkLengthsRemoval[zl].items(): 
        if probLink in linkList:
            print(blLen)
            
    if len(zoomNewToOldRemoval[zl])>0:
        probLink = (zoomNewToOldRemoval[zl][probLink[0]-1]+1,zoomNewToOldRemoval[zl][probLink[1]-1]+1)

Zoom level 4608
After update (151, 148) associated with:
13509
After Removal (152, 149) associated with:
13509
Zoom level 2304
After update (152, 149) associated with:
13509
After Removal (155, 152) associated with:
13509
Zoom level 1152
After update (155, 152) associated with:
13509
After Removal (161, 158) associated with:
13509
Zoom level 576
After update (161, 158) associated with:
13509
After Removal (163, 160) associated with:
13509
Zoom level 288
After update (163, 160) associated with:
13509
After Removal (169, 166) associated with:
13509
Zoom level 144
After update (169, 166) associated with:
13509
After Removal (171, 168) associated with:
13509
Zoom level 72
After update (171, 168) associated with:
13509
After Removal (171, 168) associated with:
13509
Zoom level 36
After update (171, 168) associated with:
13509
After Removal (178, 175) associated with:
13509
Zoom level 18
After update (178, 175) associated with:
13509
After Removal (206, 202) associated with:
13509
Zoom level

In [None]:
zoomLinkLengths[4608][13509]

{(150, 795), (151, 148)}

In [None]:
zoomInterconnectedLinks[4608][(151,148)]

{(150, 795)}

In [None]:
zoomComponentToNodes[4608][969]

[979]

In [None]:
zoomNewToOldRemoval[4608][979]

982

In [None]:
zoomNodeToComponent[4608][982]

[972]

In [None]:
zl = 144
compID = 169
try:
    print(zoomToComponentLinks[zl][compID])
except KeyError:
    print('Not found')
    
print(zoomFromComponentLinks[zl][compID])

{'-': {170: {'+': {7}}}}
{'-': {166: {'+': {7}}}, '+': {170: {'+': {0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27}}}}


In [None]:
zoomToComponentLinks[9][206]

{'+': {202: {'+': {0,
    1,
    2,
    3,
    4,
    5,
    6,
    8,
    9,
    10,
    11,
    12,
    13,
    14,
    15,
    16,
    17,
    18,
    19,
    20,
    21,
    22,
    23,
    24,
    25,
    26,
    27}}},
 '-': {207: {'+': {7}}}}

In [None]:
zoomFromComponentLinks[9][206]

{'-': {202: {'+': {7}}},
 '+': {207: {'+': {0,
    1,
    2,
    3,
    4,
    5,
    6,
    8,
    9,
    10,
    11,
    12,
    13,
    14,
    15,
    16,
    17,
    18,
    19,
    20,
    21,
    22,
    23,
    24,
    25,
    26,
    27}}}}

In [None]:
print(f'Zoom level 1')
print(f'After update {probLink} associated with:')

Zoom level 1
After update (213, 208) associated with:


In [None]:
for blLen,linkList in zoomLinkLengths[1].items(): 
    if probLink in linkList:
        print(blLen)

13509


In [None]:
# Nodes
probLink = zoomComponentToNodes[1][probLink[0]-1][0],zoomComponentToNodes[1][probLink[1]-1][0]

In [None]:
print(f'Link in nodes is {probLink}')

Link in nodes is (504, 491)


In [None]:
graph.forwardLinks[probLink[0]]

{'+': [(505, '+')], '-': [(491, '+')]}

In [None]:
print('Node link associated with')

Node link associated with


In [None]:
for blLen,linkList in initialLinkLengths.items(): 
    if probLink in linkList:
        print(blLen)

13509


❗❗❗Find where the links are diverted (it must be happening on level 18 as level 9 is still unchangeable, just long components being merged.
So, it must have happened during updating removable elements after zoom level generation.

In [None]:
curT = time.localtime()
message = f"Exporting gene graph for all chromosomes finished at \
            {curT.tm_hour:02d}:{curT.tm_min:02d} on {curT.tm_mday:02d}/{curT.tm_mon:02d}!"
!ntfy send "{message}"

## Old export testing

In [None]:
pathfileDir = '../../1001G/coreGraph'
coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_v3.gfa'
# coreGFApath = f'{pathfileDir}{os.path.sep}coreGraph.gfa'

In [None]:
gfaFilename = 'AT_Chr1_OGOnly_2.1_noRef.gfa'

In [None]:
# zoomLevels = [1,10,20,100,500,1000,5000,10000,50000,100000,500000,1000000]
zoomLevels = [1,3,9,45,90,450,900,4500,9000]
# zoomLevels = [1,3,9,18]
# zoomLevels = [4,8,16,32]
maxLengthComponent = 100
maxLengthChunk = 6
invertionThreshold = 0.5
# inputPath = '../../1001G/pantograph/data/shorttest2.gfa'
# inputPath = '../../1001G/pantograph/data/AT_Chr1_OGOnly.gfa'
# inputPath = '../../1001G/chrisGraph/chr1.wfmash.n20.a90.s10000.p1,19,39,3,81,1.seqwish.sort.smooth.sort.gfa'
# inputPath = '../../1001G/pantograph/data/shorttest_seq.gfa'
inputPath = f'{gfadir}{os.path.sep}{gfaFilename}'


In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
outputPath,outputName = pathConvert(inputPath,suffix='_new')
outputPath,outputName,zoomLevels

In [None]:
dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
print(f'Opening Redis connection for db {dbid}')
redisConn = Redis(host='redis',port = 6379,db=dbid)

In [None]:
from nbdev.export import notebook2script
notebook2script()

In [None]:
# zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides,toLinks,fromLinks,graph = \
exportToPantograph(inputPath=inputPath,
                   outputName=outputName,
                   outputPath=outputPath,
                   isSeq=False,
                   redisConn=redisConn,
                   GenomeGraphParams={'accessionsToRemove':['Consensus']},
                   zoomLevels=zoomLevels,
                   maxLengthChunk=maxLengthChunk,
                   maxLengthComponent=maxLengthComponent,
                   invertionThreshold=invertionThreshold,)
#                    debug=True,returnDebugData=True)

In [None]:
!ntfy send "Exporting gene graph finished!"

### Comments

Each node processing time increase significantly with overall number of nodes. This is wrong and should be investigated.

# Adding nucleotide data

In [None]:
from copy import deepcopy
import numpy as np

In [None]:
pathfileDir = '../../1001G/coreGraph'
coreGFApath = f'{pathfileDir}{os.path.sep}coreGraph_f2.1_Ref_v04.gfa'

In [None]:
coregraph = GenomeGraph(gfaPath=coreGFApath,isGFASeq=False)
coregraph_genes = deepcopy(coregraph)

In [None]:
fullGraphPath = '../../1001G/pantograph/data/AT_Chr1_OGOnly_2.1.gfa'
fullgraph = GenomeGraph(gfaPath=fullGraphPath,isGFASeq=False)

In [None]:
chainToGenesFile = 'chain2gene_f2.1_Ref_v04.txt'
maxChainLength = len(coregraph.nodes[0])
chainToListDict = {}
with open(f'{pathfileDir}{os.path.sep}{chainToGenesFile}') as f:
    for line in f:
        chainName, geneList = line.split(':')
        geneList = geneList.lstrip().rstrip().split(',')
        chainToListDict[chainName.zfill(maxChainLength)] = geneList

In [None]:
for nodeIdx,nodeName in enumerate(coregraph.nodes):
    print(f'\nNode {nodeIdx+1}/{len(coregraph.nodes)}',end='')
    geneList = chainToListDict.get(nodeName.zfill(maxChainLength), [f'ch{nodeName.zfill(7)}'])
    geneIds = []
    if geneList[0][:2]!='ch':
        geneIds = [int(gene.rstrip('+'))-1 for gene in geneList]
        geneList = [fullgraph.nodes[geneid] for geneid in geneIds]
    coregraph_genes.nodesData[nodeIdx] = ''.join(geneList)
    
    for accession, chainDict in coregraph.nodesAnnotation[nodeIdx].items():
        interval = chainDict[nodeName]
        geneCumLengths = np.hstack((0, np.cumsum([len(gene) for gene in geneList])))
        
        coregraph.nodesAnnotation[nodeIdx][accession].\
            update({geneAnnotation:interval \
                    for geneid in geneIds \
                        for geneAnnotation in fullgraph.nodesAnnotation[geneid].get(accession,{fullgraph.nodes[geneid]:None}).keys()})

        coregraph_genes.nodesAnnotation[nodeIdx][accession].\
            update({geneAnnotation:[(geneCumLengths[i], geneCumLengths[i+1]-1)] \
                            for i,geneid in enumerate(geneIds) \
                                for geneAnnotation in fullgraph.nodesAnnotation[geneid].get(accession,{fullgraph.nodes[geneid]:None}).keys()})

        coregraph_genes.nodesAnnotation[nodeIdx][accession].\
            update({nodeName:[(geneCumLengths[0], geneCumLengths[-1]-1)]})
print('')

In [None]:
coregraph.toGFA(f'{pathfileDir}{os.path.sep}coregraph_f2.1_Ref_v04.gfa',doSeq=False)
coregraph_genes.toGFA(f'{pathfileDir}{os.path.sep}coregraph_genes_f2.1_Ref_v04.gfa',doSeq=True)

In [None]:
pathfileDir = '../../1001G/coreGraph'
coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_genes_f2.1_Ref_v04.gfa'
# coreGFApath = f'{pathfileDir}{os.path.sep}coreGraph.gfa'

In [None]:
# zoomLevels = [1,10,20,100,500,1000,5000,10000,50000,100000,500000,1000000]
zoomLevels = [1,3,9,45,90,450,900,4500,9000]
# zoomLevels = [1,3,9,18]
# zoomLevels = [4,8,16]

isSeq = True

maxLengthComponent = 100
maxLengthChunk = 16
invertionThreshold = 0.5
# inputPath = '../../1001G/pantograph/data/shorttest2.gfa'
# inputPath = '../../1001G/pantograph/data/AT_Chr1_OGOnly.gfa'
# inputPath = '../../1001G/chrisGraph/chr1.wfmash.n20.a90.s10000.p1,19,39,3,81,1.seqwish.sort.smooth.sort.gfa'
# inputPath = '../../1001G/pantograph/data/shorttest_seq.gfa'
inputPath = coreGFApath


In [None]:
zoomLevels = adjustZoomLevels(zoomLevels)
outputPath,outputName = pathConvert(inputPath,suffix='_new')
outputPath,outputName,zoomLevels

In [None]:
dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
print(f'Opening Redis connection for db {dbid}')
redisConn = Redis(host='redis',port = 6379,db=dbid)

In [None]:
# zoomComponentLengths,zoomNodeToComponent,zoomComponentToNodes,zoomComponents,zoomCompNucleotides = \
exportToPantograph(inputPath=inputPath,
                   outputName=outputName,
                   outputPath=outputPath,
                   isSeq=isSeq,
                   redisConn=redisConn,
                   GenomeGraphParams={'accessionsToRemove':['Consensus']},
                   zoomLevels=zoomLevels,
                   maxLengthChunk=maxLengthChunk,
                   maxLengthComponent=maxLengthComponent,
                   invertionThreshold=invertionThreshold,)
#                                              debug=True,returnDebugData=True)

In [None]:
!ntfy send "Pantograph data generation for coregraph finished."

# Adding gene data mass processing several chromosomes

In [None]:
notebook2script()

Converted 00_init.ipynb.
Converted 01_graph.ipynb.
Converted 02_tree.ipynb.
Converted 03_synteny.ipynb.
Converted 04_utils.ipynb.
Converted 05_export.ipynb.
Converted 05_exportDev.ipynb.
Converted deBruijnGraphProcessing.ipynb.
Converted dev.ipynb.
Converted graphTesting.ipynb.
Converted index.ipynb.


In [None]:
def checkNodeLengthsFile(GFAPath):
    directory = os.path.dirname(GFAPath)
    filebase = os.path.splitext(os.path.basename(GFAPath))[0]
    nodeLenPath = f'{directory}{os.path.sep}nodeLengths_{filebase}.dat'
    
    if os.path.exists(nodeLenPath):
        return joblib.load(nodeLenPath)
    else:
        return None

In [None]:
doCreateCoreGenes = False

In [None]:
pathfileDir = '../../1001G/coreGraph'
maxLengthComponent = 100
maxLengthChunk = 16
invertionThreshold = 0.5
zoomLevels = [1,3,9]
zoomLevels = adjustZoomLevels(zoomLevels)

for seqNum in range(1,2):
    if doCreateCoreGenes:
        coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_Chr{seqNum:d}.gfa'

        coregraph = GenomeGraph(gfaPath=coreGFApath,isGFASeq=False)
        coregraph_genes = deepcopy(coregraph)

        fullGraphPath = f'../../1001G/pantograph/data/AT_Chr{seqNum:d}_OGOnly_2.1.gfa'
        fullgraph = GenomeGraph(gfaPath=fullGraphPath,isGFASeq=False)

        chainToGenesFile = f'chain2gene{seqNum:d}.txt'
        # maxChainLength = len(coregraph.nodes[0])
        chainToListDict = {}
        nodeLengths = []
        with open(f'{pathfileDir}{os.path.sep}{chainToGenesFile}') as f:
            for line in f:
                chainName, geneList = line.split(':')
                geneList = geneList.lstrip().rstrip().split(',')
                chainToListDict[chainName] = geneList

        for nodeIdx,nodeName in enumerate(coregraph.nodes):
            print(f'\nNode {nodeIdx+1}/{len(coregraph.nodes)}',end='')
            geneList = chainToListDict.get(nodeName, [])
            geneIds = []
            # if geneList[0][:2]!='ch':
            geneIds = [int(gene.rstrip('+'))-1 for gene in geneList]
            geneList = [fullgraph.nodes[geneid] for geneid in geneIds]
            # coregraph_genes.nodesData[nodeIdx] = ''.join(geneList)
            geneNum = max(1,len(geneList))
            nodeLengths.append(geneNum)

            for accession, chainDict in coregraph.nodesAnnotation[nodeIdx].items():
                interval = chainDict[nodeName]
                # geneCumLengths = np.hstack((0, np.cumsum([len(gene) for gene in geneList])))

                coregraph.nodesAnnotation[nodeIdx][accession].\
                    update({geneAnnotation:interval \
                            for i,geneid in enumerate(geneIds) \
                                for geneAnnotation in fullgraph.nodesAnnotation[geneid].get(accession,{fullgraph.nodes[geneid]:None}).keys()})

                coregraph_genes.nodesAnnotation[nodeIdx][accession].\
                    update({geneAnnotation:[(i,i)] \
                                    for i,geneid in enumerate(geneIds) \
                                        for geneAnnotation in fullgraph.nodesAnnotation[geneid].get(accession,{fullgraph.nodes[geneid]:None}).keys()})

                coregraph_genes.nodesAnnotation[nodeIdx][accession].\
                    update({nodeName:[(0, geneNum-1)]})
        print('')

    coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_Chr{seqNum:d}.gfa'
    coreGeneGFApath = f'{pathfileDir}{os.path.sep}coregraph_genes_Chr{seqNum:d}.gfa'
    
    if doCreateCoreGenes:
        coreGeneNodeLengthsPath = f'{pathfileDir}{os.path.sep}nodeLengths_coregraph_genes_Chr{seqNum:d}.dat'

        coregraph.toGFA(coreGFApath,doSeq=False)
        coregraph_genes.toGFA(coreGeneGFApath,doSeq=True)
        joblib.dump(nodeLengths,coreGeneNodeLengthsPath)

    #Exporting chain graph with annotation only
    isSeq = False
    inputPath = coreGFApath
    
    outputPath,outputName = pathConvert(inputPath,suffix='_new')

    # dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
    # print(f'Opening Redis connection for db {dbid}')
    redisConn = Redis(host='redis',port = 6379,db=0)

    exportToPantograph(inputPath=inputPath,
                       outputName=outputName,
                       outputPath=outputPath,
                       isSeq=isSeq,
                       redisConn=redisConn,
                       GenomeGraphParams={'accessionsToRemove':['Consensus']},
                       zoomLevels=zoomLevels,
                       fillZoomLevels=True,
                       maxLengthChunk=maxLengthChunk,
                       maxLengthComponent=maxLengthComponent,
                       inversionThreshold=inversionThreshold,)

    #Exporting chain graph with genes
    isSeq = False
    inputPath = coreGeneGFApath
    
    outputPath,outputName = pathConvert(inputPath,suffix='_new')

    # dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
    # print(f'Opening Redis connection for db {dbid}')
    redisConn = Redis(host='redis',port = 6379,db=0)

    exportToPantograph(inputPath=inputPath,
                       outputName=outputName,
                       outputPath=outputPath,
                       isSeq=isSeq,
                       nodeLengths=checkNodeLengthsFile(inputPath),
                       redisConn=redisConn,
                       GenomeGraphParams={'accessionsToRemove':['Consensus']},
                       zoomLevels=zoomLevels,
                       fillZoomLevels=True,
                       maxLengthChunk=maxLengthChunk,
                       maxLengthComponent=maxLengthComponent,
                       inversionThreshold=inversionThreshold,)


Loading Genome
Loading graph from ../../1001G/coreGraph/coregraph_Chr1.gfa
Found node annotation file ../../1001G/coreGraph/annotation_coregraph_Chr1.dat, loading associations.
Loading segment 35/35
Loading segments finished.
Loading link 72/72
Loading links finished
Loading path 28/28
Loading paths finished. 28 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 35/35
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths

Recording Pantograph data to ../../1001G/coreGraph/coregraph_Chr1_new
Calculating nodes length...
Processing node 35/35
Finished calculating nodes lengths
Preprocessing paths...
Processing path 28/28
Finished preprocessing paths
Processing path breaks...
Postprocessing interconnected links 16/16
Preprocessing interconnected links finished.

Processing path breaks finished.
Converting blocks to block lengths 34/34
Conversion finished.
Reformating links to block lengths associations 34/34
Refo

In [None]:
curT = time.localtime()
message = f"Adding genes and exporting of chain graphs for all chromosome finished at \
            {curT.tm_hour:02d}:{curT.tm_min:02d} on {curT.tm_mday:02d}/{curT.tm_mon:02d}!"
!ntfy send "{message}"

In [None]:
!ntfy send "Pantograph data generation for coregraph finished for all chromosomes."