In [None]:
#| include: false
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Install

Enter the directory of the library and enter:

`pip install .`

 and for development use

`pip install -e .`

## How to use

### Use command line interface `pantograph`

After this package is installed, a command line tool `pantograph` becomes available.

It has several functions as following

#### Converting annotations to gene graph (see user manual on what is gene graph).

This is done by a command 
```bash
$ pantograph annotation2graph [-h] [-g] setting_file.yaml
```

It requires a path to a yaml file with all settings for the process. If used with `-g` option, then a sample file will be generated for you to edit and then run it.
Sample file has extensive comments explaining every parameter.

So, it is best to do the following:

```bash
$ pantograph annotation2graph -g setting.yaml
```

then, using your favourite text editor, edit the generated file and then run

```bash
$ pantograph annotation2graph setting.yaml
```

#### Converting a file(s) with paths into gene/block graph

This is done by a command 
```bash
$ pantograph paths2graph [-h] [-g] setting_file.yaml
```

It requires a path to a yaml file with all settings for the process. If used with `-g` option, then a sample file will be generated for you to edit and then run it.
Sample file has extensive comments explaining every parameter.

So, it is best to do the following:

```bash
$ pantograph paths2graph -g setting.yaml
```

then, using your favourite text editor, edit the generated file and then run

```bash
$ pantograph paths2graph setting.yaml
```

#### Sorting a graph

In order to sort a graph, it should be in GFA v1 format file. To run the sorting, you need to use the following command:

```bash
$ pantograph sort-graph [-h] [--quiet] [--isseq] [--output OUTPUT] input
```
with the following parameters

positional arguments:

`input`                 Relative (to current directory) or absolute path to the GFA file with the graph to be sorted.

optional arguments:

`-h`, `--help`            show this help message and exit

`--quiet`, `-q`           Suppress most of output. False (i.e. verbose) is not set.

`--isseq`, `-s`           Does this graph contains nucleotide sequences. False is not set.

`--output OUTPUT`, `-o OUTPUT`
                        File path where to save sorted graph. If not set, the input will be overwritten.

#### Exporting graph into visualisation data structure, which can be used by Pantograph visualisation tool.

This is done by a command 
```bash
$ pantograph export-vis [-h] [-g] setting_file.yaml
```

It requires a path to a yaml file with all settings for the process. If used with `-g` option, then a sample file will be generated for you to edit and then run it.
Sample file has extensive comments explaining every parameter.

So, it is best to do the following:

```bash
$ pantograph export-vis -g setting.yaml
```

then, using your favourite text editor, edit the generated file and then run

```bash
$ pantograph export-vis setting.yaml
```

### Use python package

The rest of the file describes some of the uses of the pyGenGraph package. There are more ways to use it, but more detailed documentation is needed to describe all use cases.
Also, more things required for this package to become really universal.

In [None]:
from nbdev import nbdev_export
nbdev_export()

In [None]:
import os
import glob
import re
import time

from pygengraph.graph import GenomeGraph
from pygengraph.utils import pathFileToPathDict
from pygengraph.export import exportProject

In [None]:
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('always',category=RuntimeWarning)

# Generating graphs

## Generating from annotation

### Preparing list of files

In [None]:
#| eval: false
refdir = '/path/to/reference/'
annotationdir = '/path/to/annotation'
gfadir = '/path/to/graphs'

In [None]:
#| eval: false
annotationFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}*.gff'))
pangenomeFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}*pangen.gff'))
# If you want to include sequences instead of simple notion of genes.
# It should also be converted to sequenceFileDict, see details in documentation for GenomeGraph Class constructor.
# sequenceFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}sequences{os.path.sep}*.fasta'))
refAnnotationFile = f'{refdir}{os.path.sep}reference.gff'
# If you want to include sequences instead of simple notion of genes
# refSequenceFile = f'{refdir}{os.path.sep}reference.fasta'

In [None]:
#| eval: false
refdir = '../../1001G/annotations/freeze2.1/outgroups'
annotationdir = '../../1001G/annotations/freeze2.1'
gfadir = '../../1001G/annotations/graphs'

In [None]:
#| eval: false
annotationFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}*.gff'))
# pangenomeFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}*pangen.gff'))
# If you want to include sequences instead of simple notion of genes.
# It should also be converted to sequenceFileDict, see details in documentation for GenomeGraph Class constructor.
# sequenceFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}sequences{os.path.sep}*.fasta'))
refAnnotationFile = f'{refdir}{os.path.sep}araport.gff'
# If you want to include sequences instead of simple notion of genes
# refSequenceFile = f'{refdir}{os.path.sep}reference.fasta'

### Generaton of gene graph

In [None]:
#| hide
nbdev_export()

In [None]:
#| eval: false
#| output: false
doUS = False
n = 1
for chrnum in range(1,n+1): # here n is number of chromosomes.
    chromosome = f'Chr{chrnum}'

    print(f'\nProcessing {chromosome}\n============')

    curtst = time.time()
    
    graph = GenomeGraph(annotationFiles = annotationFiles,
                        pangenomeFiles = None,
                        sequenceFilesDict = None,
                        doUS = doUS,
                        chromosome = chromosome,
                        refAnnotationFile=refAnnotationFile,
                        refAccession='TAIR10')
    
    print(f'Generating graph for {chromosome} took {time.time() - curtst} seconds')
    
    curtst = time.time()
    graph.treeSort()
    print(f'Sorting graph for {chromosome} took {time.time() - curtst} seconds')
    if len(graph.nodes)!=len(graph.order):
            print('Sorting failed and not all nodes were sorted. Saving unsorted graph')
            gfaFilename = f'Gene_{chromosome}_simOnly_unordered.gfa'
            graph.order = list(range(1,len(graph.nodes)+1))
    else:
        gfaFilename = f'Gene_{chromosome}_simOnly.gfa'
    
    graph.toGFA(f'{gfadir}{os.path.sep}{gfaFilename}',doSeq=False)
    

## Loading Pathfile to graph

In [None]:
#| output: false
# For path file v1
pathfileDir = 'examples/gene_graph'

pathsfile = 'paths_genegraph.txt'

paths = pathFileToPathDict(f'{pathfileDir}{os.path.sep}{pathsfile}', True, True, False)

graph = GenomeGraph(pathsDict=paths)

graph.treeSort()

if len(graph.nodes)!=len(graph.order):
    print('Sorting failed and not all nodes were sorted. Saving unsorted graph')
    output = 'paths_genegraph_unordered.gfa'
    graph.order = list(range(1,len(graph.nodes)+1))
    graph.toGFA(output,doSeq=False)
else:
    coreGFApath = f'{pathfileDir}{os.path.sep}paths_genegraph.gfa'
    graph.toGFA(coreGFApath,doSeq=False)

Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths

Constructing Tremaux tree
Preprocessing tree 100%
Done!
Getting root nodes
Start Loop...
Nodes in order: 6/7


In [None]:
#| eval: false
# For v2
# This is example, no v2 file currently available for demonstration.
pathfileDir = '/path/to/file'

pathsfile = f'paths.txt'

paths = pathFileToPathDict(f'{pathfileDir}{os.path.sep}{pathsfile}',True,'reference',True)

for seqNum in paths.keys():

    graph = GenomeGraph(pathsDict=paths[seqNum])

    # On undirected coregraph sorting is not optimal! Check sorting!!!

    graph.treeSort()

    if len(graph.nodes)!=len(graph.order):
        print('Sorting failed and not all nodes were sorted. Saving unsorted graph')
        output = f'{pathfileDir}{os.path.sep}graph_Chr{seqNum}_unordered.gfa'
        graph.order = list(range(1,len(graph.nodes)+1))
        graph.toGFA(output,doSeq=False)
    else:
        coreGFApath = f'{pathfileDir}{os.path.sep}graph_Chr{seqNum}.gfa'
        graph.toGFA(coreGFApath,doSeq=False)


# Loading graph from GFA and sorting it

In [None]:
#| output: false
gfadir = 'examples/nucleotide_graph'

# It is nucleotide graph. If it is not nucleotide graph, then `isSeq` variable should be changed to False.
gfafilename = 'paths_presentation.gfa'
isSeq = True

graph = GenomeGraph(gfaPath=f'{gfadir}{os.path.sep}{gfafilename}',isGFASeq=isSeq)

graph.treeSort()

assert len(graph.nodes)==len(graph.order)

basename,ext = os.path.splitext(gfafilename)

graph.toGFA(f'{gfadir}{os.path.sep}{basename}_ordered{ext}',doSeq=isSeq)

Loading graph from examples/nucleotide_graph/paths_presentation.gfa
Found nodeNames file examples/nucleotide_graph/nodeNames_paths_presentation.json, loading names.
Found node annotation file examples/nucleotide_graph/annotation_paths_presentation.dat, loading associations.
Loading segment 7/7
Loading segments finished.
Loading link 12/12
Loading links finished
Loading path 5/5
Loading paths finished. 5 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths

Constructing Tremaux tree
Preprocessing tree 100%
Done!
Getting root nodes
Start Loop...
Nodes in order: 6/7


# Exporting to Pantograph visualisation

In [None]:
#| output: false
projectID = 'paths_genegraph'
projectName = 'Example gene graph'
pathToGraphs = 'examples/gene_graph'
caseDict = {'Main': 'paths_genegraph.gfa'}
pathToIndex = 'examples/Visdata'

# This is if you run it in Docker compose together with active Redis image, which is named "redis".
# If you have separate redis server, enter full address here.
# If you do not want to add any annotation, `redisHost` should be None.
redisHost = 'redis'
redisPort = 6379
redisDB = 0

suffix = ''

maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = False
zoomLevels = [1,2,4]
fillZoomLevel = True

exportProject(projectID, projectName, caseDict, pathToIndex, pathToGraphs,
              redisHost = redisHost, redisPort = redisPort, redisDB = redisDB,
              suffix = suffix,
              maxLengthComponent = maxLengthComponent, maxLengthChunk = maxLengthChunk,
              inversionThreshold = inversionThreshold,
              isSeq = isSeq,
              zoomLevels = zoomLevels, fillZoomLevel = fillZoomLevel)


Processing case paths_genegraph_Main
Loading Genome
Loading graph from examples/gene_graph/paths_genegraph.gfa
Found node annotation file examples/gene_graph/annotation_paths_genegraph.dat, loading associations.
Loading segment 7/7
Loading segments finished.
Loading link 13/13
Loading links finished
Loading path 5/5
Loading paths finished. 5 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths

Recording Pantograph data to examples/Visdata/paths_genegraph/paths_genegraph_Main
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths
Processing path breaks...
Postprocessing interconnected links 2/2
Preprocessing interconnected links finished.

Processing path breaks finished.
Converting blocks to block lengths 6/6
Conversion finished.
Reformating links to block

In [None]:
#| output: false
projectID = 'tutorial_graph'
projectName = 'Example nucleotide graph'
pathToGraphs = 'examples/nucleotide_graph'
caseDict = {'Main': 'paths_presentation_ordered.gfa'}
pathToIndex = 'examples/Visdata'

# This is if you run it in Docker compose together with active Redis image, which is named "redis".
# If you have separate redis server, enter full address here.
# If you do not want to add any annotation, `redisHost` should be None.
redisHost = 'redis'
redisPort = 6379
redisDB = 0

suffix = ''

maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = True
zoomLevels = [1,2,4]
fillZoomLevel = True

exportProject(projectID, projectName, caseDict, pathToIndex, pathToGraphs,
              redisHost = redisHost, redisPort = redisPort, redisDB = redisDB,
              suffix = suffix,
              maxLengthComponent = maxLengthComponent, maxLengthChunk = maxLengthChunk,
              inversionThreshold = inversionThreshold,
              isSeq = isSeq,
              zoomLevels = zoomLevels, fillZoomLevel = fillZoomLevel)


Processing case tutorial_graph_Main
Using Redis DB server at redis:6379 with db number 0.
Loading Genome
Loading graph from examples/nucleotide_graph/paths_presentation_ordered.gfa
Found nodeNames file examples/nucleotide_graph/nodeNames_paths_presentation_ordered.json, loading names.
Found node annotation file examples/nucleotide_graph/annotation_paths_presentation_ordered.dat, loading associations.
Loading segment 7/7
Loading segments finished.
Loading link 12/12
Loading links finished
Loading path 5/5
Loading paths finished. 5 paths added, 0 paths ignored.
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths

Recording Pantograph data to examples/Visdata/tutorial_graph/tutorial_graph_Main
Calculating nodes length...
Processing node 7/7
Finished calculating nodes lengths
Preprocessing paths...
Processing path 5/5
Finished preprocessing paths
Processing path breaks...
Postprocessing 

#| hide
# Adding gene data mass processing several chromosomes

#| hide

This is old version of adding subunits to graph containing units (e.g. graph of conserved gene blocks and adding information about individual genes to it).

In [None]:
#| hide
from copy import deepcopy
import joblib

from pangraph_constructor.utils import checkNodeLengthsFile, pathConvert


In [None]:
#| hide
doCreateCoreGenes = True

In [None]:
#| hide
pathfileDir = '../../1001G/coreGraph/new_Nov2022'
maxLengthComponent = 100
maxLengthChunk = 16
invertionThreshold = 0.5
zoomLevels = [1,3,9]
zoomLevels = adjustZoomLevels(zoomLevels)

for seqNum in range(1,6):
    if doCreateCoreGenes:
        coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_v2_Chr{seqNum:d}.gfa'

        coregraph = GenomeGraph(gfaPath=coreGFApath,isGFASeq=False)
        coregraph_genes = deepcopy(coregraph)

        fullGraphPath = f'../../1001G/pantograph/data/AT_Chr{seqNum:d}_OGOnly_2.1.gfa'
        fullgraph = GenomeGraph(gfaPath=fullGraphPath,isGFASeq=False)

        chainToGenesFile = f'chain2gene.txt'
        # maxChainLength = len(coregraph.nodes[0])
        chainToListDict = {}
        nodeLengths = []
        with open(f'{pathfileDir}{os.path.sep}{chainToGenesFile}') as f:
            for line in f:
                chainName, geneList = line.split(':')
                geneList = geneList.lstrip().rstrip().split(',')
                chainToListDict[chainName] = geneList

        for nodeIdx,nodeName in enumerate(coregraph.nodes):
            print(f'\rNode {nodeIdx+1}/{len(coregraph.nodes)}',end='')
            geneList = chainToListDict.get(nodeName, [])
            # geneIds = []
            # if geneList[0][:2]!='ch':
            geneList = [gene.rstrip('+') for gene in geneList]
            geneIds = [fullgraph.nodes.index(genename) for genename in geneList]
            # coregraph_genes.nodesData[nodeIdx] = ''.join(geneList)
            geneNum = max(1,len(geneList))
            nodeLengths.append(geneNum)

            for accession, chainDict in coregraph.nodesAnnotation[nodeIdx].items():
                interval = chainDict[nodeName]
                # geneCumLengths = np.hstack((0, np.cumsum([len(gene) for gene in geneList])))
                
                coregraph.nodesAnnotation[nodeIdx][accession].pop(nodeName,None)
                
                coregraph.nodesAnnotation[nodeIdx][accession].\
                    update({geneAnnotation:interval \
                            for i,geneid in enumerate(geneIds) \
                                for geneAnnotation in fullgraph.nodesAnnotation[geneid].get(accession,{fullgraph.nodes[geneid]:None}).keys()})

                coregraph_genes.nodesAnnotation[nodeIdx][accession].\
                    update({geneAnnotation:[(i,i)] \
                                    for i,geneid in enumerate(geneIds) \
                                        for geneAnnotation in fullgraph.nodesAnnotation[geneid].get(accession,{fullgraph.nodes[geneid]:None}).keys()})

                # coregraph_genes.nodesAnnotation[nodeIdx][accession].\
                #     update({nodeName:[(0, geneNum-1)]})
        print('')

    coreGFApath = f'{pathfileDir}{os.path.sep}coregraph_v2_Chr{seqNum:d}.gfa'
    coreGeneGFApath = f'{pathfileDir}{os.path.sep}coregraph_v2_genes_Chr{seqNum:d}.gfa'
    
    if doCreateCoreGenes:
        coreGeneNodeLengthsPath = f'{pathfileDir}{os.path.sep}nodeLengths_coregraph_v2_genes_Chr{seqNum:d}.dat'

        coregraph.toGFA(coreGFApath,doSeq=False)
        coregraph_genes.toGFA(coreGeneGFApath,doSeq=True)
        joblib.dump(nodeLengths,coreGeneNodeLengthsPath)

    #Exporting chain graph with annotation only
    isSeq = False
    inputPath = coreGFApath
    
    outputPath,outputName = pathConvert(inputPath,suffix='_new')

    # dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
    # print(f'Opening Redis connection for db {dbid}')
    redisConn = Redis(host='redis',port = 6379,db=0)

    exportToPantograph(inputPath=inputPath,
                       outputName=outputName,
                       outputPath=outputPath,
                       isSeq=isSeq,
                       redisConn=redisConn,
                       GenomeGraphParams={'accessionsToRemove':['Consensus']},
                       zoomLevels=zoomLevels,
                       fillZoomLevels=True,
                       maxLengthChunk=maxLengthChunk,
                       maxLengthComponent=maxLengthComponent,
                       inversionThreshold=inversionThreshold,)

    #Exporting chain graph with genes
    isSeq = False
    inputPath = coreGeneGFApath
    
    outputPath,outputName = pathConvert(inputPath,suffix='_new')

    # dbid = getDBID('../pantograph_API/data/caseToDBID.dict',outputName)
    # print(f'Opening Redis connection for db {dbid}')
    redisConn = Redis(host='redis',port = 6379,db=0)

    exportToPantograph(inputPath=inputPath,
                       outputName=outputName,
                       outputPath=outputPath,
                       isSeq=isSeq,
                       nodeLengths=checkNodeLengthsFile(inputPath),
                       redisConn=redisConn,
                       GenomeGraphParams={'accessionsToRemove':['Consensus']},
                       zoomLevels=zoomLevels,
                       fillZoomLevels=True,
                       maxLengthChunk=maxLengthChunk,
                       maxLengthComponent=maxLengthComponent,
                       inversionThreshold=inversionThreshold,)

NameError: name 'adjustZoomLevels' is not defined