In [4]:
import pandas as pd
import json
import networkx as nx
from annoq_tree_gen import DTYPES

annotations = pd.read_csv("./../../annoq-data/tree.csv", sep=',', dtype=DTYPES)
terms = pd.read_csv("./../../annoq-data/slim/terms.tsv", sep='\t')

annotations.set_index("id", inplace=True, drop=False)

term_cols = {
    'ID':'id',
    'LABEL':'label',
    'SubClass Of':'parents',
    'hasOBONamespace':'aspect'
}

terms.rename(columns=term_cols, inplace=True)
terms.set_index("id", inplace=True, drop=False)
terms

Unnamed: 0_level_0,label,id,parents,definition,aspect
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GO:0044208,'de novo' AMP biosynthetic process,GO:0044208,GO:0006167,The chemical reactions and pathways resulting ...,biological_process
GO:0044210,'de novo' CTP biosynthetic process,GO:0044210,GO:0006241,The chemical reactions and pathways resulting ...,biological_process
GO:0042351,'de novo' GDP-L-fucose biosynthetic process,GO:0042351,GO:0034654|GO:0046368|GO:1901137,The chemical reactions and pathways resulting ...,biological_process
GO:0006189,'de novo' IMP biosynthetic process,GO:0006189,GO:0006188,The chemical reactions and pathways resulting ...,biological_process
GO:0034627,'de novo' NAD biosynthetic process,GO:0034627,GO:0009435,The chemical reactions and pathways resulting ...,biological_process
...,...,...,...,...,...
GO:0035375,zymogen binding,GO:0035375,GO:0019899,Interacting selectively and non-covalently wit...,molecular_function
GO:0042588,zymogen granule,GO:0042588,GO:0030141,"A membrane-bounded, cytoplasmic secretory gran...",cellular_component
GO:0070625,zymogen granule exocytosis,GO:0070625,GO:0017156,The release of intracellular molecules contain...,biological_process
GO:0042589,zymogen granule membrane,GO:0042589,GO:0030667,The lipid bilayer surrounding a zymogen granule.,cellular_component


In [17]:
def is_leaf(G, node):
    return G.out_degree(node) == 0 and G.in_degree(node) == 1

def add_value_type(name):
    if 'GO_' in name and 'list_id' in name:
        return 'GO_id'
    if 'GO_' in name and 'list' in name:
        return 'GO_label'
        
    return None


annotations['value_type'] = annotations.apply(lambda row: add_value_type(row['name']),axis=1)
annotations

Unnamed: 0_level_0,id,parent_id,leaf,name,label,detail,link,pmid,sort,value_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,,False,root,Annotation,,,,0.0,
1,1,0,False,Basic Info,,"Basic information about the variant, such as c...",,,1.0,
26,26,0,False,ANNOVAR,,Pre-computed ANNOVAR annotations for all alter...,http://annovar.openbioinformatics.org/en/lates...,20601685,2.0,
208,208,0,False,SnpEff,,AnpEff is a program for annotating and predict...,http://pcingola.github.io/SnpEff/,22728672,3.0,
132,132,0,False,VEP,,Variant Effect Predictor (VEP) is developed by...,https://uswest.ensembl.org/info/docs/tools/vep...,27268795,4.0,
...,...,...,...,...,...,...,...,...,...,...
621,621,495,True,sno_miRNA_type,,the type of snoRNA or miRNA (from miRBase/snoR...,,,,
622,622,495,True,splicing_consensus_ada_score,,splicing-change prediction for splicing consen...,,,,
623,623,495,True,splicing_consensus_rf_score,,splicing-change prediction for splicing consen...,,,,
624,624,495,True,target_gene,,"target gene (for promoter, enhancer, etc.) bas...",,,,


In [42]:
g = nx.from_pandas_edgelist(
    annotations, 'parent_id', 'id', create_using=nx.DiGraph())

roots = [annotations.loc[n]['name'] for n in nx.descendants(g,'26') if is_leaf(g, n)]

#len(roots)

def leaf_count(G, source):
    children = [n for n in nx.descendants(G, source) if is_leaf(g, n)]
    return len(children)


def descendants_count(G, source):
    children = [n for n in nx.descendants(G, source)]
    return len(children)

def parents_count(G, source):
    children = [n for n in nx.descendants(G, source) if not is_leaf(g, n)]
    return len(children)

child_count(g, '26')
#nx.descendants(g,'26')

56

In [43]:
annotations['children_count'] = annotations.apply(lambda row: descendants_count(g, row['id']),axis=1)
annotations['category_count'] = annotations.apply(lambda row: parents_count(g, row['id']),axis=1)
annotations['leaf_count'] = annotations.apply(lambda row: leaf_count(g, row['id']),axis=1)
annotations

Unnamed: 0_level_0,id,parent_id,leaf,name,label,detail,link,pmid,sort,value_type,count,children_count,category_count,leaf_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0,,False,root,Annotation,,,,0.0,,607,625,18,607
1,1,0,False,Basic Info,,"Basic information about the variant, such as c...",,,1.0,,5,5,0,5
26,26,0,False,ANNOVAR,,Pre-computed ANNOVAR annotations for all alter...,http://annovar.openbioinformatics.org/en/lates...,20601685,2.0,,56,56,0,56
208,208,0,False,SnpEff,,AnpEff is a program for annotating and predict...,http://pcingola.github.io/SnpEff/,22728672,3.0,,79,79,0,79
132,132,0,False,VEP,,Variant Effect Predictor (VEP) is developed by...,https://uswest.ensembl.org/info/docs/tools/vep...,27268795,4.0,,75,75,0,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,621,495,True,sno_miRNA_type,,the type of snoRNA or miRNA (from miRBase/snoR...,,,,,0,0,0,0
622,622,495,True,splicing_consensus_ada_score,,splicing-change prediction for splicing consen...,,,,,0,0,0,0
623,623,495,True,splicing_consensus_rf_score,,splicing-change prediction for splicing consen...,,,,,0,0,0,0
624,624,495,True,target_gene,,"target gene (for promoter, enhancer, etc.) bas...",,,,,0,0,0,0


In [45]:
annotations.to_csv('./../../annoq-data/tree-with-count.csv')