In [2]:
import pandas as pd
import numpy as np
import json
import networkx as nx
from annoq_tree_gen import DTYPES
from base import load_json

tree_df = pd.read_csv("./../../annoq_data/input/annotation_tree.csv", sep=',', dtype=DTYPES)
#mappings = load_json("./../../annoq_data/mappings.json")

tree_df.set_index("id", inplace=True, drop=False)

In [3]:
tree_df = tree_df[~tree_df['name'].str.endswith('_list')]
tree_df

Unnamed: 0_level_0,id,parent_id,leaf,name,label,detail,link,pmid,sort,root_url,sample_url,field_type,keyword_searchable
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,0,,False,root,Annotation,,,,0.0,,,,
1,1,0,False,Basic Info,,"Basic information about the variant, such as c...",,,1.0,,,,
26,26,0,False,ANNOVAR,,Pre-computed ANNOVAR annotations for all alter...,http://annovar.openbioinformatics.org/en/lates...,20601685,2.0,,,,
208,208,0,False,SnpEff,,AnpEff is a program for annotating and predict...,http://pcingola.github.io/SnpEff/,22728672,3.0,,,,
132,132,0,False,VEP,,Variant Effect Predictor (VEP) is developed by...,https://uswest.ensembl.org/info/docs/tools/vep...,27268795,4.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,621,495,True,sno_miRNA_type,,the type of snoRNA or miRNA (from miRBase/snoR...,,,,,,text,True
622,622,495,True,splicing_consensus_ada_score,,splicing-change prediction for splicing consen...,,,,,,float,
623,623,495,True,splicing_consensus_rf_score,,splicing-change prediction for splicing consen...,,,,,,float,
624,624,495,True,target_gene,,"target gene (for promoter, enhancer, etc.) bas...",,,,,,text,True


In [4]:
def is_leaf(G, node):
    return G.out_degree(node) == 0 and G.in_degree(node) == 1

def add_value_type(name):
    if '_list_id' in name:
        return 'term_id'
        
    return None



tree_df['value_type'] = tree_df.apply(lambda row: add_value_type(row['name']),axis=1)
#tree_df[tree_df['value_type']=='term_id']
tree_df

Unnamed: 0_level_0,id,parent_id,leaf,name,label,detail,link,pmid,sort,root_url,sample_url,field_type,keyword_searchable,value_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0,,False,root,Annotation,,,,0.0,,,,,
1,1,0,False,Basic Info,,"Basic information about the variant, such as c...",,,1.0,,,,,
26,26,0,False,ANNOVAR,,Pre-computed ANNOVAR annotations for all alter...,http://annovar.openbioinformatics.org/en/lates...,20601685,2.0,,,,,
208,208,0,False,SnpEff,,AnpEff is a program for annotating and predict...,http://pcingola.github.io/SnpEff/,22728672,3.0,,,,,
132,132,0,False,VEP,,Variant Effect Predictor (VEP) is developed by...,https://uswest.ensembl.org/info/docs/tools/vep...,27268795,4.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,621,495,True,sno_miRNA_type,,the type of snoRNA or miRNA (from miRBase/snoR...,,,,,,text,True,
622,622,495,True,splicing_consensus_ada_score,,splicing-change prediction for splicing consen...,,,,,,float,,
623,623,495,True,splicing_consensus_rf_score,,splicing-change prediction for splicing consen...,,,,,,float,,
624,624,495,True,target_gene,,"target gene (for promoter, enhancer, etc.) bas...",,,,,,text,True,


In [5]:
g = nx.from_pandas_edgelist(
    tree_df, 'parent_id', 'id', create_using=nx.DiGraph())

roots = [tree_df.loc[n]['name'] for n in nx.descendants(g,'26') if is_leaf(g, n)]

#len(roots)

def leaf_count(G, source):
    children = [n for n in nx.descendants(G, source) if is_leaf(g, n)]
    return len(children)


def descendants_count(G, source):
    children = [n for n in nx.descendants(G, source)]
    return len(children)

def parents_count(G, source):
    children = [n for n in nx.descendants(G, source) if not is_leaf(g, n)]
    return len(children)

        
    
#get_type(mappings, 'ref')
#child_count(g, '26')
#nx.descendants(g,'26')

In [7]:
tree_df['children_count'] = tree_df.apply(lambda row: descendants_count(g, row['id']),axis=1)
tree_df['category_count'] = tree_df.apply(lambda row: parents_count(g, row['id']),axis=1)
tree_df['leaf_count'] = tree_df.apply(lambda row: leaf_count(g, row['id']),axis=1)
tree_df

Unnamed: 0_level_0,id,parent_id,leaf,name,label,detail,link,pmid,sort,root_url,sample_url,field_type,keyword_searchable,value_type,children_count,category_count,leaf_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,0,,False,root,Annotation,,,,0.0,,,,,,545,18,527
1,1,0,False,Basic Info,,"Basic information about the variant, such as c...",,,1.0,,,,,,5,0,5
26,26,0,False,ANNOVAR,,Pre-computed ANNOVAR annotations for all alter...,http://annovar.openbioinformatics.org/en/lates...,20601685,2.0,,,,,,40,0,40
208,208,0,False,SnpEff,,AnpEff is a program for annotating and predict...,http://pcingola.github.io/SnpEff/,22728672,3.0,,,,,,63,0,63
132,132,0,False,VEP,,Variant Effect Predictor (VEP) is developed by...,https://uswest.ensembl.org/info/docs/tools/vep...,27268795,4.0,,,,,,59,0,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,621,495,True,sno_miRNA_type,,the type of snoRNA or miRNA (from miRBase/snoR...,,,,,,text,True,,0,0,0
622,622,495,True,splicing_consensus_ada_score,,splicing-change prediction for splicing consen...,,,,,,float,,,0,0,0
623,623,495,True,splicing_consensus_rf_score,,splicing-change prediction for splicing consen...,,,,,,float,,,0,0,0
624,624,495,True,target_gene,,"target gene (for promoter, enhancer, etc.) bas...",,,,,,text,True,,0,0,0


In [None]:
annotations.to_csv('./../../annoq-data/tree-with-types.csv', index = False)