In [30]:
import pandas as pd
import numpy as np
import json
import networkx as nx
from annoq_tree_gen import DTYPES
from base import load_json

annotations = pd.read_csv("./../../annoq-data/revised_tree.csv", sep=',', dtype=DTYPES)
terms = pd.read_csv("./../../annoq-data/slim/terms.tsv", sep='\t')
mappings = load_json("./../../annoq-data/mappings.json")

annotations.set_index("id", inplace=True, drop=False)

term_cols = {
    'ID':'id',
    'LABEL':'label',
    'SubClass Of':'parents',
    'hasOBONamespace':'aspect'
}

terms.rename(columns=term_cols, inplace=True)
terms.set_index("id", inplace=True, drop=False)

In [31]:
def is_leaf(G, node):
    return G.out_degree(node) == 0 and G.in_degree(node) == 1

def add_value_type(name):
    if 'GO_' in name and 'list_id' in name:
        return 'GO_id'
    if 'GO_' in name and 'list' in name:
        return 'GO_label'
        
    return None


annotations['value_type'] = annotations.apply(lambda row: add_value_type(row['name']),axis=1)
annotations

Unnamed: 0_level_0,id,parent_id,leaf,name,label,detail,link,pmid,sort,value_type,count,children_count,category_count,leaf_count,root URL,sample URL
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,,False,root,Annotation,,,,0.0,,607,625,18,607,,
1,1,0,False,Basic Info,,"Basic information about the variant, such as c...",,,1.0,,5,5,0,5,,
26,26,0,False,ANNOVAR,,Pre-computed ANNOVAR annotations for all alter...,http://annovar.openbioinformatics.org/en/lates...,20601685,2.0,,56,56,0,56,,
208,208,0,False,SnpEff,,AnpEff is a program for annotating and predict...,http://pcingola.github.io/SnpEff/,22728672,3.0,,79,79,0,79,,
132,132,0,False,VEP,,Variant Effect Predictor (VEP) is developed by...,https://uswest.ensembl.org/info/docs/tools/vep...,27268795,4.0,,75,75,0,75,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,621,495,True,sno_miRNA_type,,the type of snoRNA or miRNA (from miRBase/snoR...,,,,,0,0,0,0,,
622,622,495,True,splicing_consensus_ada_score,,splicing-change prediction for splicing consen...,,,,,0,0,0,0,,
623,623,495,True,splicing_consensus_rf_score,,splicing-change prediction for splicing consen...,,,,,0,0,0,0,,
624,624,495,True,target_gene,,"target gene (for promoter, enhancer, etc.) bas...",,,,,0,0,0,0,,


In [32]:
g = nx.from_pandas_edgelist(
    annotations, 'parent_id', 'id', create_using=nx.DiGraph())

roots = [annotations.loc[n]['name'] for n in nx.descendants(g,'26') if is_leaf(g, n)]

#len(roots)

def leaf_count(G, source):
    children = [n for n in nx.descendants(G, source) if is_leaf(g, n)]
    return len(children)


def descendants_count(G, source):
    children = [n for n in nx.descendants(G, source)]
    return len(children)

def parents_count(G, source):
    children = [n for n in nx.descendants(G, source) if not is_leaf(g, n)]
    return len(children)

def get_type(mapping_dict, field):
    try:
        field_key =  mapping_dict["annoq-test"]["mappings"]["properties"][field]

        if field_key != None:
            return field_key['type']
    
    except KeyError:
        return np.nan
        
    
get_type(mappings, 'ref')
#child_count(g, '26')
#nx.descendants(g,'26')

'text'

In [33]:
annotations['children_count'] = annotations.apply(lambda row: descendants_count(g, row['id']),axis=1)
annotations['category_count'] = annotations.apply(lambda row: parents_count(g, row['id']),axis=1)
annotations['leaf_count'] = annotations.apply(lambda row: leaf_count(g, row['id']),axis=1)
annotations['field_type'] = annotations.apply(lambda row: get_type(mappings, row['name']),axis=1)
annotations

Unnamed: 0_level_0,id,parent_id,leaf,name,label,detail,link,pmid,sort,value_type,count,children_count,category_count,leaf_count,root URL,sample URL,field_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,0,,False,root,Annotation,,,,0.0,,607,625,18,607,,,
1,1,0,False,Basic Info,,"Basic information about the variant, such as c...",,,1.0,,5,5,0,5,,,
26,26,0,False,ANNOVAR,,Pre-computed ANNOVAR annotations for all alter...,http://annovar.openbioinformatics.org/en/lates...,20601685,2.0,,56,56,0,56,,,
208,208,0,False,SnpEff,,AnpEff is a program for annotating and predict...,http://pcingola.github.io/SnpEff/,22728672,3.0,,79,79,0,79,,,
132,132,0,False,VEP,,Variant Effect Predictor (VEP) is developed by...,https://uswest.ensembl.org/info/docs/tools/vep...,27268795,4.0,,75,75,0,75,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,621,495,True,sno_miRNA_type,,the type of snoRNA or miRNA (from miRBase/snoR...,,,,,0,0,0,0,,,text
622,622,495,True,splicing_consensus_ada_score,,splicing-change prediction for splicing consen...,,,,,0,0,0,0,,,float
623,623,495,True,splicing_consensus_rf_score,,splicing-change prediction for splicing consen...,,,,,0,0,0,0,,,float
624,624,495,True,target_gene,,"target gene (for promoter, enhancer, etc.) bas...",,,,,0,0,0,0,,,text


In [35]:
annotations.to_csv('./../../annoq-data/tree-with-types.csv', index = False)