# Import Packages

In [1]:
# built-in
import json
from pathlib import Path

# third-party (pip install required)
import networkx as nx
import pandas as pd

In [2]:
DATA_PATH = Path('../../data/')
ANNOTATION_PATH = Path(DATA_PATH, 'annotation')

# Load Existing Gene Table

In [4]:
gene_table = pd.read_csv(Path(ANNOTATION_PATH, 'gene_info.csv'), index_col=0)
gene_table.head()

Unnamed: 0_level_0,gene_name,synonyms,gene_product,COG,regulator,uniprot,start,end,strand,essential,...,y_ome,k_eff,schmidt_prot_med,heckmann_prot_med,proteomics,p1k,p1k_ctrl_log_tpm,p1k_median_log_tpm,p1k_mad_log_tpm,in_iM
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b0001,thrL,thrL,thr operon leader peptide,No COG annotation,,P0AD86,190,255,+,False,...,False,False,,,False,False,10.372833,8.862343,1.936952,False
b0002,thrA,thrA;thrA1;thrA2,fused aspartate kinase/homoserine dehydrogenase 1,Amino acid transport and metabolism,"RpoD,Ile-tRNA,Thr-tRNA,DksA,Fnr,ArcA,ppGpp",P00561,337,2799,+,False,...,False,True,4101.5,4434.0,True,True,10.314562,9.605329,1.042143,True
b0003,thrB,thrB,homoserine kinase,Nucleotide transport and metabolism,"RpoD,Ile-tRNA,Thr-tRNA,DksA,ppGpp",P00547,2801,3733,+,False,...,False,True,442.0,1177.0,True,True,9.982251,9.232803,1.035654,True
b0004,thrC,thrC,threonine synthase,Amino acid transport and metabolism,"RpoD,ppGpp,Ile-tRNA,Thr-tRNA,DksA",P00934,3734,5020,+,False,...,False,True,8818.5,3516.0,True,True,9.973666,9.274271,0.906287,True
b0005,yaaX,yaaX,DUF2502 domain-containing protein YaaX,Function unknown,Lrp,P75616,5234,5530,+,False,...,True,False,,,False,True,7.132482,6.283661,1.093146,True


# KEGG Mapping

## Load the KEGG Orthology into a Graph

KEGG annotations downloaded from: https://www.genome.jp/brite/eco00001

In [47]:
with open(Path(ANNOTATION_PATH, 'kegg_map.json'), 'r') as kegg_file:
    kegg_map = json.load(kegg_file)
    
excludes = ['09180 Brite Hierarchies', '09181 Protein families: metabolism', '09182 Protein families: genetic information processing']
    
def get_edgelist(kegg_dict):
    edgelist = []
    if kegg_dict['name'] in excludes:
        return []
    if 'children' in kegg_dict:
        for child_dict in kegg_dict['children']:
            edgelist.append((kegg_dict['name'], child_dict['name']))
            child_edgelist = get_edgelist(child_dict) 
            edgelist.extend(child_edgelist)
    return edgelist

kegg_edgelist = get_edgelist(kegg_map)

kegg_graph = nx.DiGraph(kegg_edgelist)

## Graph Curation

KEGG allows for multiple categorizations, but for our purposes, we want to flatten this to a tree structure so that each gene has an unambiguous path from a top-level category

In [51]:
genes = [gene for gene, out_deg in kegg_graph.out_degree() if out_deg == 0 and gene[0] == 'b']

# there is only ambiguity if a gene has more than 4 ancestors (if perfect path to root, will have 3 categories and root noe)
ambi_genes = [g for g in genes if len(nx.ancestors(kegg_graph, g)) > 4]

In [None]:
ambi_cat = {
    'b2388': '00010',
    'b4025': '00010',
    'b3916': '00010',
    'b1723': '00010',
    'b4232': '00010',
    'b3925': '00010',
    'b2930': '00010',
    'b2097': '00010',
    'b2925': '00010',
    'b3919': '00010',
    'b0755': '00010',
    'b4395': '00010',
    'b3612': '00010',
    'b2779': '00010',
    'b1676': '00010',
    'b1854': '00010',
    'b1702': '00010',
    'b0114': '00620',
    'b0115': '00620',
    'b0116': '00620',
    'b1378': '00620',
    'b0356': '00071',
    'b1241': '00650',
    'b1478': '00071',
    'b3589': '00350',
    'b4269': '00620',
    'b0325': '00620',
    'b2453': '00620',
    'b3588': '00620',
    'b4069': '00620',
    'b0756': '00052',
    'b3879': '00052',
    'b0688': '00500',
    'b3403': '00010',
    'b1734': '00500',
    'b2716': '00500',
    'b2901': '00500',
    'b3721': '00500',
    'b2417': '02060',
    'b1101': '02060',
    'b1621': '02060',
    'b2715': '02060',
    'b0720': '00020',
    'b1276': '00020',
    'b0771': '00020',
    'b0118': '00020',
    'b1136': '00020',
    'b0727': '00020',
    'b0729': '00020',
    'b0728': '00020',
    'b0723': '00020',
    'b0724': '00020',
    'b0721': '00020',
    'b0722': '00020',
    'b4154': '00190',
    'b4153': '00190',
    'b4152': '00190',
    'b4151': '00190',
    'b4122': '00020',
    'b1612': '00020',
    'b1611': '00020',
    'b1675': '00020',
    'b2929': '00020',
    'b3236': '00020',
    'b2210': '00020',
    'b1852': '00030',
    'b2029': '00030',
    'b3386': '00030',
    'b2465': '00030',
    'b2935': '00030',
    'b4090': '00030',
    'b4383': '00030',
    'b3380': '00030',
    'b1207': '00030',
    'b1850': '00030',
    'b3553': '00630',
    'b0772': '00040',
    'b1617': '00040',
    'b2028': '00541',
    'b1236': '00541',
    'b2042': '00541',
    'b3575': '00053',
    'b3580': '00053',
    'b4196': '00053',
    'b3581': '00053',
    'b4197': '00053',
    'b3582': '00053',
    'b4198': '00053',
    'b0061': '00040',
    'b3583': '00053',
    'b3565': '00040',
    'b3904': '00040',
    'b3902': '00040',
    'b1774': '00040',
    'b2802': '00040',
    'b2803': '00040',
    'b2800': '00040',
    'b0394': '00051',
    'b1613': '00051',
    'b2048': '00051',
    'b2049': '00051',
    'b2053': '00051',
    'b2052': '00541',
    'b2168': '00051',
    'b2167': '02060',
    'b1817': '02060',
    'b1818': '02060',
    'b1819': '02060',
    'b2934': '02060',
    'b2933': '02060',
    'b3599': '02060',
    'b2704': '02060',
    'b2703': '02060',
    'b2702': '02060',
    'b0757': '00052',
    'b0758': '00052',
    'b0759': '00541',
    'b0344': '00052',
    'b3076': '00052',
    'b3077': '00052',
    'b2036': '00541',
    'b4119': '00052',
    'b4302': '02060',
    'b2094': '02060',
    'b2093': '02060',
    'b4565': '02060',
    'b4304': '02060',
    'b0403': '00500',
    'b3133': '02060',
    'b3138': '02060',
    'b3139': '02060',
    'b3140': '02060',
    'b4195': '02060',
    'b4194': '02060',
    'b4193': '02060',
    'b2132': '00500',
    'b3533': '02026',
    'b1736': '02060',
    'b1738': '02060',
    'b1737': '02060',
    'b3430': '00500',
    'b3429': '00500',
    'b3417': '00500',
    'b3428': '00500',
    'b3682': '02060',
    'b3683': '02060',
    'b4240': '02060',
    'b1107': '00520',
    '02060': '00520',
    'b3729': '00520',
    'b3730': '09107',
}

In [255]:
gene_to_test = ambi_genes[150]
print(gene_to_test)

nx.ancestors(kegg_graph, gene_to_test)

b3730 glmU; fused N-acetylglucosamine-1-phosphate uridyltransferase and glucosamine-1-phosphate acetyltransferase	K04042 glmU; bifunctional UDP-N-acetylglucosamine pyrophosphorylase / glucosamine-1-phosphate N-acetyltransferase [EC:2.7.7.23 2.3.1.157]


{'00520 Amino sugar and nucleotide sugar metabolism [PATH:eco00520]',
 '00541 O-Antigen nucleotide sugar biosynthesis [PATH:eco00541]',
 '09100 Metabolism',
 '09101 Carbohydrate metabolism',
 '09107 Glycan biosynthesis and metabolism',
 'eco00001'}

In [79]:
nx.ancestors(kegg_graph, '09105 Amino acid metabolism')

{'09100 Metabolism', 'eco00001'}