In [1]:
from Bio.KEGG.KGML.KGML_parser import read, parse
import pandas as pd
import requests
import re

**Load KEGG xml file**

In [2]:
pathway = read(open("hiv_raw_data/hsa04012.xml", 'r'))
print(pathway)

Pathway: ErbB signaling pathway
KEGG ID: path:hsa04012
Image file: https://www.kegg.jp/kegg/pathway/hsa/hsa04012.png
Organism: hsa
Entries: 80
Entry types:
	gene: 60
	group: 6
	compound: 4
	map: 10



**Function to extract relations (i.e. edges) from KGML file**

In [3]:
def extract_relations(kgml_file):

    with open(kgml_file, 'r') as f:
        pathway = read(f)

    relations_data = []

    # iterate over all relations in the pathway
    for relation in pathway.relations:
        entry1 = relation.entry1.id
        entry2 = relation.entry2.id
        relation_type = relation.type  # Type of interaction (e.g., PPrel, GErel, etc.)

        relations_data.append((entry1, entry2, relation_type))

    return relations_data

**Parse KGML file to extract relations**

In [4]:
kgml_file_path = "hiv_raw_data/hsa04012.xml"  
relations = extract_relations(kgml_file_path)

# for node1, node2, rel_type in relations:
#     print(f"Relation: Node1={node1}, Node2={node2}, Type={rel_type}")

print(len(relations))

85


In [5]:
relations_df = pd.DataFrame(relations, columns=['Node1', 'Node2', 'RelationType'])
relations_df.head()

Unnamed: 0,Node1,Node2,RelationType
0,70,10,PPrel
1,8,38,PPrel
2,177,55,PPrel
3,38,64,PPrel
4,177,41,PPrel


**Get entries i.e. nodes from KGML file**

In [6]:
entries_data = []
for entry in pathway.entries.values():
    entries_data.append({
        'id': entry.id,
        'name': entry.name,
        'type': entry.type,
        'link': entry.link,
        'reaction': entry.reaction,
    })
entries_df = pd.DataFrame(entries_data)
len(entries_df)

80

In [7]:
entries_df.head()

Unnamed: 0,id,name,type,link,reaction
0,7,path:hsa05223,map,https://www.kegg.jp/dbget-bin/www_bget?hsa05223,
1,8,hsa:2885,gene,https://www.kegg.jp/dbget-bin/www_bget?hsa:2885,
2,9,hsa:2002,gene,https://www.kegg.jp/dbget-bin/www_bget?hsa:2002,
3,10,hsa:2066,gene,https://www.kegg.jp/dbget-bin/www_bget?hsa:2066,
4,11,hsa:2066,gene,https://www.kegg.jp/dbget-bin/www_bget?hsa:2066,


In [8]:
set(entries_df['type'])

{'compound', 'gene', 'group', 'map'}

**Map the entry IDs in the relations to NodeIDs**

In [10]:
# replace Node1 with Name
relations_df = relations_df.merge(entries_df[['id', 'name']], left_on='Node1', right_on='id', how='left').drop(columns=['id'])
relations_df.rename(columns={'name': 'Node1_Name'}, inplace=True)
relations_df.head()

Unnamed: 0,Node1,Node2,RelationType,Node1_Name
0,70,10,PPrel,hsa:145957
1,8,38,PPrel,hsa:2885
2,177,55,PPrel,undefined
3,38,64,PPrel,hsa:2549
4,177,41,PPrel,undefined


In [None]:
# replace Node2 with Name
relations_df = relations_df.merge(entries_df[['id', 'name']], left_on='Node2', right_on='id', how='left').drop(columns=['id'])
relations_df.rename(columns={'name': 'Node2_Name'}, inplace=True)
relations_df.head()

Unnamed: 0,Node1,Node2,RelationType,Node1_Name,Node2_Name
0,70,10,PPrel,hsa:145957,hsa:2066
1,8,38,PPrel,hsa:2885,hsa:2549
2,177,55,PPrel,undefined,hsa:5335 hsa:5336
3,38,64,PPrel,hsa:2549,hsa:110117499 hsa:5290 hsa:5291 hsa:5293 hsa:5...
4,177,41,PPrel,undefined,hsa:867 hsa:868


**TODO:**
- cartesian product for the one to many mapping
- ID mapping from HSA IDs to Uniprot Entry name?
- Clean up resulting edge/network dataframe