In [2]:
from Bio.KEGG.KGML.KGML_parser import read, parse
import pandas as pd
import requests
import itertools

**Load KEGG xml file**

In [2]:
pathway = read(open("hiv_raw_data/hsa04012.xml", 'r'))
print(pathway)

Pathway: ErbB signaling pathway
KEGG ID: path:hsa04012
Image file: https://www.kegg.jp/kegg/pathway/hsa/hsa04012.png
Organism: hsa
Entries: 80
Entry types:
	gene: 60
	group: 6
	compound: 4
	map: 10



**Function to extract relations (i.e. edges) from KGML file**

In [3]:
def extract_relations(kgml_file):

    with open(kgml_file, 'r') as f:
        pathway = read(f)

    relations_data = []

    # iterate over all relations in the pathway
    for relation in pathway.relations:
        entry1 = relation.entry1.id
        entry2 = relation.entry2.id
        relation_type = relation.type  # Type of interaction (e.g., PPrel, GErel, etc.)

        relations_data.append((entry1, entry2, relation_type))

    return relations_data

**Parse KGML file to extract relations**

In [4]:
kgml_file_path = "hiv_raw_data/hsa04012.xml"  
relations = extract_relations(kgml_file_path)

# for node1, node2, rel_type in relations:
#     print(f"Relation: Node1={node1}, Node2={node2}, Type={rel_type}")

print(len(relations))

85


In [5]:
relations_df = pd.DataFrame(relations, columns=['Node1', 'Node2', 'RelationType'])
relations_df.head()

Unnamed: 0,Node1,Node2,RelationType
0,78,77,PPrel
1,45,10,PPrel
2,77,76,PPrel
3,44,178,PPrel
4,76,71,PPrel


**Get entries i.e. nodes from KGML file**

In [6]:
entries_data = []
for entry in pathway.entries.values():
    entries_data.append({
        'id': entry.id,
        'name': entry.name,
        'type': entry.type,
        'link': entry.link,
        'reaction': entry.reaction,
    })
entries_df = pd.DataFrame(entries_data)
len(entries_df)

80

In [7]:
entries_df.head()

Unnamed: 0,id,name,type,link,reaction
0,7,path:hsa05223,map,https://www.kegg.jp/dbget-bin/www_bget?hsa05223,
1,8,hsa:2885,gene,https://www.kegg.jp/dbget-bin/www_bget?hsa:2885,
2,9,hsa:2002,gene,https://www.kegg.jp/dbget-bin/www_bget?hsa:2002,
3,10,hsa:2066,gene,https://www.kegg.jp/dbget-bin/www_bget?hsa:2066,
4,11,hsa:2066,gene,https://www.kegg.jp/dbget-bin/www_bget?hsa:2066,


In [8]:
set(entries_df['type'])

{'compound', 'gene', 'group', 'map'}

**Map the entry IDs in the relations to NodeIDs**

In [9]:
# replace Node1 with Name
relations_df = relations_df.merge(entries_df[['id', 'name']], left_on='Node1', right_on='id', how='left').drop(columns=['id'])
relations_df.rename(columns={'name': 'Node1_Name'}, inplace=True)
relations_df.head()

Unnamed: 0,Node1,Node2,RelationType,Node1_Name
0,78,77,PPrel,hsa:369 hsa:5894 hsa:673
1,45,10,PPrel,hsa:9542
2,77,76,PPrel,hsa:5604 hsa:5605
3,44,178,PPrel,hsa:10718
4,76,71,PPrel,hsa:5594 hsa:5595


In [10]:
# replace Node2 with Name
relations_df = relations_df.merge(entries_df[['id', 'name']], left_on='Node2', right_on='id', how='left').drop(columns=['id'])
relations_df.rename(columns={'name': 'Node2_Name'}, inplace=True)
relations_df.head()

Unnamed: 0,Node1,Node2,RelationType,Node1_Name,Node2_Name
0,78,77,PPrel,hsa:369 hsa:5894 hsa:673,hsa:5604 hsa:5605
1,45,10,PPrel,hsa:9542,hsa:2066
2,77,76,PPrel,hsa:5604 hsa:5605,hsa:5594 hsa:5595
3,44,178,PPrel,hsa:10718,undefined
4,76,71,PPrel,hsa:5594 hsa:5595,hsa:4609


**Replace all `undefined` with ErbB-2 ID**

In [11]:
relations_df = relations_df.replace('undefined', 'hsa:2064', regex=True)

**Get cartesian product of all proteins in each ro - i.e. get all pairs**

In [16]:
split_node1_df = relations_df['Node1_Name'].str.split(' ', expand=True)
print(len(split_node1_df))
split_node1_df.head()

85


Unnamed: 0,0,1,2,3,4,5,6
0,hsa:369,hsa:5894,hsa:673,,,,
1,hsa:9542,,,,,,
2,hsa:5604,hsa:5605,,,,,
3,hsa:10718,,,,,,
4,hsa:5594,hsa:5595,,,,,


In [17]:
split_node2_df = relations_df['Node2_Name'].str.split(' ', expand=True)
print(len(split_node2_df))
split_node2_df.head()

85


Unnamed: 0,0,1,2,3,4,5,6
0,hsa:5604,hsa:5605,,,,,
1,hsa:2066,,,,,,
2,hsa:5594,hsa:5595,,,,,
3,hsa:2064,,,,,,
4,hsa:4609,,,,,,


In [18]:
def cartesian_product_rows(df1, df2):
    result = []
    for row1, row2 in zip(df1.values, df2.values):
        # remove None vals
        values1 = [x for x in row1 if x is not None]
        values2 = [x for x in row2 if x is not None]
        
        # get the cartesian prod
        for pair in itertools.product(values1, values2):
            result.append(pair)
    
    return pd.DataFrame(result)

In [19]:
all_reaction_pairs = cartesian_product_rows(df1=split_node1_df, df2=split_node2_df)
all_reaction_pairs.columns = ['Node1', 'Node2']
all_reaction_pairs.head()

Unnamed: 0,Node1,Node2
0,hsa:369,hsa:5604
1,hsa:369,hsa:5605
2,hsa:5894,hsa:5604
3,hsa:5894,hsa:5605
4,hsa:673,hsa:5604


**ID Mapping using LinkDB + keep only SwissProt reviewed proteins**

In [21]:
hsa_uniprot_df = pd.read_csv("hiv_raw_data/hsa_uniprot_03122025.list", sep='\t', header=None)
hsa_uniprot_df.columns = ['HSA','Uniprot','link_category']
hsa_uniprot_df.head()

Unnamed: 0,HSA,Uniprot,link_category
0,hsa:1,up:P04217,equivalent
1,hsa:1,up:V9HWD8,equivalent
2,hsa:10,up:A4Z6T7,equivalent
3,hsa:10,up:P11245,equivalent
4,hsa:100,up:A0A0S2Z381,equivalent


In [22]:
# replace Node1 with UniprotID
all_reaction_pairs = all_reaction_pairs.merge(hsa_uniprot_df[['HSA', 'Uniprot']], left_on='Node1', right_on='HSA', how='left').drop(columns=['HSA'])
all_reaction_pairs.rename(columns={'Uniprot': 'Node1_Uniprot'}, inplace=True)
all_reaction_pairs.head()

Unnamed: 0,Node1,Node2,Node1_Uniprot
0,hsa:369,hsa:5604,up:A0A024R178
1,hsa:369,hsa:5604,up:P10398
2,hsa:369,hsa:5605,up:A0A024R178
3,hsa:369,hsa:5605,up:P10398
4,hsa:5894,hsa:5604,up:L7RRS6


In [23]:
# replace Node1 with UniprotID
all_reaction_pairs = all_reaction_pairs.merge(hsa_uniprot_df[['HSA', 'Uniprot']], left_on='Node2', right_on='HSA', how='left').drop(columns=['HSA'])
all_reaction_pairs.rename(columns={'Uniprot': 'Node2_Uniprot'}, inplace=True)
all_reaction_pairs.head()

Unnamed: 0,Node1,Node2,Node1_Uniprot,Node2_Uniprot
0,hsa:369,hsa:5604,up:A0A024R178,up:A4QPA9
1,hsa:369,hsa:5604,up:A0A024R178,up:Q02750
2,hsa:369,hsa:5604,up:P10398,up:A4QPA9
3,hsa:369,hsa:5604,up:P10398,up:Q02750
4,hsa:369,hsa:5605,up:A0A024R178,up:P36507


In [24]:
all_reaction_pairs_uniprot = all_reaction_pairs.drop(columns=['Node1', 'Node2'])
all_reaction_pairs_uniprot.head()

Unnamed: 0,Node1_Uniprot,Node2_Uniprot
0,up:A0A024R178,up:A4QPA9
1,up:A0A024R178,up:Q02750
2,up:P10398,up:A4QPA9
3,up:P10398,up:Q02750
4,up:A0A024R178,up:P36507


In [25]:
# Remove 'up:' from all strings excluding NaN
all_reaction_pairs_uniprot = all_reaction_pairs_uniprot.applymap(lambda x: x.replace("up:", "") if isinstance(x, str) else x)
all_reaction_pairs_uniprot.head()

  all_reaction_pairs_uniprot = all_reaction_pairs_uniprot.applymap(lambda x: x.replace("up:", "") if isinstance(x, str) else x)


Unnamed: 0,Node1_Uniprot,Node2_Uniprot
0,A0A024R178,A4QPA9
1,A0A024R178,Q02750
2,P10398,A4QPA9
3,P10398,Q02750
4,A0A024R178,P36507


In [26]:
def filter_dataframe(df):
    base_url = "https://www.genome.jp/entry/sp:"
    # list to keep track of rows to keep
    rows_to_keep = []
    df = df.dropna()
    for index, row in df.iterrows():
        n1_uniprot = row['Node1_Uniprot']
        n2_uniprot = row['Node2_Uniprot']
        url_n1 = f"{base_url}{n1_uniprot}"
        response_n1 = requests.get(url_n1)
        url_n2 = f"{base_url}{n2_uniprot}"
        response_n2 = requests.get(url_n2)
        if "No such data was found." not in response_n1.text and "No such data was found." not in response_n2.text:
            rows_to_keep.append(index)
            # print(df.loc[index])
    # filter the DataFrame to keep only the desired rows
    filtered_df = df.loc[rows_to_keep].reset_index(drop=True)
    return filtered_df

In [27]:
copy = all_reaction_pairs_uniprot.copy()
all_reaction_pairs_swissprot = filter_dataframe(copy)
all_reaction_pairs_swissprot.head()

Unnamed: 0,Node1_Uniprot,Node2_Uniprot
0,P10398,Q02750
1,P10398,P36507
2,P04049,Q02750
3,P04049,P36507
4,P15056,Q02750


**Add directionality and edge weights**

In [28]:
directionality = 'D'
edge_weight = 1.0

In [None]:
all_reaction_pairs_swissprot['Prize'] = edge_weight
all_reaction_pairs_swissprot['Directionality'] = directionality

In [30]:
all_reaction_pairs_swissprot.head()

Unnamed: 0,Node1_Uniprot,Node2_Uniprot,Directionality,Prize
0,P10398,Q02750,D,1.0
1,P10398,P36507,D,1.0
2,P04049,Q02750,D,1.0
3,P04049,P36507,D,1.0
4,P15056,Q02750,D,1.0


**Save file**

In [15]:
all_reaction_pairs_swissprot.to_csv('erbb_edges_gold_standard.txt',sep='\t',header=None, index=False)