In [1]:
from Bio.KEGG.KGML.KGML_parser import read, parse
import pandas as pd
import requests
import itertools

**Load KEGG xml file**

In [2]:
pathway = read(open("hiv_raw_data/hsa04012.xml", 'r'))
print(pathway)

Pathway: ErbB signaling pathway
KEGG ID: path:hsa04012
Image file: https://www.kegg.jp/kegg/pathway/hsa/hsa04012.png
Organism: hsa
Entries: 80
Entry types:
	gene: 60
	group: 6
	compound: 4
	map: 10



**Function to extract relations (i.e. edges) from KGML file**

In [3]:
def extract_relations(kgml_file):

    with open(kgml_file, 'r') as f:
        pathway = read(f)

    relations_data = []

    # iterate over all relations in the pathway
    for relation in pathway.relations:
        entry1 = relation.entry1.id
        entry2 = relation.entry2.id
        relation_type = relation.type  # Type of interaction (e.g., PPrel, GErel, etc.)

        relations_data.append((entry1, entry2, relation_type))

    return relations_data

**Parse KGML file to extract relations**

In [4]:
kgml_file_path = "hiv_raw_data/hsa04012.xml"  
relations = extract_relations(kgml_file_path)

# for node1, node2, rel_type in relations:
#     print(f"Relation: Node1={node1}, Node2={node2}, Type={rel_type}")

print(len(relations))

85


In [5]:
relations_df = pd.DataFrame(relations, columns=['Node1', 'Node2', 'RelationType'])
relations_df.head()

Unnamed: 0,Node1,Node2,RelationType
0,50,79,PPrel
1,51,50,PPrel
2,45,12,PPrel
3,79,78,PPrel
4,45,178,PPrel


**Get entries i.e. nodes from KGML file**

In [6]:
entries_data = []
for entry in pathway.entries.values():
    entries_data.append({
        'id': entry.id,
        'name': entry.name,
        'type': entry.type,
        'link': entry.link,
        'reaction': entry.reaction,
    })
entries_df = pd.DataFrame(entries_data)
len(entries_df)

80

In [7]:
entries_df.head()

Unnamed: 0,id,name,type,link,reaction
0,7,path:hsa05223,map,https://www.kegg.jp/dbget-bin/www_bget?hsa05223,
1,8,hsa:2885,gene,https://www.kegg.jp/dbget-bin/www_bget?hsa:2885,
2,9,hsa:2002,gene,https://www.kegg.jp/dbget-bin/www_bget?hsa:2002,
3,10,hsa:2066,gene,https://www.kegg.jp/dbget-bin/www_bget?hsa:2066,
4,11,hsa:2066,gene,https://www.kegg.jp/dbget-bin/www_bget?hsa:2066,


In [8]:
set(entries_df['type'])

{'compound', 'gene', 'group', 'map'}

**Map the entry IDs in the relations to NodeIDs**

In [9]:
# replace Node1 with Name
relations_df = relations_df.merge(entries_df[['id', 'name']], left_on='Node1', right_on='id', how='left').drop(columns=['id'])
relations_df.rename(columns={'name': 'Node1_Name'}, inplace=True)
relations_df.head()

Unnamed: 0,Node1,Node2,RelationType,Node1_Name
0,50,79,PPrel,hsa:6654 hsa:6655
1,51,50,PPrel,hsa:2885
2,45,12,PPrel,hsa:9542
3,79,78,PPrel,hsa:3265 hsa:3845 hsa:4893
4,45,178,PPrel,hsa:9542


In [10]:
# replace Node2 with Name
relations_df = relations_df.merge(entries_df[['id', 'name']], left_on='Node2', right_on='id', how='left').drop(columns=['id'])
relations_df.rename(columns={'name': 'Node2_Name'}, inplace=True)
relations_df.head()

Unnamed: 0,Node1,Node2,RelationType,Node1_Name,Node2_Name
0,50,79,PPrel,hsa:6654 hsa:6655,hsa:3265 hsa:3845 hsa:4893
1,51,50,PPrel,hsa:2885,hsa:6654 hsa:6655
2,45,12,PPrel,hsa:9542,hsa:2065
3,79,78,PPrel,hsa:3265 hsa:3845 hsa:4893,hsa:369 hsa:5894 hsa:673
4,45,178,PPrel,hsa:9542,undefined


**Get cartesian product of all proteins in each ro - i.e. get all pairs**

In [11]:
split_node1_df = relations_df['Node1_Name'].str.split(' ', expand=True)
print(len(split_node1_df))
split_node1_df.head()

85


Unnamed: 0,0,1,2,3,4,5,6
0,hsa:6654,hsa:6655,,,,,
1,hsa:2885,,,,,,
2,hsa:9542,,,,,,
3,hsa:3265,hsa:3845,hsa:4893,,,,
4,hsa:9542,,,,,,


In [12]:
split_node2_df = relations_df['Node2_Name'].str.split(' ', expand=True)
print(len(split_node2_df))
split_node2_df.head()

85


Unnamed: 0,0,1,2,3,4,5,6
0,hsa:3265,hsa:3845,hsa:4893,,,,
1,hsa:6654,hsa:6655,,,,,
2,hsa:2065,,,,,,
3,hsa:369,hsa:5894,hsa:673,,,,
4,undefined,,,,,,


In [13]:
def cartesian_product_rows(df1, df2):
    result = []
    for row1, row2 in zip(df1.values, df2.values):
        # remove None vals
        values1 = [x for x in row1 if x is not None]
        values2 = [x for x in row2 if x is not None]
        
        # get the cartesian prod
        for pair in itertools.product(values1, values2):
            result.append(pair)
    
    return pd.DataFrame(result)

In [14]:
all_reaction_pairs = cartesian_product_rows(df1=split_node1_df, df2=split_node2_df)
all_reaction_pairs.columns = ['Node1', 'Node2']
all_reaction_pairs.head()

Unnamed: 0,Node1,Node2
0,hsa:6654,hsa:3265
1,hsa:6654,hsa:3845
2,hsa:6654,hsa:4893
3,hsa:6655,hsa:3265
4,hsa:6655,hsa:3845


In [15]:
print("Number of 'undefined' proteins in column 0: ", all_reaction_pairs['Node1'].value_counts()['undefined'])
print("Number of 'undefined' proteins in column 1: ", all_reaction_pairs['Node2'].value_counts()['undefined'])

Number of 'undefined' proteins in column 0:  47
Number of 'undefined' proteins in column 1:  15


**ID Mapping using LinkDB + keep only SwissProt reviewed proteins**

In [16]:
hsa_uniprot_df = pd.read_csv("hiv_raw_data/hsa_uniprot_03122025.list", sep='\t', header=None)
hsa_uniprot_df.columns = ['HSA','Uniprot','link_category']
hsa_uniprot_df.head()

Unnamed: 0,HSA,Uniprot,link_category
0,hsa:1,up:P04217,equivalent
1,hsa:1,up:V9HWD8,equivalent
2,hsa:10,up:A4Z6T7,equivalent
3,hsa:10,up:P11245,equivalent
4,hsa:100,up:A0A0S2Z381,equivalent


In [17]:
# replace Node1 with UniprotID
all_reaction_pairs = all_reaction_pairs.merge(hsa_uniprot_df[['HSA', 'Uniprot']], left_on='Node1', right_on='HSA', how='left').drop(columns=['HSA'])
all_reaction_pairs.rename(columns={'Uniprot': 'Node1_Uniprot'}, inplace=True)
all_reaction_pairs.head()

Unnamed: 0,Node1,Node2,Node1_Uniprot
0,hsa:6654,hsa:3265,up:Q07889
1,hsa:6654,hsa:3845,up:Q07889
2,hsa:6654,hsa:4893,up:Q07889
3,hsa:6655,hsa:3265,up:Q07890
4,hsa:6655,hsa:3845,up:Q07890


In [18]:
# replace Node1 with UniprotID
all_reaction_pairs = all_reaction_pairs.merge(hsa_uniprot_df[['HSA', 'Uniprot']], left_on='Node2', right_on='HSA', how='left').drop(columns=['HSA'])
all_reaction_pairs.rename(columns={'Uniprot': 'Node2_Uniprot'}, inplace=True)
all_reaction_pairs.head()

Unnamed: 0,Node1,Node2,Node1_Uniprot,Node2_Uniprot
0,hsa:6654,hsa:3265,up:Q07889,up:P01112
1,hsa:6654,hsa:3265,up:Q07889,up:X5D945
2,hsa:6654,hsa:3845,up:Q07889,up:I1SRC5
3,hsa:6654,hsa:3845,up:Q07889,up:L7RSL8
4,hsa:6654,hsa:3845,up:Q07889,up:P01116


In [19]:
all_reaction_pairs_uniprot = all_reaction_pairs.drop(columns=['Node1', 'Node2'])
all_reaction_pairs_uniprot.head()

Unnamed: 0,Node1_Uniprot,Node2_Uniprot
0,up:Q07889,up:P01112
1,up:Q07889,up:X5D945
2,up:Q07889,up:I1SRC5
3,up:Q07889,up:L7RSL8
4,up:Q07889,up:P01116


In [20]:
# Remove 'up:' from all strings excluding NaN
all_reaction_pairs_uniprot = all_reaction_pairs_uniprot.applymap(lambda x: x.replace("up:", "") if isinstance(x, str) else x)
all_reaction_pairs_uniprot.head()

  all_reaction_pairs_uniprot = all_reaction_pairs_uniprot.applymap(lambda x: x.replace("up:", "") if isinstance(x, str) else x)


Unnamed: 0,Node1_Uniprot,Node2_Uniprot
0,Q07889,P01112
1,Q07889,X5D945
2,Q07889,I1SRC5
3,Q07889,L7RSL8
4,Q07889,P01116


In [25]:
def filter_dataframe(df):
    base_url = "https://www.genome.jp/entry/sp:"
    # list to keep track of rows to keep
    rows_to_keep = []
    df = df.dropna()
    for index, row in df.iterrows():
        n1_uniprot = row['Node1_Uniprot']
        n2_uniprot = row['Node2_Uniprot']
        url_n1 = f"{base_url}{n1_uniprot}"
        response_n1 = requests.get(url_n1)
        url_n2 = f"{base_url}{n2_uniprot}"
        response_n2 = requests.get(url_n2)
        if "No such data was found." not in response_n1.text and "No such data was found." not in response_n2.text:
            rows_to_keep.append(index)
            # print(df.loc[index])
    # filter the DataFrame to keep only the desired rows
    filtered_df = df.loc[rows_to_keep].reset_index(drop=True)
    return filtered_df

In [26]:
copy = all_reaction_pairs_uniprot.copy()
all_reaction_pairs_swissprot = filter_dataframe(copy)
all_reaction_pairs_swissprot.head()

Unnamed: 0,Node1_Uniprot,Node2_Uniprot
0,Q07889,P01112
1,Q07889,P01116
2,Q07889,P01111
3,Q07890,P01112
4,Q07890,P01116


In [27]:
all_reaction_pairs_swissprot.to_csv('erbb_edges_swissprot.tsv',sep='\t')