In [1]:
# pip install biopython

In [2]:
# pip install --upgrade biopython

In [3]:
from Bio.KEGG.KGML.KGML_parser import read
import pandas as pd
import requests
import re

### parse KGML file to find human proteins using Biopython KGML parser

In [4]:
pathway = read(open("C:/Users/sumed/BDS-SROP-24/spras_raw_data/ko03250.xml", 'r'))

In [5]:
print(pathway)

Pathway: Viral life cycle - HIV-1
KEGG ID: path:ko03250
Image file: https://www.kegg.jp/kegg/pathway/ko/ko03250.png
Organism: ko
Entries: 85
Entry types:
	ortholog: 81
	map: 4



In [6]:
entries_data = []
for entry in pathway.entries.values():
    entries_data.append({
        'id': entry.id,
        'name': entry.name,
        'type': entry.type,
        'link': entry.link,
        'reaction': entry.reaction,
    })
entries_df = pd.DataFrame(entries_data)
print(len(entries_df))

85


In [7]:
entries_df.head()

Unnamed: 0,id,name,type,link,reaction
0,308,ko:K19258,ortholog,https://www.kegg.jp/dbget-bin/www_bget?K19258,
1,309,ko:K24802,ortholog,https://www.kegg.jp/dbget-bin/www_bget?K24802,
2,310,ko:K22202,ortholog,https://www.kegg.jp/dbget-bin/www_bget?K22202,
3,311,ko:K24803,ortholog,https://www.kegg.jp/dbget-bin/www_bget?K24803,
4,312,ko:K22891,ortholog,https://www.kegg.jp/dbget-bin/www_bget?K22891,


In [8]:
set(entries_df['type'])

{'map', 'ortholog'}

In [9]:
proteins_df = entries_df[entries_df['type'] == 'ortholog'] 

In [10]:
proteins_df.head()

Unnamed: 0,id,name,type,link,reaction
0,308,ko:K19258,ortholog,https://www.kegg.jp/dbget-bin/www_bget?K19258,
1,309,ko:K24802,ortholog,https://www.kegg.jp/dbget-bin/www_bget?K24802,
2,310,ko:K22202,ortholog,https://www.kegg.jp/dbget-bin/www_bget?K22202,
3,311,ko:K24803,ortholog,https://www.kegg.jp/dbget-bin/www_bget?K24803,
4,312,ko:K22891,ortholog,https://www.kegg.jp/dbget-bin/www_bget?K22891,


In [11]:
split_proteins_df = proteins_df['name'].str.split(' ', expand=True)
split_proteins_df.head()

Unnamed: 0,0,1,2,3
0,ko:K19258,,,
1,ko:K24802,,,
2,ko:K22202,,,
3,ko:K24803,,,
4,ko:K22891,,,


In [12]:
result = split_proteins_df.values.flatten()
proteins_list = [x for x in result if x is not None]

In [13]:
orthology_ids = [item.replace('ko:', '') for item in proteins_list]
print(orthology_ids)

['K19258', 'K24802', 'K22202', 'K24803', 'K22891', 'K25664', 'K22599', 'K22892', 'K25306', 'K25057', 'K24801', 'K24266', 'K12183', 'K12194', 'K12195', 'K12200', 'K22951', 'K22951', 'K24802', 'K24803', 'K25664', 'K22599', 'K25306', 'K25667', 'K25666', 'K22599', 'K04180', 'K04189', 'K06454', 'K12172', 'K14296', 'K15436', 'K14398', 'K24801', 'K10648', 'K18739', 'K25663', 'K10429', 'K14754', 'K03767', 'K25306', 'K24803', 'K11648', 'K09578', 'K25306', 'K24803', 'K15436', 'K14398', 'K11648', 'K06062', 'K19258', 'K22202', 'K22891', 'K14290', 'K25665', 'K19287', 'K24801', 'K24266', 'K01349', 'K15185', 'K15179', 'K15180', 'K15181', 'K15182', 'K15171', 'K15172', 'K22892', 'K18750', 'K06731', 'K22890', 'K12182', 'K12196', 'K15183', 'K02211', 'K15187', 'K15188', 'K25057', 'K03767', 'K04498', 'K07936', 'K18750', 'K22544', 'K22892', 'K25655', 'K25661', 'K22891', 'K27688']


### Building Orthology:HSA map through API calls

In [14]:
url_template = "http://rest.genome.jp/link/ORTHOLOGY:{}"

In [15]:
orthology_hsa_map = {}

Uncomment the following cells to build the Orthology:HSA map through API calls and save the dataframe locally as a CSV:

In [16]:
for item in orthology_ids:
    # construct the API URL and make api call
    url = url_template.format(item)
    response = requests.get(url)
    # if successful
    if response.status_code == 200:
        # get response text
        response_text = response.text 
        # split the response into lines
        lines = response_text.splitlines() 
        for line in lines:
            parts = line.split('\t')     
            # check if the line has at least 2 parts
            if len(parts) > 1:
                # Get the orthology and hsa ID
                orthology_id = parts[0].split(":")[1]
                hsa_id = parts[1]            
                # Check if the hsa ID contains "hsa"
                if re.search(r'\bhsa\b', hsa_id):
                    # add the orthology-hsa mapping to the dictionary
                    orthology_hsa_map[orthology_id] = hsa_id

In [17]:
orthology_hsa_map

{'K25057': 'hsa:11168',
 'K12183': 'hsa:7251',
 'K12194': 'hsa:29082',
 'K12195': 'hsa:79643',
 'K12200': 'hsa:10015',
 'K04180': 'hsa:1234',
 'K04189': 'hsa:7852',
 'K06454': 'hsa:920',
 'K12172': 'hsa:5903',
 'K14296': 'hsa:9972',
 'K15436': 'hsa:23534',
 'K14398': 'hsa:79869',
 'K10648': 'hsa:85363',
 'K18739': 'hsa:636',
 'K25663': 'hsa:9638',
 'K10429': 'hsa:55201',
 'K14754': 'hsa:4600',
 'K03767': 'hsa:5478',
 'K11648': 'hsa:6598',
 'K09578': 'hsa:5300',
 'K06062': 'hsa:8850',
 'K14290': 'hsa:7514',
 'K01349': 'hsa:5045',
 'K15185': 'hsa:27125',
 'K15179': 'hsa:7469',
 'K15180': 'hsa:25920',
 'K15181': 'hsa:51497',
 'K15182': 'hsa:7936',
 'K15171': 'hsa:6827',
 'K15172': 'hsa:6829',
 'K18750': 'hsa:9582',
 'K06731': 'hsa:684',
 'K12182': 'hsa:9146',
 'K12196': 'hsa:9525',
 'K15183': 'hsa:8178',
 'K02211': 'hsa:1025',
 'K15187': 'hsa:4300',
 'K15188': 'hsa:905',
 'K04498': 'hsa:2033',
 'K07936': 'hsa:5901',
 'K22544': 'hsa:25939',
 'K25655': 'hsa:256987',
 'K25661': 'hsa:10955',


In [18]:
orthology_hsa_df = pd.DataFrame(orthology_hsa_map.items(), columns=['KEGG_Orthology', 'HSA'])
orthology_hsa_df.head()

Unnamed: 0,KEGG_Orthology,HSA
0,K25057,hsa:11168
1,K12183,hsa:7251
2,K12194,hsa:29082
3,K12195,hsa:79643
4,K12200,hsa:10015


In [19]:
orthology_hsa_df.to_csv("C:/Users/sumed/BDS-SROP-24/spras_processed_data/orthology_hsa_mapping_ko03250.csv", index=False)

### Map Orthology to HSA to Uniprot

Loads the dataframe that was created above and saved as a CSV locally:

In [20]:
orthology_hsa_df = pd.read_csv("C:/Users/sumed/BDS-SROP-24/spras_processed_data/orthology_hsa_mapping_ko03250.csv")
orthology_hsa_df.head()

Unnamed: 0,KEGG_Orthology,HSA
0,K25057,hsa:11168
1,K12183,hsa:7251
2,K12194,hsa:29082
3,K12195,hsa:79643
4,K12200,hsa:10015


In [21]:
list1 = orthology_hsa_df['KEGG_Orthology'].tolist()
print(len(set(list1)))
print(len(set(orthology_ids)))
subtracted = (set(orthology_ids).difference(list1))
print("Following proteins don't have hsa ID: ")
print(subtracted)

44
61
Following proteins don't have hsa ID: 
{'K22951', 'K25664', 'K24801', 'K19258', 'K24266', 'K22202', 'K19287', 'K25665', 'K25666', 'K25667', 'K22891', 'K22599', 'K24803', 'K22892', 'K25306', 'K24802', 'K22890'}


Load HSA to Uniprot database link list:

In [22]:
hsa_uniprot_df = pd.read_csv("C:/Users/sumed/BDS-SROP-24/spras_raw_data/hsa_uniprot.list", sep='\t', header=None)
hsa_uniprot_df.columns = ['HSA','Uniprot','link_category']
hsa_uniprot_df.head()

Unnamed: 0,HSA,Uniprot,link_category
0,hsa:1,up:P04217,equivalent
1,hsa:1,up:V9HWD8,equivalent
2,hsa:10,up:A4Z6T7,equivalent
3,hsa:10,up:P11245,equivalent
4,hsa:100,up:A0A0S2Z381,equivalent


Dataframe merge on HSA ID to link Orthology -> HSA -> Uniprot

In [23]:
orthology_uniprot_df = orthology_hsa_df.merge(hsa_uniprot_df, on = 'HSA', how = 'left')
orthology_uniprot_df.head(200)

Unnamed: 0,KEGG_Orthology,HSA,Uniprot,link_category
0,K25057,hsa:11168,up:O75475,equivalent
1,K12183,hsa:7251,up:Q99816,equivalent
2,K12194,hsa:29082,up:Q9BY43,equivalent
3,K12195,hsa:79643,up:Q96FZ7,equivalent
4,K12200,hsa:10015,up:Q8WUM4,equivalent
5,K04180,hsa:1234,up:P51681,equivalent
6,K04180,hsa:1234,up:Q38L21,equivalent
7,K04189,hsa:7852,up:P61073,equivalent
8,K06454,hsa:920,up:A0A4Y5UGE4,equivalent
9,K06454,hsa:920,up:B4DT49,equivalent


Uncomment the following cell to save the mapping:

In [24]:
orthology_uniprot_df.to_csv("C:/Users/sumed/BDS-SROP-24/spras_processed_data/kegg-orthology_hsa_uniprot_mapping_ko03250.csv", index=False)

### Selecting only Uniprot IDs that are also SwissProt IDs i.e. have been manually reviewed

In [26]:
def filter_dataframe(df):
    base_url = "https://www.genome.jp/entry/sp:"
    # list to keep track of rows to keep
    rows_to_keep = []
    df = df.dropna()
    for index, row in df.iterrows():
        uniprot_id = row['Uniprot'][3:]
        url = f"{base_url}{uniprot_id}"
        response = requests.get(url)
        # print(response.text)
        if "No such data was found." not in response.text:
            rows_to_keep.append(index)
            # print("SwissProt exists for: ", uniprot_id)
        else:
            print("SwissProt DOES NOT exist for: ", uniprot_id)
    # filter the DataFrame to keep only the desired rows
    filtered_df = df.loc[rows_to_keep].reset_index(drop=True)
    return filtered_df

Uncomment the following cells to filter the dataframe and save the Orthology -> HSA -> Swissprot mapping dataframe locally as a CSV:

In [27]:
orthology_uniprot_df_copy = orthology_uniprot_df.copy()

In [28]:
orthology_swissprot_df = filter_dataframe(orthology_uniprot_df_copy)

SwissProt DOES NOT exist for:  Q38L21
SwissProt DOES NOT exist for:  A0A4Y5UGE4
SwissProt DOES NOT exist for:  B4DT49
SwissProt DOES NOT exist for:  B3KMX1
SwissProt DOES NOT exist for:  V9HWF5
SwissProt DOES NOT exist for:  B3KWD0
SwissProt DOES NOT exist for:  A0A0C4DFX9
SwissProt DOES NOT exist for:  H0UI80
SwissProt DOES NOT exist for:  A0A1U9X830
SwissProt DOES NOT exist for:  A0A024R7H5
SwissProt DOES NOT exist for:  A0A0S2Z4R4
SwissProt DOES NOT exist for:  A0A0S2Z448
SwissProt DOES NOT exist for:  B4DH21
SwissProt DOES NOT exist for:  Q7Z6C1
SwissProt DOES NOT exist for:  Q59H15


In [29]:
orthology_swissprot_df.head()

Unnamed: 0,KEGG_Orthology,HSA,Uniprot,link_category
0,K25057,hsa:11168,up:O75475,equivalent
1,K12183,hsa:7251,up:Q99816,equivalent
2,K12194,hsa:29082,up:Q9BY43,equivalent
3,K12195,hsa:79643,up:Q96FZ7,equivalent
4,K12200,hsa:10015,up:Q8WUM4,equivalent


In [30]:
len(orthology_swissprot_df)

44

In [31]:
orthology_swissprot_df.to_csv("C:/Users/sumed/BDS-SROP-24/spras_processed_data/kegg-orthology_hsa_swissprot_mapping_ko03250.csv", index=False)