In [1]:
import pandas as pd
import requests
import os

In [2]:
df = pd.read_csv('../Dataset/final_df.csv')
df.head()

Unnamed: 0,Respondent ID,Gender,Age,BMI,25-hydroxyvitamin D2 + D3 [nmol/L],Fasting Glucose [mmol/L],Insulin [pmol/L],Triglyceride [mmol/L],LDL-Cholesterol(NIH2) [mmol/L],Vitamin C [umol/L],...,Non_HDL Ratio,LDL_HDL Ratio,Total Cholestrol_HDL Ratio,Triglyceride_HDL Ratio,Lymphocyte_Monocyte Ratio,Neutrophil_Lymphocyte Ratio,Platelet_Lymphocyte Ratio,Platelet_WBC Ratio,Lymphocyte_C-Reactive Protein Ratio,Systemic Immune‐inflammation Index (SII)
0,93703.0,0.0,2.0,17.5,66.912952,6.212328,85.453019,1.219948,2.820133,54.172769,...,126.497004,2.042228,3.368882,0.883438,4.166667,1.6,103.700053,35.11398,7.268033,414.800213
1,93704.0,1.0,2.0,15.7,74.4,6.212328,85.453019,1.219948,2.820133,54.172769,...,126.497004,2.042228,3.368882,0.883438,5.833333,0.914286,68.285714,32.297297,120.689655,218.514286
2,93705.0,0.0,66.0,31.7,89.9,6.212328,85.453019,1.219948,2.820133,73.8,...,97.061098,1.819441,2.619355,0.787064,5.666667,1.235294,90.882353,35.930233,12.5,381.705882
3,93706.0,1.0,18.0,21.5,53.8,6.212328,85.453019,1.219948,2.820133,63.6,...,100.928074,2.311585,3.139344,0.999958,2.5,2.466667,155.333333,38.196721,20.27027,574.733333
4,93707.0,1.0,13.0,18.1,58.2,6.212328,85.453019,1.219948,2.820133,27.4,...,121.03635,1.602348,2.778409,0.693153,6.0,1.452381,82.857143,31.071429,131.25,505.428571


In [3]:
# We now convert this dataset into a Knowledge-Graph
# Knowledge graph is a knowledge base that uses a graph-structured data model or topology to integrate data.
# There are several advantages in using Knowledge Graphs
# Some of the advantages with respect to this specific case is that knowledge graphs are more flexible
# This allows us to modify data easily than a relational model
# This is especially useful since medical data is not static and changes will have to be updated
# Also, it helps in adding more components or even diseases in the future

# To convert this to a knowledge graph (using Neo4j) we should make csv files for each node and relationships

# Person Node

person = df[['Respondent ID', 'Gender', 'Age', 'BMI']].copy()
person['Gender'] = person['Gender'].map({0: 'Female', 1: 'Male'})
person['Age'] = person['Age'].astype(int)
person

Unnamed: 0,Respondent ID,Gender,Age,BMI
0,93703.0,Female,2,17.5
1,93704.0,Male,2,15.7
2,93705.0,Female,66,31.7
3,93706.0,Male,18,21.5
4,93707.0,Male,13,18.1
...,...,...,...,...
9249,102952.0,Female,70,20.0
9250,102953.0,Male,42,35.8
9251,102954.0,Female,41,26.1
9252,102955.0,Female,14,45.6


In [4]:
# Blood Biomarker Node

blood_biomarker = pd.DataFrame(list(df.columns[4:79])+list(df.columns[97:108]),columns=['Name'])
blood_biomarker['Unit'] = ''

In [5]:
for index, row in blood_biomarker.iterrows():
    for col in blood_biomarker.columns:
        cell_value = row[col]
        unit_start = cell_value.find('[')
        unit_end = cell_value.find(']')
        if unit_start != -1 and unit_end != -1:
            unit = cell_value[unit_start+1:unit_end]
            blood_biomarker.at[index, col] = cell_value[:unit_start].strip()
            blood_biomarker.at[index, 'Unit'] = unit

blood_biomarker.head()

Unnamed: 0,Name,Unit
0,25-hydroxyvitamin D2 + D3,nmol/L
1,Fasting Glucose,mmol/L
2,Insulin,pmol/L
3,Triglyceride,mmol/L
4,LDL-Cholesterol(NIH2),mmol/L


In [6]:
# We could also add the protein sequence to the biomarkers that are proteins
# since protein sequences provides useful insights and could even aid in potential drug discovery
# 57 columns do not have a protein sequence
# 18 columns have protein sequences

# We collect the protein sequences from the UniProt Database

# The URL to access the API of UniProt to download protein sequences of the 18 columns
url = "https://rest.uniprot.org/uniprotkb/accessions?accessions=E7ETN1%2CO00299%2CP00338%2CP01308%2CP02647%2CP02741%2CP02768%2CP02786%2CP02787%2CP02794%2CP04114%2CP05186%2CP06732%2CP17174%2CP24298%2CP55157%2CP69905%2CQ6U841&format=fasta"

fasta_download_dir = "../KG"
if not os.path.exists(fasta_download_dir):
    os.makedirs(fasta_download_dir)

response = requests.get(url)

# Save the sequences as FASTA content to a file
if response.status_code == 200:
    with open("../KG/protein_sequences.fasta", "wb") as fasta_file:
        fasta_file.write(response.content)
    print('Downloaded Protein Sequences as .fasta file')
else:
    print("Failed to retrieve protein sequences.")

Downloaded Protein Sequences as .fasta file


In [7]:
# We use SeqIO from Bio to read from fasta files

from Bio import SeqIO

fasta_file = '../KG/protein_sequences.fasta'

sequence_ids = []
sequences = []

# Reading the FASTA file
for record in SeqIO.parse(fasta_file, 'fasta'):
    sequence_ids.append(record.id)
    sequences.append(str(record.seq))

seqs = pd.DataFrame({'Sequence_ID': sequence_ids, 'Sequence': sequences})
seqs

Unnamed: 0,Sequence_ID,Sequence
0,tr|E7ETN1|E7ETN1_HUMAN,MKKKLVVLGLLAVVLVLVIVGLCLWLPSASKEPDNHVYTRAAVAAD...
1,sp|O00299|CLIC1_HUMAN,MAEEQPQVELFVKAGSDGAKIGNCPFSQRLFMVLWLKGVTFNVTTV...
2,sp|P00338|LDHA_HUMAN,MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLAD...
3,sp|P01308|INS_HUMAN,MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGER...
4,sp|P02647|APOA1_HUMAN,MKAAVLTLAVLFLTGSQARHFWQQDEPPQSPWDRVKDLATVYVDVL...
5,sp|P02741|CRP_HUMAN,MEKLLCFLVLTSLSHAFGQTDMSRKAFVFPKESDTSYVSLKAPLTK...
6,sp|P02768|ALBU_HUMAN,MKWVTFISLLFLFSSAYSRGVFRRDAHKSEVAHRFKDLGEENFKAL...
7,sp|P02786|TFR1_HUMAN,MMDQARSAFSNLFGGEPLSYTRFSLARQVDGDNSHVEMKLAVDEEE...
8,sp|P02787|TRFE_HUMAN,MRLAVGALLVCAVLGLCLAVPDKTVRWCAVSEHEATKCQSFRDHMK...
9,sp|P02794|FRIH_HUMAN,MTTASTSQVRQNYHQDSEAAINRQINLELYASYVYLSMSYYFDRDD...


In [8]:
# Now we convert the IDs to the corresponding protein columns

id_to_protein = {
    'E7ETN1_HUMAN': 'Gamma Glutamyl Transferase',
    'CLIC1_HUMAN': 'Chloride',
    'LDHA_HUMAN': 'Lactate Dehydrogenase',
    'INS_HUMAN': 'Insulin',
    'APOA1_HUMAN': 'Direct HDL-Cholesterol',
    'CRP_HUMAN': 'C-Reactive Protein',
    'ALBU_HUMAN': 'Albumin',
    'TFR1_HUMAN': 'Transferrin receptor',
    'TRFE_HUMAN': 'Transferrin Saturation',
    'FRIH_HUMAN': 'Ferritin',
    'APOB_HUMAN': 'LDL-Cholesterol(NIH2)',
    'PPBT_HUMAN': 'Alkaline Phosphatase',
    'KCRM_HUMAN': 'Creatine Phosphokinase',
    'AATC_HUMAN': 'Aspartate Aminotransferase',
    'ALAT1_HUMAN': 'Alanine Aminotransferase',
    'MTP_HUMAN': 'Triglyceride',
    'HBA_HUMAN': 'Hemoglobin',
    'S4A10_HUMAN': 'Bicarbonate',}

seqs['Sequence_ID'] = seqs['Sequence_ID'].str.split('|').str[-1]
seqs['Sequence_ID'] = seqs['Sequence_ID'].replace(id_to_protein)
seqs.columns = ['Name', 'Protein_Sequence']
seqs

Unnamed: 0,Name,Protein_Sequence
0,Gamma Glutamyl Transferase,MKKKLVVLGLLAVVLVLVIVGLCLWLPSASKEPDNHVYTRAAVAAD...
1,Chloride,MAEEQPQVELFVKAGSDGAKIGNCPFSQRLFMVLWLKGVTFNVTTV...
2,Lactate Dehydrogenase,MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLAD...
3,Insulin,MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGER...
4,Direct HDL-Cholesterol,MKAAVLTLAVLFLTGSQARHFWQQDEPPQSPWDRVKDLATVYVDVL...
5,C-Reactive Protein,MEKLLCFLVLTSLSHAFGQTDMSRKAFVFPKESDTSYVSLKAPLTK...
6,Albumin,MKWVTFISLLFLFSSAYSRGVFRRDAHKSEVAHRFKDLGEENFKAL...
7,Transferrin receptor,MMDQARSAFSNLFGGEPLSYTRFSLARQVDGDNSHVEMKLAVDEEE...
8,Transferrin Saturation,MRLAVGALLVCAVLGLCLAVPDKTVRWCAVSEHEATKCQSFRDHMK...
9,Ferritin,MTTASTSQVRQNYHQDSEAAINRQINLELYASYVYLSMSYYFDRDD...


In [9]:
# Finally we merge it with the blood biomarker df

blood_biomarker = blood_biomarker.merge(seqs, on='Name', how='left')
blood_biomarker.head()

Unnamed: 0,Name,Unit,Protein_Sequence
0,25-hydroxyvitamin D2 + D3,nmol/L,
1,Fasting Glucose,mmol/L,
2,Insulin,pmol/L,MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGER...
3,Triglyceride,mmol/L,MILLAVLFLCFISSYSASVKGHTTGLSLNNDRLYKLTYSTEVLLDR...
4,LDL-Cholesterol(NIH2),mmol/L,MDPPRPALLALLALPALLLLLLAGARAEEEMLENVSLVCPKDATRF...
