In [1]:
import pandas as pd
import requests
import os

In [2]:
df = pd.read_csv('../Dataset/final_df.csv')
df.head()

Unnamed: 0,Respondent ID,Gender,Age,BMI,25-hydroxyvitamin D2 + D3 [nmol/L],Fasting Glucose [mmol/L],Insulin [pmol/L],Triglyceride [mmol/L],LDL-Cholesterol(NIH2) [mmol/L],Vitamin C [umol/L],...,Non_HDL Ratio,LDL_HDL Ratio,Total Cholestrol_HDL Ratio,Triglyceride_HDL Ratio,Lymphocyte_Monocyte Ratio,Neutrophil_Lymphocyte Ratio,Platelet_Lymphocyte Ratio,Platelet_WBC Ratio,Lymphocyte_C-Reactive Protein Ratio,Systemic Immune‐inflammation Index (SII)
0,93703.0,0.0,2.0,17.5,66.912952,6.212328,85.453019,1.219948,2.820133,54.172769,...,126.497004,2.042228,3.368882,0.883438,4.166667,1.6,103.700053,35.11398,7.268033,414.800213
1,93704.0,1.0,2.0,15.7,74.4,6.212328,85.453019,1.219948,2.820133,54.172769,...,126.497004,2.042228,3.368882,0.883438,5.833333,0.914286,68.285714,32.297297,120.689655,218.514286
2,93705.0,0.0,66.0,31.7,89.9,6.212328,85.453019,1.219948,2.820133,73.8,...,97.061098,1.819441,2.619355,0.787064,5.666667,1.235294,90.882353,35.930233,12.5,381.705882
3,93706.0,1.0,18.0,21.5,53.8,6.212328,85.453019,1.219948,2.820133,63.6,...,100.928074,2.311585,3.139344,0.999958,2.5,2.466667,155.333333,38.196721,20.27027,574.733333
4,93707.0,1.0,13.0,18.1,58.2,6.212328,85.453019,1.219948,2.820133,27.4,...,121.03635,1.602348,2.778409,0.693153,6.0,1.452381,82.857143,31.071429,131.25,505.428571


In [3]:
# We now convert this dataset into a Knowledge-Graph
# Knowledge graph is a knowledge base that uses a graph-structured data model or topology to integrate data.
# There are several advantages in using Knowledge Graphs
# Some of the advantages with respect to this specific case is that knowledge graphs are more flexible
# This allows us to modify data easily than a relational model
# This is especially useful since medical data is not static and changes will have to be updated
# Also, it helps in adding more components or even diseases in the future

# To convert this to a knowledge graph (using Neo4j) we should make csv files for each node and relationships

### Person Node

In [4]:
# Person Node table

person = df[['Respondent ID', 'Gender', 'Age', 'BMI']].copy()
person['Gender'] = person['Gender'].map({0: 'Female', 1: 'Male'})
person['Age'] = person['Age'].astype(int)
person

Unnamed: 0,Respondent ID,Gender,Age,BMI
0,93703.0,Female,2,17.5
1,93704.0,Male,2,15.7
2,93705.0,Female,66,31.7
3,93706.0,Male,18,21.5
4,93707.0,Male,13,18.1
...,...,...,...,...
9249,102952.0,Female,70,20.0
9250,102953.0,Male,42,35.8
9251,102954.0,Female,41,26.1
9252,102955.0,Female,14,45.6


### Blood Biomarker Node

In [13]:
blood_biomarker = pd.DataFrame(list(df.columns[4:79])+list(df.columns[97:108]),columns=['Name'])
blood_biomarker['Unit'] = ''

In [14]:
# Making a seperate column for units so that it becomes a property
for index, row in blood_biomarker.iterrows():
    for col in blood_biomarker.columns:
        cell_value = row[col]
        unit_start = cell_value.find('[')
        unit_end = cell_value.find(']')
        if unit_start != -1 and unit_end != -1:
            unit = cell_value[unit_start+1:unit_end]
            blood_biomarker.at[index, col] = cell_value[:unit_start].strip()
            blood_biomarker.at[index, 'Unit'] = unit

blood_biomarker.head(10)

Unnamed: 0,Name,Unit
0,25-hydroxyvitamin D2 + D3,nmol/L
1,Fasting Glucose,mmol/L
2,Insulin,pmol/L
3,Triglyceride,mmol/L
4,LDL-Cholesterol(NIH2),mmol/L
5,Vitamin C,umol/L
6,Alanine Aminotransferase,U/L
7,Albumin,g/dL
8,Alkaline Phosphatase,IU/L
9,Aspartate Aminotransferase,U/L


In [7]:
# We could also add the protein sequence to the biomarkers that are proteins
# since protein sequences provides useful insights and could even aid in potential drug discovery
# 57 columns do not have a protein sequence
# 18 columns have protein sequences

# We collect the protein sequences from the UniProt Database

# The URL to access the API of UniProt to download protein sequences of the 18 columns
url = "https://rest.uniprot.org/uniprotkb/accessions?accessions=E7ETN1%2CO00299%2CP00338%2CP01308%2CP02647%2CP02741%2CP02768%2CP02786%2CP02787%2CP02794%2CP04114%2CP05186%2CP06732%2CP17174%2CP24298%2CP55157%2CP69905%2CQ6U841&format=fasta"

fasta_download_dir = "../KG"
if not os.path.exists(fasta_download_dir):
    os.makedirs(fasta_download_dir)

response = requests.get(url)

# Save the sequences as FASTA content to a file
if response.status_code == 200:
    with open("../KG/protein_sequences.fasta", "wb") as fasta_file:
        fasta_file.write(response.content)
    print('Downloaded Protein Sequences as .fasta file')
else:
    print("Failed to retrieve protein sequences.")

Downloaded Protein Sequences as .fasta file


In [15]:
# We use SeqIO from Bio to read from fasta files

from Bio import SeqIO

fasta_file = '../KG/protein_sequences.fasta'

seq_ids = []
seqs = []
seq_lengths = []
gene_names = []

# Reading the FASTA file
for record in SeqIO.parse(fasta_file, 'fasta'):
    seq_ids.append(record.id)
    seqs.append(str(record.seq))
    seq_lengths.append(len(record))
    gene_name = ''
    for part in record.description.split():
        if part.startswith('GN='):
            gene_name = part[3:]
            break
    gene_names.append(gene_name)

sequences = pd.DataFrame({'Sequence_ID': seq_ids, 'Gene_Name': gene_names,
                          'Sequence': seqs, 'Sequence_Length': seq_lengths})
sequences

Unnamed: 0,Sequence_ID,Gene_Name,Sequence,Sequence_Length
0,tr|E7ETN1|E7ETN1_HUMAN,GGT1,MKKKLVVLGLLAVVLVLVIVGLCLWLPSASKEPDNHVYTRAAVAAD...,97
1,sp|O00299|CLIC1_HUMAN,CLIC1,MAEEQPQVELFVKAGSDGAKIGNCPFSQRLFMVLWLKGVTFNVTTV...,241
2,sp|P00338|LDHA_HUMAN,LDHA,MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLAD...,332
3,sp|P01308|INS_HUMAN,INS,MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGER...,110
4,sp|P02647|APOA1_HUMAN,APOA1,MKAAVLTLAVLFLTGSQARHFWQQDEPPQSPWDRVKDLATVYVDVL...,267
5,sp|P02741|CRP_HUMAN,CRP,MEKLLCFLVLTSLSHAFGQTDMSRKAFVFPKESDTSYVSLKAPLTK...,224
6,sp|P02768|ALBU_HUMAN,ALB,MKWVTFISLLFLFSSAYSRGVFRRDAHKSEVAHRFKDLGEENFKAL...,609
7,sp|P02786|TFR1_HUMAN,TFRC,MMDQARSAFSNLFGGEPLSYTRFSLARQVDGDNSHVEMKLAVDEEE...,760
8,sp|P02787|TRFE_HUMAN,TF,MRLAVGALLVCAVLGLCLAVPDKTVRWCAVSEHEATKCQSFRDHMK...,698
9,sp|P02794|FRIH_HUMAN,FTH1,MTTASTSQVRQNYHQDSEAAINRQINLELYASYVYLSMSYYFDRDD...,183


In [16]:
# Now we convert the IDs to the corresponding protein columns

id_to_protein = {
    'E7ETN1_HUMAN': 'Gamma Glutamyl Transferase', # Gamma-glutamyltransferase 1 
    'CLIC1_HUMAN': 'Chloride',                    # Chloride intracellular channel protein 1
    'LDHA_HUMAN': 'Lactate Dehydrogenase',        # L-lactate dehydrogenase A chain
    'INS_HUMAN': 'Insulin',                       # Insulin
    'APOA1_HUMAN': 'Direct HDL-Cholesterol',      # Apolipoprotein A-I 
    'CRP_HUMAN': 'C-Reactive Protein',            # C-reactive protein 
    'ALBU_HUMAN': 'Albumin',                      # Albumin
    'TFR1_HUMAN': 'Transferrin receptor',         # Transferrin receptor protein 1
    'TRFE_HUMAN': 'Transferrin Saturation',       # Serotransferrin
    'FRIH_HUMAN': 'Ferritin',                     # Ferritin heavy chain
    'APOB_HUMAN': 'LDL-Cholesterol(NIH2)',        # Apolipoprotein B-100 
    'PPBT_HUMAN': 'Alkaline Phosphatase',         # Alkaline phosphatase
    'KCRM_HUMAN': 'Creatine Phosphokinase',       # Creatine kinase M-type 
    'AATC_HUMAN': 'Aspartate Aminotransferase',   # Aspartate aminotransferase
    'ALAT1_HUMAN': 'Alanine Aminotransferase',    # Alanine aminotransferase 1 
    'MTP_HUMAN': 'Triglyceride',                  # Microsomal triglyceride transfer protein large subunit
    'HBA_HUMAN': 'Hemoglobin',                    # Hemoglobin subunit alpha 
    'S4A10_HUMAN': 'Bicarbonate',}                # Sodium-driven chloride bicarbonate exchanger

sequences['Sequence_ID'] = sequences['Sequence_ID'].str.split('|').str[-1]
sequences['Sequence_ID'] = sequences['Sequence_ID'].replace(id_to_protein)
sequences.columns = ['Name', 'Protein_Gene_Name', 'Protein_Sequence', 'Protein_Sequence_Length']
sequences

Unnamed: 0,Name,Protein_Gene_Name,Protein_Sequence,Protein_Sequence_Length
0,Gamma Glutamyl Transferase,GGT1,MKKKLVVLGLLAVVLVLVIVGLCLWLPSASKEPDNHVYTRAAVAAD...,97
1,Chloride,CLIC1,MAEEQPQVELFVKAGSDGAKIGNCPFSQRLFMVLWLKGVTFNVTTV...,241
2,Lactate Dehydrogenase,LDHA,MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLAD...,332
3,Insulin,INS,MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGER...,110
4,Direct HDL-Cholesterol,APOA1,MKAAVLTLAVLFLTGSQARHFWQQDEPPQSPWDRVKDLATVYVDVL...,267
5,C-Reactive Protein,CRP,MEKLLCFLVLTSLSHAFGQTDMSRKAFVFPKESDTSYVSLKAPLTK...,224
6,Albumin,ALB,MKWVTFISLLFLFSSAYSRGVFRRDAHKSEVAHRFKDLGEENFKAL...,609
7,Transferrin receptor,TFRC,MMDQARSAFSNLFGGEPLSYTRFSLARQVDGDNSHVEMKLAVDEEE...,760
8,Transferrin Saturation,TF,MRLAVGALLVCAVLGLCLAVPDKTVRWCAVSEHEATKCQSFRDHMK...,698
9,Ferritin,FTH1,MTTASTSQVRQNYHQDSEAAINRQINLELYASYVYLSMSYYFDRDD...,183


In [17]:
# Finally we merge it with the blood biomarker blood_biomarker

blood_biomarker = blood_biomarker.merge(sequences, on='Name', how='left')

# Adding a column to indicate the type of blood biomarker
biomarker_type_mapping = {
    '25-hydroxyvitamin D2 + D3': 'Vitamin',
    'Fasting Glucose': 'Blood Sugar',
    'Insulin': 'Hormone (Protein)',
    'Triglyceride': 'Lipoprotein',
    'LDL-Cholesterol(NIH2)': 'Lipoprotein',
    'Vitamin C': 'Vitamin',
    'Alanine Aminotransferase': 'Enzyme (Protein)',
    'Albumin': 'Enzyme (Protein)',
    'Alkaline Phosphatase': 'Enzyme (Protein)',
    'Aspartate Aminotransferase': 'Enzyme (Protein)',
    'Bicarbonate': 'Metabolite',
    'Blood Urea Nitrogen': 'Waste byproduct',
    'Creatinine': 'Chemical compound',
    'Chloride': 'Electrolytes',
    'Potassium': 'Electrolytes',
    'Sodium': 'Electrolytes',
    'Total Bilirubin': 'Pigment',
    'Total Calcium': 'Electrolytes',
    'Total Protein': 'Proteins',
    'Uric Acid': 'Waste Byproduct',
    'Ferritin': 'Protein',
    'Chromium': 'Mineral',
    'Cobalt': 'Mineral',
    'Cotinine': 'Metabolite',
    'Hydroxycotinine': 'Metabolite',
    'Total Cholesterol': 'Cholesterol',
    'Direct HDL-Cholesterol': 'Lipoprotein',
    'White blood cell count': 'Blood Cells',
    'Lymphocyte': 'Blood Cells',
    'Monocyte': 'Blood Cells',
    'Segmented neutrophils': 'Blood Cells',
    'Eosinophils': 'Blood Cells',
    'Basophils': 'Blood Cells',
    'RBC count': 'Blood Cells',
    'Hemoglobin': 'Blood Cells',
    'Hematocrit': 'Blood Cells',
    'Mean cell volume': 'Blood Cells',
    'Mean cell hemoglobin': 'Blood Cells',
    'Mean Cell Hgb Conc.': 'Blood Cells',
    'Red cell distribution width': 'Blood Cells',
    'Platelet count': 'Blood Cells',
    'Mean platelet volume': 'Blood Cells',
    'Nucleated RBCs': 'Blood Cells',
    'Iron': 'Iron Levels',
    'Iron frozen': 'Iron Levels',
    'UIBC': 'Iron Levels',
    'Total Iron Binding Capacity': 'Iron Levels',
    'Transferrin Saturation': 'Iron Levels',
    'C-Reactive Protein': 'Protein',
    'Alpha-carotene': 'Carotenoids',
    'Alpha-crypotoxanthin': 'Carotenoids',
    'Trans-beta-carotene': 'Carotenoids',
    'Cis-beta-carotene': 'Carotenoids',
    'Beta-cryptoxanthin': 'Carotenoids',
    'Gamma-tocopherol': 'Tocopherols',
    'Alpha-tocopherol': 'Tocopherols',
    'Lutein and zeaxanthin': 'Carotenoids and Retinoids',
    'Trans-lycopene': 'Carotenoids and Retinoids',
    'Retinyl palmitate': 'Carotenoids and Retinoids',
    'Retinyl stearate': 'Carotenoids and Retinoids',
    'Total Lycopene': 'Carotenoids and Retinoids',
    'Retinol': 'Carotenoids and Retinoids',
    'Blood lead': 'Heavy Metals',
    'Blood cadmium': 'Heavy Metals',
    'Blood mercury': 'Heavy Metals',
    'Blood selenium': 'Heavy Metals',
    'Blood manganese': 'Heavy Metals',
    'Transferrin receptor': 'Heavy Metals',
    'Glycohemoglobin': 'Blood Sugar',
    'Creatine Phosphokinase': 'Enzyme (Protein)',
    'Globulin': 'Proteins',
    'Gamma Glutamyl Transferase': 'Enzyme (Protein)',
    'Lactate Dehydrogenase': 'Enzyme (Protein)',
    'Osmolality': 'Electrolyte Concentration',
    'Phosphorus': 'Mineral',
    'A/G Ratio': 'Composite Biomarkers',
    'Non_HDL Ratio': 'Composite Biomarkers',
    'LDL_HDL Ratio': 'Composite Biomarkers',
    'Total Cholestrol_HDL Ratio': 'Composite Biomarkers',
    'Triglyceride_HDL Ratio': 'Composite Biomarkers',
    'Lymphocyte_Monocyte Ratio': 'Composite Biomarkers',
    'Neutrophil_Lymphocyte Ratio': 'Composite Biomarkers',
    'Platelet_Lymphocyte Ratio': 'Composite Biomarkers',
    'Platelet_WBC Ratio': 'Composite Biomarkers',
    'Lymphocyte_C-Reactive Protein Ratio': 'Composite Biomarkers',
    'Systemic Immune‐inflammation Index (SII)': 'Composite Biomarkers'}

blood_biomarker.insert(loc=2, column='Type', value=blood_biomarker['Name'].map(biomarker_type_mapping))

# Blood Biomarker Node table
blood_biomarker.head()

Unnamed: 0,Name,Unit,Type,Protein_Gene_Name,Protein_Sequence,Protein_Sequence_Length
0,25-hydroxyvitamin D2 + D3,nmol/L,Vitamin,,,
1,Fasting Glucose,mmol/L,Blood Sugar,,,
2,Insulin,pmol/L,Hormone (Protein),INS,MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGER...,110.0
3,Triglyceride,mmol/L,Lipoprotein,MTTP,MILLAVLFLCFISSYSASVKGHTTGLSLNNDRLYKLTYSTEVLLDR...,894.0
4,LDL-Cholesterol(NIH2),mmol/L,Lipoprotein,APOB,MDPPRPALLALLALPALLLLLLAGARAEEEMLENVSLVCPKDATRF...,4563.0


### Disease Node

In [18]:
disease_df = df[list(df.columns[81:97])].copy()
disease_df.head()

Unnamed: 0,Hepatitis C,Congestive_Heart_Failure,Coronary_Heart_Disease,Stroke,Thyroid_Problem,Liver_Condition,Jaundice,Cancer,Cancer_Type,Anemia,Heart_Attack,Weak/Failing kidneys,Gestational diabetes,Diabetes,Hepatitis A antibody,Hepatitis B core antibody
0,,,,,,,,,,2.0,,,,2.0,,
1,,,,,,,,,,2.0,,,,2.0,1.0,
2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,1.0,1.0
3,2.0,,,,,,2.0,,,2.0,,,,2.0,2.0,2.0
4,2.0,,,,,,2.0,,,2.0,,,,2.0,2.0,2.0


In [19]:
# We have a column Cancer_Type which is multiclass and contains a variety of cancer types
# Let us split it into its seperate columns

cancer_mapping = {
    10: 'Bladder Cancer',
    11: 'Blood Cancer',
    12: 'Bone Cancer',
    13: 'Brain Cancer',
    14: 'Breast Cancer',
    15: 'Cervical Cancer',
    16: 'Colon Cancer',
    17: 'Esophageal Cancer',
    18: 'Gallbladder Cancer',
    19: 'Kidney Cancer',
    20: 'Larynx/Windpipe Cancer',
    21: 'Leukemia',
    22: 'Liver Cancer',
    23: 'Lung Cancer',
    24: 'Lymphoma/Hodgkin Disease',
    25: 'Melanoma',
    26: 'Mouth/Tongue/Lip Cancer',
    27: 'Nervous System Cancer',
    28: 'Ovary/Ovarian Cancer',
    29: 'Pancreas/Pancreatic Cancer',
    30: 'Prostate Cancer',
    31: 'Rectum/Rectal Cancer',
    32: 'Skin (Non-Melanoma) Cancer',
    33: 'Skin (Unknown Type) Cancer',
    34: 'Soft Tissue (Muscle or Fat) Cancer',
    35: 'Stomach Cancer',
    36: 'Testis/Testicular Cancer',
    37: 'Thyroid Cancer',
    38: 'Uterus/Uterine Cancer',
}

# Create new columns based on cancer types and rename them
for cancer_type, new_column_name in cancer_mapping.items():
    disease_df[new_column_name] = 0
    disease_df.loc[disease_df['Cancer_Type'] == cancer_type, new_column_name] = 1

disease_df.drop(columns=['Cancer_Type'], inplace=True)
disease_df

Unnamed: 0,Hepatitis C,Congestive_Heart_Failure,Coronary_Heart_Disease,Stroke,Thyroid_Problem,Liver_Condition,Jaundice,Cancer,Anemia,Heart_Attack,...,Pancreas/Pancreatic Cancer,Prostate Cancer,Rectum/Rectal Cancer,Skin (Non-Melanoma) Cancer,Skin (Unknown Type) Cancer,Soft Tissue (Muscle or Fat) Cancer,Stomach Cancer,Testis/Testicular Cancer,Thyroid Cancer,Uterus/Uterine Cancer
0,,,,,,,,,2.0,,...,0,0,0,0,0,0,0,0,0,0
1,,,,,,,,,2.0,,...,0,0,0,0,0,0,0,0,0,0
2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,0,0,0,0,0,0,0,0,0,0
3,2.0,,,,,,2.0,,2.0,,...,0,0,0,0,0,0,0,0,0,0
4,2.0,,,,,,2.0,,2.0,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9249,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,0,0,0,0,0,0,0,0,0,0
9250,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,...,0,0,0,0,0,0,0,0,0,0
9251,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,0,0,0,0,0,0,0,0,0,0
9252,2.0,,,,,,2.0,,2.0,,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Let us also replace all the 2 values (which stands for 'Negative') with zeroes.

for column in disease_df.columns:
    disease_df[column] = disease_df[column].replace(2, 0)

disease_df.head()

Unnamed: 0,Hepatitis C,Congestive_Heart_Failure,Coronary_Heart_Disease,Stroke,Thyroid_Problem,Liver_Condition,Jaundice,Cancer,Anemia,Heart_Attack,...,Pancreas/Pancreatic Cancer,Prostate Cancer,Rectum/Rectal Cancer,Skin (Non-Melanoma) Cancer,Skin (Unknown Type) Cancer,Soft Tissue (Muscle or Fat) Cancer,Stomach Cancer,Testis/Testicular Cancer,Thyroid Cancer,Uterus/Uterine Cancer
0,,,,,,,,,0.0,,...,0,0,0,0,0,0,0,0,0,0
1,,,,,,,,,0.0,,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,,,,,,0.0,,0.0,,...,0,0,0,0,0,0,0,0,0,0
4,0.0,,,,,,0.0,,0.0,,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# Disease Node table

disease = pd.DataFrame(list(disease_df.columns), columns=['Name'])
disease.head()

Unnamed: 0,Name
0,Hepatitis C
1,Congestive_Heart_Failure
2,Coronary_Heart_Disease
3,Stroke
4,Thyroid_Problem


In [29]:
# Adding a new column with disease type
disease_type_mapping = {
    'Hepatitis C': 'Liver Disease',
    'Congestive_Heart_Failure': 'Cardiovascular Disease',
    'Coronary_Heart_Disease': 'Cardiovascular Disease',
    'Stroke': 'Cardiovascular Disease',
    'Thyroid_Problem': 'Endocrine Disorder',
    'Liver_Condition': 'Liver Disease',
    'Jaundice': 'Liver Disease',
    'Cancer': 'Cancer',
    'Anemia': 'Blood Disorder',
    'Heart_Attack': 'Cardiovascular Disease',
    'Weak/Failing kidneys': 'Kidney Disease',
    'Gestational diabetes': 'Diabetes',
    'Diabetes': 'Diabetes',
    'Hepatitis A antibody': 'Liver Disease',
    'Hepatitis B core antibody': 'Liver Disease',
    'Bladder Cancer': 'Cancer',
    'Blood Cancer': 'Cancer',
    'Bone Cancer': 'Cancer',
    'Brain Cancer': 'Cancer',
    'Breast Cancer': 'Cancer',
    'Cervical Cancer': 'Cancer',
    'Colon Cancer': 'Cancer',
    'Esophageal Cancer': 'Cancer',
    'Gallbladder Cancer': 'Cancer',
    'Kidney Cancer': 'Cancer',
    'Larynx/Windpipe Cancer': 'Cancer',
    'Leukemia': 'Cancer',
    'Liver Cancer': 'Cancer',
    'Lung Cancer': 'Cancer',
    'Lymphoma/Hodgkin Disease': 'Cancer',
    'Melanoma': 'Cancer',
    'Mouth/Tongue/Lip Cancer': 'Cancer',
    'Nervous System Cancer': 'Cancer',
    'Ovary/Ovarian Cancer': 'Cancer',
    'Pancreas/Pancreatic Cancer': 'Cancer',
    'Prostate Cancer': 'Cancer',
    'Rectum/Rectal Cancer': 'Cancer',
    'Skin (Non-Melanoma) Cancer': 'Cancer',
    'Skin (Unknown Type) Cancer': 'Cancer',
    'Soft Tissue (Muscle or Fat) Cancer': 'Cancer',
    'Stomach Cancer': 'Cancer',
    'Testis/Testicular Cancer': 'Cancer',
    'Thyroid Cancer': 'Cancer',
    'Uterus/Uterine Cancer': 'Cancer'
}

# Assuming your DataFrame is named 'df' and the disease column is named 'Disease'
disease['Type'] = disease['Name'].map(disease_type_mapping)
disease

Unnamed: 0,Name,Type
0,Hepatitis C,Liver Disease
1,Congestive_Heart_Failure,Cardiovascular Disease
2,Coronary_Heart_Disease,Cardiovascular Disease
3,Stroke,Cardiovascular Disease
4,Thyroid_Problem,Endocrine Disorder
5,Liver_Condition,Liver Disease
6,Jaundice,Liver Disease
7,Cancer,Cancer
8,Anemia,Blood Disorder
9,Heart_Attack,Cardiovascular Disease
