# #7 Model Training

In [1]:
import pandas as pd
from neo4j import GraphDatabase
from dotenv import load_dotenv
import os
import numpy as np

In [2]:
# Neo4j connection details

load_dotenv('../KG/db.env')

uri = os.getenv("NEO4J_URI")
user = os.getenv("NEO4J_USER")
password = os.getenv("NEO4J_PASSWORD")

# Creating a Neo4j Driver Instance
driver = GraphDatabase.driver(uri, auth=(user, password))

# Verifying Connection
try:
    driver.verify_connectivity()
except Exception as err:
    print(f"Error: {err}, type: {type(err)}")

In [3]:
# Define a function to retrieve data from the knowledge graph based on suitable cypher queries 

def retrieve_data_from_graph(query):
    
    result_data = []

    with driver.session() as session:
        result = session.run(query)

        for record in result:
            result_data.append(dict(record))
    
    return result_data

In [4]:
# Function to retrieve all the data for a particular disease

def get_data_of_disease(disease):
    
    # Retrieve data for a specific disease from the knowledge graph.

    # Parameters:
    #     disease_name (str): The name of the disease to retrieve data for.

    # Returns:
    #     disease_df (pd.DataFrame): DataFrame containing person, disease, biomarker, and protein data.
    
    disease_query = """
    MATCH (p:Person)-[s:SUFFERS_FROM]->(d:Disease)
    where d.name = '"""+ disease +"""' and s.status IN ['Positive','Negative']
    RETURN p.respondentId as Respondent_ID,
    d.name AS DiseaseName,
    s.status AS DiseaseStatus;
    """

    disease_df = pd.DataFrame(retrieve_data_from_graph(disease_query))
    disease_data = disease_df.pivot(index=['Respondent_ID'],columns='DiseaseName', values='DiseaseStatus')
    disease_data = disease_data.reset_index()
    disease_data[disease] = disease_data[disease].map({'Negative': 0, 'Positive' : 1})

    biomarker_query = """
    MATCH (p:Person)-[s:SUFFERS_FROM]->(d:Disease)
    where d.name = '"""+ disease +"""' and s.status IN ['Positive','Negative']
    MATCH (p)-[h:HAS]->(b:BloodBiomarker)
    RETURN p.respondentId as Respondent_ID,
    p.gender AS Gender,
    p.age AS Age,
    p.bmi AS BMI,
    b.name AS BiomarkerName,
    b.unit AS BiomarkerUnit,
    h.value AS BiomarkerValue;
    """

    biomarker_df = pd.DataFrame(retrieve_data_from_graph(biomarker_query))
    biomarker_df['BloodBiomarker'] = np.where(biomarker_df['BiomarkerUnit'].notnull(),
                                       biomarker_df['BiomarkerName'] + ' [' + biomarker_df['BiomarkerUnit'] + ']',
                                       biomarker_df['BiomarkerName'])
    biomarker_data = biomarker_df.pivot(index=['Respondent_ID','Gender','Age','BMI'], 
                                        columns='BloodBiomarker',
                                        values='BiomarkerValue')
    biomarker_data = biomarker_data.reset_index()
    biomarker_data['Gender'] = biomarker_data['Gender'].map({'Female': 0, 'Male' : 1})


    protein_query = """
    MATCH (p:Person)-[s:SUFFERS_FROM]->(d:Disease)
    where d.name = '"""+ disease +"""' and s.status IN ['Positive','Negative']
    MATCH (p)-[h:HAS]->(b:BloodProtein)
    RETURN p.respondentId as Respondent_ID,
    b.name AS ProteinName,
    b.unit AS ProteinUnit,
    h.value AS ProteinValue;
    """

    protein_df = pd.DataFrame(retrieve_data_from_graph(protein_query))
    protein_df['BloodProtein'] = np.where(protein_df['ProteinUnit'].notnull(),
                                       protein_df['ProteinName'] + ' [' + protein_df['ProteinUnit'] + ']',
                                       protein_df['ProteinName'])
    protein_data = protein_df.pivot(index=['Respondent_ID'], 
                                        columns='BloodProtein',
                                        values='ProteinValue')
    protein_data = protein_data.reset_index()

    disease_df = biomarker_data.merge(protein_data, on='Respondent_ID',how='inner')
    disease_df = disease_df.merge(disease_data, on='Respondent_ID',how='inner')
    
    return disease_df

# Diabetes

In [5]:
# Data for Diabetes

diabetes_df = get_data_of_disease('Diabetes')
diabetes_df

Unnamed: 0,Respondent_ID,Gender,Age,BMI,25-hydroxyvitamin D2 + D3 [nmol/L],A/G Ratio,Alpha-carotene [umol/L],Alpha-crypotoxanthin [umol/L],Alpha-tocopherol [umol/L],Basophils [%],...,Ferritin [ng/mL],Gamma Glutamyl Transferase [IU/L],Hemoglobin [g/dL],Insulin [pmol/L],LDL-Cholesterol(NIH2) [mmol/L],Lactate Dehydrogenase [IU/L],Transferrin Saturation [%],Transferrin receptor [nmol/L],Triglyceride [mmol/L],Diabetes
0,93708,0,66,23.7,116.0,1.21875,0.192,0.127,,0.7,...,109.0,22.0,13.2,58.32,2.87,247.0,32.0,30.1,0.655,1
1,93709,0,75,38.9,72.8,1.121212,0.009,0.017,28.561,0.6,...,129.0,31.0,15.4,,,,21.0,46.5,,0
2,93711,1,56,21.3,165.0,1.290323,0.173,0.079,31.115,0.4,...,40.6,19.0,14.7,31.68,4.086,139.0,16.0,34.8,0.542,0
3,93712,1,18,19.7,59.9,1.454545,0.035,0.1,23.917,0.8,...,74.1,11.0,15.5,,,144.0,51.0,40.7,,0
4,93713,1,67,23.5,63.5,2.047619,0.028,0.028,18.344,0.6,...,238.0,26.0,14.2,,,123.0,27.0,28.0,,0


## Pre-processing

In [6]:
# Seperate the attributes and labels

X = diabetes_df.drop(['Respondent_ID','Diabetes'],axis=1)
y = diabetes_df['Diabetes'].copy()

In [7]:
# Splitting the data into training and test sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [8]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")

imputer.fit(X_train)

imputer.statistics_     # contains the mean values for each attribute

array([7.50000000e-01, 5.17500000e+01, 2.20500000e+01, 1.01100000e+02,
       1.50280927e+00, 1.07000000e-01, 8.35000000e-02, 2.44586667e+01,
       6.25000000e-01, 3.22500000e-01, 1.65000000e+01, 4.40250000e+00,
       9.00000000e-02, 1.82200000e+02, 2.01600000e+01, 2.46000000e+00,
       5.58000000e+00, 5.77500000e-02, 3.05666667e+00, 5.03670000e+01,
       9.02500000e-01, 2.17500000e+00, 6.35500000e+00, 3.23050000e+00,
       2.92500000e+00, 5.72500000e+00, 4.31250000e+01, 7.15350000e+00,
       1.14000000e+02, 1.99250000e+01, 1.72777306e+00, 6.36500000e-01,
       3.29500000e+01, 2.73157680e+01, 4.39583333e+00, 3.34000000e+01,
       3.05250000e+01, 9.13250000e+01, 8.75000000e+00, 7.70000000e+00,
       1.69999908e+00, 1.39211137e+02, 1.00000000e-01, 2.82500000e+02,
       3.87500000e+00, 2.35500000e+02, 1.11156644e+02, 3.73052803e+01,
       4.42500000e+00, 4.72000000e+00, 1.39500000e+01, 2.10600000e+00,
       4.15000000e-02, 1.70000000e-02, 5.67250000e+01, 1.41000000e+02,
      

In [9]:
X_train = imputer.transform(X_train)

X_train = pd.DataFrame(X_train, columns=X.columns)

# Data with null values imputed with mean
X_train

Unnamed: 0,Gender,Age,BMI,25-hydroxyvitamin D2 + D3 [nmol/L],A/G Ratio,Alpha-carotene [umol/L],Alpha-crypotoxanthin [umol/L],Alpha-tocopherol [umol/L],Basophils [%],Beta-cryptoxanthin [umol/L],...,Direct HDL-Cholesterol [mmol/L],Ferritin [ng/mL],Gamma Glutamyl Transferase [IU/L],Hemoglobin [g/dL],Insulin [pmol/L],LDL-Cholesterol(NIH2) [mmol/L],Lactate Dehydrogenase [IU/L],Transferrin Saturation [%],Transferrin receptor [nmol/L],Triglyceride [mmol/L]
0,1.0,67.0,23.5,63.5,2.047619,0.028,0.028,18.344,0.6,0.075,...,1.24,238.0,26.0,14.2,45.0,3.478,123.0,27.0,28.0,0.5985
1,1.0,56.0,21.3,165.0,1.290323,0.173,0.079,31.115,0.4,0.614,...,1.86,40.6,19.0,14.7,31.68,4.086,139.0,16.0,34.8,0.542
2,0.0,66.0,23.7,116.0,1.21875,0.192,0.127,24.458667,0.7,0.246,...,2.28,109.0,22.0,13.2,58.32,2.87,247.0,32.0,30.1,0.655
3,1.0,18.0,19.7,59.9,1.454545,0.035,0.1,23.917,0.8,0.355,...,1.24,74.1,11.0,15.5,45.0,3.478,144.0,51.0,40.7,0.5985


In [10]:
X_test = imputer.transform(X_test)
X_test = pd.DataFrame(X_test, columns=X_train.columns)

In [11]:
y_train

4    0
2    0
0    1
3    0
Name: Diabetes, dtype: int64

In [12]:
# Scaling the data

from sklearn.preprocessing import StandardScaler

X_train_gender = X_train.Gender
X_train = X_train.drop('Gender',axis=1)
X_test_gender = X_test.Gender
X_test = X_test.drop('Gender',axis=1)
cols = X_train.columns

scaler = StandardScaler()

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=cols)
X_test = pd.DataFrame(X_test, columns=cols)

X_train.insert(0, "Gender", X_train_gender)
X_test.insert(0, "Gender", X_test_gender)

In [13]:
X_train

Unnamed: 0,Gender,Age,BMI,25-hydroxyvitamin D2 + D3 [nmol/L],A/G Ratio,Alpha-carotene [umol/L],Alpha-crypotoxanthin [umol/L],Alpha-tocopherol [umol/L],Basophils [%],Beta-cryptoxanthin [umol/L],...,Direct HDL-Cholesterol [mmol/L],Ferritin [ng/mL],Gamma Glutamyl Transferase [IU/L],Hemoglobin [g/dL],Insulin [pmol/L],LDL-Cholesterol(NIH2) [mmol/L],Lactate Dehydrogenase [IU/L],Transferrin Saturation [%],Transferrin receptor [nmol/L],Triglyceride [mmol/L]
0,1.0,0.764233,0.877982,-0.873214,1.671425,-1.041685,-1.529762,-1.350592,-0.169031,-1.264926,...,-0.941542,1.638986,1.181818,-0.239904,0.0,0.0,-0.82191,-0.355479,-1.106311,0.0
1,1.0,0.212983,-0.454129,1.483999,-0.651889,0.870269,-0.124035,1.470234,-1.521278,1.489802,...,0.465099,-1.000507,-0.090909,0.359856,-1.414214,1.414214,-0.495188,-1.224426,0.286821,-1.414214
2,0.0,0.71412,0.999083,0.346034,-0.871467,1.120801,1.199002,0.0,0.507093,-0.390977,...,1.417985,-0.085911,0.454545,-1.439424,1.414214,-1.414214,1.710186,0.039498,-0.676079,1.414214
3,1.0,-1.691336,-1.422936,-0.95682,-0.148069,-0.949384,0.454794,-0.119642,1.183216,0.166101,...,-0.941542,-0.552568,-1.545455,1.319472,0.0,0.0,-0.393087,1.540407,1.495569,0.0


In [14]:
y_train

4    0
2    0
0    1
3    0
Name: Diabetes, dtype: int64

### Function for Pre-processing

In [15]:
# We can define a function to do all the preprocessing and the splitting of data

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import pandas as pd

def preprocess_and_split(df, label, impute=True, test_size=0.2, random_state=42):
    
    # Preprocesses the input dataframe and splits it into training and testing sets.

    # Parameters:
    #     df (DataFrame): The input dataframe.
    #     label (str): The column name for the target variable.
    #     test_size (float): The proportion of the dataset to include in the test split.
    #     random_state (int): Random seed for reproducibility.
    #     impute (bool): Whether to impute missing values.

    # Returns:
    #     X_train (DataFrame): The training feature matrix.
    #     X_test (DataFrame): The testing feature matrix.
    #     y_train (Series): The training target values.
    #     y_test (Series): The testing target values.
    
    X = df.drop(['Respondent_ID',label], axis=1)
    y = df[label].copy()
    
    # Splitting the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Separate categorical and numerical columns 
    categorical_cols = ['Gender'] 
    numeric_cols = [col for col in X.columns if col not in categorical_cols]
    
    # Preprocessing for numerical data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')) if impute else ('imputer', 'passthrough'),
        ('scaler', StandardScaler())])
    
    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')) if impute else ('imputer', 'passthrough')])
    
    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_cols),
            ('num', numeric_transformer, numeric_cols)])
    
    # Preprocess data
    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)
    
    # Convert back to DataFrames
    X_train = pd.DataFrame(X_train, columns= categorical_cols + numeric_cols)
    X_test = pd.DataFrame(X_test, columns= categorical_cols + numeric_cols)
    
    return X_train, X_test, y_train, y_test

In [16]:
# Attributes and Labels

X_train, X_test, y_train, y_test = preprocess_and_split(df = diabetes_df, label = 'Diabetes', impute = True)

In [17]:
X_train

Unnamed: 0,Gender,Age,BMI,25-hydroxyvitamin D2 + D3 [nmol/L],A/G Ratio,Alpha-carotene [umol/L],Alpha-crypotoxanthin [umol/L],Alpha-tocopherol [umol/L],Basophils [%],Beta-cryptoxanthin [umol/L],...,Direct HDL-Cholesterol [mmol/L],Ferritin [ng/mL],Gamma Glutamyl Transferase [IU/L],Hemoglobin [g/dL],Insulin [pmol/L],LDL-Cholesterol(NIH2) [mmol/L],Lactate Dehydrogenase [IU/L],Transferrin Saturation [%],Transferrin receptor [nmol/L],Triglyceride [mmol/L]
0,1.0,0.764233,0.877982,-0.873214,1.671425,-1.041685,-1.529762,-1.350592,-0.169031,-1.264926,...,-0.941542,1.638986,1.181818,-0.239904,0.0,0.0,-0.82191,-0.355479,-1.106311,0.0
1,1.0,0.212983,-0.454129,1.483999,-0.651889,0.870269,-0.124035,1.470234,-1.521278,1.489802,...,0.465099,-1.000507,-0.090909,0.359856,-1.414214,1.414214,-0.495188,-1.224426,0.286821,-1.414214
2,0.0,0.71412,0.999083,0.346034,-0.871467,1.120801,1.199002,0.0,0.507093,-0.390977,...,1.417985,-0.085911,0.454545,-1.439424,1.414214,-1.414214,1.710186,0.039498,-0.676079,1.414214
3,1.0,-1.691336,-1.422936,-0.95682,-0.148069,-0.949384,0.454794,-0.119642,1.183216,0.166101,...,-0.941542,-0.552568,-1.545455,1.319472,0.0,0.0,-0.393087,1.540407,1.495569,0.0


In [18]:
y_train

4    0
2    0
0    1
3    0
Name: Diabetes, dtype: int64