In [1]:
import pandas as pd
import numpy as np   
from sklearn.model_selection import train_test_split
import math

In [2]:
def dataset_splitting(dataset, testset_size, rnd,class_stratification):
    """
    Function for dataset splitting intro training and test set.
    Input:
       - dataset:              (pandas DataFrame) dataset to split
       - testset_size:         (float) proportion of samples to keep for the test set
       - rnd:                  (int) seed for reproducible output
       - class_stratification: (pandas Series) dataset's column indicating the variable (phenotypes) to obtain a stratified splitting
    """
    splitted_dataset = train_test_split(dataset,shuffle=True,test_size=testset_size,random_state=rnd,
                                  stratify=class_stratification)
    
    training_set = splitted_dataset[0]
    test_set = splitted_dataset[1]

    training_set.to_excel("data/trainingset_experiment.xlsx",index=False)
    test_set.to_excel("data/testset_experiment.xlsx",index=False)




def compute_CUORE_10yCV_riskscore(dataset):
    """
    Function for the computation of the CUORE 10 Year sex-specific Cardiovascular (CV) risk score to develop major cardiovasular events.
    Coefficients from:  L. Palmieri, R. Rielli, L. Dematte, C Donfrancesco, P. Ciccarelli, P. De Sanctis Caiola, F. Dima, C. Lo Noce, A. Cuffari O. Brignoli, and S. Giampaoli. 
    Cuore project: implementation of the 10-year risk score. European Journal of Cardiovascular Prevention & Rehabilitation, 18 (4):642–649, 2011. doi: 10.1177/1741826710389925.
    Input:
       - dataset: (pandas DataFrame) patients dataset for which the CV score is computed
    Output:
       - risk_df: (pandas DataFrame) new dataframe with columns 'Patient_ID' and the computed CV score
    """
    
    # the CUORE model use Smoker/non Smoker, we include the ex smokers in the smokers
    dataset["Smoking_habits"] = dataset["Smoking_habits"].replace(to_replace=2,value=1) 
    
    risk_list = []
    patient_id_list = []
    age_list = []

    for i, row_value in dataset.iterrows():

        if row_value["Sex"] == 0: # male
            age_list.append(row_value["Age"])

            s = 0.953
            g = 6.583
            age = row_value["Age"]*0.076
            sist = row_value["Systolic_blood_pressure"]*0.013
            col_tot = row_value['Total_cholesterol']*0.006
            col_hdl = row_value['HDL_cholesterol']*(-0.013)

            if row_value["Diabetes_mellitus"] == 1:
                diab = 0.462
            else:
                diab = 0

            if row_value["Smoking_habits"] == 1:
                smok = 0.508
            else:
                smok = 0

            if row_value["Antihypertensives"] == 1:
                anti = 0.49
            else:
                anti = 0

        else:                          # female
            age_list.append(row_value["Age"])

            s = 0.989
            g = 6.016
            age = row_value["Age"]*0.079
            sist = row_value["Systolic_blood_pressure"]*0.016
            col_tot = row_value['Total_cholesterol']*0.003
            col_hdl = row_value['HDL_cholesterol']*(-0.015)

            if row_value["Diabetes_mellitus"] == 1:
                diab = 0.339
            else:
                diab = 0

            if row_value["Smoking_habits"] == 1:
                smok = 0.773
            else:
                smok = 0

            if row_value["Antihypertensives"] == 1:
                anti = 0.59
            else:
                anti = 0

        risk_list.append(1 - math.pow(s,(math.exp(age+sist+col_tot+col_hdl+smok+ diab+ anti-g))))
        patient_id_list.append(row_value['PATIENT_ID'])

    risk_df = pd.DataFrame({"PATIENT_ID" : patient_id_list, "CUORE_10yCV_riskscore" : risk_list})
    return risk_df

In [40]:
imputed_data = pd.read_excel("exp1/data/dataset_filtrato_Imputato.xlsx")

# Set a column as binary class :{NOATH vs ATH1,2} from imputed dataset.
# Please specify below the initial phenotypes to convert into a binary class.
imputed_data["Class"] = imputed_data["Class"].replace(to_replace="NOATH",value=0) 
imputed_data["Class"] = imputed_data["Class"].replace(to_replace="ATH1",value=1) 
imputed_data["Class"] = imputed_data["Class"].replace(to_replace="ATH2",value=1) 

imputed_data["Y_class"] = imputed_data["Class"]
del imputed_data["Class"]



# compute CUORE 10Y CV risk scores
risk_df = compute_CUORE_10yCV_riskscore(imputed_data)

# merge the risk dataframe and the imputed data on PATIENT_ID -> append the column containing the CV risk to the imputed data
imputed_data = pd.merge(imputed_data,risk_df,on="PATIENT_ID")

# save the new dataset
imputed_data.to_excel("exp1/data/dataset_experiment.xlsx",index=False)



# Load dataset for experiment and split it into training and test set.
data_exp = pd.read_excel("exp1/data/dataset_exp1.xlsx")

dataset_splitting(data_exp,0.3,203,data_exp['Y_class'])
