In [1]:
import sys; sys.path.insert(0, '..')

import numpy as np
import pandas as pd
from preprocessing_utils import calculate_feature, adjust_skewness_for_dataframe

#### Read in data and get variables needed

In [None]:
### Read in data ###
run_dmc_long_data = pd.read_csv("/Users/lirui/Downloads/Cohort_Dementia_Prediction/Cohort_Data/RUN DMC/Handover_RUNDMC/run_long.csv", header=0)
long_patient_IDs = np.unique(np.array(run_dmc_long_data['ID']))
no_long_patients = len(long_patient_IDs)
print('There are %i patients'%no_long_patients)

run_dmc_psmd_data = pd.read_csv("/Users/lirui/Downloads/Cohort_Dementia_Prediction/Cohort_Data/RUN DMC/Marco_Email_12_16_2021/RUN_dmc_all_year_psmd_geff_dseg.csv", header=0)
psmd_patient_IDs = np.unique(np.array(run_dmc_psmd_data['ID']))
no_psmd_patients = len(psmd_patient_IDs)
print('There are %i patients with psmd data'%no_psmd_patients)

run_dmc_cog_data = pd.read_csv("/Users/lirui/Downloads/Cohort_Dementia_Prediction/Cohort_Data/RUN DMC/Marco_Email_12_21_2021/4Marco_Cognition_all_year_RUN_DMC.csv", header=0)
cog_patient_IDs = np.unique(np.array(run_dmc_cog_data['id']))
no_cog_patients = len(cog_patient_IDs)
print('There are %i patients with cognitive data'%no_cog_patients)

### Sanity Check ###
assert (set(cog_patient_IDs) - set(long_patient_IDs)) == set([]) # Check that the two datasets contain the same patients

# Check whether psmd_patient_IDs are all in patient_IDs
for idx, ID in enumerate(psmd_patient_IDs):
    try:
        (ID in long_patient_IDs) == True
    except:
        print('Patient %i does not exist in run_dmc_long_data!')
        continue
    # Check whether age, dementia1 are matched for all patients between longitudinal data and psmd data
    run_dmc_patient_long_data = run_dmc_long_data.loc[(run_dmc_long_data['ID']==ID) & (run_dmc_long_data['year']==2006)]
    long_age = run_dmc_patient_long_data.iloc[0]['age']
    psmd_data = run_dmc_psmd_data.loc[(run_dmc_psmd_data['ID']==ID) & (run_dmc_psmd_data['year']==2006)]
    psmd_age = psmd_data.iloc[0]['age']
    
    if (long_age-psmd_age < 1e-6) == False:
        print("Age does not match for patient %i"%ID)

In [3]:
# Standardise values for baseline cognitiveindex, executivefunction, and psychomotorspeed
columns_to_standardize = ['cognitiveindex06', 'psychomotorspeed06', 'executivefunction06']
for column in columns_to_standardize:
    values = run_dmc_cog_data[[column]]
    run_dmc_cog_data[column+'_standardized'] = (values-values.mean())/values.std() # values.std() uses N-1 in denominator

In [4]:
numeric_feature_dict = {
    'WMH_vol_ml': ['WMH_ICV_ml'],
    'WM_vol_ml': ['WMvolume'],
    'GM_vol_ml': ['GMvolume'],
    'num_lacunes':['lacnumb'],
    'num_mb': ['mbnumb'],
    'TBV_ml': ['TBvolume'],
    'PSMD': ['PSMD'], # from PSMD file
    'global_cog': ['cognitiveindex06'], # from cognitive file
    'EF': ['executivefunction06'], # from cognitive file
    'PS': ['psychomotorspeed06'], # from cognitive file
    'age': ['age'],
    'edu_yrs': ['educationyears'],
    # Additional ones
    'BMI': ['bmi'],
    'SVDp': ['SVDp'],
    'MMSE': ['MMSE'],
    'tia': ['tia'],
    'tiacount': ['tiacount'],
    'ischemic_stroke': ['ischemic_stroke'],
    'ischemic_count': ['ischemic_count'],
    'hemorrhage': ['hemorrhage'],
    'hemorrhage_count': ['hemorrhage_count'], 
    'unspecified_CVA': ['unspecified_CVA'], 
    'unspecified_CVA_count': ['unspecified_CVA_count'],
    # Endpoints
    'dementia_final': ['dementia1'],
    'T_survival': ['time_dementia'],
    'dementia_subtype': ['dementiacat']
    
}

categorical_feature_dict = {
    'sex': {
        'name': 'gender',
        'mapping':{
            'Male': 0,
            'Female': 1}
    },
    'HTN': {
        'name': 'hypertension',
        'mapping':{
            'no HT': 0,
            'HT': 1}
    },
    'HC': {
        'name': 'hypercholesterolemia',
        'mapping':{
            'no HC': 0,
            'HC': 1}
    },
    'diabetes': {
        'name': 'diabetes',
        'mapping':{
            '0': 0,
            '1': 1}
    },
    'smoking': {
        'name': 'smokingstatus',
        'mapping':{
            'Never smoked': 0,
            'Current smoker': 1,
            'Former smoker': 1}
    },
    'depression': {
        'name': 'Depressive_symptoms_2006',
        'mapping':{
            'geen depressieve symptomen': 0,
            'depressieve symptomen (CESD ge 16 and/or use of antidepr)': 1}
    }
}    

In [None]:
output_data = []
for idx, ID in enumerate(long_patient_IDs):
    this_patient_output_data = {'ID': ID}
    patient_long_data = run_dmc_long_data.loc[(run_dmc_long_data['ID']==ID) & (run_dmc_long_data['year']==2006)].iloc[0]

    for output_feature in list(numeric_feature_dict.keys()):
        if len(numeric_feature_dict[output_feature])>1: # combine features
                operation = numeric_feature_dict[output_feature][0]
                required_feature_names = numeric_feature_dict[output_feature][1:]
                feature_value, contains_NA, NA_features = calculate_feature(operation, required_feature_names, patient_long_data, [], False, output_feature)
        else:
            if output_feature == 'PSMD':
                try: # Need to do this because some patients in long_patient_IDs don't have PSMD
                    feature_value = run_dmc_psmd_data.loc[(run_dmc_psmd_data['ID']==ID) & (run_dmc_psmd_data['year']==2006)]['PSMD'].iloc[0]
                except:
                    feature_value = pd.NA
            elif output_feature in ['global_cog', 'EF', 'PS']:
                feature_value = run_dmc_cog_data.loc[run_dmc_cog_data['id']==ID, numeric_feature_dict[output_feature][0]].iloc[0]
            else:
                feature_value = patient_long_data[numeric_feature_dict[output_feature][0]]
        this_patient_output_data[output_feature] = feature_value

    for output_feature in list(categorical_feature_dict.keys()):
        orig_feature = categorical_feature_dict[output_feature]['name']
        feature_cat = str(patient_long_data[orig_feature])
        try:
            feature_value = categorical_feature_dict[output_feature]['mapping'][feature_cat]
        except: #If cannot be mapped, take the feature as it is.
            feature_value = patient_long_data[orig_feature]
        this_patient_output_data[output_feature] = feature_value

    output_data.append(this_patient_output_data)

output_df = pd.DataFrame(output_data)
print(output_df.shape)
output_df.head()

### Save data

In [7]:
output_df.to_csv('/Users/lirui/Downloads/Cohort_Dementia_Prediction/Cohort_Data/Selected_Data/Data_6.0/original_data/complete_RUN_DMC_503_subjects.csv', index=False)

### Check whether data is correct

In [9]:
orig_data_long = run_dmc_long_data.loc[run_dmc_long_data['year']==2006].set_index('ID')
orig_data_cog = run_dmc_cog_data.set_index('id')
ID_list = list(output_df['ID'])

assert orig_data_long.loc[ID_list,'SVDp'].to_list() == output_df['SVDp'].to_list()
assert orig_data_long.loc[ID_list,'bmi'].to_list() == output_df['BMI'].to_list()
assert orig_data_long.loc[ID_list,'dementia1'].to_list() == output_df['dementia_final'].to_list()
# try:
#     orig_data_cog.loc[ID_list,'cognitiveindex11'].to_list() == augmented_df['global_cog_5yr'].to_list()
# except:
#     print(np.where((orig_data_cog.loc[ID_list,'cognitiveindex11'].to_numpy() == augmented_df['global_cog_5yr'].to_numpy()) == False))