In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from IPython.display import display
from preprocessing_utils import calculate_feature, adjust_skewness_for_dataframe

#### Merge complete longitudinal data with cognitive data (No Need to Run Again)

In [None]:
singapore_complete_data = pd.read_csv("/Users/lirui/Downloads/gmlvq-python-rui/OPTIMAL_Data/HARMONISATION/Handover_HARMO_all_available/Singapore_data_set_29_10.csv", header=0, index_col=0)
singapore_complete_data.index -= 1
patient_IDs = np.unique(np.array(singapore_complete_data.index))
no_patients = len(patient_IDs)
print(singapore_complete_data.shape)

singapore_cog_data = pd.read_csv("/Users/lirui/Downloads/gmlvq-python-rui/OPTIMAL_Data/HARMONISATION/Marco_Email_30_11/Singapore_data_Marco_standardized_cog_scores.csv", header=0, index_col=0)
singapore_cog_data.index -= 1
patient_IDs_cog = np.unique(np.array(singapore_cog_data.index))
print(singapore_cog_data.shape)

for col in singapore_cog_data.columns:
    if col == 'PID':
        continue
    else:
        if col in singapore_complete_data.columns:
            #print("Replaced:", col)
            singapore_complete_data.loc[:, [col]] = singapore_cog_data[[col]]
        else:
            #print('Added:', col)
            singapore_complete_data = singapore_complete_data.merge(singapore_cog_data[['PID',col]], how='left', on='PID')

singapore_complete_data.to_csv('/Users/lirui/Downloads/gmlvq-python-rui/OPTIMAL_Data/HARMONISATION/Rui_Combined_Data.csv', index=True)

#### Compute survival data (No Need to Run Again)

In [None]:
firstvisit_dementia_df = pd.read_excel('/Users/lirui/Downloads/gmlvq-python-rui/OPTIMAL_Data/HARMONISATION/5_year_follow_up/Data_time_dementia1.xlsx')
firstvisit_dementia_IDs = firstvisit_dementia_df['PID'].tolist()
firstvisit_dementia_df = firstvisit_dementia_df.set_index('PID')

five_year_df = pd.read_csv('/Users/lirui/Downloads/gmlvq-python-rui/OPTIMAL_Data/HARMONISATION/5_year_follow_up/5_year_updated_data.csv')
five_year_IDs = five_year_df['PID'].tolist()
five_year_df = five_year_df.set_index('PID')

lastvisit_dementia_df = pd.read_excel('/Users/lirui/Downloads/gmlvq-python-rui/OPTIMAL_Data/HARMONISATION/5_year_follow_up/PID_MCI_HARMONISATION_with dementia dates_SH.xlsx')
lastvisit_dementia_IDs = lastvisit_dementia_df['PID'].tolist()
lastvisit_dementia_df = lastvisit_dementia_df.set_index('PID')

# Check that the files contain same patients
try:
    firstvisit_dementia_IDs == five_year_IDs == lastvisit_dementia_IDs
except:
    print('Mismatch in patient IDs!')

In [None]:
result_df = pd.DataFrame({
    'ID': five_year_IDs, 
    'dementia':five_year_df['Conversion_dementia_up_to_5_years'], 
    'first_visit_date': firstvisit_dementia_df['Date_of_first_visit']}).set_index('ID')

T_list = []
end_date_list = []
for ID in five_year_IDs:
    firstvisit_date = result_df.loc[ID, 'first_visit_date']

    # Fill in partially missing data for HD052 and HD123
    if ID == 'HD052':
        end_date = '07/01/2016'
    elif ID == 'HD123':
        end_date = '07/01/2013'
    elif ID == 'HD487':
        end_date = '06/09/2018' # Mid point between the 2 year follow up and the most recent visit.
    else:
        if result_df.loc[ID, 'dementia'] ==0: # No dementia, find censoring time.
            end_date = lastvisit_dementia_df.loc[ID, 'date_of_most_recent_visit']
        elif result_df.loc[ID, 'dementia'] ==1:
            end_date = lastvisit_dementia_df.loc[ID, 'Date_of_event_dementia']
        else:
            print('Unrecognised dementia outcome! Please check.')
            end_date = 'NA'
    
    if type(end_date)==str:
        try: # if end_date is actual date in string format
            end_date = datetime.strptime(end_date, "%m/%d/%Y")
            end_date_list.append(end_date.strftime('%Y-%m-%d'))
        except: # if end_date is not exact date or NA
            print("Cannot convert end date for patient {}: {}".format(ID, end_date))
            end_date_list.append(end_date)
    else: # type is timestamp
        end_date_list.append(end_date.strftime('%Y-%m-%d'))

    try:
        this_T = (end_date-firstvisit_date).days/365
        T_list.append(this_T)
    except:
        T_list.append('NA')
        print('Cannot calculate T for patient {}: end_date is {}'.format(ID, end_date))

result_df['end_date'] = end_date_list
result_df['T_survival'] = T_list
result_df = result_df.join(five_year_df['Dementia_subtype'])
display(result_df)

In [4]:
result_df.to_csv('/Users/lirui/Downloads/gmlvq-python-rui/OPTIMAL_Data/HARMONISATION/Computed_survival_data.csv')

#### Read in data and get variables needed

In [None]:
two_yr_combined_data = pd.read_csv("../Cohort_Data/HARMONISATION/Full_data_with_age_std_cog_scores_by_Rui.csv", header=0, index_col=1)
five_yr_survival_data = pd.read_csv("../Cohort_Data/HARMONISATION/Computed_survival_data.csv", header=0, index_col=0)

patient_IDs = two_yr_combined_data.index.tolist()
print('There are %i patients'%len(patient_IDs))

In [3]:
numeric_feature_dict = {
    'WMH_vol_ml': ['WMHml'],
    'num_lacunes':['Total_Lacunes'],
    'num_mb': ['CMBs'],
    'TBV_ml': ['sum', 'GM_ml', 'WM_ml'],
    'WM_vol_ml': ['WM_ml'],
    'GM_vol_ml': ['GM_ml'],
    'ICV_ml': ['ICVml'],
    'PSMD': ['PSMD_baseline'],
    'global_cog': ['Global_Rui'],
    'EF': ['EF_Rui'], # this is standardized because there is only 1 test for this domain.
    'PS': ['PS_Rui'],
    'age': ['Age'],
    'edu_yrs': ['TotNoYrs'],
    'SVDp': ['divSumPerc', 'WMHml', 'GM_ml', 'WM_ml'],
    'MMSE': ['V0mmsetotal'],
    'MOCA': ['V0mocatotal'],
    'dementia_final': ['dementia'],
    'T_survival': ['T_survival'],
    'dementia_subtype': ['Dementia_subtype'],
    'MCI_diagnosis': ['MCI_diagnosis']
}

categorical_feature_dict = {
    'sex': {
        'name': 'Gender',
        'mapping':{
            'Male': 0,
            'Female': 1}
    },
    'HTN': {
        'name': 'Hypertension_binary',
        'mapping':{
            'No': 0,
            'Yes': 1}
    },
    'HC': {
        'name': 'Hyperlipidemia_binary',
        'mapping':{
            'No': 0,
            'Yes': 1}
    },
    'diabetes': {
        'name': 'Diabetes_binary',
        'mapping':{
            'No': 0,
            'Yes': 1}
    },
    'smoking': {
        'name': 'Smoking_binary',
        'mapping':{
            'Never': 0,
            'Ever': 1}
    }
}

In [None]:
output_data = []
for idx, ID in enumerate(patient_IDs):
    this_patient_output_data = {'ID': ID}
    patient_BL_data = two_yr_combined_data.loc[ID]

    for output_feature in list(numeric_feature_dict.keys()):
        if len(numeric_feature_dict[output_feature])>1: # combine features
                operation = numeric_feature_dict[output_feature][0]
                required_feature_names = numeric_feature_dict[output_feature][1:]
                feature_value, contains_NA, NA_features = calculate_feature(operation, required_feature_names, patient_BL_data, [], False, output_feature)
        else:
            if output_feature in ['dementia_final', 'T_survival', 'dementia_subtype']:
                feature_value = five_yr_survival_data.loc[ID, numeric_feature_dict[output_feature][0]]
            else:
                feature_value = patient_BL_data[numeric_feature_dict[output_feature][0]]
        this_patient_output_data[output_feature] = feature_value

    for output_feature in list(categorical_feature_dict.keys()):
        orig_feature = categorical_feature_dict[output_feature]['name']
        feature_cat = str(patient_BL_data[orig_feature])
        try:
            feature_value = categorical_feature_dict[output_feature]['mapping'][feature_cat]
        except: #If cannot be mapped, take the feature as it is.
            feature_value = patient_BL_data[orig_feature]
        this_patient_output_data[output_feature] = feature_value

    output_data.append(this_patient_output_data)

output_df = pd.DataFrame(output_data)
print(output_df.shape)
output_df.head()

In [5]:
output_df.to_csv('/Users/lirui/Downloads/Cohort_Dementia_Prediction/Cohort_Data/Selected_Data/Data_6.0/original_data/complete_HARMONISATION_265_subjects.csv', index=False)

### Check if data is correct.

In [6]:
ID_list = list(output_df['ID'])

assert two_yr_combined_data.loc[ID_list,'WMHml'].dropna().to_list() == output_df['WMH_vol_ml'].dropna().to_list()
assert five_yr_survival_data.loc[ID_list,'dementia'].dropna().to_list() == output_df['dementia_final'].dropna().to_list()
