In [1]:
import sys; sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from preprocessing_utils import calculate_feature, adjust_skewness_for_dataframe

### Use baseline data to fill NA in longitudinal data (No need to run again)
(as there are more patients in the longitudinal data) If the feature to merge is NA in longitudinal data but not NA in baseline data, replace with the value in baseline data. If there is a mismatch, report the mismatch.

In [None]:
scans_long_data = pd.read_csv("../OPTIMAL_Data/SCANS/Handover_SCANS/SCANS_long.csv", header=0)
scans_long_data = scans_long_data.iloc[:, 1:]
long_patient_IDs = np.unique(np.array(scans_long_data['ID']))
no_patients_long = len(long_patient_IDs)

scans_BL_data = pd.read_csv("../OPTIMAL_Data/SCANS/Handover_SCANS/Lawrence_2013_Baseline_Spreadsheet_complete_final.csv", header=0)
scans_BL_data = scans_BL_data.iloc[:, 1:]
BL_patient_IDs = [int(i[2:]) for i in scans_BL_data['ID']] 
BL_patient_IDs = np.unique(np.array(BL_patient_IDs))


no_patients_BL = len(BL_patient_IDs)
print('There are {} patients in the longitudinal data, and {} patients in the baseline data. Additional patients in longitudinal data are {}'
      .format(no_patients_long, no_patients_BL, set(long_patient_IDs) - set(BL_patient_IDs)))

In [None]:
features_to_merge = ['PSMD', 'EF', 'PS']
long_dict = {
    'PSMD': 'psmd_value_long',
    'EF': 'EF',
    'PS': 'PS'
}
BL_dict = {
    'PSMD': 'psmd_value_t0',
    'EF': 'EF_t0',
    'PS': 'PS_t0'
}
for ID in BL_patient_IDs:
    if ID in long_patient_IDs:
        long_patient_data = scans_long_data.loc[(scans_long_data['ID']==ID) & (scans_long_data['TP']==0)]
        BL_patient_data = scans_BL_data.loc[scans_BL_data['ID']== ('ID'+ str(ID))]
        
        for feature in features_to_merge:
            long_feature_value = long_patient_data.iloc[0][long_dict[feature]]
            BL_feature_value = BL_patient_data.iloc[0][BL_dict[feature]]
            if (pd.isna(long_feature_value)==True) and (pd.isna(BL_feature_value)==False): # Replace NA in longitudinal data
                print('Replaced value for patient {} for feature {}'.format(ID, feature))
                scans_long_data.loc[(scans_long_data['ID']==ID) & (scans_long_data['TP']==0), long_dict[feature]] = BL_feature_value
            elif (pd.isna(long_feature_value)==False) and (pd.isna(BL_feature_value)==False): 
                try:
                    long_feature_value-BL_feature_value < 1e-6
                except:
                    print("Mismatch for feature {}. Longitudinal: {}, baseline: {}".format(feature, long_feature_value, BL_feature_value))
            elif (pd.isna(long_feature_value)==False) and (pd.isna(BL_feature_value)==True):
                print('Patient {} has non-NA value for feature {} only in longitudinal data.'.format(ID, ))
            else:
                continue
    else:
        print('Patient {} is not in longitudinal dataset.'.format(ID))

## Sanity Check
assert pd.isna(scans_long_data.loc[(scans_long_data['ID']==133) & (scans_long_data['TP']==0)].iloc[0]['psmd_value_long']) == False

In [4]:
# scans_filled_BL_data = scans_long_data.loc[scans_long_data['TP']==0]
# scans_filled_BL_data.to_csv("/Users/lirui/Downloads/gmlvq-python-rui/OPTIMAL_Data/SCANS/SCANS_filled_BL_data.csv", index=False)

### Read in (Filled) Baseline Data

In [None]:
scans_filled_BL_data = pd.read_csv("../Cohort_Data/SCANS/SCANS_filled_BL_data.csv", header=0)
long_patient_IDs = np.unique(np.array(scans_filled_BL_data['ID']))
num_patients_long = len(long_patient_IDs)

print('There are {} patients in total'.format(num_patients_long))

#### Select variables, using patient IDs in longitudinal data

In [3]:
numeric_variable_dict = {
    'WMH_vol_ml': ['WMH_ml'],
    'num_lacunes':['lacunes'],
    'num_mb': ['microbleeds'],
    'TBV_ml': ['TCV_ml'], #defined as the sum of grey matter, white matter and WMH
    'WM_vol_ml': ['sum', 'NAWM_ml', 'WMH_ml'],
    'GM_vol_ml': ['GM_ml'],
    'PSMD': ['psmd_value_long'],
    'global_cog': ['Global'],
    'EF': ['EF'],
    'PS': ['PS'],
    'age': ['BSL_age'],
    'edu_yrs': ['education_years'],
    'SVDp': ['SVDp'],
    'BMI': ['BMI'],
    'MMSE': ['MMSE'],
    'T_survival': ['Time_dementia']
}

categorical_variable_dict = {
    'sex': {
        'name': 'sex',
        'mapping':{
            'male': 0,
            'female': 1}
    },
    'HTN': {
        'name': 'HTN',
        'mapping':{
            'No': 0,
            'Yes': 1}
    },
    'HC': {
        'name': 'HL',
        'mapping':{
            'No': 0,
            'Yes': 1}
    },
    'diabetes': {
        'name': 'diabetes',
        'mapping': {
            'non diabetic': 0,
            'diabetic': 1,
            'diet control': 1}
    },
    'smoking': {
        'name': 'smoking',
        'mapping': {
            'never smoked': 0,
            'current smoker': 1,
            'ex-smoker': 1}
    },
    'dementia_final': {
        'name': 'Dementia',
        'mapping': {
            'censored, no dementia': 0,
            'developed dementia': 1}
    }
}  

In [None]:
output_data = []
for idx, ID in enumerate(long_patient_IDs):
    this_patient_output_data = {'ID': ID}
    patient_BL_data = scans_filled_BL_data.loc[scans_filled_BL_data['ID']==ID].iloc[0]

    for output_feature in list(numeric_variable_dict.keys()):
        if len(numeric_variable_dict[output_feature])>1: # combine features
                operation = numeric_variable_dict[output_feature][0]
                required_feature_names = numeric_variable_dict[output_feature][1:]
                feature_value, contains_NA, NA_features = calculate_feature(operation, required_feature_names, patient_BL_data, [], False, output_feature)
        else:
            feature_value = patient_BL_data[numeric_variable_dict[output_feature][0]]
            if (output_feature == 'T_survival') and (pd.isna(feature_value)==False) and feature_value>5.2:
                feature_value = 5.2
        this_patient_output_data[output_feature] = feature_value

    for output_feature in list(categorical_variable_dict.keys()):
        orig_feature = categorical_variable_dict[output_feature]['name']
        feature_cat = str(patient_BL_data[orig_feature])
        try:
            feature_value = categorical_variable_dict[output_feature]['mapping'][feature_cat]
        except: #If cannot be mapped, take the feature as it is.
            feature_value = patient_BL_data[orig_feature]
        this_patient_output_data[output_feature] = feature_value

    output_data.append(this_patient_output_data)

output_df = pd.DataFrame(output_data)
print(output_df.shape)
output_df.head()

In [8]:
output_df.to_csv('/Users/lirui/Downloads/Cohort_Dementia_Prediction/Cohort_Data/Selected_Data/Data_6.0/original_data/complete_SCANS_121_subjects.csv', index=False)

### Check whether data is correct

In [7]:
orig_data_long = scans_filled_BL_data.set_index('ID')
ID_list = list(output_df['ID'])

assert orig_data_long.loc[ID_list,'SVDp'].dropna().to_list() == output_df['SVDp'].dropna().to_list()
assert orig_data_long.loc[ID_list,'BMI'].dropna().to_list() == output_df['BMI'].dropna().to_list()
orig_T_survival = orig_data_long.loc[ID_list,'Time_dementia'].to_list()
capped_T_survival = [min(i, 5.2) for i in orig_T_survival]
assert capped_T_survival == output_df['T_survival'].to_list()

# try:
#     orig_data_cog.loc[ID_list,'cognitiveindex11'].to_list() == augmented_df['global_cog_5yr'].to_list()
# except:
#     print(np.where((orig_data_cog.loc[ID_list,'cognitiveindex11'].to_numpy() == augmented_df['global_cog_5yr'].to_numpy()) == False))