In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from IPython.display import display
from scipy.stats import ttest_ind, chi2_contingency
from Utils.data_preparation import get_input_variables, get_feature_set
from decimal import Decimal


### Missing Data Analysis

In [2]:
def get_missingness_in_data(data, variables):
    N = data.shape[0]
    missingness_results = {}
    for var in variables:
        data_for_var = data[var]
        N_nonNA = data_for_var.dropna().shape[0]
        N_NA = N - N_nonNA
        # if N_NA == 0:
        #     missingness_results[var] = N_NA
        # else: 
        #     missingness_results[var] = '{} ({:.1f}%)'.format(N_NA, 100*N_NA/N)
        missingness_results[var] = str(N_NA)
    #missingness_results_df = pd.DataFrame.from_dict(missingness_results)
    return missingness_results # Returns a dict

In [None]:
# Read in data and define the selected variables
datapaths = {
    'RUN DMC': '/Users/lirui/Downloads/Cohort_Dementia_Prediction/Cohort_Data/Selected_Data/Data_6.0/original_data/complete_RUN_DMC_503_subjects.csv',
    'SCANS': '/Users/lirui/Downloads/Cohort_Dementia_Prediction/Cohort_Data/Selected_Data/Data_6.0/original_data/complete_SCANS_121_subjects.csv',
    'HARMONISATION': '/Users/lirui/Downloads/Cohort_Dementia_Prediction/Cohort_Data/Selected_Data/Data_6.0/original_data/complete_HARMONISATION_265_subjects.csv'
}

all_cohort_results = {}
for cohort in list(datapaths.keys()):
    data = pd.read_csv(datapaths[cohort])
    if cohort == 'RUN DMC':
        variables = ['WMH_vol_ml', 'SVDp', 'WM_vol_ml', 'GM_vol_ml', 'TBV_ml', 'num_lacunes','num_mb', 'PSMD',\
        'global_cog', 'EF', 'PS',\
        'age', 'edu_yrs', 'sex', 'HTN', 'HC', 'diabetes', 'smoking', # 'BMI', 'stroke_history',\
        'dementia_final', 'T_survival']
        vars_to_drop = []

    elif cohort == 'SCANS':
        variables = ['WMH_vol_ml', 'SVDp', 'WM_vol_ml', 'GM_vol_ml', 'TBV_ml', 'num_lacunes','num_mb', 'PSMD',\
        'global_cog', 'EF', 'PS',\
        'age', 'edu_yrs', 'sex', 'HTN', 'HC', 'diabetes', 'smoking', # 'BMI', 'stroke_history',\
        'dementia_final', 'T_survival']
        vars_to_drop = []
        
    elif cohort == 'HARMONISATION':
        variables = ['WMH_vol_ml', 'SVDp', 'WM_vol_ml', 'GM_vol_ml', 'TBV_ml', 'ICV_ml', 'num_lacunes','num_mb', 'PSMD',\
        'global_cog', 'EF', 'PS',\
        'age', 'edu_yrs', 'sex', 'HTN', 'HC', 'diabetes', 'smoking',\
        'dementia_final', 'T_survival']
        vars_to_drop = []

    else: 
        print('Unrecognised cohort!')

    # if ('ICV_ml' in list(data.columns)) == False:
    #     variables.remove('ICV_ml')

    selected_data = data.dropna(subset=vars_to_drop)

    results_for_cohort = get_missingness_in_data(selected_data, variables)

    #Append additional information
    results_for_cohort['N'] = data.shape[0]
    #results_for_cohort['N_selected'] = selected_data.shape[0]

    # results_for_cohort['N_complete'] = data[variables].dropna().shape[0]
    all_cohort_results[cohort] = results_for_cohort

all_cohort_results_df = pd.DataFrame.from_dict(all_cohort_results, orient='columns')
display(all_cohort_results_df)

In [6]:
all_cohort_results_df.to_csv('/Users/lirui/Downloads/Cohort_Dementia_Prediction/Cohort_Data/Selected_Data/Data_6.0/missingness_summary.csv')

### Get baseline data summary statistics

In [2]:
skewed_cont_variables = ['WMLL *', 'Num Lacunes *', 'Num MB *']
binary_variables = ['Bin MB (%)', 'Female (%)', 'Hypertension (%)', 'Hypercholesterolemia (%)', 'Diabetes (%)', 'Smoking (%)', 'Stroke History (%)']
var_type='original'
input_feature_name_dict = {
    'WMLL *':{'transformed': 'Trans_SVDp', 'original': 'SVDp'},
    'WM':{'transformed': 'WM_vol_ml', 'original': 'WM_vol_ml'},
    'GM':{'transformed': 'GM_vol_ml', 'original': 'GM_vol_ml'},
    'TBV':{'transformed': 'TBV_ml', 'original': 'TBV_ml'},  
    'Num Lacunes *':{'transformed': 'Trans_num_lacunes', 'original': 'num_lacunes'},
    'Num MB *':{'transformed': 'Trans_num_mb', 'original': 'num_mb'},
    'Bin MB (%)':{'transformed': 'mb_bin', 'original': 'mb_bin'},
    'PSMD':{'transformed': 'Trans_PSMD', 'original': 'PSMD'},

    'Global cognition':{'transformed': 'global_cog', 'original': 'global_cog'},
    'EF':{'transformed': 'EF', 'original': 'EF'},
    'PS':{'transformed': 'PS', 'original': 'PS'},

    'Age':{'transformed': 'age', 'original': 'age'},
    'Edu':{'transformed': 'edu_yrs', 'original': 'edu_yrs'},

    'Female (%)':{'transformed': 'sex', 'original': 'sex'},
    'Hypertension (%)':{'transformed': 'HTN', 'original': 'HTN'},
    'Hypercholesterolemia (%)':{'transformed': 'HC', 'original': 'HC'},
    'Diabetes (%)':{'transformed': 'diabetes', 'original': 'diabetes'},
    'Smoking (%)':{'transformed': 'smoking', 'original': 'smoking'},

    'BMI': {'transformed': 'BMI', 'original': 'BMI'},
    'Stroke History (%)': {'transformed': 'stroke_history', 'original': 'stroke_history'}
}

In [15]:
multi_input_variables_to_print, multi_FS_name, multi_var_description, multi_cat_feature_indices = get_feature_set('Multi')
all_possible_variables = get_input_variables(multi_input_variables_to_print, 'transformed')
all_possible_variables = all_possible_variables + ['T_survival', 'dementia_final']

all_cohort_data = {
    'RUN DMC': {
        'datapath': '/Users/lirui/Downloads/Cohort_Dementia_Prediction/Cohort_Data/Selected_Data/Data_6.0/augmented_data/augmented_complete_RUN_DMC_503_subjects.csv',
        'variables_no_missing': [] #all_possible_variables # ['T_survival', 'dementia_final', 'PSMD']
    },

    'SCANS': {
        'datapath': '/Users/lirui/Downloads/Cohort_Dementia_Prediction/Cohort_Data/Selected_Data/Data_6.0/augmented_data/augmented_complete_SCANS_121_subjects.csv',
        'variables_no_missing': [] #all_possible_variables # ['T_survival', 'dementia_final']
    },

    'HARMONISATION': {
        'datapath': '/Users/lirui/Downloads/Cohort_Dementia_Prediction/Cohort_Data/Selected_Data/Data_6.0/augmented_data/augmented_complete_HARMONISATION_265_subjects.csv',
        'variables_no_missing': [] #all_possible_variables # ['T_survival', 'dementia_final', 'SVDp']
    }
}

for cohort in list(all_cohort_data.keys()):
    data = pd.read_csv(all_cohort_data[cohort]['datapath'])
    all_cohort_data[cohort]['data'] = data.dropna(subset=all_cohort_data[cohort]['variables_no_missing'])

all_cohort_data['POOLED'] = {}
all_cohort_data['POOLED']['data'] = pd.concat([all_cohort_data['RUN DMC']['data'], all_cohort_data['SCANS']['data'], all_cohort_data['HARMONISATION']['data']])

In [None]:
all_cohort_data_summary = {}

for cohort in list(all_cohort_data.keys()):
    cohort_data = all_cohort_data[cohort]['data']
    all_cohort_data_summary[cohort] = {
        'Sample size in survival analysis': cohort_data.shape[0],
    }

    all_cohort_data_summary[cohort]['Final dementia cases (%)'] = '{} ({}%)'.format(
            cohort_data[cohort_data['dementia_final']==1].shape[0], 
            round((cohort_data[cohort_data['dementia_final']==1].shape[0]/cohort_data.shape[0])*100, 1),)
    T_q1, T_q3 = np.percentile(cohort_data['T_survival'], [25 ,75])
    all_cohort_data_summary[cohort]['Follow-up time in years [median]'] = '{:.1f} [{:.1f}, {:.1f}]'.format(np.median(cohort_data['T_survival']), T_q1, T_q3)
    all_cohort_data_summary[cohort]['Follow-up time in years [mean]'] = '{:.1f} ({:.1f})'.format(np.mean(cohort_data['T_survival']), np.std(cohort_data['T_survival']))

    # Get 3-year dementia outcome stats
    classification_cohort_data = cohort_data.dropna(subset=['dementia_3yr'])
    all_cohort_data_summary[cohort]['Sample size in classification analysis'] = classification_cohort_data.shape[0] 
    num_dementia_3yr = classification_cohort_data[classification_cohort_data['dementia_3yr']==1].shape[0]
    all_cohort_data_summary[cohort]['3-year dementia cases (%)'] = '{} ({}%)'.format(
        num_dementia_3yr,
        round(num_dementia_3yr*100/classification_cohort_data.shape[0], 1)
    )

        
    for idx, print_name in enumerate(list(input_feature_name_dict.keys())):
        if (cohort in ['HARMONISATION', 'HARMONISATION_High_SVD', 'HARMONISATION_Low_SVD']) and (print_name in ['BMI', 'Stroke History (%)']):
            continue
        else:
            variable = input_feature_name_dict[print_name][var_type]
            values = cohort_data[variable].dropna()
            
            if print_name in skewed_cont_variables: # report median and IQR
                median = np.median(values)
                q1, q3 = np.percentile(values, [25,75])
                if print_name != 'WMLL *':
                    output = '{:.0f} [{:.0f}, {:.0f}]'.format(median, q1, q3)
                else:
                    output = '{:.2f} [{:.2f}, {:.2f}]'.format(median, q1, q3)
            elif print_name in binary_variables: # report percentage of 1
                count = np.sum(values)
                mean = np.mean(values)
                output = "{} ({:.1f}%)".format(int(count), mean*100)
            else: # report mean and std
                mean = np.mean(values)
                std = np.std(values)
                if variable == 'PSMD':
                    output = "{:.2E} ({:.2E})".format(Decimal(mean), Decimal(std)) 
                else:            
                    output = "{:.2f} ({:.2f})".format(mean, std)  
            
            all_cohort_data_summary[cohort][print_name] =  output

all_cohort_data_summary_df = pd.DataFrame.from_dict(all_cohort_data_summary, orient='columns')
display(all_cohort_data_summary_df)


In [9]:
all_cohort_data_summary_df.to_csv('/Users/lirui/Downloads/Cohort_Dementia_Prediction/Cohort_Data/Selected_Data/Data_6.0/data_summary.csv', index=True)

In [None]:
data1 = original_data
data2 = classification_data

for idx, feature in enumerate(all_possible_variables):
    #Numerical features -- use Student's t test (equal_var=True) or Welch's t test (equal_var=False)
    if feature in ['Trans_SVDp', 'num_lacunes', 'num_mb', 'TBV_ml', 'Trans_PSMD', 'global_cog', 'EF', 'PS', 'age', 'edu_yrs']:
        array1 = data1[feature].to_numpy()
        array2 = data2[feature].to_numpy()
        p_val = ttest_ind(array1, array2, equal_var=True)[1]
        if p_val <0.05:
            print(feature, p_val)

    # Binary features -- use chi squared test
    elif feature in ['mb_bin', 'sex', 'HTN', 'HC', 'diabetes', 'smoking']:
        data1['DS'] = 'DS1'
        data2['DS'] = 'DS2'
        df = pd.concat([data1[['DS', feature]], data2[['DS', feature]]])
        table = pd.crosstab(index=df['DS'], columns=df[feature])
        chi2, p, dof, expected = chi2_contingency(table)
        if p < 0.05:
            print(feature, p)
    else:
        print(feature)