In [2]:
# Load libraries

import os
import pandas 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from scipy import stats
import sklearn.model_selection
from sklearn.linear_model import LinearRegression

In [40]:
# Functions


def custom_merge(volume_row, plasma_df, days_threshold):
    rid = volume_row['RID']
    date = volume_row['EXAMDATE']
    
    # Filter plasma_df for the current RID
    rid_subset = plasma_df[plasma_df['RID'] == rid].copy()
    
    if not rid_subset.empty:
        # Calculate date difference and find the closest date within the threshold
        rid_subset['date_diff'] = abs(rid_subset['EXAMDATE'] - date)
        closest_row = rid_subset[rid_subset['date_diff'] <= pandas.Timedelta(days=days_threshold)].nsmallest(1, 'date_diff')
        
        if not closest_row.empty:
            # Combine the volume row with the closest plasma row
            merged = pandas.concat([volume_row, closest_row.iloc[0].drop(labels=['RID', 'EXAMDATE'])])
            return merged
    
    return None


## All biomarker inputs 

In [98]:
# Combining all biomarkers into one csv

# Merge data frames
finalEBM_input = pandas.merge(cog_data, merged_df, on='RID', how='inner')
finalEBM_input = pandas.merge(finalEBM_input, tau_data, on='RID', how='inner')

# Rename columns 
finalEBM_input = finalEBM_input.rename(columns={'RID_x': 'RID', 'Diagnosis_x': 'Diagnosis'})

# Reorder and delete columns
finalEBM_input = finalEBM_input[['RID', 'Diagnosis', 'W-ADAS11', 'W-average_hippo', 'W-ctx_entorhinal_tau', 'W-Hippocampus_tau', 'W-Amygdala_tau', 'PTAU181_VALUE', 'AB4240_VALUE']]

finalEBM_input


Unnamed: 0,RID,Diagnosis,W-ADAS11,W-average_hippo,W-ctx_entorhinal_tau,W-Hippocampus_tau,W-Amygdala_tau,PTAU181_VALUE,AB4240_VALUE
0,21,CN,-0.822070,-0.442675,-0.019197,2.447936,-0.091983,1.000,152.506596
1,31,CN,-2.572310,1.208408,0.539259,0.366876,-0.070678,1.920,115.831135
2,56,MCI,-1.873024,0.401691,1.225730,1.013739,0.370188,0.974,158.430233
3,59,MCI,-0.659299,0.187756,0.672359,2.353455,0.632490,0.744,147.826087
4,69,MCI,-0.310073,-0.153926,0.864068,0.200932,1.198506,1.410,150.673854
...,...,...,...,...,...,...,...,...,...
116,6185,CN,0.294352,-0.811825,0.064458,-0.294280,-0.195782,1.090,112.222222
117,6226,CN,-0.887217,-1.036108,-0.443965,0.274523,0.174707,0.811,111.562500
118,6251,CN,-0.368289,0.594652,2.784326,0.583768,2.481325,0.994,109.318996
119,6292,CN,-1.067704,0.285416,-0.795378,-1.387867,-0.633964,1.200,144.179104


In [100]:
# Save finalEBM_input to a csv
finalEBM_input.to_csv('../EBM input/test.csv')

## Plasma Data

In [95]:
# Plasma data
plasma = pandas.read_csv('../data/plasma_data_long.csv')

# Delete rows with NaN entries
plasma = plasma.dropna(how='any')

# Merging data

# Convert EXAMDATE to datetime 
hippo_vol_data = hippo_vol_data.rename(columns={'merge_EXAMDATE':'EXAMDATE'})
hippo_vol_data['EXAMDATE'] = pandas.to_datetime(hippo_vol_data['EXAMDATE'], format='%Y-%m-%d')
plasma['EXAMDATE'] = pandas.to_datetime(plasma['EXAMDATE'], format='%Y-%m-%d')

# Sort rows by RID and EXAMDATE
plasma = plasma.sort_values(['RID', 'EXAMDATE'])

# Merge data frames
merged_rows = []
for _, row in hippo_vol_data.iterrows():
    merged_row = custom_merge(row, plasma, 360)
    if merged_row is not None:
        merged_rows.append(merged_row)

# Add merged rows to df
merged_df = pandas.DataFrame(merged_rows)

# Remove duplicate columns 
merged_df = merged_df.loc[:,~merged_df.columns.duplicated()]

# # Add in columns with demographic data so we can perform age regression 
# with_demo = pandas.read_csv('../data/Regional_Tau_w_Demographics.csv')
# with_demo = with_demo[['AGE', 'merge_RID', 'merge_DX']]
# merged_df = pandas.merge(merged_df, with_demo, left_index=True, right_index=True)

# # Drop rows where all value is NaN 
# value_to_drop = [4513, 6038, 6505, 6598]  
# merged_df = merged_df[~merged_df['RID'].isin(value_to_drop)]

# # Age regression
# model = LinearRegression()

# # Container for residuals
# residuals_dict = {}

# for col in plasma.columns[0:6]:  
#     # Fit model
#     X = merged_df[['AGE']]
#     y = merged_df[col]
#     model.fit(X, y)
    
#     # Predict
#     y_pred = model.predict(X)
    
#     # Calculate residuals and store them
#     residuals_dict[col] = y - y_pred

# residuals_df = pandas.DataFrame(residuals_dict)

# for col in residuals_df.columns:
#     merged_df[f'W-{col}'] = residuals_df[col]
    
# # extract data for control subjects
# tau_data_control = merged_df[merged_df['Diagnosis'] == 'CN']

# for col in merged_df.columns[2:]:  
    
#     # compute the mean and standard deviation of the control population
#     mean_control = tau_data_control[[col]].mean()
#     std_control = tau_data_control[[col]].std()

#     # z-score the data
#     merged_df[[col]] = (merged_df[[col]]-mean_control)/std_control

merged_df


Unnamed: 0,RID,W-average_hippo,EXAMDATE,AB4240_VALUE,GFAP_VALUE,NFL_VALUE,PTAU181_VALUE,PTAU217_VALUE,PTAU_RATIO_VALUE,VISCODE2,date_diff
0,21,-0.442675,2017-11-27,152.506596,0.1400,4.05,1.000,2.753,4.92,m144,67 days
1,31,1.208408,2018-04-17,115.831135,0.2070,6.54,1.920,7.823,13.29,m150,1 days
2,56,0.401691,2017-11-28,158.430233,0.1170,3.63,0.974,1.588,1.89,m144,0 days
Unnamed 0,59,0.187756,2017-12-12,147.826087,0.1310,2.65,0.744,1.462,3.30,m144,0 days
Unnamed 1,69,-0.153926,2018-01-23,150.673854,0.1960,5.68,1.410,2.570,4.72,m144,8 days
...,...,...,...,...,...,...,...,...,...,...,...
Unnamed 113,6185,-0.811825,2018-02-14,112.222222,0.0782,5.18,1.090,1.813,4.43,bl,43 days
Unnamed 114,6226,-1.036108,2018-05-02,111.562500,0.0677,3.01,0.811,1.822,4.48,bl,0 days
Unnamed 115,6251,0.594652,2018-06-06,109.318996,0.0806,2.45,0.994,2.398,6.38,bl,0 days
Unnamed 116,6292,0.285416,2018-04-26,144.179104,0.0926,5.59,1.200,1.695,2.86,bl,0 days


## Regional Tau Input
Entorhinal, parahippocampal, hippocampal, amygdala, precuneus, anterior cingulate

In [69]:
tau_data = pandas.read_csv('../data/Cog_Tau_V1.csv')

# Change every '-' to '_' in column title names (to avoid errors later on)
tau_data.columns = [c.replace('-', '_') for c in tau_data.columns]

# Find the average of the right and left cerebellum 
tau_data['Average_Cerebellum'] = tau_data[['Left_Cerebellum_Cortex','Right_Cerebellum_Cortex']].mean(axis=1)

# Cerebellum correction
corrected = (tau_data.iloc[:, 7:].div(tau_data['Average_Cerebellum'], axis=0) - 1).values
tau_data.iloc[:, 7:] = corrected

# Calculate average for each region between hemispheres
tau_data['ctx_entorhinal_tau'] = tau_data[['ctx_rh_entorhinal', 'ctx_lh_entorhinal']].mean(axis=1)
tau_data['Hippocampus_tau'] = tau_data[['Right_Hippocampus', 'Left_Hippocampus']].mean(axis=1)
tau_data['Amygdala_tau'] = tau_data[['Right_Amygdala', 'Left_Amygdala']].mean(axis=1)

# Add in columns with demographic data so we can perform age regression 
with_demo = pandas.read_csv('../data/Regional_Tau_w_Demographics.csv')
with_demo = with_demo[['AGE', 'RID', 'merge_DX']]

# Housekeeping
tau_data = tau_data[['merge_RID', 'ctx_entorhinal_tau', 'Hippocampus_tau', 'Amygdala_tau']]
tau_data = tau_data.rename(columns={'merge_RID': 'RID'})
with_demo = with_demo.rename(columns={'AGE':'Age'})

# Merge with demographic data
tau_data = pandas.merge(tau_data, with_demo, left_index=True, right_index=True)
tau_data = tau_data.rename(columns={'RID_x': 'RID'})
tau_data = tau_data.rename(columns={'merge_DX': 'Diagnosis'})
tau_data = tau_data.drop(['RID_y'],axis=1)

# Drop rows where all value is NaN 
value_to_drop = [4513, 6038, 6505, 6598]  
tau_data = tau_data[~tau_data['RID'].isin(value_to_drop)]

# Drop duplicates
tau_data = tau_data.drop_duplicates(subset='RID', keep='first')


# Age regression
model = LinearRegression()

# Container for residuals
residuals_dict = {}

for col in tau_data.columns[1:4]:  
    # Fit model
    X = tau_data[['Age']]
    y = tau_data[col]
    model.fit(X, y)
    
    # Predict
    y_pred = model.predict(X)
    
    # Calculate residuals and store them
    residuals_dict[col] = y - y_pred

residuals_df = pandas.DataFrame(residuals_dict)

for col in residuals_df.columns:
    tau_data[f'W-{col}'] = residuals_df[col]
    
    
tau_data = tau_data[['RID','Diagnosis', 'W-ctx_entorhinal_tau', 'W-Hippocampus_tau', 'W-Amygdala_tau']]

# extract data for control subjects
tau_data_control = tau_data[tau_data['Diagnosis'] == 'CN']

for col in tau_data.columns[2:]:  
    
    # compute the mean and standard deviation of the control population
    mean_control = tau_data_control[[col]].mean()
    std_control = tau_data_control[[col]].std()

    # z-score the data
    tau_data[[col]] = (tau_data[[col]]-mean_control)/std_control
    
tau_data

Unnamed: 0,RID,Diagnosis,W-ctx_entorhinal_tau,W-Hippocampus_tau,W-Amygdala_tau
0,21,CN,-0.019197,2.447936,-0.091983
1,31,CN,0.539259,0.366876,-0.070678
3,56,MCI,1.225730,1.013739,0.370188
5,59,MCI,0.672359,2.353455,0.632490
6,69,MCI,0.864068,0.200932,1.198506
...,...,...,...,...,...
814,6704,CN,-0.144049,-0.251543,-0.030284
815,6705,Dementia,-0.759413,-1.081193,-0.340875
816,6713,Dementia,2.376429,0.937911,2.961076
817,6721,Dementia,0.513021,0.588945,0.574137


## Regional Volumes

In [4]:
# All volumes already z-scored, negative

vol_data = pandas.read_csv('residual_neg_z_scores.csv')

# Drop rows where all value is NaN 
value_to_drop = [4513, 6038, 6505, 6598]  
vol_data = vol_data[~vol_data['RID'].isin(value_to_drop)]

# Take averages of left and right superior frontal gyrus
vol_data['Average_superiorfrontal_vol'] = vol_data[['ctx-lh-superiorfrontal','ctx-rh-superiorfrontal']].mean(axis=1)

# Sum anterior cingulate subregions and average left and right hemis
vol_data['ctx-lh-anteriorcingulate'] = vol_data[['ctx-lh-rostralanteriorcingulate', 'ctx-lh-caudalanteriorcingulate']].sum(axis=1)
vol_data['ctx-rh-anteriorcingulate'] = vol_data[['ctx-rh-rostralanteriorcingulate','ctx-rh-caudalanteriorcingulate']].sum(axis=1)
vol_data['Average_anteriorcingulate_vol'] = vol_data[['ctx-lh-anteriorcingulate','ctx-rh-anteriorcingulate']].mean(axis=1)

# Take averages of left and right posterior cingulate cortices
vol_data['Average_posteriorcingulate_vol'] = vol_data[['ctx-lh-posteriorcingulate','ctx-rh-posteriorcingulate']].mean(axis=1)

# Take average of left and right precuneous
vol_data['Average_precuneus_vol'] = vol_data[['ctx-rh-precuneus','ctx-lh-precuneus']].mean(axis=1)

# Take average of left and right inferior parietal lobules
vol_data['Average_inferiorparietal_vol'] = vol_data[['ctx-rh-inferiorparietal','ctx-lh-inferiorparietal']].mean(axis=1)

# Take average of left and right inferior temporal gyrus
vol_data['Average_inferiortemporal_vol'] = vol_data[['ctx-rh-inferiortemporal','ctx-lh-inferiortemporal']].mean(axis=1)

# Delete unneeded rows
vol_data = vol_data[['RID','AGE','Average_superiorfrontal_vol','Average_anteriorcingulate_vol','Average_posteriorcingulate_vol','Average_precuneus_vol','Average_inferiorparietal_vol','Average_inferiortemporal_vol']]

# # Age regression
# model = LinearRegression()

# # Container for residuals
# residuals_dict = {}

# for col in vol_data.columns[2:]:  
#     # Fit model
#     X = vol_data[['AGE']]
#     y = vol_data[col]
#     model.fit(X, y)
    
#     # Predict
#     y_pred = model.predict(X)
    
#     # Calculate residuals and store them
#     residuals_dict[col] = y - y_pred

# residuals_df = pandas.DataFrame(residuals_dict)

# for col in residuals_df.columns:
#     vol_data[f'W-{col}'] = residuals_df[col]

#vol_data = vol_data[['RID','AGE','W-Average_superiorfrontal_vol','W-Average_anteriorcingulate_vol','W-Average_posteriorcingulate_vol','W-Average_precuneus_vol','W-Average_inferiorparietal_vol','W-Average_inferiortemporal_vol']]
vol_data = vol_data[['RID','AGE','Average_superiorfrontal_vol','Average_anteriorcingulate_vol','Average_posteriorcingulate_vol','Average_precuneus_vol','Average_inferiorparietal_vol','Average_inferiortemporal_vol']]
          
vol_data


Unnamed: 0,RID,AGE,Average_superiorfrontal_vol,Average_anteriorcingulate_vol,Average_posteriorcingulate_vol,Average_precuneus_vol,Average_inferiorparietal_vol,Average_inferiortemporal_vol
0,21,72.6,0.449522,0.301755,0.532955,0.757263,-0.239702,1.704690
1,31,77.7,0.819599,0.170225,0.503228,-0.052603,0.279301,1.286340
2,31,77.7,0.819599,0.170225,0.503228,-0.052603,0.279301,1.286340
3,56,69.6,0.502718,2.723248,0.207082,-0.088145,0.967623,1.773868
4,56,69.6,0.502718,2.723248,0.207082,-0.088145,0.967623,1.773868
...,...,...,...,...,...,...,...,...
812,6704,65.3,1.752011,1.306636,0.455020,0.965912,1.024877,1.925646
813,6705,74.9,-0.001617,1.134429,0.922593,-0.098058,0.080927,-0.253239
814,6713,71.1,-0.221338,0.301398,-1.040388,-0.699135,-0.754194,-0.464371
815,6721,77.9,0.185680,1.569582,-0.350836,0.463439,-0.699300,-0.347036


## Hippocampal Volume Input

In [75]:
# Preparing hippocampal volume data (already z-scored, neg, and age regressed)
hippo_vol_data = pandas.read_csv('../data/residual_neg_z_scores.csv')

# Delete unneeded rows 
hippo_vol_data = hippo_vol_data[['RID','merge_EXAMDATE','W-RHV', 'W-LHV']]

# Take average of right and left hippocmapal hemisphere columes 
hippo_vol_data['W-average_hippo'] = hippo_vol_data[['W-RHV', 'W-LHV']].mean(axis=1)

# Delete unneeded rows
hippo_vol_data = hippo_vol_data[['RID', 'W-average_hippo','merge_EXAMDATE']]

# Drop rows where a value is NaN 
value_to_drop = [4513, 6038, 6505, 6598]    
hippo_vol_data = hippo_vol_data[~hippo_vol_data['RID'].isin(value_to_drop)]
hippo_vol_data = hippo_vol_data.drop_duplicates(subset='RID', keep='first')

hippo_vol_data

Unnamed: 0,RID,W-average_hippo,merge_EXAMDATE
0,21,-0.442675,2017-11-27
1,31,1.208408,2018-04-17
3,56,0.401691,2017-11-28
5,59,0.187756,2017-12-12
6,69,-0.153926,2018-01-23
...,...,...,...
812,6704,0.442492,2019-04-16
813,6705,1.443939,2019-04-23
814,6713,0.114254,2019-04-29
815,6721,2.023511,2019-05-07


## Lobe Volumes Input

In [16]:
# All volumes already z-scored, negative

lobe_vol_data = pandas.read_csv('residual_neg_z_scores.csv')

# Take average of all regions in the frontal lobe (including left and right)
lobe_vol_data['Average_frontal'] = lobe_vol_data[['ctx-rh-caudalmiddlefrontal', 'ctx-rh-lateralorbitofrontal', 'ctx-rh-medialorbitofrontal', 'ctx-rh-rostralmiddlefrontal', 'ctx-rh-superiorfrontal', 'ctx-rh-frontalpole', 'ctx-lh-caudalmiddlefrontal','ctx-lh-lateralorbitofrontal','ctx-lh-medialorbitofrontal','ctx-lh-rostralmiddlefrontal', 'ctx-lh-superiorfrontal', 'ctx-lh-frontalpole']].mean(axis=1)

# Take average of all regions in the temporal lobe (including left and right)
lobe_vol_data['Average_temporal'] = lobe_vol_data[['ctx-rh-inferiortemporal', 'ctx-rh-middletemporal','ctx-rh-superiortemporal','ctx-rh-temporalpole','ctx-rh-transversetemporal','ctx-lh-inferiortemporal','ctx-lh-middletemporal','ctx-lh-superiortemporal','ctx-lh-temporalpole','ctx-lh-transversetemporal']].mean(axis=1)

# Take average of all regions in the parietal lobe (including left and right)
lobe_vol_data['Average_parietal'] = lobe_vol_data[['ctx-rh-inferiorparietal','ctx-rh-superiorparietal','ctx-lh-inferiorparietal','ctx-lh-superiorparietal']].mean(axis=1)

# Take average of all regions in the occipital lobe (including left and right)
lobe_vol_data['Average_occipital'] = lobe_vol_data[['ctx-rh-lateraloccipital','ctx-lh-lateraloccipital']].mean(axis=1)

# Delete unneeded rows
lobe_vol_data = lobe_vol_data[['RID','AGE','Average_frontal','Average_temporal','Average_parietal','Average_occipital']]

# Drop rows where all value is NaN 
value_to_drop = [4513, 6038, 6505, 6598]  
lobe_vol_data = lobe_vol_data[~lobe_vol_data['RID'].isin(value_to_drop)]

# Age regression
model = LinearRegression()

# Container for residuals
residuals_dict = {}

for col in lobe_vol_data.columns[2:]:  
    # Fit model
    X = lobe_vol_data[['AGE']]
    y = lobe_vol_data[col]
    model.fit(X, y)
    
    # Predict
    y_pred = model.predict(X)
    
    # Calculate residuals and store them
    residuals_dict[col] = y - y_pred

residuals_df = pandas.DataFrame(residuals_dict)

for col in residuals_df.columns:
    lobe_vol_data[f'W-{col}'] = residuals_df[col]

lobe_vol_data = lobe_vol_data[['RID', 'W-Average_frontal', 'W-Average_temporal', 'W-Average_parietal', 'W-Average_occipital']]
#lobe_vol_data = lobe_vol_data[['RID', 'Average_frontal', 'Average_temporal', 'Average_parietal', 'Average_occipital']]

lobe_vol_data


Unnamed: 0,RID,W-Average_frontal,W-Average_temporal,W-Average_parietal,W-Average_occipital
0,21,0.198814,0.827801,0.370596,0.916692
1,31,0.196486,0.598093,0.190721,0.063724
2,31,0.196486,0.598093,0.190721,0.063724
3,56,0.580999,0.684701,0.579621,1.171055
4,56,0.580999,0.684701,0.579621,1.171055
...,...,...,...,...,...
812,6704,1.504040,1.119667,0.594320,1.510380
813,6705,0.192404,-0.070038,-0.292768,-0.421116
814,6713,-0.195858,-0.467507,-1.002440,-0.529634
815,6721,-0.064755,-0.395750,-0.314841,-0.129030


## Meta ROI Tau Input

In [16]:
# Preparing tau data

meta_tau_data = pandas.read_csv('Cog_Tau_V1.csv')

# Find the average of the right and left cerebellum 
meta_tau_data['Average_Cerebellum'] = (meta_tau_data['Left-Cerebellum-Cortex']+ meta_tau_data['Right-Cerebellum-Cortex'])/2

# Change every '-' to '_' in column title names (to avoid errors later on)
meta_tau_data.columns = [c.replace('-', '_') for c in meta_tau_data.columns]

# Divide each regional tau value by the average of R and L cerebellum tau and subtract 1
meta_tau_data[['ctx_rh_inferiortemporal', 'ctx_rh_middletemporal', 'ctx_rh_parahippocampal', 'ctx_lh_entorhinal', 'ctx_lh_fusiform', 'ctx_lh_inferiortemporal', 'ctx_lh_middletemporal', 'ctx_lh_parahippocampal', 'Left_Amygdala', 'Right_Amygdala']] = meta_tau_data[['ctx_rh_inferiortemporal', 'ctx_rh_middletemporal', 'ctx_rh_parahippocampal', 'ctx_lh_entorhinal', 'ctx_lh_fusiform', 'ctx_lh_inferiortemporal', 'ctx_lh_middletemporal', 'ctx_lh_parahippocampal', 'Left_Amygdala', 'Right_Amygdala']].div(meta_tau_data['Average_Cerebellum'], axis=0)
meta_tau_data[['ctx_rh_inferiortemporal', 'ctx_rh_middletemporal', 'ctx_rh_parahippocampal', 'ctx_lh_entorhinal', 'ctx_lh_fusiform', 'ctx_lh_inferiortemporal', 'ctx_lh_middletemporal', 'ctx_lh_parahippocampal', 'Left_Amygdala', 'Right_Amygdala']] = meta_tau_data[['ctx_rh_inferiortemporal', 'ctx_rh_middletemporal', 'ctx_rh_parahippocampal', 'ctx_lh_entorhinal', 'ctx_lh_fusiform', 'ctx_lh_inferiortemporal', 'ctx_lh_middletemporal', 'ctx_lh_parahippocampal', 'Left_Amygdala', 'Right_Amygdala']] - 1

# Take average of all meta tau regions
meta_tau_data['Average_meta_tau'] = meta_tau_data[['ctx_rh_inferiortemporal', 'ctx_rh_middletemporal', 'ctx_rh_parahippocampal', 'ctx_lh_entorhinal', 'ctx_lh_fusiform', 'ctx_lh_inferiortemporal', 'ctx_lh_middletemporal', 'ctx_lh_parahippocampal', 'Left_Amygdala', 'Right_Amygdala']].mean(axis=1)

# Add in columns with demographic data so we can perform age regression 
with_demo = pandas.read_csv('Regional_Tau_w_Demographics.csv')
with_demo = with_demo[['AGE', 'RID']]

# Rename column titles for consistency 
meta_tau_data = meta_tau_data.rename(columns={'merge_RID': 'RID'})
with_demo = with_demo.rename(columns={'AGE':'Age'})

meta_tau_data = pandas.merge(meta_tau_data, with_demo, left_index=True, right_index=True)
meta_tau_data = meta_tau_data[['RID_x', 'merge_DX', 'Age','Average_meta_tau']]
meta_tau_data = meta_tau_data.rename(columns={'RID_x': 'RID'})
meta_tau_data = meta_tau_data.rename(columns={'merge_DX': 'Diagnosis'})

# Drop rows where all value is NaN 
value_to_drop = [4513, 6038, 6505, 6598]  
meta_tau_data = meta_tau_data[~meta_tau_data['RID'].isin(value_to_drop)]

meta_tau_data = meta_tau_data[['RID','Diagnosis', 'Age', 'Average_meta_tau']]

# Extracting the columns for regression analysis
X = meta_tau_data[['Age']]  # Independent variable
y = meta_tau_data['Average_meta_tau']  # Dependent variable

# Fitting the simple linear regression model
model = LinearRegression()
model.fit(X, y)

# Predicting the values using the fitted model
y_pred = model.predict(X)

# Calculating residuals
residuals = y - y_pred

# Adding the residuals as a new column named 'W-Average_Tau' to the dataframe
meta_tau_data['W-Average_meta_tau'] = residuals

# Inclue only the relevant columns  
meta_tau_data = meta_tau_data[['RID', 'Diagnosis', 'W-Average_meta_tau']]
#meta_tau_data = meta_tau_data[['RID', 'Diagnosis', 'Average_meta_tau']]

# extract data for control subjects
#meta_tau_data_control = meta_tau_data[meta_tau_data['Diagnosis'] == 'CN']
    
# compute the mean and standard deviation of the control population
# meta_tau_mean_control = meta_tau_data_control[['W-Average_meta_tau']].mean()
# meta_tau_std_control = meta_tau_data_control[['W-Average_meta_tau']].std()
#meta_tau_mean_control = meta_tau_data_control[['Average_meta_tau']].mean()
#meta_tau_std_control = meta_tau_data_control[['Average_meta_tau']].std()

# z-score the data
#meta_tau_data[['W-Average_meta_tau']] = (meta_tau_data[['W-Average_meta_tau']]-meta_tau_mean_control)/meta_tau_std_control
#meta_tau_data[['Average_meta_tau']] = (meta_tau_data[['Average_meta_tau']]-meta_tau_mean_control)/meta_tau_std_control


meta_tau_data

Unnamed: 0,RID,Diagnosis,W-Average_meta_tau
0,21,CN,-0.046907
1,31,CN,-0.109110
2,31,CN,-0.099811
3,56,MCI,-0.009005
4,56,MCI,0.008748
...,...,...,...
814,6704,CN,-0.087506
815,6705,Dementia,-0.150065
816,6713,Dementia,0.062978
817,6721,Dementia,0.005101


## Cognitive Score Input

In [76]:
# Preparing cognitive score input
cog_data = pandas.read_csv('../data/Regional_Tau_w_Demographics.csv')


### Using ADAS11

In [77]:
# Rename columns for consistency
cog_data = cog_data.rename(columns={'merge_DX': 'Diagnosis'})
cog_data = cog_data.rename(columns={'AGE': 'Age'})

# Drop rows where all value is NaN 
value_to_drop = [4513, 6038, 6505, 6598]  
cog_data = cog_data[~cog_data['RID'].isin(value_to_drop)]
cog_data = cog_data.drop_duplicates(subset='RID', keep='first')

# Delete unneeded columns 
cog_data = cog_data[['RID', 'Age', 'Diagnosis', 'ADAS11_bl']]

# extract data for control subjects
cog_data_control = cog_data[cog_data['Diagnosis'] == 'CN']

# compute the mean and standard deviation of the control population
cog_mean_control = cog_data_control[['ADAS11_bl']].mean()
cog_std_control = cog_data_control[['ADAS11_bl']].std()

# z-score the data
cog_data[['ADAS11_bl']] = (cog_data[['ADAS11_bl']]-cog_mean_control)/cog_std_control

# Age regression for cognitive score

# Extracting the columns for regression analysis
X = cog_data[['Age']]  # Independent variable
y = cog_data['ADAS11_bl']  # Dependent variable

# Fitting the simple linear regression model
model = LinearRegression()
model.fit(X, y)

# Predicting the 'MMSE_bl' values using the fitted model
y_pred = model.predict(X)

# Calculating residuals
residuals = y - y_pred

# Adding the residuals as a new column 
cog_data['W-ADAS11'] = residuals

# Delete unneeded columns
cog_data = cog_data[['RID', 'Diagnosis', 'W-ADAS11']]

cog_data

Unnamed: 0,RID,Diagnosis,W-ADAS11
0,21,CN,-0.822070
1,31,CN,-2.572310
3,56,MCI,-1.873024
5,59,MCI,-0.659299
6,69,MCI,-0.310073
...,...,...,...
814,6704,CN,0.582730
815,6705,Dementia,3.082260
816,6713,Dementia,3.110116
817,6721,Dementia,3.085278


### Using MMSE

In [3]:
# Rename columns for consistency
data = data.rename(columns={'merge_DX': 'Diagnosis'})
data = data.rename(columns={'AGE': 'Age'})

# Change diagnosis to numerical values
mapping = {'CN': 0, 'MCI': 1, 'Dementia': 2}
data['Diagnosis'] = data['Diagnosis'].map(mapping)

# Delete unneeded columns 
data = data[['RID', 'Age', 'Diagnosis', 'MMSE_bl']]

# extract data for control subjects
data_control = data[data['Diagnosis'] == 0]

# compute the mean and standard deviation of the control population
mean_control = data_control[['MMSE_bl']].mean()
std_control = data_control[['MMSE_bl']].std()

# z-score the data
data[['MMSE_bl']] = (data[['MMSE_bl']]-mean_control)/std_control

# Compute 30 - z-score
data[['MMSE_bl']] = 30 -data[['MMSE_bl']]

In [3]:
# Drop rows where all value is NaN 
value_to_drop = [4513, 6038, 6505, 6598]  
    
data = data[~data['RID'].isin(value_to_drop)]

      RID  Age  Diagnosis    MMSE_bl
734  6505  NaN          0  30.111267
778  6598  NaN          1  33.827959


In [20]:
# Age regression for cognitive score

from sklearn.linear_model import LinearRegression

# Extracting the columns for regression analysis
X = data[['Age']]  # Independent variable
y = data['MMSE_bl']  # Dependent variable

# Fitting the simple linear regression model
model = LinearRegression()
model.fit(X, y)

# Predicting the 'MMSE_bl' values using the fitted model
y_pred = model.predict(X)

# Calculating residuals
residuals = y - y_pred

# Adding the residuals as a new column named 'W-MMSE' to the dataframe
data['W-MMSE'] = residuals

# Delete unneeded columns
data = data[['RID', 'Diagnosis', 'W-MMSE']]
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['W-MMSE'] = residuals


Unnamed: 0,RID,Diagnosis,W-MMSE
0,21,0,-1.448714
1,31,0,-1.577825
2,31,0,-1.577825
3,56,1,0.485579
4,56,1,0.485579
...,...,...,...
814,6704,0,0.594438
815,6705,2,3.138925
816,6713,2,1.376779
817,6721,2,3.062977


In [21]:
data.to_csv('Cogscore.csv')