In [6]:
# Load libraries

import os
import pandas 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pySuStaIn
import statsmodels.formula.api as smf
from scipy import stats
import sklearn.model_selection

## Combining all threee inputs into one csv

In [7]:
# combining all three biomarkers into one csv

hippo = pandas.read_csv('hippocampal_volume.csv')
tau = pandas.read_csv('Tau.csv')
cog = pandas.read_csv('Cogscore.csv') 
lobes = pandas.read_csv('lobe_volume.csv')
subregions = pandas.read_csv('lobe_subregion_volumes.csv')

# Merge hippo and tau
finalEBM_input = pandas.merge(hippo, tau, left_index=True, right_index=True)

# Merge cog
finalEBM_input = pandas.merge(finalEBM_input, cog, left_index=True, right_index=True)

# Housekeeping
finalEBM_input = finalEBM_input[['W-average_hippo','W-Average_Tau','RID','Diagnosis','W-ADAS11']]

# Merge lobes
finalEBM_input = pandas.merge(finalEBM_input, lobes, left_index=True, right_index=True)

# Merge subregions
finalEBM_input = pandas.merge(finalEBM_input, subregions, left_index=True, right_index=True)

# Rename columns 
finalEBM_input = finalEBM_input.rename(columns={
    'W-Average_Tau': 'W_average_tau',
    'W-average_hippo': 'W_average_hippo',
    #'W-MMSE': 'W_MMSE', # If using MMSE instead of ADAS 
    'W-ADAS11': 'W_ADAS11'}) 
    
# Keep only relevant columns
finalEBM_input = finalEBM_input[['RID',	'W_average_hippo', 'W_average_tau', 'Diagnosis', 'W_ADAS11', 'W_Average_frontal','W_Average_temporal','W_Average_parietal','W_Average_occipital','W_Average_superiorfrontal','W_Average_anteriorcingulate','W_Average_posteriorcingulate','W_Average_precuneus','W_Average_inferiorparietal','W_Average_inferiortemporal']] # If using MMSE instead of ADAS rewrite this with W_MMSE

# Reorder columns
new_order = ['RID', 'Diagnosis','W_average_hippo', 'W_ADAS11','W_average_tau','W_Average_frontal','W_Average_temporal','W_Average_parietal','W_Average_occipital','W_Average_superiorfrontal','W_Average_anteriorcingulate','W_Average_posteriorcingulate','W_Average_precuneus','W_Average_inferiorparietal','W_Average_inferiortemporal'] # If using MMSE instead of ADAS rewrite this with W_MMSE
finalEBM_input = finalEBM_input[new_order]

# Delete redundant RIDs
finalEBM_input = finalEBM_input.drop_duplicates(subset='RID', keep='first')


Unnamed: 0,RID,Diagnosis,W_average_hippo,W_ADAS11,W_average_tau,W_Average_frontal,W_Average_temporal,W_Average_parietal,W_Average_occipital,W_Average_superiorfrontal,W_Average_anteriorcingulate,W_Average_posteriorcingulate,W_Average_precuneus,W_Average_inferiorparietal,W_Average_inferiortemporal
0,21,0,-0.442675,-0.725910,-0.046907,0.198814,0.827801,0.370596,0.916692,0.270293,0.090236,0.469295,0.539510,-0.470478,1.517905
1,31,0,1.208408,-2.442518,-0.109110,0.196486,0.598093,0.190721,0.063724,0.527321,0.007342,0.381349,-0.402813,-0.091963,0.989604
3,56,1,0.401691,-1.784994,-0.009005,0.580999,0.684701,0.579621,1.171055,0.389989,1.311059,0.177668,-0.227982,0.819487,1.651760
5,59,1,0.187756,-0.571995,0.000454,0.491392,0.602563,0.774127,0.061574,0.943000,0.353885,0.628442,0.222930,1.562665,1.719080
6,69,1,-0.153926,-0.215380,0.062139,-0.480805,-0.428033,0.043973,-0.096262,-0.562571,-0.090341,0.362464,-0.352621,0.122907,-0.188402
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
808,6704,0,0.442492,0.636912,-0.087506,1.504040,1.119667,0.594320,1.510380,1.734599,0.617195,0.474693,0.937756,0.995192,1.896241
809,6705,2,1.443939,3.167292,-0.150065,0.192404,-0.070038,-0.292768,-0.421116,-0.231828,0.498848,0.832678,-0.375546,-0.213206,-0.489609
810,6713,2,0.114254,3.177238,0.062978,-0.195858,-0.467507,-1.002440,-0.529634,-0.367316,0.095096,-1.086925,-0.877930,-0.943650,-0.618818
811,6721,2,2.023511,3.184310,0.005101,-0.064755,-0.395750,-0.314841,-0.129030,-0.111031,0.706349,-0.474998,0.108034,-1.076073,-0.648083


In [8]:
# Save finalEBM_input to a csv
finalEBM_input.to_csv('FinalEBM_input.csv')

## Lobe Subregion Volumes

In [42]:
# All volumes already z-scored, negative

data = pandas.read_csv('residual_neg_z_scores.csv')

# Drop rows where all value is NaN 
value_to_drop = [4513, 6038, 6505, 6598]  
    
data = data[~data['RID'].isin(value_to_drop)]

# Take averages of left and right superior frontal gyrus
data['Average_superiorfrontal'] = data[['ctx-lh-superiorfrontal','ctx-rh-superiorfrontal']].mean(axis=1)

# Take averages of anterior cingulate cortices
data['Average_anteriorcingulate'] = data[['ctx-lh-rostralanteriorcingulate','ctx-rh-rostralanteriorcingulate','ctx-lh-caudalanteriorcingulate','ctx-rh-caudalanteriorcingulate']].mean(axis=1)

# Take averages of posterior cingulate cortices
data['Average_posteriorcingulate'] = data[['ctx-lh-posteriorcingulate','ctx-rh-posteriorcingulate']].mean(axis=1)

# Take averages of precuneous
data['Average_precuneus'] = data[['ctx-rh-precuneus','ctx-lh-precuneus']].mean(axis=1)

# Take averages of inferior parietal lobules
data['Average_inferiorparietal'] = data[['ctx-rh-inferiorparietal','ctx-lh-inferiorparietal']].mean(axis=1)

# Take averages of inferior temporal gyrus
data['Average_inferiortemporal'] = data[['ctx-rh-inferiortemporal','ctx-lh-inferiortemporal']].mean(axis=1)

# Delete unneeded rows
data = data[['RID','AGE','Average_superiorfrontal','Average_anteriorcingulate','Average_posteriorcingulate','Average_precuneus','Average_inferiorparietal','Average_inferiortemporal']]

Unnamed: 0,RID,AGE,Average_superiorfrontal,Average_anteriorcingulate,Average_posteriorcingulate,Average_precuneus,Average_inferiorparietal,Average_inferiortemporal
0,21,72.6,0.449522,0.150878,0.532955,0.757263,-0.239702,1.704690
1,31,77.7,0.819599,0.085112,0.503228,-0.052603,0.279301,1.286340
2,31,77.7,0.819599,0.085112,0.503228,-0.052603,0.279301,1.286340
3,56,69.6,0.502718,1.361624,0.207082,-0.088145,0.967623,1.773868
4,56,69.6,0.502718,1.361624,0.207082,-0.088145,0.967623,1.773868
...,...,...,...,...,...,...,...,...
812,6704,65.3,1.752011,0.653318,0.455020,0.965912,1.024877,1.925646
813,6705,74.9,-0.001617,0.567214,0.922593,-0.098058,0.080927,-0.253239
814,6713,71.1,-0.221338,0.150699,-1.040388,-0.699135,-0.754194,-0.464371
815,6721,77.9,0.185680,0.784791,-0.350836,0.463439,-0.699300,-0.347036


In [43]:
# Age regression for lobe subregion volumes

from sklearn.linear_model import LinearRegression
# data should have neg_zscore_orig.csv data plus age

for region in data.columns[2:]:
    # Extracting the columns for regression analysis
    X = data[['AGE']]  # Independent variable
    y = data[region]  # Dependent variable

    # Fitting the simple linear regression model
    model = LinearRegression()
    model.fit(X, y)

    # Predicting the 'MMSE_bl' values using the fitted model
    y_pred = model.predict(X)

    # Calculating residuals
    residuals = y - y_pred

    # Adding the residuals as a new column named 'W-Average_Tau' to the dataframe
    data['W_' + region] = residuals

In [45]:
# Include only the relevant columns  
data = data[['RID', 'W_Average_superiorfrontal','W_Average_anteriorcingulate','W_Average_posteriorcingulate','W_Average_precuneus','W_Average_inferiorparietal','W_Average_inferiortemporal']]

data


Unnamed: 0,RID,W_Average_superiorfrontal,W_Average_anteriorcingulate,W_Average_posteriorcingulate,W_Average_precuneus,W_Average_inferiorparietal,W_Average_inferiortemporal
0,21,0.270293,0.090236,0.469295,0.539510,-0.470478,1.517905
1,31,0.527321,0.007342,0.381349,-0.402813,-0.091963,0.989604
2,31,0.527321,0.007342,0.381349,-0.402813,-0.091963,0.989604
3,56,0.389989,1.311059,0.177668,-0.227982,0.819487,1.651760
4,56,0.389989,1.311059,0.177668,-0.227982,0.819487,1.651760
...,...,...,...,...,...,...,...
812,6704,1.734599,0.617195,0.474693,0.937756,0.995192,1.896241
813,6705,-0.231828,0.498848,0.832678,-0.375546,-0.213206,-0.489609
814,6713,-0.367316,0.095096,-1.086925,-0.877930,-0.943650,-0.618818
815,6721,-0.111031,0.706349,-0.474998,0.108034,-1.076073,-0.648083


In [46]:
# Save to lobe_subregion_volumes.csv
data.to_csv('lobe_subregion_volumes.csv')

## Lobe Volumes 

In [37]:
# All volumes already z-scored, negative

data = pandas.read_csv('residual_neg_z_scores.csv')

# Take average of all regions in the frontal lobe (including left and right)
data['Average_frontal'] = data[['ctx-rh-caudalmiddlefrontal', 'ctx-rh-lateralorbitofrontal', 'ctx-rh-medialorbitofrontal', 'ctx-rh-rostralmiddlefrontal', 'ctx-rh-superiorfrontal', 'ctx-rh-frontalpole', 'ctx-lh-caudalmiddlefrontal','ctx-lh-lateralorbitofrontal','ctx-lh-medialorbitofrontal','ctx-lh-rostralmiddlefrontal', 'ctx-lh-superiorfrontal', 'ctx-lh-frontalpole']].mean(axis=1)

# Take average of all regions in the temporal lobe (including left and right)
data['Average_temporal'] = data[['ctx-rh-inferiortemporal', 'ctx-rh-middletemporal','ctx-rh-superiortemporal','ctx-rh-temporalpole','ctx-rh-transversetemporal','ctx-lh-inferiortemporal','ctx-lh-middletemporal','ctx-lh-superiortemporal','ctx-lh-temporalpole','ctx-lh-transversetemporal']].mean(axis=1)

# Take average of all regions in the parietal lobe (including left and right)
data['Average_parietal'] = data[['ctx-rh-inferiorparietal','ctx-rh-superiorparietal','ctx-lh-inferiorparietal','ctx-lh-superiorparietal']].mean(axis=1)

# Take average of all regions in the occipital lobe (including left and right)
data['Average_occipital'] = data[['ctx-rh-lateraloccipital','ctx-lh-lateraloccipital']].mean(axis=1)

# Delete unneeded rows
data = data[['RID','AGE','Average_frontal','Average_temporal','Average_parietal','Average_occipital']]

# Drop rows where all value is NaN 
value_to_drop = [4513, 6038, 6505, 6598]  
    
data = data[~data['RID'].isin(value_to_drop)]


In [38]:
# Age regression for lobe volumes

from sklearn.linear_model import LinearRegression
# data should have neg_zscore_orig.csv data plus age

for region in data.columns[2:]:
    # Extracting the columns for regression analysis
    X = data[['AGE']]  # Independent variable
    y = data[region]  # Dependent variable

    # Fitting the simple linear regression model
    model = LinearRegression()
    model.fit(X, y)

    # Predicting the 'MMSE_bl' values using the fitted model
    y_pred = model.predict(X)

    # Calculating residuals
    residuals = y - y_pred

    # Adding the residuals as a new column named 'W-Average_Tau' to the dataframe
    data['W_' + region] = residuals

Unnamed: 0,RID,AGE,Average_frontal,Average_temporal,Average_parietal,Average_occipital,W_Average_frontal,W_Average_temporal,W_Average_parietal,W_Average_occipital
0,21,72.6,0.339853,1.036789,0.601625,1.100572,0.198814,0.827801,0.370596,0.916692
1,31,77.7,0.414346,0.930375,0.573440,0.406073,0.196486,0.598093,0.190721,0.063724
2,31,77.7,0.414346,0.930375,0.573440,0.406073,0.196486,0.598093,0.190721,0.063724
3,56,69.6,0.676851,0.821165,0.721422,1.261718,0.580999,0.684701,0.579621,1.171055
4,56,69.6,0.676851,0.821165,0.721422,1.261718,0.580999,0.684701,0.579621,1.171055
...,...,...,...,...,...,...,...,...,...,...
812,6704,65.3,1.535122,1.152178,0.608225,1.467432,1.504040,1.119667,0.594320,1.510380
813,6705,74.9,0.368088,0.194553,0.006670,-0.165769,0.192404,-0.070038,-0.292768,-0.421116
814,6713,71.1,-0.077413,-0.294780,-0.816025,-0.392363,-0.195858,-0.467507,-1.002440,-0.529634
815,6721,77.9,0.156117,-0.058634,0.073827,0.219533,-0.064755,-0.395750,-0.314841,-0.129030


In [40]:
# Include only the relevant columns  
data = data[['RID', 'W_Average_frontal','W_Average_temporal','W_Average_parietal','W_Average_occipital']]

Unnamed: 0,RID,W_Average_frontal,W_Average_temporal,W_Average_parietal,W_Average_occipital
0,21,0.198814,0.827801,0.370596,0.916692
1,31,0.196486,0.598093,0.190721,0.063724
2,31,0.196486,0.598093,0.190721,0.063724
3,56,0.580999,0.684701,0.579621,1.171055
4,56,0.580999,0.684701,0.579621,1.171055
...,...,...,...,...,...
812,6704,1.504040,1.119667,0.594320,1.510380
813,6705,0.192404,-0.070038,-0.292768,-0.421116
814,6713,-0.195858,-0.467507,-1.002440,-0.529634
815,6721,-0.064755,-0.395750,-0.314841,-0.129030


In [41]:
# Save to lobe_volume.csv
data.to_csv('lobe_volume.csv')

## Hippocampal Volume Input

In [31]:
# Preparing hippocampal volume data

data = pandas.read_csv('residual_neg_z_scores.csv')

# Delete unneeded rows 
data = data[['RID', 'W-RHV', 'W-LHV']]

# Take average of right and left hippocmapal hemisphere columes 
data['W-average_hippo'] = data[['W-RHV', 'W-LHV']].mean(axis=1)

# Delete unneeded rows
data = data[['RID','W-average_hippo']]

# Drop rows where all value is NaN 
value_to_drop = [4513, 6038, 6505, 6598]  
    
data = data[~data['RID'].isin(value_to_drop)]

Unnamed: 0,RID,W-average_hippo
0,21,-0.442675
1,31,1.208408
2,31,1.208408
3,56,0.401691
4,56,0.401691
...,...,...
812,6704,0.442492
813,6705,1.443939
814,6713,0.114254
815,6721,2.023511


In [43]:
# Save to hippocampal_volume.csv
data.to_csv('hippocampal_volume.csv')

## Tau Input

In [47]:
# Preparing tau data

data = pandas.read_csv('Cog_Tau_V1.csv')

# Find the average of the right and left cerebellum 
data['Average_Cerebellum'] = (data['Left-Cerebellum-Cortex']+ data['Right-Cerebellum-Cortex'])/2

# Change every '-' to '_' in column title names (to avoid errors later on)
data.columns = [c.replace('-', '_') for c in data.columns]

# Divide each regional tau value by the average of R and L cerebellum tau and subtract 1
data[['ctx_rh_inferiortemporal', 'ctx_rh_middletemporal', 'ctx_rh_parahippocampal', 'ctx_lh_entorhinal', 'ctx_lh_fusiform', 'ctx_lh_inferiortemporal', 'ctx_lh_middletemporal', 'ctx_lh_parahippocampal', 'Left_Amygdala', 'Right_Amygdala']] = data[['ctx_rh_inferiortemporal', 'ctx_rh_middletemporal', 'ctx_rh_parahippocampal', 'ctx_lh_entorhinal', 'ctx_lh_fusiform', 'ctx_lh_inferiortemporal', 'ctx_lh_middletemporal', 'ctx_lh_parahippocampal', 'Left_Amygdala', 'Right_Amygdala']].div(data['Average_Cerebellum'], axis=0)
data[['ctx_rh_inferiortemporal', 'ctx_rh_middletemporal', 'ctx_rh_parahippocampal', 'ctx_lh_entorhinal', 'ctx_lh_fusiform', 'ctx_lh_inferiortemporal', 'ctx_lh_middletemporal', 'ctx_lh_parahippocampal', 'Left_Amygdala', 'Right_Amygdala']] = data[['ctx_rh_inferiortemporal', 'ctx_rh_middletemporal', 'ctx_rh_parahippocampal', 'ctx_lh_entorhinal', 'ctx_lh_fusiform', 'ctx_lh_inferiortemporal', 'ctx_lh_middletemporal', 'ctx_lh_parahippocampal', 'Left_Amygdala', 'Right_Amygdala']] - 1

# Take average of all meta tau regions
data['Average_Tau'] = data[['ctx_rh_inferiortemporal', 'ctx_rh_middletemporal', 'ctx_rh_parahippocampal', 'ctx_lh_entorhinal', 'ctx_lh_fusiform', 'ctx_lh_inferiortemporal', 'ctx_lh_middletemporal', 'ctx_lh_parahippocampal', 'Left_Amygdala', 'Right_Amygdala']].mean(axis=1)

# Add in columns with demographic data so we can perform age regression 
with_demo = pandas.read_csv('Regional_Tau_w_Demographics.csv')
with_demo = with_demo[['AGE', 'RID']]

# Rename column titles for consistency 
data = data.rename(columns={'merge_RID': 'RID'})
with_demo = with_demo.rename(columns={'AGE':'Age'})

data = pandas.merge(data, with_demo, left_index=True, right_index=True)
data = data[['RID_x', 'merge_DX', 'Age','Average_Tau']]
data = data.rename(columns={'RID_x': 'RID'})
data = data.rename(columns={'merge_DX': 'Diagnosis'})

# Drop rows where all value is NaN 
value_to_drop = [4513, 6038, 6505, 6598]  
    
data = data[~data['RID'].isin(value_to_drop)]

data

Unnamed: 0,RID,Diagnosis,Age,Average_Tau
0,21,CN,72.6,0.179657
1,31,CN,77.7,0.119621
2,31,CN,77.7,0.128919
3,56,MCI,69.6,0.216284
4,56,MCI,69.6,0.234038
...,...,...,...,...
814,6704,CN,65.3,0.135957
815,6705,Dementia,74.9,0.077476
816,6713,Dementia,71.1,0.288904
817,6721,Dementia,77.9,0.233916


In [50]:
# Age regression for tau

from sklearn.linear_model import LinearRegression
# data should have neg_zscore_orig.csv data plus age.

# Extracting the columns for regression analysis
X = data[['Age']]  # Independent variable
y = data['Average_Tau']  # Dependent variable

# Fitting the simple linear regression model
model = LinearRegression()
model.fit(X, y)

# Predicting the 'MMSE_bl' values using the fitted model
y_pred = model.predict(X)

# Calculating residuals
residuals = y - y_pred

# Adding the residuals as a new column named 'W-Average_Tau' to the dataframe
data['W-Average_Tau'] = residuals

# Inclue only the relevant columns  
data = data[['RID', 'W-Average_Tau']]
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['W-Average_Tau'] = residuals


Unnamed: 0,RID,W-Average_Tau
0,21,-0.046907
1,31,-0.109110
2,31,-0.099811
3,56,-0.009005
4,56,0.008748
...,...,...
814,6704,-0.087506
815,6705,-0.150065
816,6713,0.062978
817,6721,0.005101


In [51]:
# Save processed data to 'Tau.csv'
data.to_csv('Tau.csv')

## Cognitive Score Input

In [21]:
# Preparing cognitive score input

data = pandas.read_csv('Regional_Tau_w_Demographics.csv')

### Using ADAS11

In [22]:
# Rename columns for consistency
data = data.rename(columns={'merge_DX': 'Diagnosis'})
data = data.rename(columns={'AGE': 'Age'})

# Change diagnosis to numerical values
mapping = {'CN': 0, 'MCI': 1, 'Dementia': 2}
data['Diagnosis'] = data['Diagnosis'].map(mapping)

# Delete unneeded columns 
data = data[['RID', 'Age', 'Diagnosis', 'ADAS11_bl']]

# extract data for control subjects
data_control = data[data['Diagnosis'] == 0]

# compute the mean and standard deviation of the control population
mean_control = data_control[['ADAS11_bl']].mean()
std_control = data_control[['ADAS11_bl']].std()

# z-score the data
data[['ADAS11_bl']] = (data[['ADAS11_bl']]-mean_control)/std_control

In [24]:
# Drop rows where all value is NaN 
value_to_drop = [4513, 6038, 6505, 6598]  
    
data = data[~data['RID'].isin(value_to_drop)]

      RID   Age  Diagnosis  ADAS11_bl
262  4513  60.6          0        NaN
263  4513  60.6          0        NaN
454  6038  77.4          0        NaN
455  6038  77.4          0        NaN


Unnamed: 0,RID,Age,Diagnosis,ADAS11_bl
0,21,72.6,0,-0.172007
1,31,77.7,0,-1.736670
2,31,77.7,0,-1.736670
3,56,69.6,1,-1.320470
4,56,69.6,1,-1.320470
...,...,...,...,...
814,6704,65.3,0,0.973326
815,6705,74.9,2,3.789720
816,6713,71.1,2,3.686452
817,6721,77.9,2,3.896117


In [25]:
# Age regression for cognitive score

from sklearn.linear_model import LinearRegression

# Extracting the columns for regression analysis
X = data[['Age']]  # Independent variable
y = data['ADAS11_bl']  # Dependent variable

# Fitting the simple linear regression model
model = LinearRegression()
model.fit(X, y)

# Predicting the 'MMSE_bl' values using the fitted model
y_pred = model.predict(X)

# Calculating residuals
residuals = y - y_pred

# Adding the residuals as a new column named 'W-MMSE' to the dataframe
data['W-ADAS11'] = residuals

# Delete unneeded columns
data = data[['RID', 'Diagnosis', 'W-ADAS11']]
data

Unnamed: 0,RID,Diagnosis,W-ADAS11
0,21,0,-0.725910
1,31,0,-2.442518
2,31,0,-2.442518
3,56,1,-1.784994
4,56,1,-1.784994
...,...,...,...
814,6704,0,0.636912
815,6705,2,3.167292
816,6713,2,3.177238
817,6721,2,3.184310


In [26]:
data.to_csv('Cogscore.csv')

### Using MMSE

In [3]:
# Rename columns for consistency
data = data.rename(columns={'merge_DX': 'Diagnosis'})
data = data.rename(columns={'AGE': 'Age'})

# Change diagnosis to numerical values
mapping = {'CN': 0, 'MCI': 1, 'Dementia': 2}
data['Diagnosis'] = data['Diagnosis'].map(mapping)

# Delete unneeded columns 
data = data[['RID', 'Age', 'Diagnosis', 'MMSE_bl']]

# extract data for control subjects
data_control = data[data['Diagnosis'] == 0]

# compute the mean and standard deviation of the control population
mean_control = data_control[['MMSE_bl']].mean()
std_control = data_control[['MMSE_bl']].std()

# z-score the data
data[['MMSE_bl']] = (data[['MMSE_bl']]-mean_control)/std_control

# Compute 30 - z-score
data[['MMSE_bl']] = 30 -data[['MMSE_bl']]

In [3]:
# Drop rows where all value is NaN 
value_to_drop = [4513, 6038, 6505, 6598]  
    
data = data[~data['RID'].isin(value_to_drop)]

      RID  Age  Diagnosis    MMSE_bl
734  6505  NaN          0  30.111267
778  6598  NaN          1  33.827959


In [20]:
# Age regression for cognitive score

from sklearn.linear_model import LinearRegression

# Extracting the columns for regression analysis
X = data[['Age']]  # Independent variable
y = data['MMSE_bl']  # Dependent variable

# Fitting the simple linear regression model
model = LinearRegression()
model.fit(X, y)

# Predicting the 'MMSE_bl' values using the fitted model
y_pred = model.predict(X)

# Calculating residuals
residuals = y - y_pred

# Adding the residuals as a new column named 'W-MMSE' to the dataframe
data['W-MMSE'] = residuals

# Delete unneeded columns
data = data[['RID', 'Diagnosis', 'W-MMSE']]
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['W-MMSE'] = residuals


Unnamed: 0,RID,Diagnosis,W-MMSE
0,21,0,-1.448714
1,31,0,-1.577825
2,31,0,-1.577825
3,56,1,0.485579
4,56,1,0.485579
...,...,...,...
814,6704,0,0.594438
815,6705,2,3.138925
816,6713,2,1.376779
817,6721,2,3.062977


In [21]:
data.to_csv('Cogscore.csv')