In [1]:
import pandas as pd
import numpy as np
import os 

from sklearn.model_selection import train_test_split


In [2]:
cwd = os.getcwd()

path_cross = cwd+'/data/oasis_cross-sectional.csv'
path_long = cwd+'/data/oasis_longitudinal.csv'

df_cross = pd.read_csv(path_cross,sep=",", engine="python", on_bad_lines="skip") 
df_long = pd.read_csv(path_long,sep=",", engine="python", on_bad_lines="skip") 

#cleaning columns
df_cross.columns = df_cross.columns.str.replace(' ', '_').str.lower()
df_long.columns = df_long.columns.str.replace(' ', '_').str.lower()

new_column_names = {
                    'subject_id': 'id',
                    'mr_delay': 'delay'
                    }

df_long.rename(columns=new_column_names, inplace=True)

new_column_order = ['id', 'm/f', 'hand', 'age', 'educ', 'ses', 'mmse', 'cdr', 'etiv',
        'nwbv', 'asf', 'delay', 'mri_id', 'group', 'visit']

df_long = df_long[new_column_order]

df = pd.merge(df_cross, df_long, on=['id', 'm/f', 'hand', 'age', 'educ', 'ses', 'mmse', 'cdr', 'etiv',
        'nwbv', 'asf', 'delay'], how='outer')

#drop cols without information
df_clean = df[df['visit'].isna() | (df['visit'] == 1)]
df_clean = df_clean.drop(columns=['hand', 'delay', 'id', 'mri_id','group','visit'])
df_clean = df_clean.dropna(subset=['cdr'])
df_clean = df_clean.dropna(subset=['mmse'])
df_clean['m/f'] = (df_clean['m/f'] == "M").astype(int)
df_clean=df_clean.fillna(0)
df_clean = df_clean.reset_index(drop=True)
df_clean

#create fusion target variable
df_clean['target']=np.log1p((df_clean['cdr']+0.5)/df_clean['mmse'])
dementia_threshold = np.log1p(1.5/26)
df_clean['target'] = (df_clean['target'] > dementia_threshold).astype(int)

MRI = ["etiv", "nwbv"]
general_info = ["m/f", "age", "educ", "ses"]
df_clean = df_clean[MRI+ general_info + ['target']]

In [3]:
#when cdr is NaN, mmse is also Nan - hence all rows with cdr is NaN can be discarded as they have no target value.
num_target_NaN = (df['mmse'].isna() & df['cdr'].isna()).sum()
num_target_NaN

201

In [4]:
seed = 42
df_full_train, df_test = train_test_split(df_clean, test_size=0.2, random_state=seed)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=seed)

df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_full_train = df_full_train.target
y_train = df_train.target
y_val = df_val.target
y_test = df_test.target

df_full_train = df_full_train.drop(columns = ['target'])
df_train = df_train.drop(columns = ['target'])
df_val = df_val.drop(columns = ['target'])
df_test = df_test.drop(columns = ['target'])

X_train = df_train
X_val = df_val

X_train

Unnamed: 0,etiv,nwbv,m/f,age,educ,ses
0,1796,0.742,1,70,16.0,4.0
1,1376,0.701,0,77,1.0,4.0
2,1631,0.674,1,89,16.0,1.0
3,1453,0.727,0,81,2.0,0.0
4,1456,0.754,0,73,3.0,2.0
...,...,...,...,...,...,...
226,1350,0.763,0,76,3.0,2.0
227,1395,0.787,0,74,2.0,3.0
228,1596,0.817,1,59,3.0,2.0
229,1311,0.835,0,47,4.0,1.0


In [5]:
import pickle

with open('./output/train.pkl', 'wb') as f_out:
   pickle.dump((X_train,y_train), f_out)
f_out.close()

with open('./output/val.pkl', 'wb') as f_out:
   pickle.dump((X_val,y_val), f_out)
f_out.close()