In [1]:
import pandas as pd
import os 
import numpy as np
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import accuracy_score, root_mean_squared_error, mutual_info_score


In [2]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("MRI_exp")

2024/08/12 10:51:27 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/08/12 10:51:28 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='file:///c:/Users/SebastianSy/Desktop/240606_Backup_Gateway_Final/Administration/Other/Sinn/StartUp_and_Job/Programmieren/MLOps_Zoomcamp/MRI_project/mlruns/1', creation_time=1723452691959, experiment_id='1', last_update_time=1723452691959, lifecycle_stage='active', name='MRI_exp', tags={}>

In [6]:
#There are two datasets - one in which a single MRI scan was done on every patient (cross-sectional dataset) and one in which each patient had \
#several MRI scans over time (longitudinal). In a first step, we combine both datasets, to use all measurements possible.  

cwd = os.getcwd()
cwd.replace('\\','/')

path_cross = cwd+'\\data\\oasis_cross-sectional.csv'
path_long = cwd+'\\data\\oasis_cross-sectional.csv'

df_cross = pd.read_csv(path_cross,sep=",", engine="python", on_bad_lines="skip") 
df_long = pd.read_csv(path_long,sep=",", engine="python", on_bad_lines="skip") 

#cleaning columns
df_cross.columns = df_cross.columns.str.replace(' ', '_').str.lower()
df_long.columns = df_long.columns.str.replace(' ', '_').str.lower()

new_column_names = {
                    'subject_id': 'id',
                    'mr_delay': 'delay'
                    }

df_long.rename(columns=new_column_names, inplace=True)

new_column_order = ['id', 'm/f', 'hand', 'age', 'educ', 'ses', 'mmse', 'cdr', 'etiv',
        'nwbv', 'asf', 'delay', 'mri_id', 'group', 'visit']

#df_long = df_long[new_column_order]

df_cross.columns, df_long.columns


(Index(['id', 'm/f', 'hand', 'age', 'educ', 'ses', 'mmse', 'cdr', 'etiv',
        'nwbv', 'asf', 'delay'],
       dtype='object'),
 Index(['id', 'm/f', 'hand', 'age', 'educ', 'ses', 'mmse', 'cdr', 'etiv',
        'nwbv', 'asf', 'delay'],
       dtype='object'))

In [3]:
df_merged = pd.merge(df_cross, df_long, on=['id', 'm/f', 'hand', 'age', 'educ', 'ses', 'mmse', 'cdr', 'etiv',
        'nwbv', 'asf', 'delay'], how='outer')
df_merged

Unnamed: 0,id,m/f,hand,age,educ,ses,mmse,cdr,etiv,nwbv,asf,delay,mri_id,group,visit
0,OAS1_0001_MR1,F,R,74,2.0,3.0,29.0,0.0,1344,0.743,1.306,,,,
1,OAS1_0002_MR1,F,R,55,4.0,1.0,29.0,0.0,1147,0.810,1.531,,,,
2,OAS1_0003_MR1,F,R,73,4.0,3.0,27.0,0.5,1454,0.708,1.207,,,,
3,OAS1_0004_MR1,M,R,28,,,,,1588,0.803,1.105,,,,
4,OAS1_0005_MR1,M,R,18,,,,,1737,0.848,1.010,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
804,OAS2_0185,M,R,82,16.0,1.0,28.0,0.5,1693,0.694,1.037,842.0,OAS2_0185_MR2,Demented,2.0
805,OAS2_0185,M,R,86,16.0,1.0,26.0,0.5,1688,0.675,1.040,2297.0,OAS2_0185_MR3,Demented,3.0
806,OAS2_0186,F,R,61,13.0,2.0,30.0,0.0,1319,0.801,1.331,0.0,OAS2_0186_MR1,Nondemented,1.0
807,OAS2_0186,F,R,63,13.0,2.0,30.0,0.0,1327,0.796,1.323,763.0,OAS2_0186_MR2,Nondemented,2.0


In [5]:
df_merged.isnull().sum()

id          0
m/f         0
hand        0
age         0
educ      201
ses       239
mmse      203
cdr       201
etiv        0
nwbv        0
asf         0
delay     416
mri_id    436
group     436
visit     436
dtype: int64

In [192]:
df2 = df.drop(columns=['hand', 'delay', 'id'])
df_sorted = df2.sort_values(by='cdr')
df_cleaned = df_sorted.dropna(subset=['cdr'])
df_cleaned = df_cleaned.reset_index(drop=True)
df_cleaned['m/f'] = (df_cleaned['m/f'] == "M").astype(int)
df_cleaned['cdr'] = (df_cleaned['cdr'] > 0).astype(int)
df_cleaned = df_cleaned.rename(columns={"m/f":"sex", "cdr": "dementia"})
df_cleaned=df_cleaned.fillna(0)

df_cleaned

Unnamed: 0,sex,age,educ,ses,mmse,dementia,etiv,nwbv,asf
0,0,74,2.0,3.0,29.0,0,1344,0.743,1.306
1,0,80,4.0,2.0,29.0,0,1341,0.737,1.309
2,0,85,5.0,1.0,29.0,0,1264,0.705,1.388
3,0,71,5.0,1.0,30.0,0,1426,0.737,1.231
4,1,70,5.0,1.0,30.0,0,1660,0.739,1.057
...,...,...,...,...,...,...,...,...,...
230,1,75,4.0,1.0,20.0,1,1613,0.715,1.088
231,0,78,1.0,4.0,23.0,1,1461,0.715,1.201
232,0,84,3.0,2.0,28.0,1,1402,0.695,1.252
233,0,78,3.0,3.0,15.0,1,1401,0.703,1.253


In [193]:
#MRI = ["etiv", "nwbv", "asf"]
MRI = ["etiv", "nwbv"]
general_info = ["sex", "age", "educ", "ses"]
exam = ["mmse"]

#df_cleaned = df_cleaned[MRI + ["dementia"]]
#df_cleaned = df_cleaned[general_info + ["dementia"]]
#df_cleaned = df_cleaned[exam + ["dementia"]]
df_cleaned = df_cleaned[exam + ["dementia"]]
df_cleaned

Unnamed: 0,mmse,dementia
0,29.0,0
1,29.0,0
2,29.0,0
3,30.0,0
4,30.0,0
...,...,...
230,20.0,1
231,23.0,1
232,28.0,1
233,15.0,1


In [194]:
seed = 42
df_full_train, df_test = train_test_split(df_cleaned, test_size=0.2, random_state=seed)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=seed)

df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_full_train = df_full_train.dementia
y_train = df_train.dementia
y_val = df_val.dementia
y_test = df_test.dementia

df_full_train = df_full_train.drop(columns = ['dementia'])
df_train = df_train.drop(columns = ['dementia'])
df_val = df_val.drop(columns = ['dementia'])
df_test = df_test.drop(columns = ['dementia'])


In [195]:
# dv = DictVectorizer(sparse=False)

# train_dict = df_train.to_dict(orient='records') #turns the df into a dictionary
# X_train = dv.fit_transform(train_dict)

# val_dict = df_val.to_dict(orient='records')
# X_val = dv.transform(val_dict) #note that this is transform instead of fit_transform


X_train = df_train
X_val = df_val

In [196]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

model.fit(X_train, y_train)

In [197]:
y_pred = model.predict(X_val)
result = (y_pred==y_val)
sum(result)/len(result)

0.851063829787234