In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics as m
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('dementia_dataset.csv')
df.head()

Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046
3,OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01
4,OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034


In [25]:
df.drop(columns=['Subject ID','MRI ID','Visit','MR Delay','Hand','eTIV','nWBV','ASF'], inplace=True)
df.head()

Unnamed: 0,Group,M/F,Age,EDUC,SES,MMSE,CDR
0,Nondemented,M,87,14,2.0,27.0,0.0
1,Nondemented,M,88,14,2.0,30.0,0.0
2,Demented,M,75,12,,23.0,0.5
3,Demented,M,76,12,,28.0,0.5
4,Demented,M,80,12,,22.0,0.5


In [26]:
# assuming 'df' is the name of your pandas dataframe
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(numeric_columns)
df[numeric_columns]

['Age', 'EDUC', 'SES', 'MMSE', 'CDR']


Unnamed: 0,Age,EDUC,SES,MMSE,CDR
0,87,14,2.0,27.0,0.0
1,88,14,2.0,30.0,0.0
2,75,12,,23.0,0.5
3,76,12,,28.0,0.5
4,80,12,,22.0,0.5
...,...,...,...,...,...
368,82,16,1.0,28.0,0.5
369,86,16,1.0,26.0,0.5
370,61,13,2.0,30.0,0.0
371,63,13,2.0,30.0,0.0


In [27]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(categorical_cols)
df[categorical_cols]

['Group', 'M/F']


Unnamed: 0,Group,M/F
0,Nondemented,M
1,Nondemented,M
2,Demented,M
3,Demented,M
4,Demented,M
...,...,...
368,Demented,M
369,Demented,M
370,Nondemented,F
371,Nondemented,F


In [28]:
# Define column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        # Impute missing values with median for numeric columns
        ("num", SimpleImputer(strategy="mean"),['Age', 'EDUC', 'SES', 'MMSE'] ),
        # Encode categorical columns with one-hot encoding
        ("cat", OneHotEncoder(handle_unknown="ignore", drop='first'), [ 'M/F']),
    ]
)
preprocessor

In [29]:
dec_classifier = DecisionTreeClassifier()

# Define pipeline to combine preprocessing and modeling
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", dec_classifier)
    ]    
)

In [30]:
from sklearn.model_selection import GridSearchCV

In [31]:
from sklearn.model_selection import train_test_split

# Split the data into features and target variable
X = df.drop('Group', axis=1)
y = df['Group']
yEnc = LabelEncoder()
y = yEnc.fit_transform(y)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [32]:
y_test

array([1, 0, 1, 2, 0, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2,
       1, 1, 2, 1, 1, 0, 2, 1, 1, 2, 2, 1, 2, 0, 2, 2, 2, 1, 2, 0, 2, 0,
       2, 1, 1, 2, 2, 1, 1, 2, 1, 0, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 0, 1,
       2, 2, 1, 0, 1, 0, 2, 2, 0])

In [33]:
yEnc.classes_

array(['Converted', 'Demented', 'Nondemented'], dtype=object)

In [34]:
X_train.head()

Unnamed: 0,M/F,Age,EDUC,SES,MMSE,CDR
192,M,75,12,4.0,28.0,0.0
75,M,89,12,4.0,26.0,0.5
84,F,70,17,3.0,29.0,0.0
361,F,66,13,2.0,30.0,0.0
16,M,69,12,2.0,24.0,0.5


In [35]:
pipeline.fit(X_train, y_train)

In [36]:
from joblib import dump,load

In [37]:
dump(pipeline,"dementia_ai.jb")

['dementia_ai.jb']

In [38]:
import joblib
from sklearn.metrics import accuracy_score

# Load trained model
model = joblib.load("dementia_ai.jb")

# Make predictions on test data
y_pred = model.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.68


In [39]:
from sklearn.metrics import classification_report

# Load the trained model and test data
model = load('dementia_ai.jb')
model


In [40]:

# Get the predictions from the model
y_pred = model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.45      0.50        11
           1       0.79      0.72      0.75        32
           2       0.62      0.72      0.67        32

    accuracy                           0.68        75
   macro avg       0.66      0.63      0.64        75
weighted avg       0.69      0.68      0.68        75



In [41]:
from sklearn.metrics import confusion_matrix

# Get the predicted labels for the test data
y_pred = model.predict(X_test)

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print(cm)


[[ 5  0  6]
 [ 1 23  8]
 [ 3  6 23]]


In [42]:
pipeline.predict(X_test)

array([1, 2, 2, 2, 0, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 0, 1, 2, 1, 2, 1,
       1, 1, 2, 1, 1, 0, 1, 0, 1, 2, 2, 1, 2, 2, 2, 2, 0, 2, 2, 2, 1, 2,
       2, 1, 1, 2, 2, 1, 1, 0, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 1, 1, 0, 1, 0, 2, 2, 0])

In [45]:
X_test.loc[0].to_dict()

{'M/F': 'M', 'Age': 87, 'EDUC': 14, 'SES': 2.0, 'MMSE': 27.0, 'CDR': 0.0}

In [47]:
pd.DataFrame([{'M/F': 'M', 'Age': 87, 'EDUC': 14, 'SES': 2.0, 'MMSE': 27.0, 'CDR': 0.0}])

Unnamed: 0,M/F,Age,EDUC,SES,MMSE,CDR
0,M,87,14,2.0,27.0,0.0


In [48]:
X.CDR.unique()

array([0. , 0.5, 1. , 2. ])

In [50]:
X.SES.unique().tolist()

[2.0, nan, 3.0, 4.0, 1.0, 5.0]

In [53]:
sorted(X.EDUC.unique())

[6, 8, 11, 12, 13, 14, 15, 16, 17, 18, 20, 23]

In [59]:
yEnc.inverse_transform([0])

array(['Converted'], dtype=object)