# Tirgul 8: Dementia Prediction with Tree-based Models


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error as mse

# Read Data
[A link for explaination about the data](https://www.kaggle.com/majedahalrwaily/dementia-classification)

In [2]:
data = pd.read_csv('oasis_cross-sectional.csv')

data.tail()

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
431,OAS1_0285_MR2,M,R,20,,,,,1469,0.847,1.195,2.0
432,OAS1_0353_MR2,M,R,22,,,,,1684,0.79,1.042,40.0
433,OAS1_0368_MR2,M,R,22,,,,,1580,0.856,1.111,89.0
434,OAS1_0379_MR2,F,R,20,,,,,1262,0.861,1.39,2.0
435,OAS1_0395_MR2,F,R,26,,,,,1283,0.834,1.368,39.0


| Column      | Description |
|:----------- |:----------- |
| EDUC      | Years of Education       |
| SES       | Socioeconomic Status       |
| MMSE      | Mini Mental State Examination    |
| CDR       |  Clinical Dementia Rating      |
| eTIV      | Estimated Total Intracranial Volume     |
| ASF       | Normalize Whole Brain Volume      |

## Wrok Flow:
- Feature selection
- Filter data
- Determine prediction label
- Label categorization
- Train/Test split

In [None]:
features = ['M','F','R','Age','Educ','SES','MMSE','eTIV','nWBV','ASF']
pred_label = ['CDR']
# get dummies translates values to columns
fm_df = pd.get_dummies(data['M/F'])
hands_df = pd.get_dummies(data['Hand'])
fm_df.head()

In [None]:
# Join the data
data = pd.concat([data,fm_df],axis=1)
data = pd.concat([data,hands_df],axis=1)

data = data[features + pred_label+['M/F','Hand']].dropna()
data.head()

In [None]:
# Turning values into categories

data['CDR'] = data['CDR'].astype('category').cat.codes
# 0 => 0 , 0.5 => 1 , 1 => 2 , 2 => 3
data.head()

In [None]:
# split to train and test data
X = data[features]
y = data[pred_label]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=43)  

X.head()

# Evaluation function

In [None]:
def eval(x,y,model):
    pred = model.predict(x)
    y_alt=y.values
    print("Acc: {:.3f}".format((pred == y_alt).mean()))
    print("MSE: {:.3f}".format(mse(pred,y,squared=False)))   # note for CDR cat units
    print("Classified {:.3f}% as sick, where the in the true data {:.3f}% where sick.".format(100*(pred>0).sum()/len(pred), 100*(y.values>0).sum()/len(y.values)) )  


# Tree plotting funtion

In [None]:
import sklearn.tree as tree
def plot_tree(tree_model,feat,size=(15,10)):
    fig = plt.figure(figsize=size)
    tree.plot_tree(tree_model, 
                   feature_names = feat, 
                   filled=True, 
                   rounded = True)  
    plt.show
    
# about plot_tree method: https://scikit-learn.org/stable/modules/generated/sklearn.tree.plot_tree.html


# Training

## Vanil tree

In [None]:
model = DecisionTreeClassifier(random_state=42)

model.fit(X_train,y_train)
print("Depth:",model.get_depth())
eval(X_test,y_test,model)

## Max-depth = 4

In [None]:
model = DecisionTreeClassifier(max_depth=4,random_state=42)

model.fit(X_train,y_train)

eval(X_test,y_test,model)
plot_tree(model,features)

## Max-depth = 1

In [None]:
model = DecisionTreeClassifier(max_depth=1,random_state=42)

model.fit(X_train,y_train)

eval(X_test,y_test,model)
plot_tree(model,features)

The value refers the CDR column of values [0,0.5,1,2] where 0 means healthy.
The results seems to be better according to the evaluation function, but if we look at the leaves, we see that most of the subjects are not sick so 'Not Sick' is the safest geuss.

![smbc.png](smbc.png)

## Min samples leaf = 5

In [None]:
model = DecisionTreeClassifier(min_samples_leaf=5,random_state=42)

model.fit(X_train,y_train)

eval(X_test,y_test,model)

plot_tree(model,features,(35,20))

## Min samples split = 5

In [None]:
model = DecisionTreeClassifier(min_samples_split=5,random_state=42)

model.fit(X_train,y_train)

eval(X_test,y_test,model)

plot_tree(model,features,(35,20))