<h1><center>Creating Severity Levels </center></h1>

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, roc_auc_score
import shap
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import silhouette_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.metrics import confusion_matrix
from imblearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import subprocess
subprocess.check_call(["pip", "install", "tabulate"])
from tabulate import tabulate

  from .autonotebook import tqdm as notebook_tqdm


Defaulting to user installation because normal site-packages is not writeable


In [3]:
# Load data
data = pd.read_csv("dementia_dataset_1.csv")

# drop unnecessary columns
data.drop(["PatientID","DoctorInCharge"], axis=1, inplace=True)


### Preprocessing

In [4]:
# Identify numerical features: features with more than 5 unique values are considered numerical
num_cols = [
    col for col in data.columns if col != "Diagnosis" and data[col].nunique() > 5
]

# Identify categorical features: features that are not numerical and not 'Diagnosis'
cat_cols = data.columns.difference(num_cols).difference(["Diagnosis"]).to_list()


In [5]:
data['MMSE'].head()

0    21.463532
1    20.613267
2     7.356249
3    13.991127
4    13.517609
Name: MMSE, dtype: float64

### Clustering Function

In [6]:
def process_cluster( data, cluster_features, target_column="Diagnosis", n_clusters=3):

    # Create a copy of the data
    data_clustering = data.copy()

    # Cluster ONLY dementia patients
    dementia_mask = data_clustering[target_column] == 1
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(data_clustering[dementia_mask][cluster_features])

    # Order clusters by MMSE (clinical relevance)
    cluster_order = (
        data_clustering[dementia_mask]
        .groupby(clusters)["MMSE"]
        .mean()
        .sort_values(ascending=False)
        .index
    )

    # Print MMSE mean of each cluster
    mmse_means = (
        data_clustering[dementia_mask]
        .groupby(clusters)["MMSE"]
        .mean()
        .sort_values(ascending=False)
    )
    print("MMSE mean of each cluster:")
    print(mmse_means, "\n")

    severity_mapping = {cluster_order[0]: 1, cluster_order[1]: 2, cluster_order[2]: 3}

    # Update target variable
    y = data_clustering[target_column].copy()
    y[dementia_mask] = [severity_mapping[c] for c in clusters]
    
    # Update the clustered data with the new severity levels
    data_clustering["Severity"] = y

    # Drop the Diagnosis column
    data_clustering.drop(columns=[target_column], inplace=True)

    # Calculate silhouette score
    silhouette_avg = silhouette_score(
        data_clustering[dementia_mask][cluster_features], clusters
    )
        
    return {
       "clustered_data": data_clustering,
       "silhouette_score": silhouette_avg
    }

### Testing Various Clusters

In [7]:
# Filter features to exclude 'diagnose'
cluster_features = [col for col in data.columns if col != "Diagnosis"]

# Process clustering
clustered_data = process_cluster(data, cluster_features)

# Access results
print("Silhouette Score:", clustered_data["silhouette_score"])


MMSE mean of each cluster:
1    12.626733
2    11.981645
0    11.407447
Name: MMSE, dtype: float64 

Silhouette Score: 0.25946257194807976


In [8]:
cluster_features_2 = ["MMSE", "ADL",]  # Specify the features for clustering
clustered_data_2 = process_cluster(data, cluster_features_2)

# Access results
print("Silhouette Score:", clustered_data_2["silhouette_score"])


MMSE mean of each cluster:
0    20.957385
2    12.664652
1     4.583337
Name: MMSE, dtype: float64 

Silhouette Score: 0.4292872177446522


In [9]:
cluster_features_3 = ['MMSE', 'FunctionalAssessment']
clustered_data_3 = process_cluster(data, cluster_features_3)

# Access results
print("Silhouette Score:", clustered_data_3["silhouette_score"])

MMSE mean of each cluster:
0    21.080635
2    12.860176
1     4.639195
Name: MMSE, dtype: float64 

Silhouette Score: 0.4415287253461713


In [10]:
cluster_features_4 = ['MMSE']
clustered_data_4 = process_cluster(data, cluster_features_4)

# Access results
print("Silhouette Score:", clustered_data_4["silhouette_score"])

MMSE mean of each cluster:
2    20.904580
0    12.492109
1     4.486073
Name: MMSE, dtype: float64 

Silhouette Score: 0.5858768435462162


### Considering Silhouette Score and mean MMSE Scores, Cluster 4 is selected!