<h1><center>Creating Severity Levels </center></h1>

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, roc_auc_score
import shap
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import silhouette_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.metrics import confusion_matrix
from imblearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import subprocess
subprocess.check_call(["pip", "install", "tabulate"])
from tabulate import tabulate

  from .autonotebook import tqdm as notebook_tqdm


Defaulting to user installation because normal site-packages is not writeable


In [2]:
# Load data
data = pd.read_csv("dementia_dataset_1.csv")

# List the columns you want to keep
columns_to_keep = ["Age","Gender","BMI","FamilyHistoryAlzheimers","Hypertension","CardiovascularDisease","MMSE", "ADL", "FunctionalAssessment","MemoryComplaints","BehavioralProblems","Diagnosis"]  

# Keep only these columns
data = data[columns_to_keep]



In [3]:
data.head()

Unnamed: 0,Age,Gender,BMI,FamilyHistoryAlzheimers,Hypertension,CardiovascularDisease,MMSE,ADL,FunctionalAssessment,MemoryComplaints,BehavioralProblems,Diagnosis
0,73,0,22.927749,0,0,0,21.463532,1.725883,6.518877,0,0,0
1,89,0,26.827681,0,0,0,20.613267,2.592424,7.118696,0,0,0
2,73,0,17.795882,1,0,0,7.356249,7.119548,5.895077,0,0,0
3,74,1,33.800817,0,0,0,13.991127,6.481226,8.965106,0,1,0
4,89,0,20.716974,0,0,0,13.517609,0.014691,6.045039,0,0,0


### Preprocessing

In [4]:
# Identify numerical features: features with more than 5 unique values are considered numerical
num_cols = [
    col for col in data.columns if col != "Diagnosis" and data[col].nunique() > 5
]

# Identify categorical features: features that are not numerical and not 'Diagnosis'
cat_cols = data.columns.difference(num_cols).difference(["Diagnosis"]).to_list()


In [5]:
data['MMSE'].head()

0    21.463532
1    20.613267
2     7.356249
3    13.991127
4    13.517609
Name: MMSE, dtype: float64

### Clustering Function

In [6]:
def process_cluster( data, cluster_features, target_column="Diagnosis", n_clusters=3):

    # Create a copy of the data
    data_clustering = data.copy()

    # Cluster ONLY dementia patients
    dementia_mask = data_clustering[target_column] == 1
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(data_clustering[dementia_mask][cluster_features])

    # Order clusters by MMSE (clinical relevance)
    cluster_order = (
        data_clustering[dementia_mask]
        .groupby(clusters)["MMSE"]
        .mean()
        .sort_values(ascending=False)
        .index
    )

    # Print MMSE mean of each cluster
    mmse_means = (
        data_clustering[dementia_mask]
        .groupby(clusters)["MMSE"]
        .mean()
        .sort_values(ascending=False)
    )
    print("MMSE mean of each cluster:")
    print(mmse_means, "\n")

    severity_mapping = {cluster_order[0]: 1, cluster_order[1]: 2, cluster_order[2]: 3}

    # Update target variable
    y = data_clustering[target_column].copy()
    y[dementia_mask] = [severity_mapping[c] for c in clusters]
    
    # Update the clustered data with the new severity levels
    data_clustering["Severity"] = y

    # Drop the Diagnosis column
    data_clustering.drop(columns=[target_column], inplace=True)

    # Calculate silhouette score
    silhouette_avg = silhouette_score(
        data_clustering[dementia_mask][cluster_features], clusters
    )
        
    return {
       "clustered_data": data_clustering,
       "silhouette_score": silhouette_avg
    }

### Testing Various Clusters

In [7]:
# Filter features to exclude 'diagnose'
cluster_features = [col for col in data.columns if col != "Diagnosis"]

# Process clustering
clustered_data = process_cluster(data, cluster_features)

# Access results
print("Silhouette Score:", clustered_data["silhouette_score"])


MMSE mean of each cluster:
1    19.034165
0    12.206746
2     5.947568
Name: MMSE, dtype: float64 

Silhouette Score: 0.2388906321196244


In [11]:
cluster_features_2 = ["MMSE", "FunctionalAssessment", "ADL"]
clustered_data_2 = process_cluster(data, cluster_features_2)

# Access results
print("Silhouette Score:", clustered_data_2["silhouette_score"])


MMSE mean of each cluster:
2    20.952587
0    12.617803
1     4.542357
Name: MMSE, dtype: float64 

Silhouette Score: 0.34887179487214776


In [12]:
cluster_features_3 = ['MMSE', 'FunctionalAssessment']
clustered_data_3 = process_cluster(data, cluster_features_3)

# Access results
print("Silhouette Score:", clustered_data_3["silhouette_score"])

MMSE mean of each cluster:
0    21.080635
2    12.860176
1     4.639195
Name: MMSE, dtype: float64 

Silhouette Score: 0.4415287253461713


In [13]:
cluster_features_4 = ['MMSE']
clustered_data_4 = process_cluster(data, cluster_features_4)

# Access results
print("Silhouette Score:", clustered_data_4["silhouette_score"])

MMSE mean of each cluster:
2    20.904580
0    12.492109
1     4.486073
Name: MMSE, dtype: float64 

Silhouette Score: 0.5858768435462162


### Considering Silhouette Score and mean MMSE Scores, Cluster 4 is selected!

### Evaluation Function

In [15]:
def evaluate_model(X_train, y_train, X_test, y_test,name, model):
   
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    return {
         "model": name,
         "accuracy": accuracy_score(y_test, predictions) * 100,
         "precision": precision_score(y_test, predictions, average="weighted") * 100,  # Updated
         "recall": recall_score(y_test, predictions, average="weighted") * 100,        # Updated
         "f1": f1_score(y_test, predictions, average="weighted") * 100,    
         "roc_auc": roc_auc_score(y_test, model.predict_proba(X_test), multi_class="ovr") * 100
         if hasattr(model, "predict_proba") else None,  # Htandle models without predict_proba
         "classification_report": classification_report(y_test, predictions),
         "confusion_matrix": confusion_matrix(y_test, predictions),
         "cross_validation_scores": cross_val_score(model, X_train, y_train, cv=5) * 100,
    }

In [16]:
# Split data into features and target
y = clustered_data_4["clustered_data"]["Severity"] 
X = clustered_data_4["clustered_data"].drop("Severity", axis=1)

In [17]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply resampling to the training data
resampler = SMOTE(random_state=42)  # You can replace SMOTE with another resampler if needed
X_train, y_train = resampler.fit_resample(X_train, y_train)

In [18]:
# Print class distribution after resampling
print("Class distribution after resampling:")
print(pd.Series(y_train).value_counts())

Class distribution after resampling:
Severity
0    1112
3    1112
1    1112
2    1112
Name: count, dtype: int64


In [19]:
# Instantiate classification models with default parameters
models = {
   "Decision Tree": DecisionTreeClassifier(),
   "Random Forest": RandomForestClassifier(),
   "K-Nearest Neighbors": KNeighborsClassifier(),
   "Support Vector Machine": SVC(),
   "Gradient Boosting Classifier": GradientBoostingClassifier(),
   "XGBClassifier": XGBClassifier(),
}

In [20]:
# Initialize a list to hold the results
all_results = []

# Initialize a dictionary to hold the confusion matrices
confusion_matrices = {}

In [21]:
# Evaluate each model
for name, model in models.items():
    results = evaluate_model(X_train, y_train, X_test, y_test, name, model)
    all_results.append(results)

In [22]:
# Create a DataFrame from the results (excluding the last three items)
results_df = pd.DataFrame([
    {
        "model": result["model"],
        "accuracy": result["accuracy"],
        "precision": result["precision"],
        "recall": result["recall"],
        "f1": result["f1"],
        "roc_auc": result["roc_auc"]
    }
    for result in all_results
])

# Sort the DataFrame by F1-score in descending order
results_df = results_df.sort_values(by="f1", ascending=False)

# Display the results
print("Model Evaluation Results:")
print(tabulate(results_df, headers="keys", tablefmt="psql", showindex=False))


Model Evaluation Results:
+------------------------------+------------+-------------+----------+---------+-----------+
| model                        |   accuracy |   precision |   recall |      f1 |   roc_auc |
|------------------------------+------------+-------------+----------+---------+-----------|
| XGBClassifier                |    81.6279 |     82.8257 |  81.6279 | 81.8801 |   95.2479 |
| Decision Tree                |    80.2326 |     81.427  |  80.2326 | 80.5345 |   85.8565 |
| Gradient Boosting Classifier |    80      |     83.0444 |  80      | 80.4434 |   95.9951 |
| Random Forest                |    80      |     80.614  |  80      | 80.1392 |   94.2635 |
| K-Nearest Neighbors          |    63.4884 |     70.2258 |  63.4884 | 64.836  |   81.5505 |
| Support Vector Machine       |    56.9767 |     68.8401 |  56.9767 | 57.7819 |  nan      |
+------------------------------+------------+-------------+----------+---------+-----------+


In [23]:
# Print the excluded items for XGBClassifier
for result in all_results:
   if result["model"] == "XGBClassifier":
      print("\nReports for XGBClassifier:")
      print("Classification Report:")
      print(result["classification_report"])
      print("\nConfusion Matrix:")
      print(result["confusion_matrix"])
      print("\nCross-Validation Scores:")
      print(result["cross_validation_scores"])
      break


Reports for XGBClassifier:
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.81      0.85       277
           1       0.66      0.72      0.69        46
           2       0.66      0.84      0.74        44
           3       0.77      0.90      0.83        63

    accuracy                           0.82       430
   macro avg       0.75      0.82      0.78       430
weighted avg       0.83      0.82      0.82       430


Confusion Matrix:
[[224  17  19  17]
 [ 13  33   0   0]
 [  7   0  37   0]
 [  6   0   0  57]]

Cross-Validation Scores:
[88.98876404 92.92134831 93.48314607 92.68841395 92.80089989]
