<h1><center>Severity Levels Creation </center></h1>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score

In [2]:
# Load the processed train and test datasets
df = pd.read_csv("dementia_dataset_5.csv")

In [3]:
df.shape

(775, 18)

In [4]:
df["Disfluency_Ratio"] = df["Pauses"] / (df["Key_Elements_Described"] + 1)

# Small constant to avoid division by zero
epsilon = 1e-6
# Calculate the composite severity score for each row
df["Severity_Score"] = (df["Pauses"] + df["Repair_Rate"] + df["Irrelevant_Details"]) / (
    df["Parse_Tree_Depth"] + df["Key_Elements_Described"] + epsilon
)

df["Cognitive_Expression_Score"] = (
    df["TTR"] * 0.1 + df["Idea_Density"] * 0.1 + df["Parse_Tree_Depth"] * 0.05
)


In [5]:
selected_features = [
    "Cognitive_Expression_Score",
    "Disfluency_Ratio",
    "Severity_Score",
]

In [6]:
# Filter only dementia patients (Class_label == 1)
dementia_df = df[df["Class_label"] == 1].copy()

In [7]:
X = dementia_df[selected_features]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
# Apply KMeans clustering with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
dementia_df["Severity_Cluster"] = kmeans.fit_predict(X_scaled)


In [9]:
# Print the number of entries per cluster
Severity_counts = dementia_df["Severity_Cluster"].value_counts()
print("Number of entries per cluster:")
print(Severity_counts)

Number of entries per cluster:
Severity_Cluster
0    266
2     99
1     50
Name: count, dtype: int64


In [10]:
# Analyze cluster means
cluster_means = dementia_df.groupby("Severity_Cluster")[selected_features].mean()
print("Cluster means (for interpretation):")
print(cluster_means)


Cluster means (for interpretation):
                  Cognitive_Expression_Score  Disfluency_Ratio  Severity_Score
Severity_Cluster                                                              
0                                   2.388518          0.368421        0.085169
1                                   2.289520          8.570000        1.981019
2                                   6.600171          0.634680        0.196063


In [11]:
# Map clusters to severity labels (based on feature inspection)
# Check the printed means to adjust this mapping as needed!
cluster_to_severity = {0: "1", 1: "3", 2: "2"}
dementia_df["Severity"] = dementia_df["Severity_Cluster"].map(cluster_to_severity)


In [12]:
# Combine back with control group
control_df = df[df["Class_label"] == 0].copy()
control_df["Severity"] = "0"


In [13]:
final_df = pd.concat([control_df, dementia_df], ignore_index=True)


In [14]:
# Calculate Silhouette Score
silhouette_avg = silhouette_score(X_scaled, dementia_df["Severity_Cluster"])
print(f"Silhouette Score: {silhouette_avg}")

Silhouette Score: 0.607553743476794


In [15]:
final_df.drop(columns=['Class_label','Severity_Cluster','Disfluency_Ratio','Severity_Score','Cognitive_Expression_Score'], inplace=True)


In [16]:
# Split the final dataset into 80% train and 20% test
from sklearn.model_selection import train_test_split

# First split to get train (80%) and test (20%)
train_df, test_df = train_test_split(final_df, test_size=0.2, random_state=42)

# Print the shapes to verify the split
print(f"Train dataset shape: {train_df.shape}")
print(f"Test dataset shape: {test_df.shape}")

# Save train and test datasets separately
train_df.to_csv("dementia_dataset_6.csv", index=False)
test_df.to_csv("test_dataset_speech.csv", index=False)

print("Results saved to 'dementia_dataset_6.csv', and 'test_dataset_speech.csv'.")

Train dataset shape: (620, 18)
Test dataset shape: (155, 18)
Results saved to 'dementia_dataset_6.csv', and 'test_dataset_speech.csv'.


In [17]:
# Print the number of entries per cluster
Severity_counts = final_df["Severity"].value_counts()
print("Number of entries per cluster:")
print(Severity_counts)

Number of entries per cluster:
Severity
0    360
1    266
2     99
3     50
Name: count, dtype: int64
