In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy.stats import ttest_ind
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from dash import Input, Output, dcc, html
from dash import Dash

In [2]:
os.chdir(r"C:\Users\USER\Desktop\My Project\Parkinson's Disease Telemonitoring - Clustering")
os.getcwd()

"C:\\Users\\USER\\Desktop\\My Project\\Parkinson's Disease Telemonitoring - Clustering"

In [3]:
# create a function to wrangle the data
def wrangle(filepath):
    df = pd.read_csv(filepath)
    
    return df

In [4]:
df = wrangle("parkinsons.csv")
print("Shape:", df.shape)
df.head()

Shape: (5875, 22)


Unnamed: 0,subject#,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,...,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,1,72,0,5.6431,28.199,34.398,0.00662,3.4e-05,0.00401,0.00317,...,0.23,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006
1,1,72,0,12.666,28.447,34.894,0.003,1.7e-05,0.00132,0.0015,...,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081
2,1,72,0,19.681,28.695,35.389,0.00481,2.5e-05,0.00205,0.00208,...,0.181,0.00734,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014
3,1,72,0,25.647,28.905,35.81,0.00528,2.7e-05,0.00191,0.00264,...,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277
4,1,72,0,33.642,29.187,36.375,0.00335,2e-05,0.00093,0.0013,...,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361


In [5]:
# copy data
voice = df.copy()

In [6]:
features = voice.drop(columns=["subject#", "sex", "test_time", "total_UPDRS",
                                "motor_UPDRS", "age", "Jitter(Abs)", "Jitter:RAP",
                                "Shimmer:APQ3", "Shimmer:APQ5", "Shimmer:APQ11",
                                "RPDE", "DFA"])
X = features

In [7]:
# Create the app layout

## Instantiate the application
app = Dash(__name__)
## Colors
colors = {
    "background": "#1e1e1e",   # Charcoal Grey
    "text": "#ffffff",         # White
    "accent": "#2ecc71"        # Emerald Green
}

## App layout
app.layout = html.Div(
    style={'backgroundColor': colors["background"]},
    children=[
        # App title
        html.H1(
            children="Parkinson's Disease Progression", 
            style={
                'textAlign': 'center',
                'color': colors['text']
            }
        ),
        html.H2(
            children="A Clustering Analysis of Patients' Vocal Features",
            style={
                'textAlign': 'center',
                'color': colors['text']
            }),
        
        # Add Slider
        html.H2(children="K-Means Clustering",
                style={'color': colors['text']}
        ),
        html.H3(children="Number of Clusters (k)",
                style={'color': colors['text']}
        ),
        dcc.Slider(
            min=2, max=12, step=1, value=2, id="k-slider",
            tooltip={"placement": "bottom", "always_visible": True},
            updatemode="drag"
        ),
        
        # Add Section Metrics
        html.Div(id="metrics", style={'color': colors['accent']}), 
        # PCA scatter plot
        dcc.Graph(id="pca-scatter"),
        # Add bar chart showing mean values for each group
        dcc.Graph(id="kind-bar"),
        # Add bar chart showing mean values of UPDRS scores for each group
        dcc.Graph(id="kind-bar2")
    ]
)


In [8]:
# Run functiom to get metrics
def get_model_metrics(k, return_metrics):
    """Build ``KMeans`` model using selected features in 
    
    Parameters
    ----------
    k : int, default=2
        Number of Clusters.
        
    return_metrics : bool, default=False
        If ``False``, returns ``KMeans`` model. If ``True`` returns ``dict``
        with inertia and silhouette score.
        
    """
    model = make_pipeline(
        StandardScaler(),
        KMeans(n_clusters=k, random_state=42)
    )
    model.fit(X)
    
    if return_metrics:
        # Calculate inertia
        i = model.named_steps["kmeans"].inertia_
        # Calculate silhouette score
        ss = silhouette_score(X, model.named_steps["kmeans"].labels_)
        # Put result into dictionary
        metrics = {
            "inertia": round(i),
            "silhouette": round(ss, 3)
        }
        # Return dictionary to user
        return metrics
    
    
    return model

In [9]:
# Add a callback decorator
@app.callback(
    Output("metrics", "children"),
    Input("k-slider", "value")
)
def serve_metrics(k=2):
    """Returns list of H3 elements containing inertia and silhouette scores
    for KMeans model."""
    
    # Get metrics
    metrics = get_model_metrics(k=k, return_metrics=True)
    
    # Add metrics to HTML elements (with theme colors)
    text = [
        html.H3(
            f"Inertia: {metrics['inertia']}",
            style={'color': colors["accent"]}  # Emerald green for highlight
        ),
        html.H3(
            f"Silhouette Score: {metrics['silhouette']}",
            style={'color': colors["accent"]}
        )
    ]
    
    return text


In [10]:
# Create function that takes in a dataframe and return a PCA 
# reduced dataframe
def get_pca_labels(k=2):
    """``KMeans`` labels

    Parameters
    ----------
        k : int, default=2.
    """
    # Build transformer
    transformer = PCA(n_components=2, random_state=42)
    
    # Transform data
    X_t = transformer.fit_transform(X)
    X_pca = pd.DataFrame(X_t, columns=["PC1", "PC2"])
    
    # Add labels
    model = get_model_metrics(k=k, return_metrics=False)
    X_pca["labels"] = model.named_steps["kmeans"].labels_.astype(str)
    X_pca.sort_values("labels", inplace=True)
    
    return X_pca

In [11]:
@app.callback(
    Output("pca-scatter", "figure"),
    Input("k-slider", "value")
)
def serve_scatter_plot(k):
    """Build 2D scatter plot of dataframe with KMeans labels."""
    fig = px.scatter(
        data_frame=get_pca_labels(k=k),
        x="PC1",
        y="PC2",
        color="labels",
        title="PCA Representation of Clusters",
        color_discrete_sequence=px.colors.qualitative.Set2  # soft but visible colors
    )
    fig.update_layout(
        plot_bgcolor=colors["background"],
        paper_bgcolor=colors["background"],
        font=dict(color=colors["text"]),
        title=dict(font=dict(color=colors["accent"])),  # emerald green accent
        xaxis_title="PC1",
        yaxis_title="PC2",
        xaxis=dict(color=colors["text"]),
        yaxis=dict(color=colors["text"])
    )
    return fig


In [12]:
# create a function that gets the mean of the clustered group and turns into dataframe
def get_cluster_mean(k=2):
    """Get mean values for each cluster
    
    Parameters
    ----------
    k : int, default=2
        Number of clusters.
    """
    
    model = get_model_metrics(k=k, return_metrics=False)
    xgb = X.groupby(model.named_steps["kmeans"].labels_).mean()
    return xgb

In [13]:
@app.callback(
    Output("kind-bar", "figure"),
    Input("k-slider", "value")
)
def serve_cluster_mean(k):
    """Plot a bar chart showing the mean features of each cluster."""
    fig = px.bar(
        data_frame=get_cluster_mean(k=k),
        barmode="group",
        title="Mean Features of Each Cluster"
    )
    
    fig.update_layout(
        plot_bgcolor=colors["background"],
        paper_bgcolor=colors["background"],
        font=dict(color=colors["text"]),
        title=dict(font=dict(color=colors["accent"])),
        legend_title=dict(font=dict(color=colors["accent"]))
    )
    
    return fig


In [14]:
# Create a function that checks mean UPDRS score for each cluster
def get_updrs_scores(k=2):
    """Get mean UPDRS scores for each cluster and returns a dataframe

    Parameters
    ---------
        k : int default=2.
    """
    model = get_model_metrics(k=k, return_metrics=False)
    updrs_summary = voice.groupby(model.named_steps["kmeans"].labels_)[["motor_UPDRS", "total_UPDRS"]].agg("mean").T
    return updrs_summary

In [15]:
@app.callback(
    Output("kind-bar2", "figure"),
    Input("k-slider", "value")
)
def serve_updrs_scores(k=2):
    """Plot mean UPDRS scores and return a bar plot."""
    
    fig = px.bar(
        data_frame=get_updrs_scores(k=k),
        barmode="group",
        title="Mean UPDRS Scores for Each Patient Cluster"
    )
    
    fig.update_layout(
        plot_bgcolor=colors["background"],
        paper_bgcolor=colors["background"],
        font=dict(color=colors["text"]),
        title=dict(font=dict(color=colors["accent"])),
        xaxis=dict(title=dict(text="UPDRS Variables", font=dict(color=colors["accent"]))),
        yaxis=dict(title=dict(text="Scores", font=dict(color=colors["accent"]))),
        legend_title=dict(text="Patient Clusters", font=dict(color=colors["accent"]))
    )
    
    return fig


In [16]:
serve_updrs_scores(k=2)

In [17]:
app.run(debug=True)