In [1]:
# import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from pathlib import Path
from collections import Counter
from matplotlib import style
import seaborn as sns
import sqlite3
from sqlalchemy import create_engine, text
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import holoviews as hv
import hvplot.pandas

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Create a DataFrame for the healthcare-dataset-stroke-data.csv. 
file_path = Path("alzheimers_disease_data.csv")
alzheimer_df = pd.read_csv(file_path, index_col="Diagnosis")
alzheimer_df.head()

Unnamed: 0_level_0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,DoctorInCharge
Diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,6.518877,0,0,1.725883,0,0,0,1,0,XXXConfid
0,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,7.118696,0,0,2.592424,0,0,0,0,1,XXXConfid
0,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,5.895077,0,0,7.119548,0,1,0,1,0,XXXConfid
0,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,8.965106,0,1,6.481226,0,0,0,0,0,XXXConfid
0,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,6.045039,0,0,0.014691,0,0,1,1,0,XXXConfid


---
### Data cleaning and preparation process 


In [3]:
# determine the number of rows and columns.
alzheimer_df_rc, alzheimer_df_cc = alzheimer_df.shape
print('Number of total rows:', alzheimer_df_rc)
print('Number of total columns:', alzheimer_df_cc)

Number of total rows: 2149
Number of total columns: 34


In [4]:
# Check all columns inside of the DataFrame
alzheimer_df.columns

Index(['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI',
       'Smoking', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality',
       'SleepQuality', 'FamilyHistoryAlzheimers', 'CardiovascularDisease',
       'Diabetes', 'Depression', 'HeadInjury', 'Hypertension', 'SystolicBP',
       'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
       'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment',
       'MemoryComplaints', 'BehavioralProblems', 'ADL', 'Confusion',
       'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks',
       'Forgetfulness', 'DoctorInCharge'],
      dtype='object')

In [5]:
# show duplicates
duplicate = alzheimer_df[alzheimer_df.duplicated()]
print("Duplicate Rows:", len(duplicate), "\n")

Duplicate Rows: 0 



In [6]:
# Check for missing values
alzheimer_df.isna().sum()

PatientID                    0
Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
DifficultyCompletingTasks    0
Forgetfu

In [7]:
# Drop all rows with missing information 
alzheimer_df = alzheimer_df.dropna(how='any')

In [8]:
# print out columns and number of unique values
for col in alzheimer_df.columns:
    print(col, alzheimer_df[col].nunique())

PatientID 2149
Age 31
Gender 2
Ethnicity 4
EducationLevel 4
BMI 2149
Smoking 2
AlcoholConsumption 2149
PhysicalActivity 2149
DietQuality 2149
SleepQuality 2149
FamilyHistoryAlzheimers 2
CardiovascularDisease 2
Diabetes 2
Depression 2
HeadInjury 2
Hypertension 2
SystolicBP 90
DiastolicBP 60
CholesterolTotal 2149
CholesterolLDL 2149
CholesterolHDL 2149
CholesterolTriglycerides 2149
MMSE 2149
FunctionalAssessment 2149
MemoryComplaints 2
BehavioralProblems 2
ADL 2149
Confusion 2
Disorientation 2
PersonalityChanges 2
DifficultyCompletingTasks 2
Forgetfulness 2
DoctorInCharge 1


In [9]:
# Drop irrelevent columns
alzheimer_df = alzheimer_df.drop(['DoctorInCharge', 'PatientID'], axis=1)
alzheimer_df.head()

Unnamed: 0_level_0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness
Diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,9.025679,...,21.463532,6.518877,0,0,1.725883,0,0,0,1,0
0,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,7.151293,...,20.613267,7.118696,0,0,2.592424,0,0,0,0,1
0,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,9.673574,...,7.356249,5.895077,0,0,7.119548,0,1,0,1,0
0,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,8.392554,...,13.991127,8.965106,0,1,6.481226,0,0,0,0,0
0,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,5.597238,...,13.517609,6.045039,0,0,0.014691,0,0,1,1,0


In [10]:
# Create a summary of statistics
alzheimer_df.describe()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness
count,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,...,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0
mean,74.908795,0.506282,0.697534,1.286645,27.655697,0.288506,10.039442,4.920202,4.993138,7.051081,...,14.755132,5.080055,0.208004,0.156817,4.982958,0.205212,0.158213,0.150768,0.158678,0.301536
std,8.990221,0.500077,0.996128,0.904527,7.217438,0.453173,5.75791,2.857191,2.909055,1.763573,...,8.613151,2.892743,0.405974,0.363713,2.949775,0.40395,0.365026,0.357906,0.365461,0.459032
min,60.0,0.0,0.0,0.0,15.008851,0.0,0.002003,0.003616,0.009385,4.002629,...,0.005312,0.00046,0.0,0.0,0.001288,0.0,0.0,0.0,0.0,0.0
25%,67.0,0.0,0.0,1.0,21.611408,0.0,5.13981,2.570626,2.458455,5.482997,...,7.167602,2.566281,0.0,0.0,2.342836,0.0,0.0,0.0,0.0,0.0
50%,75.0,1.0,0.0,1.0,27.823924,0.0,9.934412,4.766424,5.076087,7.115646,...,14.44166,5.094439,0.0,0.0,5.038973,0.0,0.0,0.0,0.0,0.0
75%,83.0,1.0,1.0,2.0,33.869778,1.0,15.157931,7.427899,7.558625,8.562521,...,22.161028,7.546981,0.0,0.0,7.58149,0.0,0.0,0.0,0.0,1.0
max,90.0,1.0,3.0,3.0,39.992767,1.0,19.989293,9.987429,9.998346,9.99984,...,29.991381,9.996467,1.0,1.0,9.999747,1.0,1.0,1.0,1.0,1.0


In [11]:
# Plot the data to see what's in the DataFrame
alzheimer_df.hvplot.bar( 
    width=800,
    height=400,
    rot=90,
    title='AD Data Visualization',
    xlabel='Categories',
    ylabel='Values')

---
### Exploratory Data Analysis (EDA)



In [12]:
# Add five correlated features to the dataset.
alzheimer_cleanML_opt_df = alzheimer_df[['Ethnicity', 'Gender', 'Age', 'EducationLevel','MMSE', 'FunctionalAssessment', 'MemoryComplaints', 'BehavioralProblems', 'ADL']]
alzheimer_cleanML_opt_df.head()

Unnamed: 0_level_0,Ethnicity,Gender,Age,EducationLevel,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL
Diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,0,73,2,21.463532,6.518877,0,0,1.725883
0,0,0,89,0,20.613267,7.118696,0,0,2.592424
0,3,0,73,1,7.356249,5.895077,0,0,7.119548
0,0,1,74,1,13.991127,8.965106,0,1,6.481226
0,0,0,89,0,13.517609,6.045039,0,0,0.014691


In [13]:
# Plot the data to see what's in the DataFrame
alzheimer_cleanML_opt_df.hvplot.bar(
    width=800,
    height=400,
    rot=90
)

In [14]:
# Preprocess the data
# Select relevant features for clustering
features = ['Age', 'Gender', 'Ethnicity', 'EducationLevel', 'MemoryComplaints', 'BehavioralProblems','MMSE', 'FunctionalAssessment', 'ADL']  
X = alzheimer_cleanML_opt_df[features]

In [15]:
data_scaled = StandardScaler().fit_transform(X)
data_scaled [0:10]

array([[-0.21236841, -1.01264391, -0.70040826,  0.78883348, -0.51247653,
        -0.4312567 ,  0.77903679,  0.49750588, -1.10443449],
       [ 1.56775727, -1.01264391, -0.70040826, -1.42278185, -0.51247653,
        -0.4312567 ,  0.68029675,  0.70490696, -0.81060109],
       [-0.21236841, -1.01264391,  2.31195467, -0.31697418, -0.51247653,
        -0.4312567 , -0.85922158,  0.28181278,  0.72449145],
       [-0.10111056,  0.98751396, -0.70040826, -0.31697418, -0.51247653,
         2.31880456, -0.08872276,  1.34334607,  0.50804427],
       [ 1.56775727, -1.01264391, -0.70040826, -1.42278185, -0.51247653,
        -0.4312567 , -0.14371176,  0.3336654 , -1.68467896],
       [ 1.23398371,  0.98751396,  0.30371272, -0.31697418, -0.51247653,
        -0.4312567 ,  1.48207863,  0.14871322,  1.36744896],
       [-0.76865769, -1.01264391,  2.31195467,  0.78883348, -0.51247653,
        -0.4312567 , -1.48536765,  0.33957317,  1.44226598],
       [ 0.0101473 , -1.01264391, -0.70040826, -0.31697418, -0

In [16]:
# Create a DataFrame with the scaled data
data_scaled_df = pd.DataFrame(
    data_scaled,
    columns=features)

data_scaled_df["Diagnosis"] = data_scaled_df.index

# Set Diagnosis column as index
data_scaled_df = data_scaled_df.set_index("Diagnosis")

data_scaled_df.head()

Unnamed: 0_level_0,Age,Gender,Ethnicity,EducationLevel,MemoryComplaints,BehavioralProblems,MMSE,FunctionalAssessment,ADL
Diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,-0.212368,-1.012644,-0.700408,0.788833,-0.512477,-0.431257,0.779037,0.497506,-1.104434
1,1.567757,-1.012644,-0.700408,-1.422782,-0.512477,-0.431257,0.680297,0.704907,-0.810601
2,-0.212368,-1.012644,2.311955,-0.316974,-0.512477,-0.431257,-0.859222,0.281813,0.724491
3,-0.101111,0.987514,-0.700408,-0.316974,-0.512477,2.318805,-0.088723,1.343346,0.508044
4,1.567757,-1.012644,-0.700408,-1.422782,-0.512477,-0.431257,-0.143712,0.333665,-1.684679


### Find the Best Value for k Using the Original Data

In [17]:
# Create a list with the number of k-values from 1 to 11
k=list(range(1,11))

In [18]:
# Create an empty list to store the inertia values
inertia=[]

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `data_scaled_df`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(data_scaled_df)
    inertia.append(k_model.inertia_)

In [19]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)
df_elbow

Unnamed: 0,k,inertia
0,1,19341.0
1,2,17628.142508
2,3,15892.38718
3,4,15288.962967
4,5,13887.541062
5,6,12549.546302
6,7,12114.270727
7,8,11716.16214
8,9,11387.172373
9,10,11119.736632


In [20]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [21]:
print('The best K value is 3')

The best K value is 3


In [22]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=3, random_state=1)

In [23]:
# Fit the K-Means model using the scaled data
model.fit(data_scaled_df)

In [24]:
# Predict the clusters using the scaled data
data_predict = model.predict(data_scaled_df)

# Print the resulting array of cluster values.
data_predict

array([1, 1, 1, ..., 1, 0, 1])

In [25]:
# Create a copy of the DataFrame
data_scaled_df_copy = data_scaled_df.copy()
data_scaled_df_copy.head()

Unnamed: 0_level_0,Age,Gender,Ethnicity,EducationLevel,MemoryComplaints,BehavioralProblems,MMSE,FunctionalAssessment,ADL
Diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,-0.212368,-1.012644,-0.700408,0.788833,-0.512477,-0.431257,0.779037,0.497506,-1.104434
1,1.567757,-1.012644,-0.700408,-1.422782,-0.512477,-0.431257,0.680297,0.704907,-0.810601
2,-0.212368,-1.012644,2.311955,-0.316974,-0.512477,-0.431257,-0.859222,0.281813,0.724491
3,-0.101111,0.987514,-0.700408,-0.316974,-0.512477,2.318805,-0.088723,1.343346,0.508044
4,1.567757,-1.012644,-0.700408,-1.422782,-0.512477,-0.431257,-0.143712,0.333665,-1.684679


In [26]:
# Add a new column to the DataFrame with the predicted clusters
data_scaled_df_copy["predicted_clusters"] = data_predict

# Display sample data
data_scaled_df_copy.head()

Unnamed: 0_level_0,Age,Gender,Ethnicity,EducationLevel,MemoryComplaints,BehavioralProblems,MMSE,FunctionalAssessment,ADL,predicted_clusters
Diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,-0.212368,-1.012644,-0.700408,0.788833,-0.512477,-0.431257,0.779037,0.497506,-1.104434,1
1,1.567757,-1.012644,-0.700408,-1.422782,-0.512477,-0.431257,0.680297,0.704907,-0.810601,1
2,-0.212368,-1.012644,2.311955,-0.316974,-0.512477,-0.431257,-0.859222,0.281813,0.724491,1
3,-0.101111,0.987514,-0.700408,-0.316974,-0.512477,2.318805,-0.088723,1.343346,0.508044,2
4,1.567757,-1.012644,-0.700408,-1.422782,-0.512477,-0.431257,-0.143712,0.333665,-1.684679,1


In [27]:
# Create a scatter plot using hvPlot by setting 

data_scaled_df_copy.hvplot.scatter(
        x="Age",
        y="MMSE",
        by="predicted_clusters",
        hover_cols = "Diagnosis")

### Optimize Clusters with Principal Component Analysis

In [28]:
# Create a PCA model instance 

pca = PCA(n_components=3)

In [29]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components

data_pca = pca.fit_transform(data_scaled_df_copy)
# View the first five rows of the DataFrame
data_pca[0:5]

array([[-0.47676334, -0.84671854,  0.76921901],
       [-0.92856124,  1.10237578, -0.48436328],
       [ 0.63663114, -0.91043642, -1.06952619],
       [ 1.5696726 ,  1.70529366, -0.05733638],
       [-1.7662299 ,  0.81053319, -0.91218213]])

In [30]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
pca.explained_variance_ratio_

array([0.13961105, 0.11304323, 0.11056333])

In [31]:
total_explained_variance = np.sum(pca.explained_variance_ratio_)
total_explained_variance

0.36321760417972837

In [32]:
# Create a new DataFrame with the PCA data.

# Creating a DataFrame with the PCA data
data_pca_df = pd.DataFrame(
    data_pca,
    columns=["PC1", "PC2" , "PC3"])

# Set Diagnosis column as index
data_pca_df["Diagnosis"] = data_pca_df.index

# Set Diagnosis column as index
data_pca_df = data_pca_df.set_index("Diagnosis")
# Display sample data
data_pca_df.head()

Unnamed: 0_level_0,PC1,PC2,PC3
Diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-0.476763,-0.846719,0.769219
1,-0.928561,1.102376,-0.484363
2,0.636631,-0.910436,-1.069526
3,1.569673,1.705294,-0.057336
4,-1.76623,0.810533,-0.912182


### Find the Best Value for k Using the PCA Data

In [33]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1, 11))

In [34]:
# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_pca`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    model = KMeans(n_clusters=i, random_state=0)
    model.fit(data_pca_df)
    inertia.append(model.inertia_)

In [35]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data_2 = {
    "k_pca": k,
    "inertia": inertia}
# Create a DataFrame with the data to plot the Elbow curve
df_elbow_2 = pd.DataFrame(elbow_data_2)
df_elbow_2.head()

Unnamed: 0,k_pca,inertia
0,1,7420.155534
1,2,5217.274276
2,3,4097.953799
3,4,3392.690323
4,5,2874.651545


In [36]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
df_elbow_2.hvplot.line(x="k_pca", y="inertia", title="Elbow Curve_pca", xticks=k)

### Cluster features with K-means Using the PCA Data

In [37]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=2, random_state=0)

In [38]:
# Fit the K-Means model using the PCA data
model.fit(data_pca_df)

In [39]:
# Predict the clusters using the PCA data
data_predict_2 = model.predict(data_pca_df)
# Print the resulting array of cluster values.
data_predict_2[:50] # To view all the figures (0 to 3 in the array)

array([1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 1])

In [40]:
# Create a copy of the DataFrame with the PCA data
df_scaled_data_pca = data_pca_df.copy()
df_scaled_data_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3
Diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-0.476763,-0.846719,0.769219
1,-0.928561,1.102376,-0.484363
2,0.636631,-0.910436,-1.069526
3,1.569673,1.705294,-0.057336
4,-1.76623,0.810533,-0.912182


In [41]:
# Add a new column to the DataFrame with the predicted clusters
df_scaled_data_pca["predicted_clusters_2"]=data_predict_2

# Display sample data
df_scaled_data_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3,predicted_clusters_2
Diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-0.476763,-0.846719,0.769219,1
1,-0.928561,1.102376,-0.484363,1
2,0.636631,-0.910436,-1.069526,0
3,1.569673,1.705294,-0.057336,0
4,-1.76623,0.810533,-0.912182,1


In [42]:
# Create a scatter plot using hvPlot by setting 
# `x="PC1"` and `y="PC2"`

df_scaled_data_pca.hvplot.scatter(
        x="PC1",
        y="PC2",
        by="predicted_clusters_2",
        hover_cols=['Diagnosis'])

### Visualize and Compare the Results

In [43]:
# Composite plot to contrast the Elbow curves
(df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve - Original Data") + df_elbow_2.hvplot.line(x="k_pca", y="inertia", xticks=k, title="Elbow Curve - PCA Data"))


In [44]:
# Composite plot to contrast the clusters
(data_scaled_df_copy.hvplot.scatter(x="Age", y="MMSE", by="predicted_clusters", hover_cols="Diagnosis", title="Original Scaled Data Plot") + df_scaled_data_pca.hvplot.scatter(x="PC1", y="PC2", by="predicted_clusters_2", hover_cols="Diagnosis", title="PCA Data Plot"))