In [4]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [5]:
# TA Andrew Smigaj suggested this fix for Jupyter Notebook to run KMeans on my local machine
import os
os.environ["OMP_NUM_THREADS"] = '1'

In [10]:
# Load the data into a Pandas DataFrame
data = pd.read_csv("static/heart_failure_clinical_records_dataset.csv")

# Display sample data
data.head(10)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1
5,90.0,1,47,0,40,1,204000.0,2.1,132,1,1,8,1
6,75.0,1,246,0,15,0,127000.0,1.2,137,1,0,10,1
7,60.0,1,315,1,60,0,454000.0,1.1,131,1,1,10,1
8,65.0,0,157,0,65,0,263358.03,1.5,138,0,0,10,1
9,80.0,1,123,0,35,1,388000.0,9.4,133,1,1,10,1


In [13]:
# Generate summary statistics
data.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


In [14]:
# Plot your data to see what's in your DataFrame
data.hvplot.line(
    width=800,
    height=400,
    rot=90
)

---

### Prepare the Data

In [15]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
# Import the StandardScaler
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit and transform the data to normalize it
scaled_data = scaler.fit_transform(data)

# Create a DataFrame with the scaled data
df_market_data_scaled = pd.DataFrame(scaled_data, columns=data.columns, index=data.index)

# Display the first five rows of the scaled DataFrame
df_market_data_scaled.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,1.192945,-0.871105,0.000166,-0.847579,-1.53056,1.359272,0.01681648,0.490057,-1.504036,0.735688,-0.687682,-1.629502,1.454161
1,-0.491279,-0.871105,7.51464,-0.847579,-0.007077,-0.735688,7.53566e-09,-0.284552,-0.141976,0.735688,-0.687682,-1.603691,1.454161
2,0.350833,-0.871105,-0.449939,-0.847579,-1.53056,-0.735688,-1.038073,-0.0909,-1.731046,0.735688,1.454161,-1.590785,1.454161
3,-0.912335,1.147968,-0.486071,-0.847579,-1.53056,-0.735688,-0.5464741,0.490057,0.085034,0.735688,-0.687682,-1.590785,1.454161
4,0.350833,1.147968,-0.435486,1.17983,-1.53056,-0.735688,0.6517986,1.264666,-4.682176,-1.359272,-0.687682,-1.577879,1.454161


In [18]:
# Create a DataFrame with the scaled data
df_scaled_with_names = df_market_data_scaled.copy()

# Copy the crypto names from the original data
df_scaled_with_names['id'] = data.index

# Set the "coin_id" column as the index
df_scaled_with_names.set_index('id', inplace=True)

# Display sample data
df_scaled_with_names.head()

Unnamed: 0_level_0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1.192945,-0.871105,0.000166,-0.847579,-1.53056,1.359272,0.01681648,0.490057,-1.504036,0.735688,-0.687682,-1.629502,1.454161
1,-0.491279,-0.871105,7.51464,-0.847579,-0.007077,-0.735688,7.53566e-09,-0.284552,-0.141976,0.735688,-0.687682,-1.603691,1.454161
2,0.350833,-0.871105,-0.449939,-0.847579,-1.53056,-0.735688,-1.038073,-0.0909,-1.731046,0.735688,1.454161,-1.590785,1.454161
3,-0.912335,1.147968,-0.486071,-0.847579,-1.53056,-0.735688,-0.5464741,0.490057,0.085034,0.735688,-0.687682,-1.590785,1.454161
4,0.350833,1.147968,-0.435486,1.17983,-1.53056,-0.735688,0.6517986,1.264666,-4.682176,-1.359272,-0.687682,-1.577879,1.454161


---

### Find the Best Value for k Using the Original Data.

In [20]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 12))

In [21]:
# Create an empty list to store the inertia values
# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list
# --------------------------------------------------------------------------
# Create an empty list to store the inertia values
inertia_values = []

# Create a for loop to compute the inertia with each possible value of k
for k in k_values:
    # 1. Create a KMeans model using the loop counter for the n_clusters
    kmeans_model = KMeans(n_clusters=k, random_state=0)
    
    # 2. Fit the model to the data using `df_scaled_with_names`
    kmeans_model.fit(df_scaled_with_names)
    
    # 3. Append the model.inertia_ to the inertia list
    inertia_values.append(kmeans_model.inertia_)



In [23]:
# Create a dictionary with the data to plot the elbow curve
elbow_data = {
    'k_values': k_values,
    'inertia_values': inertia_values
}

# Create a DataFrame with the data to plot the elbow curve
df_elbow = pd.DataFrame(elbow_data)

In [24]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.

import hvplot.pandas
# Plot a line chart with all the inertia values
elbow_plot = df_elbow.hvplot.line(x='k_values', y='inertia_values', title='Elbow Curve', xticks=k_values)
elbow_plot

#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** 2 // ...based on the Sharp Curve aka Elbow at 2

---

### Cluster Cryptocurrencies with K-means Using the Original Data

In [25]:
# Initialize the K-Means model with the best value for k
## Random_State Parameter in scikit-learn's K-Means model determines the seed for the random number generator used in the initialization of Centroids
## Setting a specific value for random_state ensures random initialization of Centroids is reproducible
## If reran, the clustering process with the same dataset and the same value of random_state, we would get the same cluster assignments

best_k = 2
kmeans_model = KMeans(n_clusters=best_k, random_state=0)

In [26]:
# Fit the K-Means model using the scaled data
kmeans_model.fit(df_scaled_with_names)



In [27]:
# Predict the clusters to group the cryptocurrencies using the scaled data
clusters = kmeans_model.predict(df_scaled_with_names)

# Print the resulting array of cluster values
print(clusters)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 1
 0 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0
 1 1 0 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1
 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0
 0 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1
 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1]


In [29]:
# Create a copy of the DataFrame
df_with_clusters = df_scaled_with_names.copy()

In [32]:
# Add a new column to the DataFrame with the predicted clusters
df_with_clusters['Cluster'] = clusters

# Display sample data
df_with_clusters.head()

Unnamed: 0_level_0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT,Cluster
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1.192945,-0.871105,0.000166,-0.847579,-1.53056,1.359272,0.01681648,0.490057,-1.504036,0.735688,-0.687682,-1.629502,1.454161,0
1,-0.491279,-0.871105,7.51464,-0.847579,-0.007077,-0.735688,7.53566e-09,-0.284552,-0.141976,0.735688,-0.687682,-1.603691,1.454161,0
2,0.350833,-0.871105,-0.449939,-0.847579,-1.53056,-0.735688,-1.038073,-0.0909,-1.731046,0.735688,1.454161,-1.590785,1.454161,0
3,-0.912335,1.147968,-0.486071,-0.847579,-1.53056,-0.735688,-0.5464741,0.490057,0.085034,0.735688,-0.687682,-1.590785,1.454161,0
4,0.350833,1.147968,-0.435486,1.17983,-1.53056,-0.735688,0.6517986,1.264666,-4.682176,-1.359272,-0.687682,-1.577879,1.454161,0


In [60]:
# Create a scatter plot using hvPlot
scatter_plot_a = df_with_clusters.hvplot.scatter(
    x="age",
    y="ejection_fraction",
    c="Cluster",  # Color the graph points with the cluster labels
    hover_cols=["id"],
)

# Display the scatter plot
scatter_plot_a

In [61]:
# Create a scatter plot using hvPlot
scatter_plot_b = df_with_clusters.hvplot.scatter(
    x="platelets",
    y="serum_creatinine",
    c="Cluster",  # Color the graph points with the cluster labels
    hover_cols=["id"],
)

# Display the scatter plot
scatter_plot_b

In [68]:
# Create a scatter plot using hvPlot
scatter_plot_c = df_with_clusters.hvplot.scatter(
    x="ejection_fraction",
    y="serum_sodium",
    c="Cluster",  # Color the graph points with the cluster labels
    hover_cols=["id"],
)

# Display the scatter plot
scatter_plot_c

In [71]:
# Create a scatter plot using hvPlot
scatter_plot_d = df_with_clusters.hvplot.scatter(
    x="platelets",
    y="smoking",
    c="Cluster",  # Color the graph points with the cluster labels
    hover_cols=["id"],
)

# Display the scatter plot
scatter_plot_d

---

### Optimize Clusters with Principal Component Analysis.

In [38]:
from sklearn.decomposition import PCA 

# Create a PCA model instance with n_components=3
## PCA model aims to find a smaller set of features (a new set of axes) called "Principal Compponents" that can represent the data effectively, reducing its complexity and allowing for more concise/interpretable representation of the dataset
pca_model = PCA(n_components=3)

In [40]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
# View the first five rows of the DataFrame. 
# ---------------------------------------------------------------
# Use PCA model with fit_transform to reduce to three principal components
pca_data = pca_model.fit_transform(df_scaled_with_names)

# Create a DataFrame with the PCA data
df_pca = pd.DataFrame(data=pca_data, columns=['PC1', 'PC2', 'PC3'])

# Set the 'id' index from the original DataFrame as the index for the new DataFrame
df_pca['id'] = df_scaled_with_names.index
df_pca.set_index('id', inplace=True)

# View the first five rows of the DataFrame
df_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3.11212,0.435577,-0.568258
1,1.008003,1.987544,-2.527496
2,2.529498,2.218907,-0.786422
3,2.000081,0.342222,-0.34709
4,3.513147,-1.465274,-3.242868


In [43]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
#--------------------------------------------------------------------
# Retrieve the explained variance for each principal component
explained_variance = pca_model.explained_variance_ratio_

# Print the explained variance for each PC
explained_variance

array([0.15622596, 0.12776378, 0.10093015])

In [44]:
total_explained_variance = sum(explained_variance)
total_explained_variance

0.38491988567241947

#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** PC1 Total =  // PC2 Total =  // PC3 Total =  // TOTAL PC1-thruC3 = 

In [45]:
# Create a new DataFrame with the PCA data.
# Creating a DataFrame with the PCA data
# Copy the crypto names from the original data
# Set the coinid column as index
# Display sample data
# -----------------------------------------
# Create a new DataFrame with the PCA data
df_pca = pd.DataFrame(data=pca_data, columns=['PC1', 'PC2', 'PC3'])

# Copy the crypto names from the original data
df_pca['id'] = df_scaled_with_names.index

# Set the 'coin_id' column as the index for the new DataFrame
df_pca.set_index('id', inplace=True)

# Display sample data
df_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3.11212,0.435577,-0.568258
1,1.008003,1.987544,-2.527496
2,2.529498,2.218907,-0.786422
3,2.000081,0.342222,-0.34709
4,3.513147,-1.465274,-3.242868


---

### Find the Best Value for k Using the PCA Data

In [46]:
# Create a list with the number of k-values from 1 to 11
k_values2 = list(range(1, 12))
k_values2

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [47]:
# Create an empty list to store the inertia values
# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_pca`
# 3. Append the model.inertia_ to the inertia list
# -----------------------------------------------------------------
# Create an empty list to store the inertia values
inertia_values2 = []

# Create a for loop to compute the inertia with each possible value of k
for k in k_values2:
    # 1. Create a KMeans model using the loop counter for the n_clusters
    kmeans_model2 = KMeans(n_clusters=k, random_state=0)
    
    # 2. Fit the model to the data using `df_pca`
    kmeans_model2.fit(df_pca)
    
    # 3. Append the model.inertia_ to the inertia list
    inertia_values2.append(kmeans_model2.inertia_)



In [48]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data2 = {
    'k': k_values2,
    'inertia': inertia_values2
}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow2 = pd.DataFrame(elbow_data2)

In [49]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
# ------------------------------------------------
# Plot a line chart with all the inertia values computed with different values of k
df_elbow2.hvplot.line(x='k', y='inertia', title='PCA Elbow Curve', xticks=k_values2)

In [50]:
## NOTE:
## The inertia value in the context of k-means clustering measures the sum of squared distances between data points and their assigned cluster centers
## A value of 49.665 for k=4 indicates that, on average, the data points within each cluster are relatively close to the Centroid of their respective clusters 
## A lower inertia, such as 49.665 for k=4, indicates that the chosen number of clusters provides a good balance between cluster compactness and separation in the dataset

#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** k-value: 2 // inertia: 


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** 

### Cluster Cryptocurrencies with K-means Using the PCA Data

In [51]:
# Initialize the K-Means model using the best value for k
best_k_pca = 2
kmeans_model_pca = KMeans(n_clusters=best_k_pca, random_state=0)

In [52]:
# Fit the K-Means model using the PCA data
kmeans_model_pca.fit(df_pca)



In [53]:
# Predict the clusters to group the cryptocurrencies using the PCA data
clusters_pca = kmeans_model_pca.predict(df_pca)

# Print the resulting array of cluster values
print(clusters_pca)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 1 1 0 1
 0 0 1 1 0 1 1 1 0 0 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 0
 1 1 0 0 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1
 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0
 0 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1
 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1]


In [55]:
# Create a copy of the DataFrame with the PCA data
# Add a new column to the DataFrame with the predicted clusters
# Display sample data
# ---------------------------
# Create a copy of the DataFrame with the PCA data
df_pca_with_clusters = df_pca.copy()

# Add a new column to the DataFrame with the predicted clusters
df_pca_with_clusters['Cluster'] = clusters_pca

# Display sample data
df_pca_with_clusters.head()

Unnamed: 0_level_0,PC1,PC2,PC3,Cluster
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,3.11212,0.435577,-0.568258,0
1,1.008003,1.987544,-2.527496,0
2,2.529498,2.218907,-0.786422,0
3,2.000081,0.342222,-0.34709,0
4,3.513147,-1.465274,-3.242868,0


In [57]:
# Create a scatter plot using hvPlot by setting 
# `x="PC1"` and `y="PC2"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
#------------------
# Create a scatter plot using hvPlot
scatter_plot2 = df_pca_with_clusters.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="Cluster",
    hover_cols=["id"],
    title="(PCA Data)",
    xlabel="Principal Component 1 (PC1)",
    ylabel="Principal Component 2 (PC2)",
    width=800,
    height=400
)

# Display the scatter plot
scatter_plot2

### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [58]:
# Composite plot to contrast the Elbow curves
elbow_plot * df_elbow2.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve Comparison",
    xlabel="Number of Clusters (k)",
    ylabel="Inertia",
    width=800,
    height=400
)

In [63]:
# Composite plot to contrast the clusters
scatter_plot_b + scatter_plot2

#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** 
 