In [None]:
import os
os.environ['OMP_NUM_THREADS'] = '1'
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv("Resources/crypto_market_data.csv",
                             index_col="coin_id")

# Display sample data
df_market_data.head(10)

In [None]:
# Generate summary statistics
df_market_data.describe()

In [None]:
# Plot data to see what's in the DataFrame

df_market_data.hvplot.line(
    width = 800,
    height=400,
    rot=90
)

# Prepare the Data (Most Important Step)

In [None]:
# Create an instance of the StandardScaler
scaler = StandardScaler()

# Normalize the data
scaled_data = scaler.fit_transform(df_market_data)

# Convert the normalized data back to a DataFrame
df_market_data_scaled = pd.DataFrame(scaled_data, columns=df_market_data.columns)

# Display the normalized data
df_market_data_scaled.head()


In [None]:
df_market_data_scaled = pd.DataFrame(scaled_data, columns=df_market_data.columns)

# Copy the crypto names from the original data
df_market_data_scaled['coin_id'] = df_market_data.index

# Set the coinid column as index
df_market_data_scaled.set_index('coin_id', inplace=True)

# Display sample data
df_market_data_scaled.head()


Best Value for k Using the Original Data.

In [None]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 12)) # Stops at 11, does not include 12
k_values

In [None]:
# Create an empty list to store the inertia values
inert_values = []

# Create a for loop to compute the inertia with each possible value of k
for k in k_values:
    # Create a KMeans model using the loop counter for the n_clusters
    kmeans_model = KMeans(n_clusters=k, n_init=10)
    
    # Fit the model to the data using `df_market_data_scaled`
    kmeans_model.fit(df_market_data_scaled)
    
    # Append the model.inertia_ to the inertia list
    inert_values.append(kmeans_model.inertia_)


In [None]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {
    'k_values': k_values,
    'inert_values': inert_values
}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)


In [None]:
df_elbow.hvplot.line(x='k_values', y='inert_values', xlabel='Number of Clusters (k)', ylabel='Inertia', title='Elbow Curve', xticks=list(range(1, len(df_elbow))))



The elbow seems to occur at k=3, where the curve starts to flatten out more significantly. This suggests that the best value for k in the k-means clustering algorithm for this dataset would be 3. Choosing k=3 should provide a reasonable segmentation of the data into clusters without introducing unnecessary complexity or overfitting.

### Cluster Cryptocurrencies with K-Means Using the Original Data

In [None]:
# Initialize the K-Means model using the best value for k
kmeans_model = KMeans(n_clusters=3)

In [None]:
# Fit the K-Means model using the scaled data
kmeans_model.fit(df_market_data_scaled)


In [None]:
# Predict the clusters to group the cryptocurrencies using the scaled data
cluster_labels = kmeans_model.predict(df_market_data_scaled)

# Print the resulting array of cluster values
print(cluster_labels)


In [None]:
# Create a copy of the DataFrame
df_copy = df_market_data_scaled.copy()


In [None]:
# Add a new column to the DataFrame with the predicted clusters
df_copy['Cluster'] = cluster_labels

# Display sample data
df_copy.head()


In [None]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']  # These are  hex color codes for blue, orange, and


df_copy.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    by="Cluster",  # Ensure this parameter is used to group by clusters
    color=colors,
    hover_cols=["coin_id"]
)


### Optimize Clusters with Principal Component Analysis.

In [None]:
# Create a PCA model instance and set `n_components=3`
pca_model = PCA(n_components=3)


In [None]:
# Use the PCA model with `fit_transform` to reduce to three principal components
pca_data = pca_model.fit_transform(df_market_data_scaled)

# Convert the PCA data to a DataFrame
df_pca = pd.DataFrame(pca_data, columns=['PC1', 'PC2', 'PC3'])

# View the first five rows of the DataFrame
df_pca.head()


In [None]:
# Retrieve the explained variance to determine how much information 
explained_variance = pca_model.explained_variance_ratio_
total_explained_variance = sum(explained_variance)

total_explained_variance_percent = total_explained_variance * 100

print(f'Total Variance in percentage: {total_explained_variance_percent}%')
print(f'Total Variance in float: {total_explained_variance}')


In [None]:
# Create a new DataFrame with the PCA data
df_pca_data = pd.DataFrame(pca_data, columns=['PC1', 'PC2', 'PC3'])

# Copy the crypto names from the original data
df_pca_data['coin_id'] = df_market_data.index

# Set the coinid column as index
df_pca_data.set_index('coin_id', inplace=True)

# Display sample data
df_pca_data.head()
