In [None]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
df_market_data.head(10)


In [None]:
# Generate summary statistics
df_market_data.describe()


In [None]:
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)

# Prepare the Data

In [None]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
scaled_data = StandardScaler().fit_transform(df_market_data)


In [None]:
# Create a DataFrame with the scaled data
df_market_data_scaled = pd.DataFrame(
    scaled_data,
    columns=df_market_data.columns
)
# Copy the crypto names from the original data
df_market_data_scaled["coin_id"] = df_market_data.index

# Set the coinid column as index
df_market_data_scaled = df_market_data_scaled.set_index("coin_id")

# Display sample data
df_market_data_scaled.head()

# Find the Best Value for k Using the Original Data.

In [None]:
# Create a list with the number of k-values from 1 to 11
k = range(1,12)
list(k)


In [None]:
# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    model = KMeans(n_clusters=i)
    model.fit(df_market_data_scaled)
    inertia.append(model.inertia_)
    
inertia


In [None]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {'k': k, 'inertia': inertia}
df_elbow_original = pd.DataFrame.from_dict(elbow_data)


In [None]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_plot = df_elbow_original.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

elbow_plot


# Cluster Cryptocurrencies with K-means Using the Original Data

In [None]:
# Initialize the K-Means model using the best value for k

model = KMeans(n_clusters=4)


In [None]:
# Fit the K-Means model using the scaled data
model.fit(df_market_data_scaled)


In [None]:
# Predict the clusters to group the cryptocurrencies using the scaled data
clusters = model.predict(df_market_data_scaled)

# Print the resulting array of cluster values.
print(clusters)


In [None]:
# Create a copy of the DataFrame
df_market_data_scaled_copy = df_market_data_scaled
df_market_data_scaled_copy.head(10)


In [None]:
df_market_data_scaled.loc[:, 'cluster_original'] = clusters

# Display sample data
df_market_data_scaled.head()


In [None]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
plot_original_clusters = df_market_data_scaled.hvplot.scatter(
    x     = "price_change_percentage_14d", 
    y     = "price_change_percentage_1y", 
    by    = 'cluster_original',
    title = "Cryptocurrencies Standarized Returns. K-Mean Clusters with k=4.",
    hover_cols = 'coin_id'
)
plot_original_clusters


# Optimize Clusters with Principal Component Analysis.

In [None]:
# Create a PCA model instance and set `n_components=3`.
pca = PCA(n_components = 3)

df_market_data_scaled_no_clusters = df_market_data_scaled.drop(columns='cluster_original')


In [None]:
# Fit the PCA model and transform the data to three principal components
market_data_pca = pca.fit_transform(df_market_data_scaled_no_clusters)

# View the first five rows of the transformed data
market_data_pca[:5]


In [None]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
pca.explained_variance_ratio_


In [None]:
# Total explained variance with 3 components
total_variance = np.sum(pca.explained_variance_ratio_[:3])


In [None]:
# Create a new DataFrame with the PCA data
df_market_data_pca = pd.DataFrame(market_data_pca, columns=['PC1', 'PC2', 'PC3'], index=df_market_data_scaled.index)

# Display sample data
df_market_data_pca.head()


# Find the Best Value for k Using the PCA Data

In [None]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1,12))
k


In [None]:
# Create an empy list to store the inertia values
inertia_pca = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_pca`
# 3. Append the model.inertia_ to the inertia list

for i in k:
    model_pca = KMeans(n_clusters = i)
    model_pca.fit(df_market_data_pca)
    inertia_pca.append(model_pca.inertia_)

inertia_pca


In [None]:
# Create a dictionary with the data to plot the Elbow curve
inertia_pca = {'k':k, 'inertia_pca':inertia_pca}
# Create a DataFrame with the data to plot the Elbow curve
inertia_pca_df = pd.DataFrame(inertia_pca)

inertia_pca_df = inertia_pca_df.set_index('k')
inertia_pca_df


In [None]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
inertia_pca_plot = inertia_pca_df['inertia_pca'].hvplot(
    title  = 'Principal Components of the Standarized Performance: Inertia v/s Number of Clusters',
    xlabel = 'Number of Clusters (k)',
    ylabel = 'Inertia (units)'     
)

inertia_pca_plot


# Cluster Cryptocurrencies with K-means Using the PCA Data


In [None]:
# Initialize the K-Means model using the best value for k
k = 4
model_pca = KMeans(n_clusters=k, random_state=1)


In [None]:
# Fit the K-Means model using the PCA data
model_pca.fit(df_market_data_pca)


In [None]:
# Predict the clusters to group the cryptocurrencies using the PCA data
cluster = model_pca.transform(df_market_data_pca)
# Print the resulting array of cluster values.
print(cluster[0:5])


In [None]:
# Create a copy of the DataFrame with the PCA data

df_market_data_pca_copy=df_market_data_pca
df_market_data_pca_copy.head()


In [None]:
df_market_data_pca['cluster_pca'] = cluster

# Display sample data
df_market_data_pca.head()


In [None]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
market_pca_plot = df_market_data_pca.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="cluster_pca",
    hover_cols=['coin_id']
).opts(plot=dict(tools=['hover'], width=600, height=400))

market_pca_plot


# Visualize and Compare the Results
In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.



In [None]:
# Composite plot to contrast the Elbow curves
elbow_plot + inertia_pca_plot


In [None]:
# Composite plot to contrast the clusters

plot_original_clusters + market_pca_plot
