In [1]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    Path("Resources/crypto_market_data.csv"),
    index_col="coin_id")

# Display sample data
df_market_data.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384
binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195
chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186
cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756
litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408
bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082


In [3]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [4]:
# Plot the Dataframe Data
df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)

---

In [5]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
scaled_data = StandardScaler().fit_transform(df_market_data)

# Create a DataFrame with the scaled data
df_market_data_scaled = pd.DataFrame(
    scaled_data,
    columns=df_market_data.columns
)


# Copy the crypto names from the original data
df_market_data_scaled["coin_id"] = df_market_data.index

df_market_data_scaled.rename(columns={'coin_id': 'Ticker'}, inplace=True)

# Set the coinid column as index
df_market_data_scaled = df_market_data_scaled.set_index("Ticker")

# Display sample data
df_market_data_scaled.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317


---

In [6]:
# Create a a list to store inertia values
inertia = []

# Create a a list to store the values of k
k_list = list(range(1, 11))

In [7]:
# Iterate through each value of k
for k in k_list:
    # Create a KMeans model with the current value of k
    kmeans = KMeans(n_clusters=k)
    
    # Fit the model to the scaled data
    kmeans.fit(df_market_data_scaled) 
    
    # Append the inertia to the inertia list
    inertia.append(kmeans.inertia_)

In [8]:
# Create a dictionary with the data to plot the Elbow curve
elbow_curve = {"k": k_list, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
elbow_data= pd.DataFrame(elbow_curve)

elbow_data

Unnamed: 0,k,inertia
0,1,287.0
1,2,198.571818
2,3,123.190482
3,4,79.022435
4,5,65.122199
5,6,55.595772
6,7,43.91469
7,8,36.658386
8,9,32.880514
9,10,28.635171


In [9]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbowgraph = elbow_data.hvplot.line(x="k", y="inertia", title= "Elbow Curve", xticks=k_list)
display(elbowgraph)

In [10]:
# Initialize the K-Means model with the best value for k
kmeans_model = KMeans(n_clusters=4, random_state=0)

# Fit the K-Means model using the original data
kmeans_model.fit(df_market_data) 

# Predict the clusters to group the cryptocurrencies using the original data
clusters = kmeans_model.predict(df_market_data)

# Create a copy of the original data
data_with_clusters = df_market_data.copy()

# Add a new column with the predicted clusters
data_with_clusters['Cluster'] = clusters

# Create a scatter plot using hvPlot
custom_colors=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
scattergraph = data_with_clusters.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    c="Cluster",
    colormap = custom_colors,
    hover_cols=["crypto_name"],
    title="Cryptocurrency Clusters",
)
display(scattergraph)

In [11]:
# Fit the K-Means model using the scaled data
kmeans_model.fit(df_market_data_scaled) 

# Predict the clusters to group the cryptocurrencies using the scaled data
scaled_clusters = kmeans_model.predict(df_market_data_scaled)

# Create a copy of the original data
data_with_scaled_clusters = df_market_data_scaled.copy()

# Add a new column with the predicted clusters
data_with_scaled_clusters['Cluster'] = scaled_clusters

scaled_scattergraph = data_with_scaled_clusters.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    c="Cluster",
    colormap = custom_colors,
    hover_cols=["crypto_name"],
    title="Scaled Cryptocurrency Clusters",
)
display(scaled_scattergraph)

---

In [12]:
# Create a PCA model instance and set `n_components=3`.
pca = PCA(n_components=3)

# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
crypto_pca = pca.fit_transform(df_market_data)

# View the first five rows of the DataFrame. 
crypto_pca[:5]

array([[-341.80096268,  -51.36677548,   12.52547089],
       [-249.42046633,   24.11754777,  -14.23146597],
       [-402.61472077, -118.71073742,   24.83839662],
       [-406.75243715,  -79.48728629,    1.56633057],
       [-382.42994789, -103.43195906,   16.75307273]])

In [13]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
explained_variance=pca.explained_variance_ratio_
explained_variance

array([9.76037313e-01, 2.30282949e-02, 7.48308214e-04])

In [14]:
total_explained_variance = sum(explained_variance)
print("Total Explained Variance:", total_explained_variance)

Total Explained Variance: 0.9998139161298989


In [15]:
# Create the PCA DataFrame
cryptopca = pd.DataFrame(
    crypto_pca,
    index=df_market_data.index,
    columns=["Component 1", "Component 2", "Component 3"])

cryptopca.head()


Unnamed: 0_level_0,Component 1,Component 2,Component 3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-341.800963,-51.366775,12.525471
ethereum,-249.420466,24.117548,-14.231466
tether,-402.614721,-118.710737,24.838397
ripple,-406.752437,-79.487286,1.566331
bitcoin-cash,-382.429948,-103.431959,16.753073


---

In [16]:
# Create a a list to store inertia values and the values of k
pcainertia = []
kpca = list(range(1, 11))

In [17]:
# Create a for loop to compute the inertia with each possible value of k
for k in kpca:
    k_model = KMeans(n_clusters=k, random_state=0)
    k_model.fit(cryptopca)
    pcainertia.append(k_model.inertia_)

In [18]:
# Create a dictionary with the data to plot the Elbow curve
pcaelbow={"k": kpca, "inertia": pcainertia}

# Create a DataFrame with the data to plot the Elbow curve
pcadf = pd.DataFrame(pcaelbow)

In [19]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.

pcagraph= pcadf.hvplot.line(
    x="k",
    y="inertia",
    title="PCA KMeans Elbow Curve",
    xticks =k)
display(pcagraph)

---

In [20]:
# Convert the NumPy array crypto_pca to a Pandas DataFrame
cryptoframe = pd.DataFrame(cryptopca, columns=['PC1', 'PC2', 'PC3'])



In [21]:
# Initialize the K-Means model using the best value for k
pcamodel = KMeans(n_clusters=2, random_state=0)

# Fit the K-Means model using the PCA data
pcamodel.fit(pcadf)

# Predict the clusters to group the cryptocurrencies using the PCA data
pcadiction= pcamodel.predict(pcadf)

df_market_data.reset_index(inplace=True)
cryptopca2 = pcadf.copy()
cryptopca2['crypto_name'] = df_market_data['coin_id']
cryptopca2['predicted clusters'] = pcadiction

pcascatter = cryptopca2.hvplot.scatter(
    x="k",
    y="inertia",
    c="predicted clusters",
    over_cols = "crypto_name",
    colormap = custom_colors,
    title="Cryptocurrency Clusters").opts(yformatter="%.0f")
display(pcascatter)



---

In [22]:
# Composite plot to contrast the Elbow curves
pcagraph + elbowgraph

In [23]:
# Compoite plot to contrast the clusters
pcascatter + scaled_scattergraph + scattergraph

Fewer features can be positively or negatively impactful when presenting a dataset. Fewer features means the data can be interpreted with greater ease and with less concern for noise this also lowers the barrier to entry for understanding data visualizations. However, there's a drastic loss of data and information, difficulties seperating clusters and a risk of misclustering of data when using more features in your K-Means clusters. 