In [1]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import numpy as np

In [2]:
# function, which finds Elbow and Calinski scores for clusters in range [1;range_k_end_point] using KMeans model
# and returns a dataframe with elbow and calinski scores per each cluster number
def elbow_calinski_scores(range_k_end_point, df):
    if not isinstance(range_k_end_point, int) or range_k_end_point<1: # https://stackoverflow.com/questions/402504/how-to-determine-a-python-variables-type
        print('First argument should be a positive integer number')
        return None
    else:
        # Create a list with the number of k-values from 1 to 11
        k=list(range(1,range_k_end_point+1))
        # Create an empty list to store the inertia values and calinski_harabasz_score values
        inertia_list=[]
        calinski_metric_list=[]

        # Loop to compute the inertia with each possible value of k 
        for i in k:
        # 1. Create a KMeans model using the loop counter for the n_clusters
        # 2. Fit the model to the data using `df`
            k_model=KMeans(n_clusters=i, random_state=0, n_init=5)
            k_model.fit(df)
        # 3. Append the inertia_ value into inertia_list list per each claster number
            inertia_list.append(k_model.inertia_)
        # 4. Append the metrics.calinski_harabasz_score into the calinski_metric_list list per each claster number
            if i>1:
                labels=k_model.labels_ # we have same data for training and for predictions, so .labels_ and .predict() will generate the same result
                calinski_score=metrics.calinski_harabasz_score(df,labels)
                calinski_metric_list.append(calinski_score)
            else:  
                calinski_metric_list.append(None)

        # Create a DataFrame with the data that can be used further analyse and or to plot the Elbow and or Calinski scores
        inertia_calinski_df=pd.DataFrame({'k':k, 'inertia':inertia_list, 'calinski':calinski_metric_list})
        return inertia_calinski_df

In [3]:
# function, which finds predicted clusters for a df dataframe based on KMeans algorithm
# and returns a dataframe with original data along with cluster indecies per each point
def kmeans_predictions_for_train_df(n_clusters, df):
    # Initialize and fit the K-Means model using the best value for number of clusters n_clusters
    k_model=KMeans(n_clusters=n_clusters, random_state=0, n_init='auto')
    k_model.fit(df)

    # Predict the clusters
    k_clusters=k_model.predict(df)
    
    # Create a copy of the DataFrame
    df_res=df.copy()
    
    # Add a new column to the DataFrame with the predicted clusters
    col_name='KMpredition_'+str(n_clusters)+'_clusters'
    df_res[col_name]=k_clusters
  
    return df_res

In [4]:
# function that looks for outliers wiithin column_name column of df dataframe
# and return list of outlier's df indecies
def outliers(df, column_name):
    # Find first and third quantile
    q1=df[column_name].quantile(0.25)
    q3=df[column_name].quantile(0.75)
    # Find interquantile range
    iqr=q3-q1
    # And respectively lower and higher boundaries
    l_boundary=q1-1.5*iqr
    h_boundary=q3+1.5*iqr
    # Find indecies of outliers and return them as a list
    outlier_index_list=[]
    for index,row in df.iterrows():
        if row[column_name]>=h_boundary or row[column_name]<=l_boundary:
            outlier_index_list.append(index)
    return outlier_index_list

In [5]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
df_market_data.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384
binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195
chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186
cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756
litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408
bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082


In [6]:
# check size of the data, if there are null values and what are the datatypes
df_market_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41 entries, bitcoin to digibyte
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   price_change_percentage_24h   41 non-null     float64
 1   price_change_percentage_7d    41 non-null     float64
 2   price_change_percentage_14d   41 non-null     float64
 3   price_change_percentage_30d   41 non-null     float64
 4   price_change_percentage_60d   41 non-null     float64
 5   price_change_percentage_200d  41 non-null     float64
 6   price_change_percentage_1y    41 non-null     float64
dtypes: float64(7)
memory usage: 2.6+ KB


In [7]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [8]:
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)

In [9]:
# plot same as above, but in more intuitive way, so that timeline is x-axis, as values are y-axis
df_market_data_T=df_market_data.transpose()
df_market_data_T.hvplot.line(
    width=800,
    height=400,
    rot=90
)

---

### Prepare the Data

In [10]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
columns_list=['price_change_percentage_24h','price_change_percentage_7d','price_change_percentage_14d','price_change_percentage_30d','price_change_percentage_60d','price_change_percentage_200d','price_change_percentage_1y']
scaled_data=StandardScaler().fit_transform(df_market_data[columns_list])
scaled_data[0:5]

array([[ 0.50852937,  0.49319307,  0.77220043,  0.23545963, -0.0674951 ,
        -0.35595348, -0.25163688],
       [ 0.18544589,  0.93444504,  0.55869212, -0.05434093, -0.27348273,
        -0.11575947, -0.19935211],
       [ 0.02177396, -0.70633685, -0.02168042, -0.06103015,  0.00800452,
        -0.55024692, -0.28206051],
       [-0.04076438, -0.81092807,  0.24945797, -0.05038797, -0.37316402,
        -0.45825882, -0.29554614],
       [ 1.19303608,  2.00095907,  1.76061001,  0.54584206, -0.29120287,
        -0.49984776, -0.27031695]])

In [11]:
# Create a DataFrame with the scaled data
scaled_df=pd.DataFrame(scaled_data, columns=columns_list)

# Copy the crypto names from the original data
scaled_df['coin_id']=df_market_data.index
# Set the coinid column as index
scaled_df=scaled_df.set_index('coin_id')

# Display sample data
scaled_df.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317


In [12]:
# check statistic data after the re-scaling
scaled_df.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,0.0,1.895503e-16,2.707861e-17,2.978647e-17,-5.4157220000000004e-18,-1.326852e-16,4.197185e-17
std,1.012423,1.012423,1.012423,1.012423,1.012423,1.012423,1.012423
min,-4.981042,-1.682027,-2.217108,-1.393153,-0.9560487,-0.5511464,-0.2963296
25%,-0.127467,-0.7066688,-0.6299628,-0.460558,-0.5517599,-0.4998478,-0.2817468
50%,0.077497,-0.1906843,-0.009190922,-0.06103015,-0.1592496,-0.3550537,-0.2255326
75%,0.33128,0.4931931,0.6435649,0.1165382,0.01606038,-0.0473611,-0.1454693
max,1.919812,2.572251,2.907054,5.351455,4.769913,4.63238,6.088625


In [13]:
# Plot the re-scaled data
scaled_df.hvplot.line(
    width=800,
    height=400,
    rot=90
)

In [14]:
# Plot the re-scaled data in different view (timeline is x-axis)
scaled_df_T=scaled_df.transpose()
scaled_df_T.hvplot.line(
    width=800,
    height=400,
    rot=90
)

---

In [15]:
# We definitely have two 'outliers' based on the data visualization, which are 'ethlend' and 'celsius-degree-token'
# So let us check using official calculation principle, just for curiosity, 
# but leave those outliers in our data anyways 
outl_ind_list=outliers(scaled_df,'price_change_percentage_200d')
outl_ind_list

['theta-token', 'ethlend', 'havven', 'omisego', 'celsius-degree-token']

### Find the Best Value for k Using the Original Data.

In [16]:
# Check inertia and calinski-harabasz score values for number of clusters in range [1;10]
# so that we can make a desicion on the optimal number of clusters to be used
inertia_calinski_df=elbow_calinski_scores(10,scaled_df)

In [17]:
# keep an array with clusters count separately
k=np.array(inertia_calinski_df['k'])
display(k)
display(inertia_calinski_df)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

Unnamed: 0,k,inertia,calinski
0,1,287.0,
1,2,198.571818,17.367515
2,3,123.190482,25.264783
3,4,79.022435,32.459853
4,5,66.002038,30.135155
5,6,52.101806,31.559124
6,7,46.073087,29.632321
7,8,37.284807,31.573954
8,9,32.161417,31.694945
9,10,28.075253,31.766478


In [18]:
# Plot a line chart with all the inertia values as well as calinski-harabasz score values computed with 
# the different values of k to visually identify the optimal value for k.
i=inertia_calinski_df.hvplot.line(x='k',y='inertia', xticks=k, color='green', label='Elbow Curve')
c=inertia_calinski_df.hvplot.line(x='k',y='calinski', xticks=k, color='blue', label='Calinski Curve' ).opts(xlabel='number of clusters', ylabel='metric value', title='Elbow and Calinski Scores X Cluster Count')
i*c

#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:**
From the visualization of Elbow curve we can see that inertia values have significant drops till k=4, and then decrease slows down. At the same time Calinski-Harabasz score peaks at k=4. So, as per this pattern k=4 is the most optimal choice for number of clusters for our data.

---

### Cluster Cryptocurrencies with K-means Using the Original Data

In [19]:
# Find predictions as per KMeans algorithm for number of clusters, equal to 4
clusters_number=4
scaled_df_res=kmeans_predictions_for_train_df(clusters_number,scaled_df)
scaled_df_res.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y,KMpredition_4_clusters
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637,2
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352,2
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061,0
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546,0
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317,2


In [20]:
# Check the size of clusters
col_name='KMpredition_'+str(clusters_number)+'_clusters'
scaled_df_res[col_name].value_counts()

KMpredition_4_clusters
0    26
2    13
1     1
3     1
Name: count, dtype: int64

In [21]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name ('coin_id' column) in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
scaled_df_res.hvplot.scatter(x='price_change_percentage_24h', y='price_change_percentage_7d', by=col_name, hover_cols=['coin_id'], title='Original Data Clustering by KMeans (24h X 7d)')


---

### Optimize Clusters with Principal Component Analysis.

In [22]:
# Create a PCA model instance and set `n_components=3`.
pca = PCA(n_components=3)

In [23]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
pca_a_list=pca.fit_transform(scaled_df)

# View the first five rows of the DataFrame. 
pca_a_list[0:5]

array([[-0.60066733,  0.84276006,  0.46159457],
       [-0.45826071,  0.45846566,  0.95287678],
       [-0.43306981, -0.16812638, -0.64175193],
       [-0.47183495, -0.22266008, -0.47905316],
       [-1.15779997,  2.04120919,  1.85971527]])

In [24]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
var_rac_arr=pca.explained_variance_ratio_
var_rac_arr

array([0.3719856 , 0.34700813, 0.17603793])

In [25]:
# Find the total % of attributed information in case of 3 PCAs in total
total_perc=np.sum(var_rac_arr)
total_perc

0.8950316570309842

#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** The total explained variance of the three principal components is close to 90%!

In [26]:
# Creating a new DataFrame with the PCA data
df_market_data_pca=pd.DataFrame(pca_a_list,columns=['PC1', 'PC2', 'PC3'] )
df_market_data_pca.head()

# Copy the crypto names from the original data
df_market_data_pca['coin_id']=df_market_data.index

# Set the coinid column as index
df_market_data_pca=df_market_data_pca.set_index('coin_id')

# Display sample data
df_market_data_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715


---

### Find the Best Value for k Using the PCA Data

In [27]:
# Check inertia and calinski-harabasz score values for number of clusters in range [1;10]
# so that we can make a desicion on the optimal number of clusters to be used
inertia_calinski_pca_df=elbow_calinski_scores(10,df_market_data_pca)


In [28]:
# keep an array with clusters count separately
k_pca=np.array(inertia_calinski_pca_df['k'])
display(k_pca)
display(inertia_calinski_pca_df)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

Unnamed: 0,k,inertia,calinski
0,1,256.874086,
1,2,168.811896,20.344688
2,3,93.774626,33.046144
3,4,49.665497,51.455694
4,5,37.985344,51.862073
5,6,27.720626,57.865728
6,7,21.630165,61.629158
7,8,17.016962,66.4487
8,9,13.668325,71.173538
9,10,10.48489,80.942562


In [37]:
# Plot a line chart with all the inertia values as well as calinski-harabasz score values computed with 
# the different values of k to visually identify the optimal value for k.
i=inertia_calinski_pca_df.hvplot.line(x='k',y='inertia', xticks=k_pca, color='green', label='Elbow Curve')
c=inertia_calinski_pca_df.hvplot.line(x='k',y='calinski', xticks=k_pca, color='blue', label='Calinski Curve' ).opts(xlabel='number of clusters', ylabel='metric value', title='Elbow and Calinski Scores X Cluster Count')
i*c


#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** As per pattern clearly visible on the visualization (which is pretty similar to the one for original data) the best number of clusters for PCA data equals to 4 (k=4).


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** The best number of clusters for PCA data is the same as for original data.

### Cluster Cryptocurrencies with K-means Using the PCA Data

In [30]:
# Find predictions as per KMeans algorithm for number of clusters, equal to 4, having as an input df with only 3 features, which are output of PCA
df_market_data_pca_clustered=kmeans_predictions_for_train_df(clusters_number,df_market_data_pca)
df_market_data_pca_clustered.head()


Unnamed: 0_level_0,PC1,PC2,PC3,KMpredition_4_clusters
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bitcoin,-0.600667,0.84276,0.461595,2
ethereum,-0.458261,0.458466,0.952877,2
tether,-0.43307,-0.168126,-0.641752,0
ripple,-0.471835,-0.22266,-0.479053,0
bitcoin-cash,-1.1578,2.041209,1.859715,2


In [31]:
# Check the size of clusters
df_market_data_pca_clustered[col_name].value_counts()

KMpredition_4_clusters
0    26
2    13
1     1
3     1
Name: count, dtype: int64

In [32]:
# Create a scatter plot using hvPlot by setting 
# `x="PC1"` and `y="PC2"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
df_market_data_pca_clustered.hvplot.scatter(x='PC1', y='PC2', by=col_name, hover_cols=['coin_id'], title='PCA Data Clustering by KMeans')



### Visualize and Compare the Results

In this section, we will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [33]:
# Composite plot to contrast the Elbow curves
i_original=inertia_calinski_df.hvplot.line(x='k',y='inertia', xticks=k, label='Elbow Curve (Original Data)').opts(xlabel='number of clusters', ylabel='inertia value', title='Elbow Curve (Original Data)')
i_pca=inertia_calinski_pca_df.hvplot.line(x='k',y='inertia', xticks=k_pca, label='Elbow Curve (PCA Data)').opts(xlabel='number of clusters', ylabel='inertia value', title='Elbow Curve (PCA Data)')
iop_combined=(i_original+i_pca).cols(1)
iop_combined

In [34]:
# Composite plot to contrast the Calinski curves
c_original=inertia_calinski_df.hvplot.line(x='k',y='calinski', xticks=k, color='darkviolet', label='Calinski (Original Data)')
c_pca=inertia_calinski_pca_df.hvplot.line(x='k',y='calinski', xticks=k_pca, color='indigo', label='Calinski (PCA Data)').opts(xlabel='number of clusters', title='Calinski Score (Original vs PCA Data)')
c_original*c_pca

In [38]:
# Composite plot to contrast the clusters
original_data_clustering=scaled_df_res.hvplot.scatter(x='price_change_percentage_24h', y='price_change_percentage_7d', by=col_name, alpha=0.7, line_color='black', hover_cols=['coin_id']).opts(title='Original Data Clustering')
pca_data_clustering=df_market_data_pca_clustered.hvplot.scatter(x='PC1', y='PC2', by=col_name, alpha=0.7, line_color='black', hover_cols=['coin_id']).opts(title='PCA Data Clustering')
op_combined=(original_data_clustering+pca_data_clustering).cols(1)
op_combined

#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** When clustering crypto_market_data with fewer dimensions we receive pretty similar patterns of inertia values and Calinski Harabasz score values compared to using full set of features, which leads to the same number of clusters as for the original data. So the optimal number of clusters remains consistent. Clusters for fewer dimensions look pretty well separated, unlike full set of features.