# Clustering Crypto

In [10]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [11]:
# Load the crypto_data.csv dataset.
crypto_df = pd.read_csv('crypto_data.csv', index_col=0)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [12]:
# Keep all the cryptocurrencies that are being traded.
traded_crypto_df = crypto_df.query('IsTrading==True')
traded_crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [13]:
# Remove the "IsTrading" column. 
traded_crypto_df.drop(columns=['IsTrading'], axis=1, inplace=True)
traded_crypto_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0


In [14]:
# Remove rows that have at least 1 null value.
traded_crypto_df.dropna(how='any', axis=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [16]:
# Keep the rows where coins are mined.
traded_crypto_df = traded_crypto_df[traded_crypto_df['TotalCoinsMined']>0]
traded_crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0


In [17]:
# Create a new DataFrame that holds only the cryptocurrencies names.
Crypto_Names_DF = traded_crypto_df[['CoinName']]
Crypto_Names_DF.head()

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [19]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
traded_crypto_df.drop(columns='CoinName',axis=1,inplace=True)
traded_crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [20]:
# Use get_dummies() to create variables for text features.
traded_crypto_dummies_df = pd.get_dummies(traded_crypto_df, columns=['Algorithm', 'ProofType'])
traded_crypto_dummies_df.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Standardize the data with StandardScaler().
scaler = StandardScaler()
traded_crypto_scaled = scaler.fit_transform(traded_crypto_dummies_df)

### Deliverable 2: Reducing Data Dimensions Using PCA

In [22]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
traded_crypto_pca = pca.fit_transform(traded_crypto_scaled)
traded_crypto_pca

array([[-0.32820554,  0.9822063 , -0.28004673],
       [-0.31148477,  0.98216833, -0.28016738],
       [ 2.31080747,  1.65360261, -0.28537762],
       ...,
       [ 0.31282274, -2.25241767,  0.207118  ],
       [-0.13658201, -2.14286434,  0.20821083],
       [-0.2631786 ,  0.74391773, -0.14531337]])

In [23]:
# Create a DataFrame with the three principal components.
principal_components_df = pd.DataFrame(data=traded_crypto_pca, columns=['Principal Component 1','Principal Component 2', 'Principal Component 3'], index=traded_crypto_dummies_df.index)
principal_components_df.head(10)

Unnamed: 0,Principal Component 1,Principal Component 2,Principal Component 3
42,-0.328206,0.982206,-0.280047
404,-0.311485,0.982168,-0.280167
1337,2.310807,1.653603,-0.285378
BTC,-0.15618,-1.245211,0.096818
ETH,-0.170575,-1.933563,0.136315
LTC,-0.160861,-1.159224,-0.031387
DASH,-0.408509,1.287307,-0.193242
XMR,-0.162338,-2.16505,0.16927
ETC,-0.169012,-1.93368,0.136312
ZEC,-0.136581,-2.142864,0.208211


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [28]:
# Create an elbow curve to find the best value for K.
inertia = []
K = list(range(1,11))
for i in K:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(principal_components_df)
    inertia.append(km.inertia_)

elbow_data = {'K': K, 'inertia': inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x='K', y='inertia', xticks='K', title="Elbow Curve")


  "KMeans is known to have a memory leak on Windows "


Running K-Means with `k=4`

In [30]:
# Initialize the K-Means model.
model = KMeans(n_clusters=4, random_state=1)

# Fit the model
model.fit(principal_components_df)

# Predict clusters
predictions = model.predict(principal_components_df)
predictions

array([1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,

In [31]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clusters_df = pd.concat([traded_crypto_df,principal_components_df], axis=1, join='inner')

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clusters_df = pd.concat([pd.concat([clusters_df,Crypto_Names_DF],axis=1, join='inner')])

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clusters_df['Class'] = model.labels_

# Print the shape of the clustered_df
print(clusters_df.shape)
clusters_df.head(10)

(532, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,Principal Component 1,Principal Component 2,Principal Component 3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.328206,0.982206,-0.280047,42 Coin,1
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.311485,0.982168,-0.280167,404Coin,1
1337,X13,PoW/PoS,29279420000.0,314159265359,2.310807,1.653603,-0.285378,EliteCoin,1
BTC,SHA-256,PoW,17927180.0,21000000,-0.15618,-1.245211,0.096818,Bitcoin,0
ETH,Ethash,PoW,107684200.0,0,-0.170575,-1.933563,0.136315,Ethereum,0
LTC,Scrypt,PoW,63039240.0,84000000,-0.160861,-1.159224,-0.031387,Litecoin,0
DASH,X11,PoW/PoS,9031294.0,22000000,-0.408509,1.287307,-0.193242,Dash,1
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.162338,-2.16505,0.16927,Monero,0
ETC,Ethash,PoW,113359700.0,210000000,-0.169012,-1.93368,0.136312,Ethereum Classic,0
ZEC,Equihash,PoW,7383056.0,21000000,-0.136581,-2.142864,0.208211,ZCash,0


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [34]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clusters_df,
    x='Principal Component 1',
    y='Principal Component 2',
    z='Principal Component 3',
    color='Class',
    symbol='Class',
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [44]:
# Create a table with tradable cryptocurrencies.
clusters_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'Class'])

In [37]:
# Print the total number of tradable cryptocurrencies.
len(clusters_df)

532

In [45]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
scatter_crypto = MinMaxScaler().fit_transform(clusters_df[['TotalCoinSupply', 'TotalCoinsMined']])
scatter_crypto

array([[4.20000000e-11, 0.00000000e+00],
       [5.32000000e-04, 1.06585544e-03],
       [3.14159265e-01, 2.95755135e-02],
       ...,
       [1.40022261e-03, 9.90135079e-04],
       [2.10000000e-05, 7.37028150e-06],
       [1.00000000e-06, 1.29582282e-07]])

In [47]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
plot_df = pd.DataFrame(data=scatter_crypto, columns=['TotalCoinSupply', 'TotalCoinsMined'], index=clusters_df.index.copy())

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
plot_df['CoinName'] = clusters_df['CoinName']

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
plot_df['Class'] = clusters_df['Class']

plot_df.head(10)

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
42,4.2e-11,0.0,42 Coin,1
404,0.000532,0.001066,404Coin,1
1337,0.3141593,0.029576,EliteCoin,1
BTC,2.1e-05,1.8e-05,Bitcoin,0
ETH,0.0,0.000109,Ethereum,0
LTC,8.4e-05,6.4e-05,Litecoin,0
DASH,2.2e-05,9e-06,Dash,1
XMR,0.0,1.7e-05,Monero,0
ETC,0.00021,0.000115,Ethereum Classic,0
ZEC,2.1e-05,7e-06,ZCash,0


In [49]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
plot_df.hvplot.scatter(x="TotalCoinsMined",y="TotalCoinSupply", by='Class',hover_cols='CoinName')
