# Clustering Crypto

In [410]:
# Initial imports
import pandas as pd
import hvplot.pandas
import hvplot
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from unicodedata import name

### Deliverable 1: Preprocessing the Data for PCA

In [411]:
# Load the crypto_data.csv dataset.
file_path = "/Users/delonjackson/Desktop/GitHub/Cryptocurrencies/crypto_data.csv"
crypto_df = pd.read_csv(file_path)
crypto_df.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
6,2015,2015 coin,X11,True,PoW/PoS,,0
7,BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,True,PoW,107684200.0,0
9,LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [412]:
crypto_df.drop(columns=['Unnamed: 0'], inplace=True)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.0,0


In [413]:
# Keep all the cryptocurrencies that are being traded.
crypto_df = crypto_df[crypto_df['IsTrading']==True]
crypto_df.head(10)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.0,0
5,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
6,2015 coin,X11,True,PoW/PoS,,0
7,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
8,Ethereum,Ethash,True,PoW,107684200.0,0
9,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [414]:
# Keep all the cryptocurrencies that have a working algorithm.
# YOUR CODE HERE

In [415]:
# Remove the "IsTrading" column. 
crypto_df  = crypto_df.drop(['IsTrading'], axis=1) 
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365Coin,X11,PoW/PoS,,2300000000
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,PoW,,611000
4,808,SHA-256,PoW/PoS,0.0,0


In [416]:
# Remove rows that have at least 1 null value.
crypto_df = crypto_df.dropna()
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,SHA-256,PoW/PoS,0.0,0
5,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,Ethereum,Ethash,PoW,107684200.0,0
9,Litecoin,Scrypt,PoW,63039240.0,84000000
10,Dash,X11,PoW/PoS,9031294.0,22000000
11,Monero,CryptoNight-V7,PoW,17201140.0,0
12,Ethereum Classic,Ethash,PoW,113359700.0,210000000


In [417]:
# Keep the rows where coins are mined.
crypto_df[crypto_df['TotalCoinsMined'] > 0]
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
4,808,SHA-256,PoW/PoS,0.000000e+00,0
5,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
7,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
1238,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,PoW,7.296538e+06,21000000


In [418]:
# Create a new DataFrame that holds only the cryptocurrencies names.
coins_names = crypto_df.filter(['CoinName'], axis = 1)
coins_names.head(10)

Unnamed: 0,CoinName
0,42 Coin
2,404Coin
4,808
5,EliteCoin
7,Bitcoin
8,Ethereum
9,Litecoin
10,Dash
11,Monero
12,Ethereum Classic


In [419]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
crypto_df = crypto_df.drop("CoinName", axis=1)
print(crypto_df.shape)
crypto_df.head(10)

(685, 4)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,41.99995,42
2,Scrypt,PoW/PoS,1055185000.0,532000000
4,SHA-256,PoW/PoS,0.0,0
5,X13,PoW/PoS,29279420000.0,314159265359
7,SHA-256,PoW,17927180.0,21000000
8,Ethash,PoW,107684200.0,0
9,Scrypt,PoW,63039240.0,84000000
10,X11,PoW/PoS,9031294.0,22000000
11,CryptoNight-V7,PoW,17201140.0,0
12,Ethash,PoW,113359700.0,210000000


In [420]:
# Use get_dummies() to create variables for text features.
crypto_scaled = pd.get_dummies(data=crypto_df, columns=['Algorithm', 'ProofType'])
print(crypto_scaled.shape)
crypto_scaled.head(10)

(685, 109)


Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0.0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,63039240.0,84000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,9031294.0,22000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
11,17201140.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,113359700.0,210000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [421]:
# Standardize the data with StandardScaler().
crypto_scaled = StandardScaler().fit_transform(crypto_scaled)
print(crypto_scaled)

[[-0.10282804 -0.03823841 -0.03823596 ... -0.03823596 -0.03823596
  -0.03823596]
 [-0.07661326 -0.03823825 -0.03823596 ... -0.03823596 -0.03823596
  -0.03823596]
 [-0.10282804 -0.03823841 -0.03823596 ... -0.03823596 -0.03823596
  -0.03823596]
 ...
 [-0.0784756  -0.03823801 -0.03823596 ... -0.03823596 -0.03823596
  -0.03823596]
 [-0.10264677 -0.0382384  -0.03823596 ... -0.03823596 -0.03823596
  -0.03823596]
 [-0.10282485 -0.03823841 -0.03823596 ... -0.03823596 -0.03823596
  -0.03823596]]


### Deliverable 2: Reducing Data Dimensions Using PCA

In [422]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)

crypto_pca = pca.fit_transform(crypto_scaled)

crypto_pca

array([[-0.25527958,  1.19533585, -0.46932219],
       [-0.24083354,  1.19416803, -0.46997245],
       [-0.12886361,  0.84067095, -0.35382029],
       ...,
       [-0.10586378, -2.31637382,  0.26964612],
       [-0.23158259, -2.00766761,  0.27196833],
       [-0.12166026,  0.72513287, -0.16351273]])

In [423]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(data= crypto_pca, columns=['PC 1', 'PC 2', 'PC 3'], index=crypto_df.index)
pcs_df.head(10)

Unnamed: 0,PC 1,PC 2,PC 3
0,-0.25528,1.195336,-0.469322
2,-0.240834,1.194168,-0.469972
4,-0.128864,0.840671,-0.35382
5,0.287404,1.990405,-0.568933
7,-0.17782,-1.451417,0.098989
8,-0.268246,-2.159427,0.265452
9,-0.303618,-1.096802,-0.016541
10,-0.216349,1.543631,-0.480741
11,-0.26811,-2.214056,0.360319
12,-0.268168,-2.159433,0.265449


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [424]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))


Running K-Means with `k=4`

In [425]:
# Initialize the K-Means model.
model = KMeans(n_clusters=1, random_state=0)

# Fit the model
model.fit(pcs_df)
print(model.fit(pcs_df))

# Predict clusters
predictions = model.predict(pcs_df)
print(predictions)

KMeans(n_clusters=1, random_state=0)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [426]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
coin_names = pd.DataFrame(crypto_df, index=crypto_df.index)

# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat([crypto_df, pcs_df ], axis=1, sort=False)

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_df["CoinName"] = coins_names ["CoinName"]

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df["class"]=model.labels_

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(685, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,class
0,Scrypt,PoW/PoS,41.99995,42,-0.25528,1.195336,-0.469322,42 Coin,0
2,Scrypt,PoW/PoS,1055185000.0,532000000,-0.240834,1.194168,-0.469972,404Coin,0
4,SHA-256,PoW/PoS,0.0,0,-0.128864,0.840671,-0.35382,808,0
5,X13,PoW/PoS,29279420000.0,314159265359,0.287404,1.990405,-0.568933,EliteCoin,0
7,SHA-256,PoW,17927180.0,21000000,-0.17782,-1.451417,0.098989,Bitcoin,0
8,Ethash,PoW,107684200.0,0,-0.268246,-2.159427,0.265452,Ethereum,0
9,Scrypt,PoW,63039240.0,84000000,-0.303618,-1.096802,-0.016541,Litecoin,0
10,X11,PoW/PoS,9031294.0,22000000,-0.216349,1.543631,-0.480741,Dash,0
11,CryptoNight-V7,PoW,17201140.0,0,-0.26811,-2.214056,0.360319,Monero,0
12,Ethash,PoW,113359700.0,210000000,-0.268168,-2.159433,0.265449,Ethereum Classic,0


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [428]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="class",
    symbol="class",
    width=800,
    hover_name="CoinName", 
    hover_data=["Algorithm"]
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()


In [429]:
# Create a table with tradable cryptocurrencies.
clustered_df.hvplot.table(columns= ['Algorithm','CoinName', 'class', 'TotalCoinsMined', 'TotalCoinSupply', 'ProofType', 'PC 1', 'PC 2', 'PC 3'], sortable=True, selectable=True)

In [430]:
# Print the total number of tradable cryptocurrencies.
print(clustered_df)

        Algorithm ProofType  TotalCoinsMined TotalCoinSupply      PC 1  \
0          Scrypt   PoW/PoS     4.199995e+01              42 -0.255280   
2          Scrypt   PoW/PoS     1.055185e+09       532000000 -0.240834   
4         SHA-256   PoW/PoS     0.000000e+00               0 -0.128864   
5             X13   PoW/PoS     2.927942e+10    314159265359  0.287404   
7         SHA-256       PoW     1.792718e+07        21000000 -0.177820   
...           ...       ...              ...             ...       ...   
1238      SHA-256      DPoS     2.000000e+09      2000000000  4.084533   
1242       Scrypt   PoW/PoS     1.493105e+07       250000000 -0.255075   
1245  CryptoNight       PoW     9.802226e+08      1400222610 -0.105864   
1246     Equihash       PoW     7.296538e+06        21000000 -0.231583   
1247       Scrypt       PoS     1.283270e+05         1000000 -0.121660   

          PC 2      PC 3     CoinName  class  
0     1.195336 -0.469322      42 Coin      0  
2     1.194168 -0

In [445]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
scaled_df = MinMaxScaler().fit_transform(clustered_df[["TotalCoinSupply", "TotalCoinsMined"]])
print(scaled_df)

[[4.55364914e-16 5.94230127e-03]
 [5.76795558e-09 7.00182308e-03]
 [0.00000000e+00 5.94230122e-03]
 ...
 [1.51812440e-08 6.92655266e-03]
 [2.27682457e-10 5.94962775e-03]
 [1.08420218e-11 5.94243008e-03]]


In [446]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
plot_df = pd.DataFrame(data=scaled_df, index=clustered_df.index)

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
plot_df["CoinName"] = clustered_df["CoinName"]

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
plot_df["class"]= clustered_df ['class']

plot_df.head(10)

Unnamed: 0,0,1,CoinName,class
0,4.553649e-16,0.005942,42 Coin,0
2,5.767956e-09,0.007002,404Coin,0
4,0.0,0.005942,808,0
5,3.406122e-06,0.035342,EliteCoin,0
7,2.276825e-10,0.00596,Bitcoin,0
8,0.0,0.00605,Ethereum,0
9,9.107298e-10,0.006006,Litecoin,0
10,2.385245e-10,0.005951,Dash,0
11,0.0,0.00596,Monero,0
12,2.276825e-09,0.006056,Ethereum Classic,0


In [438]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
plot_df.hvplot.scatter(
   x='TotalCoinsMined',
   y='TotalCoinSupply',
   hover_cols=['CoinName'],
   by='class',)

DataError: Supplied data does not contain specified dimensions, the following dimensions were not found: ['TotalCoinsMined', 'TotalCoinSupply']

PandasInterface expects tabular data, for more information on supported datatypes see http://holoviews.org/user_guide/Tabular_Datasets.html