# Clustering Crypto

In [1]:
!pip install   -U altair

Collecting altair
[?25l  Downloading https://files.pythonhosted.org/packages/a8/07/d8acf03571db619ff117df5730dd5c0b1ad0822aa02ad1084d73e2659442/altair-4.0.1-py3-none-any.whl (708kB)
[K    100% |████████████████████████████████| 716kB 9.6MB/s eta 0:00:01
[?25hRequirement not upgraded as not directly required: jinja2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from altair) (2.10)
Requirement not upgraded as not directly required: entrypoints in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from altair) (0.2.3)
Requirement not upgraded as not directly required: toolz in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from altair) (0.9.0)
Requirement not upgraded as not directly required: jsonschema in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from altair) (2.6.0)
Requirement not upgraded as not directly required: numpy in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from alt

In [26]:
# Initial imports
import pandas as pd
#import hvplot.pandas
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import altair as alt


### Data Preprocessing

In [5]:
# Loade the cryptocurrencies data
file_path = Path("crypto_data.csv")
crypto_df = pd.read_csv(file_path, index_col=0)
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [6]:
# Keep only cryptocurrencies that are on trading
crypto_df = crypto_df.loc[crypto_df['IsTrading']==True]
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
1337,EliteCoin,X13,True,PoW/PoS,2.927942e+10,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,True,PoW,1.076842e+08,0
LTC,Litecoin,Scrypt,True,PoW,6.303924e+07,84000000


In [7]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df[crypto_df['Algorithm']!='N/A']

crypto_df.tail()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,980222600.0,1400222610
ZEN,Horizen,Equihash,True,PoW,7296538.0,21000000
XBC,BitcoinPlus,Scrypt,True,PoS,128327.0,1000000


In [8]:
# Remove the "IsTrading" column

crypto_df = crypto_df.drop(['IsTrading'], axis=1)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0


In [10]:
# Remove rows with at least 1 null value
crypto_df = crypto_df.dropna()
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [11]:
crypto_df.isnull().sum()

CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
TotalCoinSupply    0
dtype: int64

In [12]:
# Remove rows with cryptocurrencies without coins mined
crypto_df= crypto_df[crypto_df['TotalCoinsMined']>0]


In [13]:
# Fetch the cryptocurrencies names prior to drop them from crypto_df
names = crypto_df['CoinName']
names.tail()

ZEPH         ZEPHYR
GAP         Gapcoin
BDX          Beldex
ZEN         Horizen
XBC     BitcoinPlus
Name: CoinName, dtype: object

In [14]:
# Remove the cryptocurrency name since it's not going to be used on the clustering algorithm
crypto_df1 = crypto_df.drop(['CoinName'], axis=1)
crypto_df1.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [15]:
# Create dummies variables for text features
dummies = pd.get_dummies(crypto_df1, columns=['Algorithm', 'ProofType'])
dummies.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Standardize data
crypto_scaled = StandardScaler().fit_transform(dummies)
print(crypto_scaled[0:5])


Data with input dtype uint8, float64, object were all converted to float64 by StandardScaler.


Data with input dtype uint8, float64, object were all converted to float64 by StandardScaler.



[[-0.11710817 -0.1528703  -0.0433963  -0.0433963  -0.0433963  -0.06142951
  -0.07530656 -0.0433963  -0.06142951 -0.06142951 -0.0433963  -0.0433963
  -0.19245009 -0.06142951 -0.09740465 -0.0433963  -0.11547005 -0.07530656
  -0.0433963  -0.0433963  -0.15191091 -0.0433963  -0.13118084 -0.0433963
  -0.0433963  -0.08703883 -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.06142951 -0.0433963  -0.08703883 -0.08703883 -0.08703883 -0.0433963
  -0.13118084 -0.13840913 -0.13840913 -0.0433963  -0.06142951 -0.0433963
  -0.07530656 -0.18168574 -0.0433963  -0.0433963  -0.0433963  -0.07530656
  -0.15826614 -0.31491833 -0.0433963  -0.08703883 -0.07530656 -0.06142951
   1.38675049 -0.0433963  -0.0433963  -0.06142951 -0.0433963  -0.0433963
  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.39879994 -0.0433963  -0.18168574 -0.0433963  -0.08703883 -0.08703883
  -0.10680283 -0.0433963  -0.13118084 -0.0433963  -0.0433963  -0.0433963
  -0.0433963  -0.07530656 -0.43911856 -0.04339

### Reducing Dimensions Using PCA

In [17]:
# Use PCA to reduce dimension to 3 principal components
pca = PCA(n_components=3)
pca_crypto = pca.fit_transform(crypto_scaled)

In [18]:
# Create a DataFrame with the principal components data
pca_df = pd.DataFrame(data=pca_crypto, 
                      columns=['pc_1', 'pc_2', 'pc_3']
                     )
pca_df.head()

Unnamed: 0,pc_1,pc_2,pc_3
0,-0.333707,1.055805,-0.405019
1,-0.317049,1.056018,-0.405079
2,2.30442,1.696648,-0.494578
3,-0.147242,-1.363722,0.120897
4,-0.155469,-1.978008,0.251193


In [19]:
#Fetch the explained variance
print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.cumsum())

[0.02793065 0.02137264 0.0204663 ]
[0.02793065 0.04930329 0.06976959]


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [42]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range ok k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pca_df)
    inertia.append(km.inertia_)
# Create the Elbow Curve using altair
elbow_data = {'k':k, 'inertia': inertia}
elbow_df = pd.DataFrame(elbow_data)

In [59]:
#Altair scatter plot
elbow_curve = alt.Chart(elbow_df).mark_circle().encode(
    alt.X('pc_1', type='quantitative'),
    alt.Y('pc_2', type='quantitative')
)
elbow_curve

Running K-Means with `k=4`

In [60]:
# Initialize the K-Means model
km = KMeans(n_clusters=4, random_state=0)

# Fit the model
km.fit(pca_df)

# Predict clusters
predictions = km.predict(pca_df)
# Creating a new DataFrame including predicted clusters and cryptocurrencies features
pca_df['class'] = km.labels_
pca_df.index = crypto_df1.index

In [61]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features
df_cluster = pd.DataFrame({
    'Algorithm': crypto_df1.Algorithm,
    'ProofType': crypto_df1.ProofType,
    'TotalCoinsMined': crypto_df1.TotalCoinsMined,
    'TotalCoinSupply': crypto_df1.TotalCoinSupply,
    'principal component 1': pca_df.pc_1,
    'principal component 2': pca_df.pc_2,
    'principal component 3': pca_df.pc_3,
    'CoinName': crypto_df.CoinName,
    'Class': km.labels_,
    },
    index=crypto_df1.index
)
df_cluster.head(10)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,principal component 1,principal component 2,principal component 3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.333707,1.055805,-0.405019,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.317049,1.056018,-0.405079,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159265359,2.30442,1.696648,-0.494578,EliteCoin,0
BTC,SHA-256,PoW,17927180.0,21000000,-0.147242,-1.363722,0.120897,Bitcoin,1
ETH,Ethash,PoW,107684200.0,0,-0.155469,-1.978008,0.251193,Ethereum,1
LTC,Scrypt,PoW,63039240.0,84000000,-0.166041,-1.098227,-0.018555,Litecoin,1
DASH,X11,PoW/PoS,9031294.0,22000000,-0.397751,1.182152,-0.361569,Dash,0
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.133145,-2.257153,0.298566,Monero,1
ETC,Ethash,PoW,113359700.0,210000000,-0.153914,-1.9781,0.25118,Ethereum Classic,1
ZEC,Equihash,PoW,7383056.0,21000000,-0.114579,-2.095426,0.141422,ZCash,1


### Visualizing Results

#### 3D-Scatter with Clusters

In [None]:
# Create a 3D-Scatter with the PCA data and the clusters
df_cluster = pd.concat([crypto_df1, pca_df, names], axis=1)

source = df_cluster
alt.Chart(source).mark_point().encode(
    
    x='pc_1',
    y='pc_2',
    
    color='Origin',
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"] # show in a tooltip
).interactive()

#### Table of Tradable Cryptocurrencies

In [91]:
# Table with tradable cryptos
display(df_cluster.head())

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,pc_1,pc_2,pc_3,class,CoinName
42,Scrypt,PoW/PoS,41.99995,42,-0.333707,1.055805,-0.405019,0,42 Coin
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.317049,1.056018,-0.405079,0,404Coin
1337,X13,PoW/PoS,29279420000.0,314159265359,2.30442,1.696648,-0.494578,0,EliteCoin
BTC,SHA-256,PoW,17927180.0,21000000,-0.147242,-1.363722,0.120897,1,Bitcoin
ETH,Ethash,PoW,107684200.0,0,-0.155469,-1.978008,0.251193,1,Ethereum


In [21]:
# Print the total number of tradable cryptocurrencies
df_cluster.CoinName.count()

532

#### Scatter Plot with Tradable Cryptocurrencies

In [22]:
# Scale data to create the scatter plot
df_cluster['TotalCoinSupply'] = df_cluster['TotalCoinSupply'].astype('float')

In [23]:
# Standardize  and scale data down dividing by 1 million
scaled_2 = df_cluster[['TotalCoinSupply', 'TotalCoinsMined']]
scaled_3 = scaled_2.astype(float) / 1_000_000     #tried to get better data as range is too wide
scaled_3 = scaled_2.sort_values(by=['TotalCoinSupply'], ascending=True, inplace=False)
scaled_3.head()

Unnamed: 0,TotalCoinSupply,TotalCoinsMined
EXCL,0.0,5679705.0
BLU,0.0,635423900.0
FLT,0.0,461829900.0
CYP,0.0,6365285.0
UIS,0.0,65113300.0


In [24]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
scaled_3.hvplot.scatter(x='TotalCoinsMined', y='TotalCoinSupply')