# Clustering Crypto

In [27]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder


### Deliverable 1: Preprocessing the Data for PCA

In [28]:
# Loading the crypto_data.csv dataset.
filePath = Path('Resources/crypto_data.csv')
cryptoDf = pd.read_csv(filePath, index_col=0)
cryptoDf.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [29]:
# Checking Value type for each column
cryptoDf.dtypes

CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [30]:
# Keeping all the cryptocurrencies that are being traded by filtering the data frame
cryptoTradingDf = cryptoDf[cryptoDf['IsTrading'] == True]
cryptoTradingDf

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [31]:
# Looking for columns with null values
cryptoTradingDf.isnull().value_counts()

CoinName  Algorithm  IsTrading  ProofType  TotalCoinsMined  TotalCoinSupply
False     False      False      False      False            False              685
                                           True             False              459
dtype: int64

In [32]:
# Removing the "IsTrading column.
cryptoTradingDf.drop(columns=['IsTrading'], axis=1, inplace=True)
cryptoTradingDf.head()




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0


In [33]:
# Remove rows that have at least 1 null value.
cryptoTradingDf.dropna(inplace=True)
cryptoTradingDf



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [34]:
# Keeping the rows where coins are mined.
cryptoTradingDf = cryptoTradingDf[cryptoTradingDf['TotalCoinsMined'] > 0]
cryptoTradingDf

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [35]:
# Creating a new DataFrame that holds only the cryptocurrencies names.
cryptoNameDf = cryptoTradingDf['CoinName']
cryptoNameDf

42          42 Coin
404         404Coin
1337      EliteCoin
BTC         Bitcoin
ETH        Ethereum
           ...     
ZEPH         ZEPHYR
GAP         Gapcoin
BDX          Beldex
ZEN         Horizen
XBC     BitcoinPlus
Name: CoinName, Length: 532, dtype: object

In [36]:
# Dropping the 'CoinName' column since it's not going to be used on the clustering algorithm.
cryptoTradingDf.drop(columns=['CoinName'], axis=1, inplace=True)
cryptoTradingDf.head()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [37]:
# Using get_dummies() to create variables for text features.


cryptoTradingDfEncoded = cryptoTradingDf.copy()


X = pd.get_dummies(cryptoTradingDfEncoded, columns=['Algorithm', 'ProofType'])
X.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# Standardize the data with StandardScaler().
cryptoTradingScaled = StandardScaler().fit_transform(X)

print(cryptoTradingScaled[0:5])

[[-0.11710817 -0.1528703  -0.0433963  -0.0433963  -0.0433963  -0.06142951
  -0.07530656 -0.0433963  -0.06142951 -0.06142951 -0.0433963  -0.0433963
  -0.19245009 -0.06142951 -0.09740465 -0.0433963  -0.11547005 -0.07530656
  -0.0433963  -0.0433963  -0.15191091 -0.0433963  -0.13118084 -0.0433963
  -0.0433963  -0.08703883 -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.06142951 -0.0433963  -0.08703883 -0.08703883 -0.08703883 -0.0433963
  -0.13118084 -0.13840913 -0.13840913 -0.0433963  -0.06142951 -0.0433963
  -0.07530656 -0.18168574 -0.0433963  -0.0433963  -0.0433963  -0.07530656
  -0.15826614 -0.31491833 -0.0433963  -0.08703883 -0.07530656 -0.06142951
   1.38675049 -0.0433963  -0.0433963  -0.06142951 -0.0433963  -0.0433963
  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.39879994 -0.0433963  -0.18168574 -0.0433963  -0.08703883 -0.08703883
  -0.10680283 -0.0433963  -0.13118084 -0.0433963  -0.0433963  -0.0433963
  -0.0433963  -0.07530656 -0.43911856 -0.04339

### Deliverable 2: Reducing Data Dimensions Using PCA

In [39]:
# Using PCA to reduce dimension to three principal components.
indexList = (X.index.to_list())
# Initializing PCA model
pca = PCA(n_components=3)

# getting 3 principal components
cryptoPca = pca.fit_transform(cryptoTradingScaled)

# Transforming PCA data to a Dataframe
cryptoPcaDf = pd.DataFrame(
    data=cryptoPca,
    columns=['principal component 1', 'principal component 2', 'principal component 3'],
    index=indexList

)

cryptoPcaDf

Unnamed: 0,principal component 1,principal component 2,principal component 3
42,-0.334232,1.009840,-0.405680
404,-0.317549,1.009936,-0.405930
1337,2.309971,1.634033,-0.488289
BTC,-0.150525,-1.337592,0.209693
ETH,-0.153238,-2.039426,0.304013
...,...,...,...
ZEPH,2.453460,0.837964,0.252091
GAP,-0.332275,1.009711,-0.405699
BDX,0.324726,-2.330784,0.335851
ZEN,-0.138689,-2.032197,0.279947


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [40]:
# Initializing inertia list and Instantiating a list of K values to test with
inertia = []
k = list(range(1,11))

# Looking for best K value
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(cryptoPcaDf)
    inertia.append(km.inertia_)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



In [41]:
# Creating an elbow curve to find the best value for K.
elbowData = {'k':k, 'inertia': inertia}
dfElbow = pd.DataFrame(elbowData)
dfElbow.hvplot.line(x='k', y='inertia', title='Crypto Elbow Curve', xticks=k)

Running K-Means with `k=4`

In [42]:
# Initializing the K-Means model
model = KMeans(n_clusters=4, random_state=0)

# fitting the model
model.fit(cryptoPcaDf)

# Predicting K clusters
predictions = model.predict(cryptoPcaDf)
print(predictions)


# Adding class column to df
cryptoPcaDf['Class'] = model.labels_

[0 0 0 3 3 3 0 3 3 3 0 3 0 0 3 0 3 3 0 0 3 3 3 3 3 0 3 3 3 0 3 0 3 3 0 0 3
 3 3 3 3 3 0 0 3 3 3 3 3 0 0 3 0 3 3 3 3 0 3 3 0 3 0 0 0 3 3 3 0 0 0 0 0 3
 3 3 0 0 3 0 3 0 0 3 3 3 3 0 0 3 0 3 3 0 0 3 0 0 3 3 0 0 3 0 0 3 0 3 0 3 0
 3 0 0 3 3 0 3 3 3 0 3 3 3 3 3 0 0 3 3 3 0 3 0 3 3 0 3 0 3 0 0 3 3 0 3 3 0
 0 3 0 3 0 0 0 3 3 3 3 0 0 0 0 0 3 3 0 0 0 0 0 3 0 0 0 0 0 3 0 3 0 0 3 0 3
 0 0 3 0 3 0 3 0 3 0 0 0 0 3 0 0 0 0 0 3 3 0 0 3 3 0 0 0 0 0 3 0 0 0 0 0 0
 0 0 3 0 0 0 0 0 0 3 3 3 0 0 0 0 3 0 3 0 0 3 0 3 3 0 3 3 0 3 0 0 0 3 0 0 3
 0 0 0 0 0 0 0 3 0 3 0 0 0 0 3 0 3 0 3 3 3 3 0 3 0 0 3 0 3 3 3 0 3 0 3 3 3
 0 3 0 3 0 0 0 3 0 3 3 3 3 3 0 0 3 0 0 0 3 0 3 0 3 0 3 0 0 0 0 3 0 0 3 0 0
 0 3 3 3 3 0 0 0 0 3 0 3 3 3 0 0 3 3 0 0 3 0 3 3 3 0 3 3 0 0 0 3 3 3 0 0 0
 3 3 0 3 3 3 3 0 1 1 3 3 3 0 1 0 0 0 0 3 3 3 3 0 0 0 3 0 3 0 0 0 0 3 0 0 3
 0 0 3 3 0 3 0 3 3 3 3 0 0 3 0 3 0 0 0 0 0 0 3 3 3 0 0 0 0 0 0 3 0 3 3 3 3
 0 0 0 0 3 0 0 3 0 0 3 0 3 0 3 3 0 0 3 0 3 3 0 3 3 0 3 0 3 0 0 3 0 0 0 0 0
 3 3 3 0 0 0 3 0 3 0 3 0 

In [43]:
# Creating a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentating the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = cryptoTradingDf.join(cryptoPcaDf,how='inner')
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,principal component 1,principal component 2,principal component 3,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.334232,1.00984,-0.40568,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.317549,1.009936,-0.40593,0
1337,X13,PoW/PoS,29279420000.0,314159265359,2.309971,1.634033,-0.488289,0
BTC,SHA-256,PoW,17927180.0,21000000,-0.150525,-1.337592,0.209693,3
ETH,Ethash,PoW,107684200.0,0,-0.153238,-2.039426,0.304013,3


In [44]:
#  Adding a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies.
clustered_df = clustered_df.join(cryptoNameDf,how='inner')
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,principal component 1,principal component 2,principal component 3,Class,CoinName
42,Scrypt,PoW/PoS,41.99995,42,-0.334232,1.00984,-0.40568,0,42 Coin
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.317549,1.009936,-0.40593,0,404Coin
1337,X13,PoW/PoS,29279420000.0,314159265359,2.309971,1.634033,-0.488289,0,EliteCoin
BTC,SHA-256,PoW,17927180.0,21000000,-0.150525,-1.337592,0.209693,3,Bitcoin
ETH,Ethash,PoW,107684200.0,0,-0.153238,-2.039426,0.304013,3,Ethereum


In [45]:
# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(532, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,principal component 1,principal component 2,principal component 3,Class,CoinName
42,Scrypt,PoW/PoS,41.99995,42,-0.334232,1.00984,-0.40568,0,42 Coin
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.317549,1.009936,-0.40593,0,404Coin
1337,X13,PoW/PoS,29279420000.0,314159265359,2.309971,1.634033,-0.488289,0,EliteCoin
BTC,SHA-256,PoW,17927180.0,21000000,-0.150525,-1.337592,0.209693,3,Bitcoin
ETH,Ethash,PoW,107684200.0,0,-0.153238,-2.039426,0.304013,3,Ethereum
LTC,Scrypt,PoW,63039240.0,84000000,-0.16662,-1.130174,0.011091,3,Litecoin
DASH,X11,PoW/PoS,9031294.0,22000000,-0.394276,1.243601,-0.399694,0,Dash
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.148786,-2.17741,0.272191,3,Monero
ETC,Ethash,PoW,113359700.0,210000000,-0.151678,-2.039537,0.303998,3,Ethereum Classic
ZEC,Equihash,PoW,7383056.0,21000000,-0.138688,-2.032197,0.279947,3,ZCash


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [46]:
# Creating a 3D-Scatter with the PCA data and the clusters

fig = px.scatter_3d(
    clustered_df,
    x='principal component 1',
    y='principal component 2',
    z='principal component 3',
    color='Class',
    symbol='Class',
    hover_name='CoinName',
    hover_data=['TotalCoinsMined','TotalCoinSupply','Algorithm', 'ProofType']
)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [47]:
# Creating a table with tradable cryptocurrencies.
clustered_df.hvplot.table(columns=['CoinName','Algorithm','ProofType','TotalCoinSupply','TotalCoinsMined','Class'])

In [48]:
# Print the total number of tradable cryptocurrencies.
clustered_df['CoinName'].count()

532

In [49]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
clustDf = clustered_df[['TotalCoinSupply','TotalCoinsMined']]
minMax = MinMaxScaler().fit_transform(clustDf)
minMax

array([[4.20000000e-11, 0.00000000e+00],
       [5.32000000e-04, 1.06585544e-03],
       [3.14159265e-01, 2.95755135e-02],
       ...,
       [1.40022261e-03, 9.90135079e-04],
       [2.10000000e-05, 7.37028150e-06],
       [1.00000000e-06, 1.29582282e-07]])

In [50]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
scatterDf = pd.DataFrame(
    data=minMax,
    columns=['TotalCoinSupplyScaled', 'TotalCoinsMinedScaled'],
    index= clustered_df.index.tolist()
)
scatterDf.head()

# Adding the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
scatterDf = scatterDf.join(cryptoNameDf,how='inner')

# Adding the "Class" column from the clustered_df DataFrame to the new DataFrame.
classColumn = clustered_df['Class']
scatterDf = scatterDf.join(classColumn, how='inner')

scatterDf.head(10)

Unnamed: 0,TotalCoinSupplyScaled,TotalCoinsMinedScaled,CoinName,Class
42,4.2e-11,0.0,42 Coin,0
404,0.000532,0.001066,404Coin,0
1337,0.3141593,0.029576,EliteCoin,0
BTC,2.1e-05,1.8e-05,Bitcoin,3
ETH,0.0,0.000109,Ethereum,3
LTC,8.4e-05,6.4e-05,Litecoin,3
DASH,2.2e-05,9e-06,Dash,0
XMR,0.0,1.7e-05,Monero,3
ETC,0.00021,0.000115,Ethereum Classic,3
ZEC,2.1e-05,7e-06,ZCash,3


In [51]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
scatterDf.hvplot.scatter(x='TotalCoinsMinedScaled',
                         y='TotalCoinSupplyScaled',
                         by='Class',
                         xlabel='Total Coins Mined',
                         ylabel='Total Coin Supply',
                         title='Crypto Scatter',
                         hover_cols=['CoinName']
                         )
