# Clustering Crypto

In [142]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import os

### Deliverable 1: Preprocessing the Data for PCA

In [143]:
# Load the crypto_data.csv dataset.
file_path = os.path.join("Resources", "crypto_data.csv")

file = pd.read_csv(file_path)

df = file.set_index('Unnamed: 0')

df.index.name =''

df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,,,
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000


In [144]:
# Keep all the cryptocurrencies that are being traded.
df_trading = df[df['IsTrading'] == True]

df_trading

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,,,
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610


In [145]:
# Keep all the cryptocurrencies that have a working algorithm.

df_trading['Algorithm'].isna().sum()

0

In [146]:
# Remove the "IsTrading" column. 
df_trading.drop(columns= ['IsTrading'], inplace = True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [147]:
# Remove rows that have at least 1 null value.
df_trading = df_trading.dropna()

df_trading

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,,
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610


In [148]:
# Keep the rows where coins are mined.
df_mine = df_trading[df_trading['TotalCoinsMined'] > 0]

df_mine

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,,
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610


In [149]:
# Create a new DataFrame that holds only the cryptocurrencies names.
df_names = pd.DataFrame(data = df_mine['CoinName'])

df_names

Unnamed: 0,CoinName
,
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex


In [150]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
df_mine.drop(columns = ['CoinName'], inplace= True)

df_mine



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,X13,PoW/PoS,2.927942e+10,314159265359
BTC,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,CryptoNight,PoW,9.802226e+08,1400222610


In [151]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(df_mine, columns = ["Algorithm", "ProofType"])

X

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
,,,,,,,,,,,,,,,,,,,,,
42,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEPH,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GAP,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BDX,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [152]:
# Standardize the data with StandardScaler().
df_scaled = StandardScaler().fit_transform(X)

df_scaled

array([[-0.11710817, -0.1528703 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.09396955, -0.145009  , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [ 0.52494561,  4.48942416, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       ...,
       [-0.09561336, -0.13217937, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11694817, -0.15255998, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11710536, -0.15285552, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ]])

### Deliverable 2: Reducing Data Dimensions Using PCA

In [153]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)

pca_list = pca.fit_transform(df_scaled)

pca_list

array([[-0.33757259,  1.04507756, -0.62455251],
       [-0.32075602,  1.04534246, -0.62522905],
       [ 2.32268521,  1.65191201, -0.71367931],
       ...,
       [ 0.32413919, -2.32158556,  0.44630535],
       [-0.14685872, -2.07091715,  0.43189146],
       [-0.28527685,  0.84641197, -0.33858194]])

In [154]:
# Create a DataFrame with the three principal components.
pca_df = pd.DataFrame(data=pca_list, columns = ['pc1', 'pc2', 'pc3'])

pca_df

Unnamed: 0,pc1,pc2,pc3
0,-0.337573,1.045078,-0.624553
1,-0.320756,1.045342,-0.625229
2,2.322685,1.651912,-0.713679
3,-0.153502,-1.336117,0.227348
4,-0.171906,-2.052186,0.457476
...,...,...,...
527,2.341245,0.711833,0.182889
528,-0.335596,1.044976,-0.624617
529,0.324139,-2.321586,0.446305
530,-0.146859,-2.070917,0.431891


In [155]:
print(f'pca ratio {pca.explained_variance_ratio_}')

print(f'pca sum {sum(pca.explained_variance_ratio_)}')

pca ratio [0.02792164 0.02141309 0.02049886]
pca sum 0.06983359420849507


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [156]:
# Create an elbow curve to find the best value for K.
# YOUR CODE HERE
inertia = []
k= list(range(1,11))

for i in k:
    km = KMeans(n_clusters = i, random_state = 1)
    km.fit(pca_df)
    inertia.append(km.inertia_)
    
elbow_data = {'k': k, 'inertia': inertia}
df_elbow = pd.DataFrame(elbow_data)

df_elbow.hvplot.line(x='k', y='inertia')


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



Running K-Means with `k=4`

In [157]:
# Initialize the K-Means model.
model =KMeans(n_clusters = 4, random_state = 1)

# Fit the model
model.fit(pca_df)

# Predict clusters
prediction = model.predict(pca_df)

print(prediction)

[1 1 1 0 0 0 1 0 0 0 1 0 1 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 1 0
 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 1 0 1 1 1 0 0 0 1 1 1 1 1 0
 0 0 1 1 0 1 0 1 1 0 0 0 0 1 1 0 1 0 0 1 1 0 1 1 0 0 1 1 0 1 1 0 1 0 1 0 1
 0 1 1 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 0 1 1 0 0 1 0 0 1
 1 0 1 0 1 1 1 0 0 0 0 1 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 1 0 1 0
 1 1 0 1 0 1 0 1 0 1 1 1 1 0 1 1 1 1 1 0 0 1 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 0 0 0 1 1 1 1 0 1 0 1 1 0 1 0 0 1 0 0 1 0 1 1 1 0 1 1 0
 1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 0 1 0 0 0 0 1 0 1 1 0 1 0 0 0 1 0 1 0 0 0
 1 0 1 0 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 1 0 1 0 1 0 1 0 1 1 1 1 0 1 1 0 1 1
 1 0 0 0 0 1 1 1 1 0 1 0 0 0 1 1 0 0 1 1 0 1 0 0 0 1 0 0 1 1 1 0 0 0 1 1 1
 0 0 1 0 0 0 0 1 3 3 0 0 0 1 3 1 1 1 1 0 0 0 0 1 1 1 0 1 0 1 1 1 1 0 1 1 0
 1 1 0 0 1 0 1 0 0 0 0 1 1 0 1 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 0 1 0 0 0 0
 1 1 1 1 0 1 1 0 1 1 0 3 0 1 0 0 1 1 0 1 0 0 1 0 0 1 0 1 0 1 1 0 1 1 1 1 1
 0 0 0 1 1 1 0 1 0 1 0 1 

In [158]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
pca_df.index = df_mine.index

clustered_df = pd.concat([df_mine, pca_df, df_names], axis =1 )

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
# YOUR CODE HERE

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# YOUR CODE HERE

clustered_df['class'] = model.labels_ 

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(532, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,pc1,pc2,pc3,CoinName,class
,,,,,,,,,
42,Scrypt,PoW/PoS,41.99995,42.0,-0.337573,1.045078,-0.624553,42 Coin,1.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.320756,1.045342,-0.625229,404Coin,1.0
1337,X13,PoW/PoS,29279420000.0,314159265359.0,2.322685,1.651912,-0.713679,EliteCoin,1.0
BTC,SHA-256,PoW,17927180.0,21000000.0,-0.153502,-1.336117,0.227348,Bitcoin,0.0
ETH,Ethash,PoW,107684200.0,0.0,-0.171906,-2.052186,0.457476,Ethereum,0.0
LTC,Scrypt,PoW,63039240.0,84000000.0,-0.16514,-1.105361,-0.047971,Litecoin,0.0
DASH,X11,PoW/PoS,9031294.0,22000000.0,-0.397422,1.204324,-0.471145,Dash,1.0
XMR,CryptoNight-V7,PoW,17201140.0,0.0,-0.148779,-2.241577,0.435255,Monero,0.0
ETC,Ethash,PoW,113359700.0,210000000.0,-0.170331,-2.052274,0.457425,Ethereum Classic,0.0


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [179]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
        clustered_df,
        x='pc1',
        y='pc2',
        z='pc3',
        color= 'class',
        symbol= 'class',
        hover_name = 'CoinName',
        hover_data = ['CoinName', 'Algorithm'],
        width=800)
fig.update_layout(legend=dict(x=0, y=1))

fig.show()


In [182]:
# Create a table with tradable cryptocurrencies.
tradable = clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply',
                                              'TotalCoinsMined', 'class'] ,sortable=True, selectable=True)
tradable

In [193]:
# Print the total number of tradable cryptocurrencies.
coin = len(clustered_df['CoinName'].unique())
print(f'Total number of tradable currencies {coin}')

Total number of tradable currencies 531


In [202]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
supplymine = clustered_df[['TotalCoinSupply', 'TotalCoinsMined']]
supplymine_scaled = MinMaxScaler().fit_transform(supplymine)
print(supplymine_scaled.shape)
supplymine_scaled

(532, 2)


array([[4.20000000e-11, 0.00000000e+00],
       [5.32000000e-04, 1.06585544e-03],
       [3.14159265e-01, 2.95755135e-02],
       ...,
       [1.40022261e-03, 9.90135079e-04],
       [2.10000000e-05, 7.37028150e-06],
       [1.00000000e-06, 1.29582282e-07]])

In [214]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
supplymine_df = pd.DataFrame(data=supplymine_scaled, columns = ['TotalCoinSupply', 'TotalCoinsMined'])

supplymine_df.index = clustered_df.index

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
# YOUR CODE HERE
move = clustered_df[['CoinName', 'class']]

plot_df = supplymine_df.join(move)
plot_df.head(10)

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,class
,,,,
42,4.2e-11,0.0,42 Coin,1.0
404,0.000532,0.001066,404Coin,1.0
1337,0.3141593,0.029576,EliteCoin,1.0
BTC,2.1e-05,1.8e-05,Bitcoin,0.0
ETH,0.0,0.000109,Ethereum,0.0
LTC,8.4e-05,6.4e-05,Litecoin,0.0
DASH,2.2e-05,9e-06,Dash,1.0
XMR,0.0,1.7e-05,Monero,0.0
ETC,0.00021,0.000115,Ethereum Classic,0.0


In [234]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
# YOUR CODE HERE
plot_df.hvplot.scatter(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    color = 'class',
    hover_cols='CoinName',
    xlabel = "Total Coins Mined",
    ylabel = "Total Coin Supply"
    )