# Clustering Crypto

In [38]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [39]:
# loads the crypto_data.csv dataset

file_path = Path('Resources/crypto_data.csv')
crypto_df = pd.read_csv(file_path, index_col=[0])
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [45]:
# Keep all the cryptocurrencies that are being traded.
crypto_df = crypto_df.loc[crypto_df.IsTrading,:]
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000


In [46]:
# Keep all the cryptocurrencies that have a working algorithm.
# Drops all rows with na values
crypto_df = crypto_df.dropna(how='any', axis='rows')

crypto_df.head()


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000


In [51]:
# Remove the "IsTrading" column. 
crypto_df = crypto_df.drop('IsTrading', axis=1)

In [52]:
# Remove rows that have at least 1 null value.
# Already done above to drop algorithms
# crypto_df = crypto_df.dropna(how="any", axis="rows")

In [53]:
# Keep the rows where coins are mined.
crypto_df = crypto_df.loc[(crypto_df.TotalCoinsMined > 0)]

In [54]:
# Create a new DataFrame that holds only the cryptocurrencies names.
crypto_df_names = crypto_df.CoinName
crypto_df_names.head()

42        42 Coin
404       404Coin
1337    EliteCoin
BTC       Bitcoin
ETH      Ethereum
Name: CoinName, dtype: object

In [55]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
crypto_df = crypto_df.drop('CoinName', axis=1)
crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [56]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(crypto_df, columns=['Algorithm','ProofType'])

In [57]:
# Standardize the data with StandardScaler().
X_scale = StandardScaler().fit_transform(X)

### Deliverable 2: Reducing Data Dimensions Using PCA

In [59]:
# Using PCA to reduce dimension to three principal components.
principal_comp = PCA(n_components=3)
crypto_principal_comp = principal_comp.fit_transform(X_scale)

In [60]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(data=crypto_principal_comp, columns=['pc1','pc2','pc3'], index=crypto_df.index)

### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [61]:
# creates an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))

# calculates inertia for the range of K values
for num_clusters in k:
    km = KMeans(n_clusters=num_clusters, random_state=0)  
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


  "KMeans is known to have a memory leak on Windows "


Running K-Means with `k=4`

In [62]:
# initializes the K-Means model.
model = KMeans(n_clusters=4, random_state=0)

# fits the model
model.fit(pcs_df)
# predicts clusters

predictions = model.predict(pcs_df)

In [64]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
predictions_df = pd.DataFrame(data=model.labels_, columns=["Class"], index=X.index)
predictions_df

# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat([crypto_df, pcs_df, crypto_df_names, predictions_df], join="inner", axis=1)

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(532, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,pc1,pc2,pc3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.339865,1.039011,-0.596318,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.323226,1.039303,-0.596765,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159265359,2.303675,1.714735,-0.669352,EliteCoin,0
BTC,SHA-256,PoW,17927180.0,21000000,-0.147228,-1.335536,0.209545,Bitcoin,2
ETH,Ethash,PoW,107684200.0,0,-0.142755,-1.993729,0.419899,Ethereum,2
LTC,Scrypt,PoW,63039240.0,84000000,-0.169848,-1.126708,-0.015434,Litecoin,2
DASH,X11,PoW/PoS,9031294.0,22000000,-0.389307,1.242192,-0.541664,Dash,0
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.153739,-2.236119,0.399907,Monero,2
ETC,Ethash,PoW,113359700.0,210000000,-0.1412,-1.99382,0.419874,Ethereum Classic,2
ZEC,Equihash,PoW,7383056.0,21000000,-0.145456,-2.037694,0.409435,ZCash,2


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [65]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    title="PCA - Crypto Data",
    x="pc1",
    y="pc2",
    z="pc3",
    color="Class",
    opacity=0.9,
    width=700,
    hover_name="CoinName",
    hover_data=["Algorithm", "TotalCoinSupply"],
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [18]:
# Create a table with tradable cryptocurrencies.
clustered_df.hvplot.table(
    columns=['CoinName','Algorithm','ProofType','TotalCoinSupply','TotalCoinsMined','Class',],
    sortable=True,
    selectable=True,
                        )
clustered_df.hvplot.table()

In [73]:
# Print the total number of tradable cryptocurrencies.
size = clustered_df.index.size
print(f'The total number of cryptocurrencies to trade is {size}.')

The total number of cryptocurrencies to trade is 532.


In [72]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
columns = ['TotalCoinSupply','TotalCoinsMined']
plot_df = pd.DataFrame(
    MinMaxScaler().fit_transform(clustered_df[columns]),
    columns=columns,
    index=clustered_df.index,
                       )


Unnamed: 0,TotalCoinSupply,TotalCoinsMined
42,4.200000e-11,0.000000e+00
404,5.320000e-04,1.065855e-03
1337,3.141593e-01,2.957551e-02
BTC,2.100000e-05,1.810842e-05
ETH,0.000000e+00,1.087731e-04
...,...,...
ZEPH,2.000000e-03,2.020225e-03
GAP,2.500000e-04,1.508199e-05
BDX,1.400223e-03,9.901351e-04
ZEN,2.100000e-05,7.370282e-06


In [75]:
# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
plot_df[['CoinName','Class']] = clustered_df[['CoinName','Class']]
plot_df.head()

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
42,4.2e-11,0.0,42 Coin,0
404,0.000532,0.001066,404Coin,0
1337,0.3141593,0.029576,EliteCoin,0
BTC,2.1e-05,1.8e-05,Bitcoin,2
ETH,0.0,0.000109,Ethereum,2


In [76]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
plot_df.hvplot.scatter(
            x='TotalCoinsMined',y='TotalCoinSupply',by='Class',
            legend="bottom_left",hover_cols="CoinName",
                        )
