# Clustering Crypto

In [45]:
# Initial imports
!pip install -U altair
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import altair as alt



### Fetching Cryptocurrency Data

In [46]:
crypto_df = pd.read_csv("Resources/crypto_data.csv", index_col=0)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [47]:
crypto_df.columns

Index(['CoinName', 'Algorithm', 'IsTrading', 'ProofType', 'TotalCoinsMined',
       'TotalCoinSupply'],
      dtype='object')

### Data Preprocessing

In [49]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'

#There is no column TotalCoinSupply, Im going to use CirculatingSupply
x_cols=['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply']
crypto_df=crypto_df[x_cols]
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [50]:
# Keep only cryptocurrencies that are trading
crypto_df=crypto_df.loc[crypto_df['IsTrading']==True]

In [51]:
# Keep only cryptocurrencies with a working algorithm
crypto_df=crypto_df.loc[crypto_df['Algorithm']!='N/A']

In [52]:
# Remove the "IsTrading" column
crypto_df=crypto_df.drop(columns=['IsTrading'])

In [53]:
# Remove rows with at least 1 null value
crypto_df=crypto_df.dropna()

In [54]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df=crypto_df.loc[crypto_df['TotalCoinsMined']>0]

In [55]:
# Drop rows where there are 'N/A' text values
for i in crypto_df.columns:
    crypto_df=crypto_df.loc[crypto_df[i]!='N/A']

In [56]:
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0


In [57]:
# Store the 'CoinName' Column in its own DataFrame
CoinName_df=crypto_df['CoinName']

In [58]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df=crypto_df.drop(columns=['CoinName'])

In [59]:
crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [60]:
# Create dummy variables for text features
X = pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType'])

In [61]:
X.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
# Standardize data
scaler = StandardScaler()
scaler.fit(X)
X=scaler.transform(X)

### Reducing Dimensions Using PCA

In [63]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(X)

In [64]:
# Create a DataFrame with the principal components data
pcs_df = pd.DataFrame(
    data=crypto_pca, columns=["PC1","PC2","PC3"], index=crypto_df.index
)
pcs_df.head()

Unnamed: 0,PC1,PC2,PC3
42,-0.358724,0.912893,-0.468987
404,-0.342126,0.912771,-0.46931
1337,2.3126,1.644803,-0.604635
BTC,-0.120456,-1.224542,0.150412
ETH,-0.136211,-1.969202,0.397813


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [65]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Creating the Elbow Curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
alt.Chart(df_elbow).mark_line().encode(
    x='k',
    y='inertia',
).interactive()


  f"KMeans is known to have a memory leak on Windows "


Running K-Means with `k=<your best value for k here>`

In [66]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

# Add the predicted class columns
pcs_df["class"] = model.labels_
pcs_df.head()

Unnamed: 0,PC1,PC2,PC3,class
42,-0.358724,0.912893,-0.468987,3
404,-0.342126,0.912771,-0.46931,3
1337,2.3126,1.644803,-0.604635,3
BTC,-0.120456,-1.224542,0.150412,0
ETH,-0.136211,-1.969202,0.397813,0


In [78]:
#Adding back orginal dataframe
clustered_df=crypto_df.join([pcs_df, CoinName_df])

In [85]:
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,class,CoinName
42,Scrypt,PoW/PoS,-0.117108,-0.15287,-0.358724,0.912893,-0.468987,3,42 Coin
404,Scrypt,PoW/PoS,-0.09397,-0.145009,-0.342126,0.912771,-0.46931,3,404Coin
1337,X13,PoW/PoS,0.524946,4.489424,2.3126,1.644803,-0.604635,3,EliteCoin
BTC,SHA-256,PoW,-0.116715,-0.15256,-0.120456,-1.224542,0.150412,0,Bitcoin
ETH,Ethash,PoW,-0.114747,-0.15287,-0.136211,-1.969202,0.397813,0,Ethereum


#### Scatter Plot with Tradable Cryptocurrencies

In [86]:
# Use the altair scatter plot to visualize the clusters. Since this is a 2D-Scatter, use x="PC 1" and y="PC 2" for the axes, 
# and add the following columns as tool tips: "CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply".
alt.Chart(clustered_df).mark_circle(size=30).encode(
    x='PC1',
    y='PC2',
    color='class',
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
).interactive()

In [87]:
alt.Chart(clustered_df).mark_circle(size=30).encode(
    x='TotalCoinsMined',
    y='TotalCoinSupply',
    color='class',
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
).interactive()

In [88]:
# Scale data to create the scatter plot
scaled_clustered_df=clustered_df
scaler = StandardScaler()
scaler.fit(scaled_clustered_df[["TotalCoinsMined","TotalCoinSupply"]])
scaled_clustered_df[["TotalCoinsMined","TotalCoinSupply"]]=scaler.transform(scaled_clustered_df[["TotalCoinsMined","TotalCoinSupply"]])

In [89]:
alt.Chart(scaled_clustered_df).mark_circle(size=30).encode(
    x='TotalCoinsMined',
    y='TotalCoinSupply',
    color='class',
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
).interactive()

#### Table of Tradable Cryptocurrencies

In [90]:
# Table with tradable cryptos
display(scaled_clustered_df)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,class,CoinName
42,Scrypt,PoW/PoS,-0.117108,-0.152870,-0.358724,0.912893,-0.468987,3,42 Coin
404,Scrypt,PoW/PoS,-0.093970,-0.145009,-0.342126,0.912771,-0.469310,3,404Coin
1337,X13,PoW/PoS,0.524946,4.489424,2.312600,1.644803,-0.604635,3,EliteCoin
BTC,SHA-256,PoW,-0.116715,-0.152560,-0.120456,-1.224542,0.150412,0,Bitcoin
ETH,Ethash,PoW,-0.114747,-0.152870,-0.136211,-1.969202,0.397813,0,Ethereum
...,...,...,...,...,...,...,...,...,...
ZEPH,SHA-256,DPoS,-0.073251,-0.123317,2.531886,1.061982,-0.541969,3,ZEPHYR
GAP,Scrypt,PoW/PoS,-0.116781,-0.149176,-0.356779,0.912737,-0.468986,3,Gapcoin
BDX,CryptoNight,PoW,-0.095613,-0.132179,0.324055,-2.286180,0.380359,0,Beldex
ZEN,Equihash,PoW,-0.116948,-0.152560,-0.157032,-1.925380,0.322654,0,Horizen


In [92]:
sagemaker.Session().delete_endpoint(linear_predictor.endpoint)