# Clustering Crypto

In [4]:
#Initial imports
!pip install -U altair
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import altair as alt

Collecting altair
  Downloading altair-4.1.0-py3-none-any.whl (727 kB)
     |████████████████████████████████| 727 kB 21.3 MB/s            
Installing collected packages: altair
Successfully installed altair-4.1.0


### Fetching Cryptocurrency Data

In [6]:
crypto_df = pd.read_csv("crypto_data.csv", index_col=0)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [7]:
crypto_df.columns

Index(['CoinName', 'Algorithm', 'IsTrading', 'ProofType', 'TotalCoinsMined',
       'TotalCoinSupply'],
      dtype='object')

### Data Preprocessing

In [8]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'

#There is no column TotalCoinSupply, Im going to use CirculatingSupply
x_cols=['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply']
crypto_df=crypto_df[x_cols]
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [9]:
# Keep only cryptocurrencies that are trading
crypto_df=crypto_df.loc[crypto_df['IsTrading']==True]

In [10]:
# Keep only cryptocurrencies with a working algorithm
crypto_df=crypto_df.loc[crypto_df['Algorithm']!='N/A']

In [11]:
# Remove the "IsTrading" column
crypto_df=crypto_df.drop(columns=['IsTrading'])

In [12]:
# Remove rows with at least 1 null value
crypto_df=crypto_df.dropna()

In [13]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df=crypto_df.loc[crypto_df['TotalCoinsMined']>0]

In [14]:
# Drop rows where there are 'N/A' text values
for i in crypto_df.columns:
    crypto_df=crypto_df.loc[crypto_df[i]!='N/A']

In [15]:
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0


In [16]:
# Store the 'CoinName' Column in its own DataFrame
CoinName_df=crypto_df['CoinName']

In [17]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df=crypto_df.drop(columns=['CoinName'])

In [18]:
crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [19]:
# Create dummy variables for text features
X = pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType'])

In [20]:
X.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Standardize data
scaler = StandardScaler()
scaler.fit(X)
X=scaler.transform(X)

### Reducing Dimensions Using PCA

In [22]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(X)

In [23]:
# Create a DataFrame with the principal components data
pcs_df = pd.DataFrame(
    data=crypto_pca, columns=["PC1","PC2","PC3"], index=crypto_df.index
)
pcs_df.head()

Unnamed: 0,PC1,PC2,PC3
42,-0.3315,1.045974,-0.539637
404,-0.314828,1.046204,-0.540031
1337,2.302053,1.671213,-0.649254
BTC,-0.150126,-1.309574,0.109148
ETH,-0.159483,-2.079329,0.467582


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [24]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Creating the Elbow Curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
alt.Chart(df_elbow).mark_line().encode(
    x='k',
    y='inertia',
).interactive()


Running K-Means with `k=<your best value for k here>`

In [25]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

# Add the predicted class columns
pcs_df["class"] = model.labels_
pcs_df.head()

Unnamed: 0,PC1,PC2,PC3,class
42,-0.3315,1.045974,-0.539637,0
404,-0.314828,1.046204,-0.540031,0
1337,2.302053,1.671213,-0.649254,0
BTC,-0.150126,-1.309574,0.109148,3
ETH,-0.159483,-2.079329,0.467582,3


In [26]:
#Adding back orginal dataframe
clustered_df=crypto_df.join([pcs_df, CoinName_df])

In [27]:
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,class,CoinName
42,Scrypt,PoW/PoS,41.99995,42,-0.3315,1.045974,-0.539637,0,42 Coin
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.314828,1.046204,-0.540031,0,404Coin
1337,X13,PoW/PoS,29279420000.0,314159265359,2.302053,1.671213,-0.649254,0,EliteCoin
BTC,SHA-256,PoW,17927180.0,21000000,-0.150126,-1.309574,0.109148,3,Bitcoin
ETH,Ethash,PoW,107684200.0,0,-0.159483,-2.079329,0.467582,3,Ethereum


#### Scatter Plot with Tradable Cryptocurrencies

In [28]:
# Use the altair scatter plot to visualize the clusters. Since this is a 2D-Scatter, use x="PC 1" and y="PC 2" for the axes, 
# and add the following columns as tool tips: "CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply".
alt.Chart(clustered_df).mark_circle(size=30).encode(
    x='PC1',
    y='PC2',
    color='class',
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
).interactive()

In [29]:
alt.Chart(clustered_df).mark_circle(size=30).encode(
    x='TotalCoinsMined',
    y='TotalCoinSupply',
    color='class',
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
).interactive()

In [30]:
# Scale data to create the scatter plot
scaled_clustered_df=clustered_df
scaler = StandardScaler()
scaler.fit(scaled_clustered_df[["TotalCoinsMined","TotalCoinSupply"]])
scaled_clustered_df[["TotalCoinsMined","TotalCoinSupply"]]=scaler.transform(scaled_clustered_df[["TotalCoinsMined","TotalCoinSupply"]])

In [31]:
alt.Chart(scaled_clustered_df).mark_circle(size=30).encode(
    x='TotalCoinsMined',
    y='TotalCoinSupply',
    color='class',
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
).interactive()

#### Table of Tradable Cryptocurrencies

In [32]:
# Table with tradable cryptos
display(scaled_clustered_df)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,class,CoinName
42,Scrypt,PoW/PoS,-0.117108,-0.152870,-0.331500,1.045974,-0.539637,0,42 Coin
404,Scrypt,PoW/PoS,-0.093970,-0.145009,-0.314828,1.046204,-0.540031,0,404Coin
1337,X13,PoW/PoS,0.524946,4.489424,2.302053,1.671213,-0.649254,0,EliteCoin
BTC,SHA-256,PoW,-0.116715,-0.152560,-0.150126,-1.309574,0.109148,3,Bitcoin
ETH,Ethash,PoW,-0.114747,-0.152870,-0.159483,-2.079329,0.467582,3,Ethereum
...,...,...,...,...,...,...,...,...,...
ZEPH,SHA-256,DPoS,-0.073251,-0.123317,2.459714,0.766315,-0.124086,0,ZEPHYR
GAP,Scrypt,PoW/PoS,-0.116781,-0.149176,-0.329544,1.045869,-0.539661,0,Gapcoin
BDX,CryptoNight,PoW,-0.095613,-0.132179,0.326393,-2.309637,0.421195,3,Beldex
ZEN,Equihash,PoW,-0.116948,-0.152560,-0.136899,-2.006932,0.346501,3,Horizen


In [92]:
sagemaker.Session().delete_endpoint(linear_predictor.endpoint)