# Clustering Crypto

In [362]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [363]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [364]:
# Create a DataFrame 
crypto=requests.get(url)


In [365]:
crypto_df=pd.DataFrame(crypto.json()['Data'])
crypto_df=crypto_df.transpose()

In [366]:
crypto_df.head()

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,MaxSupply,MktCapPenalty,IsUsedInDefi,IsUsedInNft,PlatformType,AlgorithmType,Difficulty,BuiltOn,SmartContractAddress,DecimalPoints
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,42.0,0.0,0.0,0.0,blockchain,scrypt,0.488885,,,
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,300.0,0.0,0.0,0.0,token,,,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,18.0
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),365Coin is a Proof of Work and Proof of Stake ...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),404 is a PoW/PoS hybrid cryptocurrency that al...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),433 Token is a decentralised soccer platform t...,Finished,...,,,,,,,,,,


In [367]:
crypto_df.columns

Index(['Id', 'Url', 'ImageUrl', 'ContentCreatedOn', 'Name', 'Symbol',
       'CoinName', 'FullName', 'Description', 'AssetTokenStatus', 'Algorithm',
       'ProofType', 'SortOrder', 'Sponsored', 'Taxonomy', 'Rating',
       'IsTrading', 'TotalCoinsMined', 'CirculatingSupply', 'BlockNumber',
       'NetHashesPerSecond', 'BlockReward', 'BlockTime', 'AssetLaunchDate',
       'AssetWhitepaperUrl', 'AssetWebsiteUrl', 'MaxSupply', 'MktCapPenalty',
       'IsUsedInDefi', 'IsUsedInNft', 'PlatformType', 'AlgorithmType',
       'Difficulty', 'BuiltOn', 'SmartContractAddress', 'DecimalPoints'],
      dtype='object')

### Data Preprocessing

In [368]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'

#There is no column TotalCoinSupply, Im going to use CirculatingSupply
x_cols=['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','CirculatingSupply']
crypto_df=crypto_df[x_cols]
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,CirculatingSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,41.999952
300,300 token,,True,,300.0,0.0
365,365Coin,X11,True,PoW/PoS,0.0,0.0
404,404Coin,Scrypt,True,PoW/PoS,0.0,0.0
433,433 Token,,False,,,


In [369]:
# Keep only cryptocurrencies that are trading
crypto_df=crypto_df.loc[crypto_df['IsTrading']==True]

In [370]:
# Keep only cryptocurrencies with a working algorithm
crypto_df=crypto_df.loc[crypto_df['Algorithm']!='N/A']

In [371]:
# Remove the "IsTrading" column
crypto_df=crypto_df.drop(columns=['IsTrading'])

In [372]:
# Remove rows with at least 1 null value
crypto_df=crypto_df.dropna()

In [373]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df=crypto_df.loc[crypto_df['TotalCoinsMined']>0]

In [374]:
# Drop rows where there are 'N/A' text values
for i in crypto_df.columns:
    crypto_df=crypto_df.loc[crypto_df[i]!='N/A']

In [375]:
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply
42,42 Coin,Scrypt,PoW/PoS,41.999952,41.999952
NSR,NuShares,PoS,PoS,6178033605.8373,0.0
TRI,Triangles Coin,X13,PoW/PoS,191624.022943,0.0
CMTC,CometCoin,Scrypt,PoW,872830.0,0.0
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000.0,0.0


In [376]:
# Store the 'CoinName' Column in its own DataFrame
CoinName_df=crypto_df['CoinName']

In [377]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df=crypto_df.drop(columns=['CoinName'])

In [378]:
crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply
42,Scrypt,PoW/PoS,41.999952,41.999952
NSR,PoS,PoS,6178033605.8373,0.0
TRI,X13,PoW/PoS,191624.022943,0.0
CMTC,Scrypt,PoW,872830.0,0.0
CHAT,Scrypt,PoW/PoS,1000000000.0,0.0


In [379]:
# Create dummy variables for text features
X = pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType'])

In [380]:
X.head()

Unnamed: 0,TotalCoinsMined,CirculatingSupply,Algorithm_Autolykos,Algorithm_BEP-2,Algorithm_BEP-20 Token,Algorithm_BLAKE256,Algorithm_BMW512 / Echo512,Algorithm_Blake2B + SHA3,Algorithm_Blake2b,Algorithm_C31,...,ProofType_PoW/PoSe,ProofType_PoW/nPoS,ProofType_ProgPoW/PoS,ProofType_Proof of Authority,ProofType_Proof-of-Work,ProofType_SPoS,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW,ProofType_dPoW/PoW
42,41.999952,41.999952,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NSR,6178033605.8373,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRI,191624.022943,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CMTC,872830.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHAT,1000000000.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [381]:
# Standardize data
scaler = StandardScaler()
scaler.fit(X)
X=scaler.transform(X)

### Reducing Dimensions Using PCA

In [382]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(X)

In [383]:
# Create a DataFrame with the principal components data
pcs_df = pd.DataFrame(
    data=crypto_pca, columns=["PC1","PC2","PC3"], index=crypto_df.index
)
pcs_df.head()

Unnamed: 0,PC1,PC2,PC3
42,-0.229946,1.084832,-1.437895
NSR,-0.124271,1.417266,-0.284618
TRI,-0.174194,1.869168,-1.747097
CMTC,-0.367993,-0.916144,-0.371212
CHAT,-0.229939,1.084831,-1.437895


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [384]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Creating the Elbow Curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


  f"KMeans is known to have a memory leak on Windows "


Running K-Means with `k=<your best value for k here>`

In [385]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

# Add the predicted class columns
pcs_df["class"] = model.labels_
pcs_df.head()

Unnamed: 0,PC1,PC2,PC3,class
42,-0.229946,1.084832,-1.437895,0
NSR,-0.124271,1.417266,-0.284618,0
TRI,-0.174194,1.869168,-1.747097,0
CMTC,-0.367993,-0.916144,-0.371212,1
CHAT,-0.229939,1.084831,-1.437895,0


In [386]:
#Adding back orginal dataframe
clustered_df=crypto_df.join([pcs_df, CoinName_df])

In [387]:
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply,PC1,PC2,PC3,class,CoinName
42,Scrypt,PoW/PoS,41.999952,41.999952,-0.229946,1.084832,-1.437895,0,42 Coin
NSR,PoS,PoS,6178033605.8373,0.0,-0.124271,1.417266,-0.284618,0,NuShares
TRI,X13,PoW/PoS,191624.022943,0.0,-0.174194,1.869168,-1.747097,0,Triangles Coin
CMTC,Scrypt,PoW,872830.0,0.0,-0.367993,-0.916144,-0.371212,1,CometCoin
CHAT,Scrypt,PoW/PoS,1000000000.0,0.0,-0.229939,1.084831,-1.437895,0,OpenChat


#### Scatter Plot with Tradable Cryptocurrencies

In [388]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
clustered_df.hvplot.scatter(
    x="TotalCoinsMined",
    y="CirculatingSupply",
    hover_cols=["CoinName"],
    by="class",
)

In [389]:
# Scale data to create the scatter plot
scaled_clustered_df=clustered_df
scaler = StandardScaler()
scaler.fit(scaled_clustered_df[["TotalCoinsMined","CirculatingSupply"]])
scaled_clustered_df[["TotalCoinsMined","CirculatingSupply"]]=scaler.transform(scaled_clustered_df[["TotalCoinsMined","CirculatingSupply"]])

In [390]:
scaled_clustered_df.hvplot.scatter(
    x="TotalCoinsMined",
    y="CirculatingSupply",
    hover_cols=["CoinName"],
    by="class",
)

#### Table of Tradable Cryptocurrencies

In [391]:
# Table with tradable cryptos
clustered_df.hvplot.table(columns=["CoinName", "Algorithm", "ProofType", "CirculatingSupply", "TotalCoinsMined", "class"])