# Clustering Crypto

In [25]:
# Initial imports
import requests
import pandas as pd
from path import Path
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [26]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [27]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
# Use the following endpoint to fetch json data
#url = "https://min-api.cryptocompare.com/data/all/coinlist"
# Create a DataFrame 
#r = requests.get('url')
#data = r.json()

In [28]:
# Alternatively, use the provided csv file:
file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame
coins_df = pd.read_csv(file_path)
coins_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


### Data Preprocessing

In [29]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_df = coins_df.drop('Unnamed: 0', axis=1)
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
1247,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
1248,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
1249,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [30]:
# Keep only cryptocurrencies that are trading
crypto_df = crypto_df.loc[coins_df['IsTrading'] == True]
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.0,0


In [31]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df[crypto_df.Algorithm != 'N/A']
#coins_df.head()

In [32]:
# Remove the "IsTrading" column
crypto_df = crypto_df.drop('IsTrading', axis=1)
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
1,365Coin,X11,PoW/PoS,,2300000000
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
3,SixEleven,SHA-256,PoW,,611000
4,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...
1243,Super Zero,Ethash,PoW,,1000000000
1244,UOS,SHA-256,DPoI,,1000000000
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,PoW,7.296538e+06,21000000


In [33]:
# Remove rows with at least 1 null value
crypto_df = crypto_df.dropna()
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
4,808,SHA-256,PoW/PoS,0.000000e+00,0
5,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
7,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
1238,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,PoW,7.296538e+06,21000000


In [34]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df = crypto_df[crypto_df.TotalCoinsMined != 0]
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
5,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
7,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
8,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
1238,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,PoW,7.296538e+06,21000000


In [35]:
# Drop rows where there are 'N/A' text values
crypto_df = crypto_df[crypto_df != "N/A"]
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
5,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
7,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
8,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
1238,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,PoW,7.296538e+06,21000000


In [36]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coin_name = crypto_df['CoinName']
coin_df = pd.DataFrame({"Coin Name": coin_name})
coin_df.head()


Unnamed: 0,Coin Name
0,42 Coin
2,404Coin
5,EliteCoin
7,Bitcoin
8,Ethereum


In [37]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df = crypto_df.drop('CoinName', axis=1)
crypto_df


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,4.199995e+01,42
2,Scrypt,PoW/PoS,1.055185e+09,532000000
5,X13,PoW/PoS,2.927942e+10,314159265359
7,SHA-256,PoW,1.792718e+07,21000000
8,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2000000000
1242,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,CryptoNight,PoW,9.802226e+08,1400222610
1246,Equihash,PoW,7.296538e+06,21000000


In [38]:
# Create dummy variables for text features
crypto_dummies = pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType'])
crypto_dummies

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1242,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1245,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1246,7.296538e+06,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
# Standardize data
crypto_scaler = StandardScaler().fit_transform(crypto_dummies)
len(crypto_scaler)

533

### Reducing Dimensions Using PCA

In [40]:
# Use PCA to reduce dimensions to 3 principal components
# Initialize PCA model
pca = PCA(n_components=3)

# Get two principal components for the iris data.
iris_pca = pca.fit_transform(crypto_scaler)

In [41]:
# Create a DataFrame with the principal components data
crypto_pca = pd.DataFrame(data=iris_pca, columns=['principal component 1', 'principal component 2', 'principal component 3'])
crypto_pca.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3
0,-0.33574,1.008901,-0.619126
1,-0.319037,1.009427,-0.619696
2,2.316265,1.66139,-0.759087
3,-0.142452,-1.315012,0.203002
4,-0.15245,-2.02268,0.3953


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [42]:
# Finding the best value for k
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(crypto_pca)
    inertia.append(km.inertia_)

# Creating the Elbow Curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

  f"KMeans is known to have a memory leak on Windows "


Running K-Means with `k=<your best value for k here>`

In [43]:
# Initialize the K-Means model
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(crypto_pca)

# Predict clusters
predictions = model.predict(crypto_pca)
len(predictions)


533

In [44]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features
crypto_df["Class"] = model.labels_
crypto_df["Predictions"]= predictions
crypto_df["Coin Name"]= coin_name
crypto_df[["PC1","PC2","PC3"]] = crypto_pca

clustered_df = crypto_df
clustered_df.head()



Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,Class,Predictions,Coin Name,PC1,PC2,PC3
0,Scrypt,PoW/PoS,41.99995,42,0,0,42 Coin,-0.33574,1.008901,-0.619126
2,Scrypt,PoW/PoS,1055185000.0,532000000,0,0,404Coin,2.316265,1.66139,-0.759087
5,X13,PoW/PoS,29279420000.0,314159265359,0,0,EliteCoin,-0.165752,-1.073178,-0.008329
7,SHA-256,PoW,17927180.0,21000000,1,1,Bitcoin,-0.140627,-2.193439,0.503228
8,Ethash,PoW,107684200.0,0,1,1,Ethereum,-0.150887,-2.022737,0.395271


### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [45]:
# Scale data to create the scatter plot
scaled = MinMaxScaler().fit_transform(clustered_df[["TotalCoinSupply","TotalCoinsMined"]])
scaled_df = pd.DataFrame(scaled, columns = ["TotalCoinSupply","TotalCoinsMined"], index = crypto_df.index)
clustered_df = clustered_df.drop(columns=['TotalCoinsMined','TotalCoinSupply'])

clustered_df = clustered_df.join(scaled_df)
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,Class,Predictions,Coin Name,PC1,PC2,PC3,TotalCoinSupply,TotalCoinsMined
0,Scrypt,PoW/PoS,0,0,42 Coin,-0.33574,1.008901,-0.619126,4.2e-11,0.005942
2,Scrypt,PoW/PoS,0,0,404Coin,2.316265,1.66139,-0.759087,0.000532,0.007002
5,X13,PoW/PoS,0,0,EliteCoin,-0.165752,-1.073178,-0.008329,0.3141593,0.035342
7,SHA-256,PoW,1,1,Bitcoin,-0.140627,-2.193439,0.503228,2.1e-05,0.00596
8,Ethash,PoW,1,1,Ethereum,-0.150887,-2.022737,0.395271,0.0,0.00605


In [46]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
clustered_df.hvplot.scatter(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    by="Class",
    hover_cols=["Coin Name"])

#### Table of Tradable Cryptocurrencies

In [47]:
# Table with tradable cryptos
tradable_crypto = clustered_df[["Coin Name", "Algorithm", "ProofType", "TotalCoinSupply", "TotalCoinsMined", "Class"]]
tradable_crypto.hvplot.table(sortable=True, selectable=True)




In [48]:
# Print the total number of tradable cryptocurrencies
tradable_coins = clustered_df['Coin Name'].unique()
tradable_coins = list(tradable_coins)
print(f" The Total Number of Tradable cryptocurrencies are {len(tradable_coins)}.")

 The Total Number of Tradable cryptocurrencies are 532.
