## Clustering Crypto
---
* This is similar to the first clustering project. The difference is that I imported altair and removed all hvplot and Plotly Express. I did this so the visualizations would be compatiable with AWS SageMaker, which I later deployed this on. 

In [39]:
# Initial imports
import requests
import altair as alt
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [3]:
# Create a DataFrame
path = ("~/Desktop/crypto_data.csv")
df = pd.read_csv(path)
df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


### Data Preprocessing

In [4]:
# Keep only necessary columns
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.0,0


In [5]:
df['IsTrading'].value_counts(False)

True     1144
False     108
Name: IsTrading, dtype: int64

In [6]:
# Keep only cryptocurrencies that are trading
df = df[df.IsTrading]
df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.0,0


In [7]:
df.shape

(1144, 6)

In [8]:
# Keep only cryptocurrencies with a working algorithm
df['Algorithm'].dropna()
df.shape

(1144, 6)

In [9]:
# Remove the "IsTrading" column
df = df.drop(['IsTrading'], axis=1)
df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365Coin,X11,PoW/PoS,,2300000000
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,PoW,,611000
4,808,SHA-256,PoW/PoS,0.0,0


In [10]:
# Remove rows with at least 1 null value
df = df.dropna()
df.shape

(685, 5)

In [11]:
# Remove rows with cryptocurrencies having no coins mined
indexname = df[df['TotalCoinsMined'] == 0.000000e+00].index
df.drop(indexname, inplace=True)
df.shape

(533, 5)

In [12]:
# Drop rows where there are 'N/A' text values
df[df.CoinName.str.contains('N/A', case=False)]
df[df.Algorithm.str.contains('N/A', case=False)]
df[df.ProofType.str.contains('N/A', case=False)]
df.shape

(533, 5)

In [13]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coin_name = df['CoinName']
coin_name = pd.DataFrame(coin_name)
coin_name.head()

Unnamed: 0,CoinName
0,42 Coin
2,404Coin
5,EliteCoin
7,Bitcoin
8,Ethereum


In [14]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
df = df.drop(['CoinName'], axis=1)
df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,41.99995,42
2,Scrypt,PoW/PoS,1055185000.0,532000000
5,X13,PoW/PoS,29279420000.0,314159265359
7,SHA-256,PoW,17927180.0,21000000
8,Ethash,PoW,107684200.0,0


In [15]:
# Create dummy variables for text features
features = pd.get_dummies(df)
features.head()

Unnamed: 0,TotalCoinsMined,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,...,TotalCoinSupply_91388946,TotalCoinSupply_92000000000,TotalCoinSupply_9354000,TotalCoinSupply_9507271,TotalCoinSupply_9736000,TotalCoinSupply_98000000,TotalCoinSupply_98100000000,TotalCoinSupply_990000000000,TotalCoinSupply_999481516,TotalCoinSupply_9999999
0,41.99995,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1055185000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,29279420000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,17927180.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Standardize data
from sklearn import preprocessing
scaler = StandardScaler().fit_transform(features)
print(scaler[0:5])

[[-0.11674788 -0.0433555  -0.0433555  ... -0.0433555  -0.0433555
  -0.0433555 ]
 [-0.09358885 -0.0433555  -0.0433555  ... -0.0433555  -0.0433555
  -0.0433555 ]
 [ 0.52587231 -0.0433555  -0.0433555  ... -0.0433555  -0.0433555
  -0.0433555 ]
 [-0.11635442 -0.0433555  -0.0433555  ... -0.0433555  -0.0433555
  -0.0433555 ]
 [-0.11438445 -0.0433555  -0.0433555  ... -0.0433555  -0.0433555
  -0.0433555 ]]


###  Reducing Dimensions Using PCA

In [17]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
pca = pca.fit_transform(scaler)

In [18]:
# Create a DataFrame with the principal components data
df_pca = pd.DataFrame(
    data=pca, columns=["PC1", "PC2", "PC3"]
)
df_pca.head()

Unnamed: 0,PC1,PC2,PC3
0,-0.297104,-0.125681,-0.004635
1,-0.281407,-0.125776,-0.012389
2,0.455793,0.019764,0.180462
3,-0.253011,-0.053749,-0.162912
4,0.023762,-0.078886,-0.281624


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for k Using the Elbow Curve

In [19]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_pca)
    inertia.append(km.inertia_)
    
    # Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

In [20]:
alt.Chart(df_elbow).mark_circle(size=60).encode(
    x='k',
    y='inertia',
    tooltip=['k', 'inertia']
).interactive()

* Running K-Means k = 4

In [21]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)
    
# Fit the model
model.fit(df_pca)
    
# Predict clusters
predictions = model.predict(df_pca)
    
# Create a new DataFrame including predicted clusters and cryptocurrencies features
df['Class'] = model.labels_
df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,Class
0,Scrypt,PoW/PoS,41.99995,42,0
2,Scrypt,PoW/PoS,1055185000.0,532000000,0
5,X13,PoW/PoS,29279420000.0,314159265359,0
7,SHA-256,PoW,17927180.0,21000000,0
8,Ethash,PoW,107684200.0,0,0


In [22]:
df_p = df.join(df_pca)
df_new = df_p.join(coin_name)
df_new.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,Class,PC1,PC2,PC3,CoinName
0,Scrypt,PoW/PoS,41.99995,42,0,-0.297104,-0.125681,-0.004635,42 Coin
2,Scrypt,PoW/PoS,1055185000.0,532000000,0,0.455793,0.019764,0.180462,404Coin
5,X13,PoW/PoS,29279420000.0,314159265359,0,-0.296886,-0.0912,-0.131029,EliteCoin
7,SHA-256,PoW,17927180.0,21000000,0,-0.043039,-0.031452,-0.206425,Bitcoin
8,Ethash,PoW,107684200.0,0,0,-0.098026,-0.045993,-0.283385,Ethereum


### Visualizing Results

#### 2D-Scatter with Clusters

In [23]:
alt.Chart(df_new).mark_circle(size=60).encode(
    x='PC1',
    y='PC2',
    tooltip=['CoinName', "Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
).interactive()

### Table of Tradable Cryptocurrencies

In [37]:
# Table with tradable cryptos
alt.Chart.display(table)

In [25]:
# Print the total number of tradable cryptocurrencies
df_new['CoinName'].count()

533

### Scatter Plot with Tradable Cryptocurrencies

In [26]:
a = df_new['TotalCoinSupply'].values
a = pd.DataFrame(a)

a.head()

Unnamed: 0,0
0,42
1,532000000
2,314159265359
3,21000000
4,0


In [27]:
# Scale data to create the scatter plot
min_max = MinMaxScaler()
a_scaled = min_max.fit_transform(a)
a['TotalCoinSupply'] = a_scaled
a.drop([0])
a.head()

Unnamed: 0,0,TotalCoinSupply
0,42,4.2e-11
1,532000000,0.000532
2,314159265359,0.3141593
3,21000000,2.1e-05
4,0,0.0


In [28]:
df_new = df_new.drop(['Algorithm', 'ProofType', 'TotalCoinSupply', 'Class', 'PC1', 'PC2', 'PC3'], axis=1)
df_new.head()

Unnamed: 0,TotalCoinsMined,CoinName
0,41.99995,42 Coin
2,1055185000.0,404Coin
5,29279420000.0,EliteCoin
7,17927180.0,Bitcoin
8,107684200.0,Ethereum


In [35]:
scaled_df = df_new.join(a)
scaled_df = scaled_df.drop([0], axis=1)
scaled_df.head()

Unnamed: 0,TotalCoinsMined,CoinName,TotalCoinSupply
0,41.99995,42 Coin,4.2e-11
2,1055185000.0,404Coin,0.3141593
5,29279420000.0,EliteCoin,8.4e-05
7,17927180.0,Bitcoin,0.0
8,107684200.0,Ethereum,0.00021


In [36]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
alt.Chart(scaled_df).mark_circle(size=60).encode(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    tooltip=["TotalCoinsMined", "TotalCoinSupply"]
).interactive()