# Clustering Crypto

In [138]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import hvplot.pandas

### Deliverable 1: Preprocessing the Data for PCA

In [139]:
# Load data

file_path = "Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path)

crypto_df.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,42.0,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055184902.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,True,PoW/PoS,29279424623.0,314159265359
6,2015,2015 coin,X11,True,PoW/PoS,,0
7,BTC,Bitcoin,SHA-256,True,PoW,17927175.0,21000000
8,ETH,Ethereum,Ethash,True,PoW,107684223.0,0
9,LTC,Litecoin,Scrypt,True,PoW,63039243.0,84000000


In [140]:
# lets check the data for cleansing
crypto_df.shape

(1252, 7)

In [141]:
# Columns 
crypto_df.columns

Index(['Unnamed: 0', 'CoinName', 'Algorithm', 'IsTrading', 'ProofType',
       'TotalCoinsMined', 'TotalCoinSupply'],
      dtype='object')

In [142]:
# List dataframe data types
crypto_df.dtypes

Unnamed: 0          object
CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [143]:
# Find duplicate entries
print(f"Duplicate entries: {crypto_df.duplicated().sum()}")

Duplicate entries: 0


In [144]:
# Keep all the cryptocurrencies that are being traded.
# Transform String column
def change_string(IsTrading):
    if IsTrading == "False":
        return 0
    else:
        return 1
    
crypto_df["IsTrading"] = crypto_df["IsTrading"].apply(change_string)
crypto_df.shape

(1252, 7)

In [145]:
# remove not trade coins to keep all the cryptocurrencies that are being traded
crypto_df = crypto_df[-(crypto_df == 0).any(axis=1)]
crypto_df.shape

(1086, 7)

In [146]:
# Keep all the cryptocurrencies that are being traded.
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,1,PoW/PoS,42.0,42
1,365,365Coin,X11,1,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,1,PoW/PoS,1055184902.0,532000000
3,611,SixEleven,SHA-256,1,PoW,,611000
5,1337,EliteCoin,X13,1,PoW/PoS,29279424623.0,314159265359


In [147]:
# Remove the "IsTrading" column. 
crypto_df.drop(columns=["IsTrading"], inplace=True)
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,42.0,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055184902.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
5,1337,EliteCoin,X13,PoW/PoS,29279424623.0,314159265359


In [148]:
# Remove rows that have at least 1 null value.
# step 1: Find null values
for column in crypto_df.columns:
    print(f"Column {column} has {crypto_df[column].isnull().sum()} null values")

Column Unnamed: 0 has 0 null values
Column CoinName has 0 null values
Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 508 null values
Column TotalCoinSupply has 0 null values


In [149]:
# Remove rows that have at least 1 null value.
# step 2: dropna
crypto_df = crypto_df.dropna()
crypto_df.shape

(578, 6)

In [150]:
# Keep the rows where coins are mined.
pd.options.display.float_format = '{:,.0f}'.format

In [151]:
crypto_df = crypto_df[crypto_df.TotalCoinsMined > 0]

In [152]:
crypto_df.shape

(577, 6)

In [153]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
# Store the names of all cryptocurrencies on a DataFramed named coins_name, and use the crypto_df.index as the index for this new DataFrame.

coins_name = crypto_df.set_index(["Unnamed: 0"])
coins_name

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,42,42
404,404Coin,Scrypt,PoW/PoS,1055184902,532000000
1337,EliteCoin,X13,PoW/PoS,29279424623,314159265359
BTC,Bitcoin,SHA-256,PoW,17927175,21000000
ETH,Ethereum,Ethash,PoW,107684223,0
...,...,...,...,...,...
GAP,Gapcoin,Scrypt,PoW/PoS,14931046,250000000
BDX,Beldex,CryptoNight,PoW,980222595,1400222610
ZEN,Horizen,Equihash,PoW,7296538,21000000
XBC,BitcoinPlus,Scrypt,PoS,128327,1000000


In [154]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
coins_name.drop(columns=["CoinName"], inplace=True)
coins_name

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,42,42
404,Scrypt,PoW/PoS,1055184902,532000000
1337,X13,PoW/PoS,29279424623,314159265359
BTC,SHA-256,PoW,17927175,21000000
ETH,Ethash,PoW,107684223,0
...,...,...,...,...
GAP,Scrypt,PoW/PoS,14931046,250000000
BDX,CryptoNight,PoW,980222595,1400222610
ZEN,Equihash,PoW,7296538,21000000
XBC,Scrypt,PoS,128327,1000000


In [155]:
# We know that our model can’t have strings passed into it. 
# To make sure we can use our string data, we’ll transform our strings of PoW/PoS, PoS, and PoW from the ProofType column to 0,1, and 2, respectively. 
# The function will then be run on the whole column with the .apply method.
coins_name['TotalCoinSupply'] = coins_name['TotalCoinSupply'].apply(float)

In [156]:
# Transform String column
def change_string(ProofType):
    if ProofType == "PoW/PoS":
        return 0
    if ProofType == "Pos":
        return 1
    else:
        return 2
    
coins_name["ProofType"] = coins_name["ProofType"].apply(change_string)
coins_name.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,0,42,42
404,Scrypt,0,1055184902,532000000
1337,X13,0,29279424623,314159265359
BTC,SHA-256,2,17927175,21000000
ETH,Ethash,2,107684223,0


In [157]:
# Saving cleaned data
file_path = "Resources/coins_name.csv"
coins_name.to_csv(file_path, index=False)

In [158]:
# Use get_dummies() to create variables for text features.
X = coins_name[['Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply']].copy()
X = pd.get_dummies(X, columns=['Algorithm'], drop_first=True)
X = X.dropna()
X.head()

Unnamed: 0_level_0,ProofType,TotalCoinsMined,TotalCoinSupply,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,Algorithm_Tribus,Algorithm_VBFT,Algorithm_VeChainThor Authority,Algorithm_X11,Algorithm_X11GOST,Algorithm_X13,Algorithm_X14,Algorithm_X15,Algorithm_X16R,Algorithm_XEVAN
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42,0,42,42,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
404,0,1055184902,532000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1337,0,29279424623,314159265359,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
BTC,2,17927175,21000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,2,107684223,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [159]:
# Saving cleaned data
file_path = "Resources/X.csv"
X_pca.to_csv(file_path, index=False)

In [160]:
# Standardize the data with StandardScaler().
# Use the StandardScaler from sklearn to standardize all of the data from the X DataFrame. 
# this is an important step prior to using PCA and K-means algorithms.

from sklearn.preprocessing import MinMaxScaler
X_scaled = MinMaxScaler().fit_transform(X)
X_scaled

array([[0.00000000e+00, 0.00000000e+00, 4.20000000e-11, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.06585544e-03, 5.32000000e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.95755135e-02, 3.14159265e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.00000000e+00, 7.37028150e-06, 2.10000000e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 1.29582282e-07, 1.00000000e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.17085015e-05, 1.00000000e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

### Deliverable 2: Reducing Data Dimensions Using PCA

In [161]:
# Using PCA to reduce dimension to three principal components.
#Standardize data with StandardScaler
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:5])

[[-1.40917431 -0.11450078 -0.15072489 -0.04166667 -0.04166667 -0.05897678
  -0.0934947  -0.04166667 -0.05897678 -0.05897678 -0.04166667 -0.04166667
  -0.18949048 -0.05897678 -0.0934947  -0.04166667 -0.11081833 -0.0722944
  -0.04166667 -0.04166667 -0.1518211  -0.04166667 -0.13280318 -0.04166667
  -0.04166667 -0.0835512  -0.05897678 -0.04166667 -0.04166667 -0.04166667
  -0.05897678 -0.04166667 -0.0835512  -0.0934947  -0.10250796 -0.04166667
  -0.1258772  -0.13280318 -0.1518211  -0.04166667 -0.0835512  -0.04166667
  -0.04166667 -0.0722944  -0.17423301 -0.04166667 -0.04166667 -0.04166667
  -0.0722944  -0.16888013 -0.30802055 -0.04166667 -0.0934947  -0.0934947
  -0.05897678  1.39963365 -0.04166667 -0.04166667 -0.04166667 -0.0835512
  -0.04166667 -0.04166667 -0.04166667 -0.04166667 -0.04166667 -0.05897678
  -0.04166667 -0.04166667 -0.39536391 -0.04166667 -0.17423301 -0.04166667
  -0.0835512  -0.0835512  -0.10250796]
 [-1.40917431 -0.09041991 -0.14255828 -0.04166667 -0.04166667 -0.05897678
  

In [162]:
# Initialize PCA model
pca = PCA(n_components=3)

In [163]:
# Get two principal components for the iris data.
X_pca = pca.fit_transform(X_scaled)

In [164]:
# Transform PCA data to a DataFrame
X_pca = pd.DataFrame(
    data=X_pca, columns=["PC 1", "PC 2", "PC 3"]
)
X_pca.head()

Unnamed: 0,PC 1,PC 2,PC 3
0,0,2,-1
1,0,2,-1
2,3,2,1
3,0,-1,1
4,0,-2,0


In [165]:
# Saving cleaned data
file_path = "Resources/X_pca.csv"
X_pca.to_csv(file_path, index=False)

In [166]:
# Loading data
file_path ="Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path)

In [167]:
df_y = pd.DataFrame(data=crypto_df, columns=['Unnamed: 0'])
df = X_pca.join(df_y, how='inner')
df.head()

Unnamed: 0.1,PC 1,PC 2,PC 3,Unnamed: 0
0,0,2,-1,42
1,0,2,-1,365
2,3,2,1,404
3,0,-1,1,611
4,0,-2,0,808


In [168]:
# Create a DataFrame with the three principal components.
# YOUR CODE HERE
df = df.set_index(["Unnamed: 0"])
df.head()

Unnamed: 0_level_0,PC 1,PC 2,PC 3
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,0,2,-1
365,0,2,-1
404,3,2,1
611,0,-1,1
808,0,-2,0


In [169]:
df.count()

PC 1    577
PC 2    577
PC 3    577
dtype: int64

### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [170]:
# Create an elbow curve to find the best value for K.
# YOUR CODE HERE

# import our libraries
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 


In [171]:
# Loading data
X.head(10)

Unnamed: 0_level_0,ProofType,TotalCoinsMined,TotalCoinSupply,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,Algorithm_Tribus,Algorithm_VBFT,Algorithm_VeChainThor Authority,Algorithm_X11,Algorithm_X11GOST,Algorithm_X13,Algorithm_X14,Algorithm_X15,Algorithm_X16R,Algorithm_XEVAN
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42,0,42,42,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
404,0,1055184902,532000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1337,0,29279424623,314159265359,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
BTC,2,17927175,21000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,2,107684223,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LTC,2,63039243,84000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DASH,0,9031294,22000000,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
XMR,2,17201143,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETC,2,113359703,210000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEC,2,7383056,21000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [191]:
# Initializing model with K = 3 (since we already know there are three classes of crypto currency)
model = KMeans(n_clusters=3, random_state=5)
model

KMeans(n_clusters=3, random_state=5)

In [192]:
### Data Points Assigned to Nearest Centroid

In [193]:
# Fitting model
model.fit(X)

KMeans(n_clusters=3, random_state=5)

In [194]:
# Get predictions
predictions = model.predict(X)
print(predictions)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [195]:
#Add a new class column to X
X["class"] = model.labels_
X.head()

Unnamed: 0_level_0,ProofType,TotalCoinsMined,TotalCoinSupply,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,Algorithm_VBFT,Algorithm_VeChainThor Authority,Algorithm_X11,Algorithm_X11GOST,Algorithm_X13,Algorithm_X14,Algorithm_X15,Algorithm_X16R,Algorithm_XEVAN,class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42,0,42,42,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
404,0,1055184902,532000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1337,0,29279424623,314159265359,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
BTC,2,17927175,21000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,2,107684223,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [196]:
# elbow curve
inertia = []
k = range(1, 11)

In [197]:
# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(X)
    inertia.append(km.inertia_)

In [198]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}

In [199]:
df_elbow = pd.DataFrame(elbow_data)

In [200]:
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [201]:
#Use the Elbow Curve to Determine the Best K Valu
inertia = []
k = range(1, 11)
# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(X)
    inertia.append(km.inertia_)

In [202]:
# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [203]:
def get_clusters(k, data):
    # Initialize the K-means model
    model = KMeans(n_clusters=k, random_state=0)
    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)
    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_
    
    return data

Running K-Means with `k=4`

In [207]:
# Cluster 0: medium mined, low supply
# Cluster 1: low mined, low supply
# Cluster 2: high mined, low supply
# Cluster 3: low mined, high supply
# Cluster 4: medium mined, high supply
# Cluster 5: very high mined, high supply

In [208]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
# Loading data
file_path ="Resources/coins_name.csv"
coins_name = pd.read_csv(file_path)

In [209]:
# Loading data
file_path ="Resources/X_pca.csv"
X_pca = pd.read_csv(file_path)

In [210]:
df_y = pd.DataFrame(data=X_pca, columns=["PC 1", "PC 2", "PC 3"])
df = coins_name.join(df_y, how='inner')
df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3
0,Scrypt,0,42,42,0,2,-1
1,Scrypt,0,1055184902,532000000,0,2,-1
2,X13,0,29279424623,314159265359,3,2,1
3,SHA-256,2,17927175,21000000,0,-1,1
4,Ethash,2,107684223,0,0,-2,0


In [211]:
# Loading data
file_path ="Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path)

In [212]:
df_y = pd.DataFrame(data=crypto_df, columns=['CoinName'])
df2 = df.join(df_y, how='inner')
df2.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName
0,Scrypt,0,42,42,0,2,-1,42 Coin
1,Scrypt,0,1055184902,532000000,0,2,-1,365Coin
2,X13,0,29279424623,314159265359,3,2,1,404Coin
3,SHA-256,2,17927175,21000000,0,-1,1,SixEleven
4,Ethash,2,107684223,0,0,-2,0,808


In [213]:
df_y = pd.DataFrame(data=four_clusters, columns=['class'])
df3 = df2.join(df_y, how='inner')
df3.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,class


In [214]:
df_y = pd.DataFrame(data=crypto_df, columns=['Unnamed: 0'])
clustered_df = df3.join(df_y, how='inner')
clustered_df.head()

Unnamed: 0.1,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,class,Unnamed: 0


In [215]:
clustered_df = clustered_df.set_index(["Unnamed: 0"])
clustered_df.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [216]:
# clustering

def get_clusters(k, data):
    # Initialize the K-means model
    model = KMeans(n_clusters=k, random_state=0)
    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)
    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_
    
    return data

In [217]:
# Initialize the K-Means model. k=4
four_clusters = get_clusters(4, X)
four_clusters.head()

Unnamed: 0_level_0,ProofType,TotalCoinsMined,TotalCoinSupply,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,Algorithm_VBFT,Algorithm_VeChainThor Authority,Algorithm_X11,Algorithm_X11GOST,Algorithm_X13,Algorithm_X14,Algorithm_X15,Algorithm_X16R,Algorithm_XEVAN,class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42,0,42,42,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
404,0,1055184902,532000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1337,0,29279424623,314159265359,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,3
BTC,2,17927175,21000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,2,107684223,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [134]:
# Initialize the K-Means model.
# YOUR CODE HERE

# Fit the model
# YOUR CODE HERE

# Predict clusters
# YOUR CODE HERE

In [16]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
# YOUR CODE HERE

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
# YOUR CODE HERE

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# YOUR CODE HERE

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(532, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.332855,1.038358,-0.564944,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.316201,1.038515,-0.565371,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159265359,2.30004,1.643532,-0.570651,EliteCoin,0
BTC,SHA-256,PoW,17927180.0,21000000,-0.149023,-1.309646,0.18262,Bitcoin,1
ETH,Ethash,PoW,107684200.0,0,-0.162646,-2.019908,0.380155,Ethereum,1
LTC,Scrypt,PoW,63039240.0,84000000,-0.159391,-1.123165,-0.021041,Litecoin,1
DASH,X11,PoW/PoS,9031294.0,22000000,-0.410793,1.224033,-0.517184,Dash,0
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.148242,-2.196597,0.375973,Monero,1
ETC,Ethash,PoW,113359700.0,210000000,-0.161087,-2.02001,0.380143,Ethereum Classic,1
ZEC,Equihash,PoW,7383056.0,21000000,-0.179011,-2.0247,0.433256,ZCash,1


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [204]:
# Initialize the K-Means model. k=4
four_clusters = get_clusters(4, X)
four_clusters.head()

Unnamed: 0_level_0,ProofType,TotalCoinsMined,TotalCoinSupply,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,Algorithm_VBFT,Algorithm_VeChainThor Authority,Algorithm_X11,Algorithm_X11GOST,Algorithm_X13,Algorithm_X14,Algorithm_X15,Algorithm_X16R,Algorithm_XEVAN,class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42,0,42,42,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
404,0,1055184902,532000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1337,0,29279424623,314159265359,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,3
BTC,2,17927175,21000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,2,107684223,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [205]:
# Plotting the 2D-Scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
four_clusters.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", by="class")

In [206]:
# Creating a 3D-Scatter with the PCA data and the clusters
# Plotting the 3D-Scatter with x="TotalCoinsMined", y="TotalCoinSupply" and z="ProofType"
fig = px.scatter_3d(
    four_clusters,
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    z="ProofType",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [18]:
# Create a table with tradable cryptocurrencies.
# YOUR CODE HERE

In [19]:
# Print the total number of tradable cryptocurrencies.
# YOUR CODE HERE

There are 532 tradable cryptocurrencies.


In [20]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
# YOUR CODE HERE

array([[4.20000000e-11, 0.00000000e+00],
       [5.32000000e-04, 1.06585544e-03],
       [3.14159265e-01, 2.95755135e-02],
       ...,
       [1.40022261e-03, 9.90135079e-04],
       [2.10000000e-05, 7.37028150e-06],
       [1.00000000e-06, 1.29582282e-07]])

In [21]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
# YOUR CODE HERE

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
# YOUR CODE HERE

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
# YOUR CODE HERE

plot_df.head(10)

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
42,4.2e-11,0.0,42 Coin,0
404,0.000532,0.001066,404Coin,0
1337,0.3141593,0.029576,EliteCoin,0
BTC,2.1e-05,1.8e-05,Bitcoin,1
ETH,0.0,0.000109,Ethereum,1
LTC,8.4e-05,6.4e-05,Litecoin,1
DASH,2.2e-05,9e-06,Dash,0
XMR,0.0,1.7e-05,Monero,1
ETC,0.00021,0.000115,Ethereum Classic,1
ZEC,2.1e-05,7e-06,ZCash,1


In [22]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
# YOUR CODE HERE
