# Preprocessing

In [38]:
# Import dependencies
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
import hvplot.pandas
import plotly.express as px
import plotly.figure_factory as ff

In [39]:
# Load the data
file_path = "./Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path)
crypto_df.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
6,2015,2015 coin,X11,True,PoW/PoS,,0
7,BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,True,PoW,107684200.0,0
9,LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [40]:
crypto_df.count()
crypto_df.shape

(1252, 7)

In [41]:
crypto_df.dtypes

Unnamed: 0          object
CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [42]:
crypto_df.columns

Index(['Unnamed: 0', 'CoinName', 'Algorithm', 'IsTrading', 'ProofType',
       'TotalCoinsMined', 'TotalCoinSupply'],
      dtype='object')

In [43]:
# STEP 1
# Remove all cryptocurrencies that aren’t trading
#crypto_df[crypto_df["IsTrading"] == True].count()
crypto_df.drop(crypto_df[crypto_df["IsTrading"] == False].index, inplace=True)
crypto_df.shape

(1144, 7)

108 cryptocurrencies were removed because they are not trading, leaving the new dataframe with 1144 cryptocurrencies

In [44]:
# STEP 2
# Remove all cryptocurrencies that don’t have an algorithm defined
for column in crypto_df.columns:
    print(f"Column {column} has {crypto_df[column].isnull().sum()} null values")

Column Unnamed: 0 has 0 null values
Column CoinName has 0 null values
Column Algorithm has 0 null values
Column IsTrading has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 459 null values
Column TotalCoinSupply has 0 null values


When we explore the dataset for null values we see that all cryptocurrencies have a defined algorithm, meaning there are no null values for the Algorithm column. 

In [45]:
# STEP 3
# Remove the IsTrading column.
crypto_df.drop(columns=["IsTrading"], inplace=True)
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


In [46]:
# STEP 4
# Remove all cryptocurrencies with at least one null value.
crypto_df = crypto_df.dropna()
crypto_df.shape

(685, 6)

We can see from an exploratory analysis we did during Step 2 that there were 459 null values in the TotalCoinsMined column. Those 459 rows were dropped, leaving us with 685 rows.

In [47]:
# STEP 5
# Remove all cryptocurrencies without coins mined.
crypto_df.drop(crypto_df[crypto_df["TotalCoinsMined"] <= 0].index, inplace=True)

In [48]:
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,PoW,107684200.0,0


Finding and removies all cryptocurrencies with less than or equal to 0 coins mined; leaving our dataframe with 532 cryptocurrencies.

In [49]:
# Create a cleaned dataframe
clean_crypto_df = crypto_df
clean_crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,PoW,107684200.0,0


In [50]:
clean_crypto_df.shape

(532, 6)

In [51]:
# STEP 6
# Store the names of all cryptocurrencies on a DataFramed named coins_name, 
# and use the crypto_df.index as the index for this new DataFrame.
coins_name = pd.DataFrame(clean_crypto_df[["Unnamed: 0", "CoinName"]])
coins_name.set_index("Unnamed: 0", drop=True, inplace=True)
coins_name

Unnamed: 0_level_0,CoinName
Unnamed: 0,Unnamed: 1_level_1
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
LTC,Litecoin
DASH,Dash
XMR,Monero
ETC,Ethereum Classic
ZEC,ZCash


In [52]:
# STEP 7
# Remove the CoinName column from the clean_crypto_df
clean_crypto_df.drop(columns=["CoinName"], inplace=True)
clean_crypto_df

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,4.199995e+01,42
2,404,Scrypt,PoW/PoS,1.055185e+09,532000000
5,1337,X13,PoW/PoS,2.927942e+10,314159265359
7,BTC,SHA-256,PoW,1.792718e+07,21000000
8,ETH,Ethash,PoW,1.076842e+08,0
9,LTC,Scrypt,PoW,6.303924e+07,84000000
10,DASH,X11,PoW/PoS,9.031294e+06,22000000
11,XMR,CryptoNight-V7,PoW,1.720114e+07,0
12,ETC,Ethash,PoW,1.133597e+08,210000000
13,ZEC,Equihash,PoW,7.383056e+06,21000000


In [53]:
# STEP 8
# Create dummies variables for all of the text features, and store the resulting data on a DataFrame named X.

# Look at datatypes in coins_name
clean_crypto_df.dtypes

Unnamed: 0          object
Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [54]:
# First, change TotalCoinSupply datatype from object to float
clean_crypto_df["TotalCoinSupply"] = clean_crypto_df["TotalCoinSupply"].apply(float)
clean_crypto_df.dtypes

Unnamed: 0          object
Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply    float64
dtype: object

In [55]:
# Next, we want to convert the ProofType column to numbers
# First, let's see what ProofTypes are the most common
# proofType_df = clean_crypto_df.groupby("ProofType").nunique()
# print(proofType_df)

POS, POW, and POW/POS contain the most types of cryptocurrencies. So we can group those 3 individually and the rest as 'other'.

In [56]:
# Transform the string column.
# def change_string(ProofType):
#     if ProofType == "PoS":
#         return 0
#     if ProofType == "PoW":
#         return 1
#     if ProofType == "PoW/PoS":
#         return 2
#     else:
#         return 3
    
# coins_name["ProofType"] = coins_name["ProofType"].apply(change_string)
# coins_name

In [57]:
# Next create dummy variables for all text features (Algorithm and ProofType)
X = clean_crypto_df[["Algorithm", "ProofType", "TotalCoinsMined", "TotalCoinSupply"]].copy()
X = pd.get_dummies(X, columns=["Algorithm", "ProofType"], drop_first=True)
X = X.dropna()
X.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,41.99995,42.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1055185000.0,532000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,29279420000.0,314159300000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,17927180.0,21000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,107684200.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
# STEP 9
# Use the StandardScaler from sklearn to standardize all of the data from the X DataFrame.
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:2])

[[-0.11710817 -0.1528703  -0.0433963  -0.0433963  -0.06142951 -0.07530656
  -0.0433963  -0.06142951 -0.06142951 -0.0433963  -0.0433963  -0.19245009
  -0.06142951 -0.09740465 -0.0433963  -0.11547005 -0.07530656 -0.0433963
  -0.0433963  -0.15191091 -0.0433963  -0.13118084 -0.0433963  -0.0433963
  -0.08703883 -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.06142951
  -0.0433963  -0.08703883 -0.08703883 -0.08703883 -0.0433963  -0.13118084
  -0.13840913 -0.13840913 -0.0433963  -0.06142951 -0.0433963  -0.07530656
  -0.18168574 -0.0433963  -0.0433963  -0.0433963  -0.07530656 -0.15826614
  -0.31491833 -0.0433963  -0.08703883 -0.07530656 -0.06142951  1.38675049
  -0.0433963  -0.0433963  -0.06142951 -0.0433963  -0.0433963  -0.0433963
  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.39879994
  -0.0433963  -0.18168574 -0.0433963  -0.08703883 -0.08703883 -0.10680283
  -0.13118084 -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.07530656
  -0.43911856 -0.0433963  -0.06142951 -0.

In [59]:
# Save cleaned data
file_path = "./Resources/X_cleaned_crypto.csv"
X.to_csv(file_path, index=False)

# Reducing Data Dimensions Using PCA

In [60]:
# Initialize PCA model
pca = PCA(n_components=3)

In [61]:
# Get three principal components for the X_scaled data.
X_pca = pca.fit_transform(X_scaled)

In [62]:
# Transform PCA data to a dataframe
X_pca_df = pd.DataFrame(
    data=X_pca, columns=["PC 1", "PC 2", "PC 3"]
)
X_pca_df.head()

Unnamed: 0,PC 1,PC 2,PC 3
0,-0.333955,0.993608,-0.522746
1,-0.317289,0.993657,-0.523065
2,2.294921,1.563401,-0.619588
3,-0.147424,-1.27547,0.171421
4,-0.15175,-2.044473,0.356146


In [63]:
# Fetch the explained variance
pca.explained_variance_ratio_

array([0.02851297, 0.02177838, 0.02092444])

In [64]:
# Join the index from crypto_df
join_df = pd.DataFrame(data=clean_crypto_df, columns=["Unnamed: 0"])
pca_df = X_pca_df.join(join_df, how="inner")
pca_df.head()

Unnamed: 0.1,PC 1,PC 2,PC 3,Unnamed: 0
0,-0.333955,0.993608,-0.522746,42
2,2.294921,1.563401,-0.619588,404
5,-0.162368,-1.160095,0.015884,1337
7,-0.148343,-2.231451,0.415101,BTC
8,-0.150192,-2.044587,0.356132,ETH


In [65]:
# Remove "Unnamed: 0"
pca_df = pca_df.set_index(["Unnamed: 0"])
pca_df

Unnamed: 0_level_0,PC 1,PC 2,PC 3
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,-0.333955,0.993608,-0.522746
404,2.294921,1.563401,-0.619588
1337,-0.162368,-1.160095,0.015884
BTC,-0.148343,-2.231451,0.415101
ETH,-0.150192,-2.044587,0.356132
LTC,-0.154867,-2.041512,0.432079
DASH,-0.282779,1.347116,-0.020772
XMR,0.167585,-1.748403,0.321259
ETC,-0.317841,0.878202,-0.367228
ZEC,-0.271402,0.607574,-0.062275


In [66]:
# Save the PC data
file_path = "./Resources/principal_components.csv"
pca_df.to_csv(file_path, index=False)

# Clustering Cryptocurrencies Using K-Means

In [67]:
# STEP 1
# Create an elbow curve to find the best value for K, and use the pcs_df DataFrame.

# Create empty lists
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
   km = KMeans(n_clusters=i, random_state=0)
   km.fit(pca_df)
   inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

Based off the elbow curve, I'd say that our ideal number of clusters is 4. 

In [68]:
km = KMeans(n_clusters=4)
km.fit(pca_df)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [69]:
# STEP 2
# Once you define the best value for K, run the K-means algorithm to predict the K clusters for the cryptocurrencies’ data.
# Use the pcs_df to run the K-means algorithm.

# Initialize the K-means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pca_df)

# Predict clusters
predictions = model.predict(pca_df)
print(predictions)

[0 0 2 2 2 2 0 2 0 0 2 0 2 0 2 2 2 2 2 0 2 2 2 2 2 0 0 2 0 2 2 0 0 2 2 0 0
 2 0 0 2 2 2 0 2 2 2 0 0 2 0 0 0 2 0 2 2 2 2 0 0 2 2 2 0 0 2 0 0 2 2 0 2 2
 0 0 0 0 0 2 0 0 2 0 0 2 0 2 0 2 0 0 0 0 0 0 2 2 0 0 0 2 0 0 2 0 0 0 0 0 2
 2 0 0 0 0 2 0 2 2 0 0 0 0 2 0 0 0 2 2 0 0 2 2 0 2 0 2 0 0 2 2 0 0 0 2 2 2
 0 0 0 0 2 0 0 2 0 2 0 0 0 2 0 2 2 0 2 0 2 0 0 2 2 0 0 2 0 0 2 2 2 0 0 2 0
 2 2 2 2 0 2 2 0 0 0 2 2 2 2 2 2 0 3 1 0]


In [70]:
# STEP 3
# Create a new DataFrame named “clustered_df,” that includes the following columns:
# Algorithm, ProofType, TotalCoinsMined, TotalCoinSupply, PC 1, PC 2, PC 3, CoinName, and Class. 

clustered_df = clean_crypto_df.merge(pca_df, on="Unnamed: 0")
clustered_df = clustered_df.merge(coins_name, on="Unnamed: 0")

clustered_df["class"] = model.labels_

clustered_df.set_index("Unnamed: 0", drop=True, inplace=True)
clustered_df.head(10)

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,Scrypt,PoW/PoS,41.99995,42.0,-0.333955,0.993608,-0.522746,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,2.294921,1.563401,-0.619588,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159300000.0,-0.162368,-1.160095,0.015884,EliteCoin,2
BTC,SHA-256,PoW,17927180.0,21000000.0,-0.148343,-2.231451,0.415101,Bitcoin,2
ETH,Ethash,PoW,107684200.0,0.0,-0.150192,-2.044587,0.356132,Ethereum,2
LTC,Scrypt,PoW,63039240.0,84000000.0,-0.154867,-2.041512,0.432079,Litecoin,2
DASH,X11,PoW/PoS,9031294.0,22000000.0,-0.282779,1.347116,-0.020772,Dash,0
XMR,CryptoNight-V7,PoW,17201140.0,0.0,0.167585,-1.748403,0.321259,Monero,2
ETC,Ethash,PoW,113359700.0,210000000.0,-0.317841,0.878202,-0.367228,Ethereum Classic,0
ZEC,Equihash,PoW,7383056.0,21000000.0,-0.271402,0.607574,-0.062275,ZCash,0


# Visualizing Results

In [71]:
# STEP 1 
# Create a 3D scatter plot using Plotly Express to plot the clusters using the clustered_df DataFrame.

# Plot the df with a third axis to show what's happening with points in the middle
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="class",
    symbol="class",
    hover_name="CoinName",
    hover_data=["Algorithm"],
    width=800,
)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [72]:
# STEP 2 
# Use hvplot.table to create a data table with all the current tradable cryptocurrencies.

clustered_df.hvplot.table(columns=["CoinName", "Algorithm", "ProofType", "TotalCoinSupply", "TotalCoinsMined", "class"], width=800)

In [73]:
# STEP 3
# Create a scatter plot using hvplot.scatter to present the clustered data about cryptocurrencies
# having x="TotalCoinsMined" and y="TotalCoinSupply" 
# to contrast the number of available coins versus the total number of mined coins.

clustered_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", by="class", hover_cols=["CoinName"])