# Data Preprocessing

In [1]:
# Import dependencies
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
# Load dataset into dataframe and view
file_path = "Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path, index_col  = 'Index')
crypto_df.head(10)

Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42.0
365,365Coin,X11,True,PoW/PoS,,2300000000.0
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000.0
611,SixEleven,SHA-256,True,PoW,,611000.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314000000000.0
2015,2015 coin,X11,True,PoW/PoS,,0.0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000.0
ETH,Ethereum,Ethash,True,PoW,107684200.0,0.0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000.0


In [3]:
# Remove index name
crypto_df.index.rename('', inplace=True)

In [4]:
# Remove all cryptocurrencies that aren’t trading
IsTrading_False = crypto_df[crypto_df['IsTrading']== False].index
crypto_df.drop(IsTrading_False, inplace=True)

In [5]:
# Remove all cryptocurrencies that don’t have an algorithm defined
# find null values
for column in crypto_df.columns:
    print(f"Column {column} has {crypto_df[column].isnull().sum()} null values")

Column CoinName has 0 null values
Column Algorithm has 0 null values
Column IsTrading has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 459 null values
Column TotalCoinSupply has 0 null values


All cryptocurrenies have a defined algorithm

In [6]:
# Drop the IsTrading column
crypto_df = crypto_df.drop(['IsTrading'], axis=1)

In [7]:
# drop rows with null values in them
crypto_df = crypto_df.dropna()

In [8]:
# Remove all cryptocurrencies without coins mined
TotalCoinsMined_0 = crypto_df[crypto_df['TotalCoinsMined']== 0].index
crypto_df.drop(TotalCoinsMined_0, inplace=True)

In [9]:
# Store names of all cryptocurrencies on a DataFrame named coins_name
coins_name = pd.DataFrame(crypto_df['CoinName'])

coins_name.head()

Unnamed: 0,CoinName
,
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [10]:
# Drop the CoinName column
crypto_df = crypto_df.drop(['CoinName'], axis=1)

In [11]:
# Create dummies variables for all text columns, store data on a DataFrame named X
X = pd.get_dummies(crypto_df, columns=["Algorithm", "ProofType"])
X.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
,,,,,,,,,,,,,,,,,,,,,
42,41.99995,42.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
404,1055185000.0,532000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1337,29279420000.0,314000000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BTC,17927180.0,21000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ETH,107684200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Standardize data
X = StandardScaler().fit_transform(X)
print(X)

[[-0.11674415 -0.15284751 -0.0433555  ... -0.0433555  -0.0433555
  -0.0433555 ]
 [-0.0935849  -0.14497819 -0.0433555  ... -0.0433555  -0.0433555
  -0.0433555 ]
 [ 0.5258821   4.49182814 -0.0433555  ... -0.0433555  -0.0433555
  -0.0433555 ]
 ...
 [-0.09523018 -0.13213547 -0.0433555  ... -0.0433555  -0.0433555
  -0.0433555 ]
 [-0.11658401 -0.15253688 -0.0433555  ... -0.0433555  -0.0433555
  -0.0433555 ]
 [-0.11674134 -0.15283272 -0.0433555  ... -0.0433555  -0.0433555
  -0.0433555 ]]


# Reducing Data Dimensions Using PCA

In [13]:
# Import dependencies
from sklearn.decomposition import PCA

In [14]:
# Initialize PCA model to reduce dimensions down to 3 principal components
pca = PCA(n_components=3)

# Use PCA to reduce the number of features in the standardized data
X_pca = pca.fit_transform(X)

In [15]:
# Transform PCA data (principal components) into a dataframe
pca_df = pd.DataFrame(
    data=X_pca,
    columns = ["PC 1", "PC 2", "PC 3"],
    index = crypto_df.index)

pca_df.head()

Unnamed: 0,PC 1,PC 2,PC 3
,,,
42,-0.325621,1.031866,-0.629063
404,-0.308961,1.031847,-0.629525
1337,2.290854,1.561271,-0.822842
BTC,-0.140117,-1.280641,0.234827
ETH,-0.159184,-2.046977,0.386747


# Clustering Cryptocurrencies Using K-means

In [16]:
# import dependencies
from sklearn.cluster import KMeans
import hvplot.pandas

In [17]:
# Create empty list and range of K values to test
inertia = []
k = list(range(1, 11))

# Loop through each K value, find inertia, and store in list
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pca_df)
    inertia.append(km.inertia_)

In [18]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

From the elbow curve, it appears that the best value for k is 5

In [19]:
# Initialize model with K = 5
model = KMeans(n_clusters=5, random_state=5)

# Fitting model (Assign data points to nearest centroid)
model.fit(pca_df)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=5, tol=0.0001, verbose=0)

In [20]:
# Find clusters (group data points)
predictions = model.predict(pca_df)

In [21]:
# Create new dataframe with all requested data
clustered_df = pd.DataFrame(data = crypto_df, index = crypto_df.index)
clustered_df["PC 1"] = pca_df["PC 1"]
clustered_df["PC 2"] = pca_df["PC 2"]
clustered_df["PC 3"] = pca_df["PC 3"]
clustered_df["CoinName"] = coins_name["CoinName"]
clustered_df["Class"] = model.labels_

clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
,,,,,,,,,
42,Scrypt,PoW/PoS,41.99995,42.0,-0.325621,1.031866,-0.629063,42 Coin,2.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.308961,1.031847,-0.629525,404Coin,2.0
1337,X13,PoW/PoS,29279420000.0,314000000000.0,2.290854,1.561271,-0.822842,EliteCoin,0.0
BTC,SHA-256,PoW,17927180.0,21000000.0,-0.140117,-1.280641,0.234827,Bitcoin,4.0
ETH,Ethash,PoW,107684200.0,0.0,-0.159184,-2.046977,0.386747,Ethereum,4.0


# Visualizing Results

### Please Note: The first two graphs include two extreme outliers (BitTorrent and BiblePay). These 2 outliers have been removed from the third and fourth graghs to make the visualizations more meaningful.

In [22]:
# import dependencies
import plotly.express as px

In [23]:
# Create current tradable cryptocurrencies data table
clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'Class'], width=400)



In [24]:
# Create 3D scatter plot to plot the clusters of clustered_df 
fig1 = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Class",
    symbol="Class",
    hover_name="CoinName",
    hover_data=["Algorithm"])
fig1.update_layout(legend=dict(x=0,y=1))
fig1.show()

In [25]:
# Convert column from object to float to properly display Y axis labels
clustered_df["TotalCoinSupply"] = clustered_df["TotalCoinSupply"].astype(float)

In [26]:
# Create scatter plot 
clustered_df.hvplot.scatter(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    by="Class",
    hover_cols=["CoinName"])

### Visualizations with Outliers Removed

In [27]:
# Remove 2 extreme outliers
clustered_df.drop(clustered_df[clustered_df['CoinName'] =='BiblePay'].index, inplace=True)
clustered_df.drop(clustered_df[clustered_df['CoinName'] =='BitTorrent'].index, inplace=True)

In [28]:
# Create 3D scatter plot to plot the clusters of clustered_df without outliers
fig1 = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Class",
    symbol="Class",
    hover_name="CoinName",
    hover_data=["Algorithm"])
fig1.update_layout(legend=dict(x=0,y=1))
fig1.show()

In [29]:
# Create scatter plot without outliers 
clustered_df.hvplot.scatter(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    by="Class",
    hover_cols=["CoinName"])