# Clustering Crypto

In [138]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import hvplot.pandas

### Deliverable 1: Preprocessing the Data for PCA

In [139]:
# Load data

file_path = "Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path)

crypto_df.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,42.0,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055184902.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,True,PoW/PoS,29279424623.0,314159265359
6,2015,2015 coin,X11,True,PoW/PoS,,0
7,BTC,Bitcoin,SHA-256,True,PoW,17927175.0,21000000
8,ETH,Ethereum,Ethash,True,PoW,107684223.0,0
9,LTC,Litecoin,Scrypt,True,PoW,63039243.0,84000000


In [140]:
# lets check the data for cleansing
crypto_df.shape

(1252, 7)

In [141]:
# Columns 
crypto_df.columns

Index(['Unnamed: 0', 'CoinName', 'Algorithm', 'IsTrading', 'ProofType',
       'TotalCoinsMined', 'TotalCoinSupply'],
      dtype='object')

In [142]:
# List dataframe data types
crypto_df.dtypes

Unnamed: 0          object
CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [143]:
# Find duplicate entries
print(f"Duplicate entries: {crypto_df.duplicated().sum()}")

Duplicate entries: 0


In [144]:
# Keep all the cryptocurrencies that are being traded.
# Transform String column
def change_string(IsTrading):
    if IsTrading == "False":
        return 0
    else:
        return 1
    
crypto_df["IsTrading"] = crypto_df["IsTrading"].apply(change_string)
crypto_df.shape

(1252, 7)

In [145]:
# remove not trade coins to keep all the cryptocurrencies that are being traded
crypto_df = crypto_df[-(crypto_df == 0).any(axis=1)]
crypto_df.shape

(1086, 7)

In [146]:
# Keep all the cryptocurrencies that are being traded.
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,1,PoW/PoS,42.0,42
1,365,365Coin,X11,1,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,1,PoW/PoS,1055184902.0,532000000
3,611,SixEleven,SHA-256,1,PoW,,611000
5,1337,EliteCoin,X13,1,PoW/PoS,29279424623.0,314159265359


In [147]:
# Remove the "IsTrading" column. 
crypto_df.drop(columns=["IsTrading"], inplace=True)
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,42.0,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055184902.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
5,1337,EliteCoin,X13,PoW/PoS,29279424623.0,314159265359


In [148]:
# Remove rows that have at least 1 null value.
# step 1: Find null values
for column in crypto_df.columns:
    print(f"Column {column} has {crypto_df[column].isnull().sum()} null values")

Column Unnamed: 0 has 0 null values
Column CoinName has 0 null values
Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 508 null values
Column TotalCoinSupply has 0 null values


In [149]:
# Remove rows that have at least 1 null value.
# step 2: dropna
crypto_df = crypto_df.dropna()
crypto_df.shape

(578, 6)

In [150]:
# Keep the rows where coins are mined.
pd.options.display.float_format = '{:,.0f}'.format

In [151]:
crypto_df = crypto_df[crypto_df.TotalCoinsMined > 0]

In [152]:
crypto_df.shape

(577, 6)

In [153]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
# Store the names of all cryptocurrencies on a DataFramed named coins_name, and use the crypto_df.index as the index for this new DataFrame.

coins_name = crypto_df.set_index(["Unnamed: 0"])
coins_name

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,42,42
404,404Coin,Scrypt,PoW/PoS,1055184902,532000000
1337,EliteCoin,X13,PoW/PoS,29279424623,314159265359
BTC,Bitcoin,SHA-256,PoW,17927175,21000000
ETH,Ethereum,Ethash,PoW,107684223,0
...,...,...,...,...,...
GAP,Gapcoin,Scrypt,PoW/PoS,14931046,250000000
BDX,Beldex,CryptoNight,PoW,980222595,1400222610
ZEN,Horizen,Equihash,PoW,7296538,21000000
XBC,BitcoinPlus,Scrypt,PoS,128327,1000000


In [154]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
coins_name.drop(columns=["CoinName"], inplace=True)
coins_name

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,42,42
404,Scrypt,PoW/PoS,1055184902,532000000
1337,X13,PoW/PoS,29279424623,314159265359
BTC,SHA-256,PoW,17927175,21000000
ETH,Ethash,PoW,107684223,0
...,...,...,...,...
GAP,Scrypt,PoW/PoS,14931046,250000000
BDX,CryptoNight,PoW,980222595,1400222610
ZEN,Equihash,PoW,7296538,21000000
XBC,Scrypt,PoS,128327,1000000


In [155]:
# We know that our model can’t have strings passed into it. 
# To make sure we can use our string data, we’ll transform our strings of PoW/PoS, PoS, and PoW from the ProofType column to 0,1, and 2, respectively. 
# The function will then be run on the whole column with the .apply method.
coins_name['TotalCoinSupply'] = coins_name['TotalCoinSupply'].apply(float)

In [156]:
# Transform String column
def change_string(ProofType):
    if ProofType == "PoW/PoS":
        return 0
    if ProofType == "Pos":
        return 1
    else:
        return 2
    
coins_name["ProofType"] = coins_name["ProofType"].apply(change_string)
coins_name.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,0,42,42
404,Scrypt,0,1055184902,532000000
1337,X13,0,29279424623,314159265359
BTC,SHA-256,2,17927175,21000000
ETH,Ethash,2,107684223,0


In [157]:
# Saving cleaned data
file_path = "Resources/coins_name.csv"
coins_name.to_csv(file_path, index=False)

In [158]:
# Use get_dummies() to create variables for text features.
X = coins_name[['Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply']].copy()
X = pd.get_dummies(X, columns=['Algorithm'], drop_first=True)
X = X.dropna()
X.head()

Unnamed: 0_level_0,ProofType,TotalCoinsMined,TotalCoinSupply,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,Algorithm_Tribus,Algorithm_VBFT,Algorithm_VeChainThor Authority,Algorithm_X11,Algorithm_X11GOST,Algorithm_X13,Algorithm_X14,Algorithm_X15,Algorithm_X16R,Algorithm_XEVAN
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42,0,42,42,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
404,0,1055184902,532000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1337,0,29279424623,314159265359,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
BTC,2,17927175,21000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,2,107684223,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [159]:
# Saving cleaned data
file_path = "Resources/X.csv"
X_pca.to_csv(file_path, index=False)

In [160]:
# Standardize the data with StandardScaler().
# Use the StandardScaler from sklearn to standardize all of the data from the X DataFrame. 
# this is an important step prior to using PCA and K-means algorithms.

from sklearn.preprocessing import MinMaxScaler
X_scaled = MinMaxScaler().fit_transform(X)
X_scaled

array([[0.00000000e+00, 0.00000000e+00, 4.20000000e-11, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.06585544e-03, 5.32000000e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.95755135e-02, 3.14159265e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.00000000e+00, 7.37028150e-06, 2.10000000e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 1.29582282e-07, 1.00000000e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.17085015e-05, 1.00000000e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

### Deliverable 2: Reducing Data Dimensions Using PCA

In [161]:
# Using PCA to reduce dimension to three principal components.
#Standardize data with StandardScaler
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:5])

[[-1.40917431 -0.11450078 -0.15072489 -0.04166667 -0.04166667 -0.05897678
  -0.0934947  -0.04166667 -0.05897678 -0.05897678 -0.04166667 -0.04166667
  -0.18949048 -0.05897678 -0.0934947  -0.04166667 -0.11081833 -0.0722944
  -0.04166667 -0.04166667 -0.1518211  -0.04166667 -0.13280318 -0.04166667
  -0.04166667 -0.0835512  -0.05897678 -0.04166667 -0.04166667 -0.04166667
  -0.05897678 -0.04166667 -0.0835512  -0.0934947  -0.10250796 -0.04166667
  -0.1258772  -0.13280318 -0.1518211  -0.04166667 -0.0835512  -0.04166667
  -0.04166667 -0.0722944  -0.17423301 -0.04166667 -0.04166667 -0.04166667
  -0.0722944  -0.16888013 -0.30802055 -0.04166667 -0.0934947  -0.0934947
  -0.05897678  1.39963365 -0.04166667 -0.04166667 -0.04166667 -0.0835512
  -0.04166667 -0.04166667 -0.04166667 -0.04166667 -0.04166667 -0.05897678
  -0.04166667 -0.04166667 -0.39536391 -0.04166667 -0.17423301 -0.04166667
  -0.0835512  -0.0835512  -0.10250796]
 [-1.40917431 -0.09041991 -0.14255828 -0.04166667 -0.04166667 -0.05897678
  

In [162]:
# Initialize PCA model
pca = PCA(n_components=3)

In [163]:
# Get two principal components for the crypto data.
X_pca = pca.fit_transform(X_scaled)

In [164]:
# Transform PCA data to a DataFrame
X_pca = pd.DataFrame(
    data=X_pca, columns=["PC 1", "PC 2", "PC 3"]
)
X_pca.head()

Unnamed: 0,PC 1,PC 2,PC 3
0,0,2,-1
1,0,2,-1
2,3,2,1
3,0,-1,1
4,0,-2,0


In [165]:
# Saving cleaned data
file_path = "Resources/X_pca.csv"
X_pca.to_csv(file_path, index=False)

In [166]:
# Loading data
file_path ="Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path)

In [167]:
df_y = pd.DataFrame(data=crypto_df, columns=['Unnamed: 0'])
df = X_pca.join(df_y, how='inner')
df.head()

Unnamed: 0.1,PC 1,PC 2,PC 3,Unnamed: 0
0,0,2,-1,42
1,0,2,-1,365
2,3,2,1,404
3,0,-1,1,611
4,0,-2,0,808


In [168]:
# Create a DataFrame with the three principal components.
# YOUR CODE HERE
df = df.set_index(["Unnamed: 0"])
df.head()

Unnamed: 0_level_0,PC 1,PC 2,PC 3
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,0,2,-1
365,0,2,-1
404,3,2,1
611,0,-1,1
808,0,-2,0


In [169]:
df.count()

PC 1    577
PC 2    577
PC 3    577
dtype: int64