In [54]:
# Prepare the data for dimensions reduction with PCA and clustering using K-means.
# Reduce data dimensions using PCA algorithms from sklearn.
# Predict clusters using cryptocurrencies data using the K-means algorithm form sklearn.
# Create some plots and data tables to present your results.

In [55]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import hvplot.pandas

In [56]:
file_path = "crypto_data.csv"
crypto_df = pd.read_csv(file_path, index_col=0)
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [57]:
# see size of original data set
crypto_df.shape

(1252, 6)

In [58]:
crypto_df.dtypes

CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [59]:
# find null values
for column in crypto_df.columns:
    print(f"Column \"{column}\" has {crypto_df[column].isnull().sum()} null values.")

Column &quot;CoinName&quot; has 0 null values.
Column &quot;Algorithm&quot; has 0 null values.
Column &quot;IsTrading&quot; has 0 null values.
Column &quot;ProofType&quot; has 0 null values.
Column &quot;TotalCoinsMined&quot; has 508 null values.
Column &quot;TotalCoinSupply&quot; has 0 null values.


In [60]:
# see what unique values are in columns for data cleansing
df_dict = dict(zip([i for i in crypto_df.columns] , [pd.DataFrame(crypto_df[i].unique(),columns=[i]) for i in crypto_df.columns]))
df_dict

{&#39;CoinName&#39;:             CoinName
 0            42 Coin
 1            365Coin
 2            404Coin
 3          SixEleven
 4                808
 ...              ...
 1240     BitcoinPlus
 1241      DivotyCoin
 1242     Giotto Coin
 1243  OpenSourceCoin
 1244       SteamPunk
 
 [1245 rows x 1 columns],
 &#39;Algorithm&#39;:           Algorithm
 0            Scrypt
 1               X11
 2           SHA-256
 3               X13
 4            Ethash
 ..              ...
 90  Equihash+Scrypt
 91             PHI2
 92      Avesta hash
 93       Slatechain
 94            TRC10
 
 [95 rows x 1 columns],
 &#39;IsTrading&#39;:    IsTrading
 0       True
 1      False,
 &#39;ProofType&#39;:                                 ProofType
 0                                 PoW/PoS
 1                                     PoW
 2                                     PoS
 3                                     PoC
 4                             PoS/PoW/PoT
 5                                    PoST
 6 

In [61]:
# Remove all cryptocurrencies that aren’t trading.
crypto_df.drop(crypto_df[crypto_df['IsTrading'] == False].index, inplace= True)

# Remove all cryptocurrencies that don’t have an algorithm defined.
crypto_df.dropna(subset=['Algorithm'])

# Remove the IsTrading column.
crypto_df.drop(columns=['IsTrading'], inplace= True)

# Remove all cryptocurrencies with at least one null value.
crypto_df.dropna(inplace=True)

# Remove all cryptocurrencies without coins mined.
crypto_df = crypto_df[crypto_df['TotalCoinsMined'] > 0]

# Store the names of all cryptocurrencies on a DataFramed named coins_name, and use the crypto_df.index as the index for this new DataFrame.
coins_name = crypto_df[['CoinName']].copy()

In [62]:
coins_name

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex
ZEN,Horizen


In [63]:
crypto_df.shape

(532, 5)

In [64]:
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [65]:
a = crypto_df.isnull().sum()
print(f'Nulls: \n{a}\n\nShape: {crypto_df.shape}\n{crypto_df.dtypes}')

Nulls: 
CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
TotalCoinSupply    0
dtype: int64

Shape: (532, 5)
CoinName            object
Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object


In [66]:
# test difference
coins_name2 = pd.DataFrame(crypto_df['CoinName'], index=crypto_df.index)
coins_name2
# identical result to copy()

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex
ZEN,Horizen


In [67]:
# Remove the CoinName column from crypto_df to set up for unsupervised ML. (non numeric value)
crypto_df.drop(columns='CoinName', inplace=True)
crypto_df
# Create dummies variables for all of the text features, and store the resulting data on a DataFrame named X.
crypto_df['TotalCoinSupply'] = crypto_df['TotalCoinSupply'].apply(float)
X = pd.get_dummies(crypto_df, columns['Algorithm', 'ProofType'])

In [68]:
crypto_df.dtypes

Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply    float64
dtype: object