In [1]:
# Dependencies.
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
#import hvplot.pandas
#import plotly.figure_factory as ff
#import plotly.express as px

Preprocessing Data

In [2]:
# Loading Data
file_path = "./crypto_data.csv"
crypto_df = pd.read_csv(file_path)
crypto_df.info()
crypto_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1252 entries, 0 to 1251
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       1252 non-null   object 
 1   CoinName         1252 non-null   object 
 2   Algorithm        1252 non-null   object 
 3   IsTrading        1252 non-null   bool   
 4   ProofType        1252 non-null   object 
 5   TotalCoinsMined  744 non-null    float64
 6   TotalCoinSupply  1252 non-null   object 
dtypes: bool(1), float64(1), object(5)
memory usage: 60.0+ KB


Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [3]:
# Renaming the Unnamed header as CryptID 
crypto_df = crypto_df.rename({'Unnamed: 0': 'CryptID'}, axis=1)
crypto_df.head()

Unnamed: 0,CryptID,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [4]:
# Instruction 1. Remove all cryptocurrencies that are not trading.
crypto_df = crypto_df[crypto_df["IsTrading"] == True]
crypto_df.info()
crypto_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1144 entries, 0 to 1247
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CryptID          1144 non-null   object 
 1   CoinName         1144 non-null   object 
 2   Algorithm        1144 non-null   object 
 3   IsTrading        1144 non-null   bool   
 4   ProofType        1144 non-null   object 
 5   TotalCoinsMined  685 non-null    float64
 6   TotalCoinSupply  1144 non-null   object 
dtypes: bool(1), float64(1), object(5)
memory usage: 63.7+ KB


Unnamed: 0,CryptID,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [5]:
# Instruction 2. Removing all cryptocurrencies that donâ€™t have an algorithm defined
crypto_df['Algorithm'].isnull().values.any()

False

In [6]:
# Instruction 3. Removing the IsTrading column.
crypto_df.drop(['IsTrading'], axis=1, inplace=True)
crypto_df.info()
crypto_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1144 entries, 0 to 1247
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CryptID          1144 non-null   object 
 1   CoinName         1144 non-null   object 
 2   Algorithm        1144 non-null   object 
 3   ProofType        1144 non-null   object 
 4   TotalCoinsMined  685 non-null    float64
 5   TotalCoinSupply  1144 non-null   object 
dtypes: float64(1), object(5)
memory usage: 62.6+ KB


Unnamed: 0,CryptID,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


In [7]:
# Instruction 4. Removing all cryptocurrencies with at least one null value.
crypto_df.dropna(inplace=True)
crypto_df.info()
crypto_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 685 entries, 0 to 1247
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CryptID          685 non-null    object 
 1   CoinName         685 non-null    object 
 2   Algorithm        685 non-null    object 
 3   ProofType        685 non-null    object 
 4   TotalCoinsMined  685 non-null    float64
 5   TotalCoinSupply  685 non-null    object 
dtypes: float64(1), object(5)
memory usage: 37.5+ KB


Unnamed: 0,CryptID,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,808,SHA-256,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [8]:
# Instruction 5.Remove all cryptocurrencies without coins mined values. 
crypto_df = crypto_df[crypto_df["TotalCoinsMined"] > 0]
crypto_df.info()
crypto_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 532 entries, 0 to 1247
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CryptID          532 non-null    object 
 1   CoinName         532 non-null    object 
 2   Algorithm        532 non-null    object 
 3   ProofType        532 non-null    object 
 4   TotalCoinsMined  532 non-null    float64
 5   TotalCoinSupply  532 non-null    object 
dtypes: float64(1), object(5)
memory usage: 29.1+ KB


Unnamed: 0,CryptID,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,PoW,107684200.0,0


In [9]:
# Instruction 6. Store the names of all cryptocurrencies on a Data Frame named coins_name, and use the crypto_df.index as the index for this new DataFrame.
coins_name_df=crypto_df[["CryptID","Algorithm","ProofType","TotalCoinsMined","TotalCoinSupply"]].copy().set_index("CryptID")
coins_name_df.info()
coins_name_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 532 entries, 42 to XBC
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Algorithm        532 non-null    object 
 1   ProofType        532 non-null    object 
 2   TotalCoinsMined  532 non-null    float64
 3   TotalCoinSupply  532 non-null    object 
dtypes: float64(1), object(3)
memory usage: 20.8+ KB


Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
CryptID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [10]:
# Instruction 8. Creating dummy variables for all of the text features, and store the resulting data on a DataFrame named X
le = LabelEncoder()
X = coins_name_df.copy()
X['Algorithm'] = le.fit_transform(X['Algorithm']) 
X['ProofType'] = le.fit_transform(X['ProofType']) 
X.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
CryptID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,52,15,41.99995,42
404,52,15,1055185000.0,532000000
1337,66,15,29279420000.0,314159265359
BTC,47,12,17927180.0,21000000
ETH,20,12,107684200.0,0


In [11]:
# Instruction 9. Use sklearn StandardScaler to standardize all of the data
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:5])

[[ 3.76459118e-01  8.91356555e-01 -1.17108170e-01 -1.52870298e-01]
 [ 3.76459118e-01  8.91356555e-01 -9.39695522e-02 -1.45008997e-01]
 [ 1.21543803e+00  8.91356555e-01  5.24945609e-01  4.48942416e+00]
 [ 7.68237937e-02  1.67233875e-03 -1.16715055e-01 -1.52559984e-01]
 [-1.54120696e+00  1.67233875e-03 -1.14746818e-01 -1.52870298e-01]]


In [12]:
# Initializing PCS model
pca = PCA(n_components=3)

In [13]:
# Applying the reduction to the scaled dataset.
X_pca=pca.fit_transform(X_scaled)

In [14]:
# Transforming PCA data to a DataFrame keeping the original index.
pcs_df = pd.DataFrame(
    data= X_pca, index=X.index, columns = ["PC 1", "PC 2","PC 3"])
pcs_df.head()

Unnamed: 0_level_0,PC 1,PC 2,PC 3
CryptID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,-0.417875,0.810296,0.372138
404,-0.396564,0.815135,0.373256
1337,3.124076,2.20978,0.504335
BTC,-0.192083,0.016266,-0.07291
ETH,-0.044116,-1.167492,1.012525


In [15]:
# 1a - Define the inertia and K cluster. Ranges of 9-11 does seem apppropriate, based on testing.
inertia = []
k = list(range(1, 9)) 

# Calculate the inertia for the range of K values
for i in k:
   km = KMeans(n_clusters=i, random_state=0)
   km.fit(pcs_df)
   inertia.append(km.inertia_)