In [260]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas

In [261]:
#loading data
path = "./Data/crypto_data.csv"
df = pd.read_csv(path)
df.rename({'Unnamed: 0': 'Name'}, axis=1, inplace=True)
df

Unnamed: 0,Name,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
1247,XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
1248,DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
1249,GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [262]:
crypto_df = df.copy()

In [263]:
#1 removing all cryptocurrencies that aren't trading
indexNames = crypto_df[crypto_df['IsTrading'] ==False].index
 
# Delete these row indexes from dataFrame
crypto_df.drop(indexNames , inplace=True)


In [264]:
#2 removing all cryptocurrencies that don’t have an algorithm defined
crypto_df=crypto_df[crypto_df["Algorithm"] != "NaN"]
crypto_df.dropna(subset=['Algorithm'], how='all', inplace=True)
crypto_df

Unnamed: 0,Name,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
1243,SERO,Super Zero,Ethash,True,PoW,,1000000000
1244,UOS,UOS,SHA-256,True,DPoI,,1000000000
1245,BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [265]:
#3 Remove the IsTrading column 
crypto_df = crypto_df.drop("IsTrading", axis = 1)
crypto_df

Unnamed: 0,Name,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
1243,SERO,Super Zero,Ethash,PoW,,1000000000
1244,UOS,UOS,SHA-256,DPoI,,1000000000
1245,BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [266]:
#4 Remove all cryptocurrencies with at least one null value

indexNames = crypto_df[crypto_df['TotalCoinsMined'] <=0].index
 
# Delete these row indexes from dataFrame
crypto_df.drop(indexNames , inplace=True)

crypto_df.dropna(subset=['ProofType'], how='all', inplace=True)

#crypto_df.dtypes

crypto_df.shape

(991, 6)

In [267]:
#5 Remove all cryptocurrencies without coins mined
crypto_df.dropna(subset=['TotalCoinsMined'], how='all', inplace=True)
crypto_df.dropna(subset=['Name'], how='all', inplace=True)

crypto_df.set_index(crypto_df.Name, inplace=True)
del crypto_df.index.name
crypto_df = crypto_df.drop("Name", axis=1)

crypto_df.head(10)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000
DASH,Dash,X11,PoW/PoS,9031294.0,22000000
XMR,Monero,CryptoNight-V7,PoW,17201140.0,0
ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000
ZEC,ZCash,Equihash,PoW,7383056.0,21000000


In [268]:
#6 Store the names of all cryptocurrencies on a DataFramed named coins_name, and use the crypto_df.index as the index for this new DataFrame

coins_name = crypto_df.copy()
coins_name = coins_name.drop("Algorithm", axis=1)
coins_name = coins_name.drop("ProofType", axis=1)
coins_name = coins_name.drop("TotalCoinsMined", axis=1)
coins_name = coins_name.drop("TotalCoinSupply", axis=1)

coins_name

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex
ZEN,Horizen


In [269]:
#7 Remove the CoinName column

crypto_df = crypto_df.drop("CoinName", axis=1)

In [270]:
crypto_df.dtypes

Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [271]:
#8 Create dummies variables for all of the text features, and store the resulting data on a DataFrame named X
X = crypto_df.copy()
X = pd.get_dummies(X, columns=["Algorithm", "ProofType"])

In [272]:
#9 Use the StandardScaler from sklearn to standardize all of the data from the X DataFrame

X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:5])

[[-0.11710817 -0.1528703  -0.0433963  -0.0433963  -0.0433963  -0.06142951
  -0.07530656 -0.0433963  -0.06142951 -0.06142951 -0.0433963  -0.0433963
  -0.19245009 -0.06142951 -0.09740465 -0.0433963  -0.11547005 -0.07530656
  -0.0433963  -0.0433963  -0.15191091 -0.0433963  -0.13118084 -0.0433963
  -0.0433963  -0.08703883 -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.06142951 -0.0433963  -0.08703883 -0.08703883 -0.08703883 -0.0433963
  -0.13118084 -0.13840913 -0.13840913 -0.0433963  -0.06142951 -0.0433963
  -0.07530656 -0.18168574 -0.0433963  -0.0433963  -0.0433963  -0.07530656
  -0.15826614 -0.31491833 -0.0433963  -0.08703883 -0.07530656 -0.06142951
   1.38675049 -0.0433963  -0.0433963  -0.06142951 -0.0433963  -0.0433963
  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.39879994 -0.0433963  -0.18168574 -0.0433963  -0.08703883 -0.08703883
  -0.10680283 -0.0433963  -0.13118084 -0.0433963  -0.0433963  -0.0433963
  -0.0433963  -0.07530656 -0.43911856 -0.04339

In [273]:
#Reducing Data Dimensions using PCA

# Initialize PCA model
pca = PCA(n_components=3)

X_pca = pca.fit_transform(X_scaled)

In [274]:
#transform PCA data to a DF

pcs_df = pd.DataFrame(
    data=X_pca,
    columns=["PC 1", "PC 2", "PC 3"]
)

pcs_df.set_index(crypto_df.index, inplace=True)
pcs_df

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.333048,1.020246,-0.577213
404,-0.316374,1.020348,-0.577684
1337,2.302712,1.692130,-0.764208
BTC,-0.150450,-1.298871,0.179716
ETH,-0.151896,-2.037274,0.398158
...,...,...,...
ZEPH,2.458212,0.879249,-0.139405
GAP,-0.331092,1.020123,-0.577243
BDX,0.323714,-2.307650,0.427934
ZEN,-0.149833,-2.044228,0.448889


In [275]:
#fetch explained variance 

pca.explained_variance_ratio_

array([0.02793101, 0.02141061, 0.02050698])

In [276]:
#Using Elbow curve with the generated principle components 

#Find the best value for K
inertia = []
k = list(range(1, 10))

# Calculate the inertia for the range of K values
for i in k:
	km = KMeans(n_clusters=i, random_state=0)
	km.fit(X_pca)
	inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [277]:
# Initialize the K-means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
#predictions = model.predict(pcs_df)

# Add the predicted class columns


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

In [278]:
clustered_df['PC 1']= pcs_df['PC 1']
clustered_df['PC 2']= pcs_df['PC 2']
clustered_df['PC 3']= pcs_df['PC 3']

clustered_df['CoinName']= coins_name['CoinName']

#clustered_df["Class"] = model.labels_

clustered_df.head(13)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.333048,1.020246,-0.577213,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.316374,1.020348,-0.577684,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159265359,2.302712,1.69213,-0.764208,EliteCoin,4
BTC,SHA-256,PoW,17927180.0,21000000,-0.15045,-1.298871,0.179716,Bitcoin,1
ETH,Ethash,PoW,107684200.0,0,-0.151896,-2.037274,0.398158,Ethereum,1
LTC,Scrypt,PoW,63039240.0,84000000,-0.164141,-1.141362,0.01677,Litecoin,1
DASH,X11,PoW/PoS,9031294.0,22000000,-0.396776,1.2447,-0.545957,Dash,0
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.144421,-2.220007,0.443953,Monero,1
ETC,Ethash,PoW,113359700.0,210000000,-0.150336,-2.037379,0.398136,Ethereum Classic,1
ZEC,Equihash,PoW,7383056.0,21000000,-0.149832,-2.044228,0.448889,ZCash,1


In [279]:
import plotly.express as px

In [280]:
#plotting a 3d plot
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    hover_name="CoinName",
    hover_data=["Algorithm"],
    color="Class",
    symbol="Class"
    )
fig.show()

In [281]:
clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply','Class'], width=800)

In [282]:
#plotting a scatter plot to present the clustered data about cryptocurrencies
fig = clustered_df.hvplot.scatter(
    x="TotalCoinsMined",
    y = "TotalCoinSupply",
    hover_cols=["CoinName"],
    by="Class"
    )

fig