# Clustering Crypto

In [211]:
!pip install -U altair

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [212]:
# Initial imports
"""Arrange Initial imports by length because it looks "So Nice" """
import csv
import json
import requests
import pandas as pd
import altair as alt
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler

### Fetching Cryptocurrency Data

In [213]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [214]:
data = requests.get(url).json()

In [215]:
"""Saving a hard-copy of the API data so I dont have to repeatedly call
the API every time I want to acsess the original copy of said data"""

with open('coins_data.csv', 'w', newline='') as f:
    fieldnames = ['Id', 'Url', 'ImageUrl', 'ContentCreatedOn', 
                  'Name', 'Symbol', 'CoinName', 'FullName', 'Description',
                  'AssetTokenStatus', 'Algorithm', 'ProofType', 'SortOrder',
                  'Sponsored', 'Taxonomy', 'Rating', 'IsTrading',
                  'TotalCoinsMined', 'BlockNumber', 'NetHashesPerSecond',
                  'BlockReward', 'BlockTime', 'AssetLaunchDate', 'MaxSupply', 'MktCapPenalty',
                  'PlatformType', 'BuiltOn', 'DecimalPoints', 'SmartContractAddress',
                  'Difficulty', 'IsUsedInDefi']
    
    writer = csv.DictWriter(f, fieldnames = fieldnames)
    writer.writeheader()
    
    nb_rows = len(data['Data'])

    for row in data['Data']:
        writer.writerow(data['Data'][row])

In [216]:
# """Alternate method--of importing the API data directly to a dataframe"""
# Commented because I dont want this cell to print a string
# pd.DataFrame(data['Data']).T

In [217]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
csv_path = Path('coins_data.csv')

In [218]:
crypto_df = pd.read_csv(csv_path)

### Data Preprocessing

In [219]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_df1 = crypto_df[['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','MaxSupply']]

# Keep only cryptocurrencies that are trading
crypto_df2 = crypto_df1[crypto_df1['IsTrading']==True]

# Remove the 'IsTrading' column
crypto_df3 = crypto_df2.drop(columns= 'IsTrading')

# Remove rows with at least 1 null value
# Keep only cryptocurrencies with a working algorithm
# Drop rows where there are 'N/A' text values

crypto_df4 = crypto_df3.dropna()

# Remove rows with cryptocurrencies having no coins mined
crypto_df5 = crypto_df4[(crypto_df4['TotalCoinsMined'] > 0)]

crypto_df6 = crypto_df5.set_index('CoinName')

crypto_df7 = crypto_df6.reset_index()

# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df 
coins_name = crypto_df7['CoinName']

# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df8 = crypto_df7.drop(columns='CoinName')

In [220]:
crypto_df8

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply
0,Scrypt,PoW/PoS,3.078407e+06,-1.000000e+00
1,SHA-256,PoW,2.615225e+06,-1.000000e+00
2,PoS,PoS,5.849667e+09,0.000000e+00
3,Scrypt,PoW,7.999310e+07,-1.000000e+00
4,X13,PoW/PoS,1.662081e+05,0.000000e+00
...,...,...,...,...
90,SHA-256,PoW,1.857473e+07,2.100000e+07
91,Ethash,PoW,1.138884e+08,-1.000000e+00
92,Leased POS,LPoS,1.038306e+08,-1.000000e+00
93,Ouroboros,PoS,3.178190e+10,4.500000e+10


In [221]:
# Get dummies
X = pd.get_dummies(crypto_df8)

In [222]:
# Standardize data
Scaler = StandardScaler()
Scaler.fit(X)
scaled_data = Scaler.transform(X)

### Reducing Dimensions Using PCA

In [223]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)

pca.fit(scaled_data)

X_pca = pca.transform(scaled_data)

In [224]:
# Create a DataFrame with the principal components data
x_pca_df = pd.DataFrame(X_pca)

### Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [225]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    model = KMeans(n_clusters=i, random_state=1)
    model.fit(x_pca_df)
    inertia.append(model.inertia_)
    
inertia_df = pd.DataFrame({'x':k, 'y':inertia})
# Create the Elbow Curve using hvPlot
alt.Chart(inertia_df).mark_line().encode(
    x='x',
    y='y'
)

Running K-Means with `k=<your best value for k here>`

In [226]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=1)
# Fit the model
model.fit(x_pca_df)
# Predict clusters
predictions = model.predict(x_pca_df)

In [227]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features
clustered_df = pd.concat([crypto_df8, x_pca_df, coins_name], axis=1)
clustered_df['Class'] = model.labels_
clustered_df = clustered_df.rename(columns={0:'PC 1', 1:'PC 2', 2:'PC 3'})
clustered_df = clustered_df.sort_index(ascending=True)
clustered_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply,PC 1,PC 2,PC 3,CoinName,Class
0,Scrypt,PoW/PoS,3.078407e+06,-1.000000e+00,-0.380124,-1.167683,-1.779031,NovaCoin,0
1,SHA-256,PoW,2.615225e+06,-1.000000e+00,-0.473714,1.610628,0.037110,CounterParty,1
2,PoS,PoS,5.849667e+09,0.000000e+00,-0.170049,-1.066453,1.010254,NuShares,3
3,Scrypt,PoW,7.999310e+07,-1.000000e+00,-0.453235,0.904988,-0.542259,MonaCoin,1
4,X13,PoW/PoS,1.662081e+05,0.000000e+00,-0.357033,-1.983798,-2.300401,Triangles Coin,0
...,...,...,...,...,...,...,...,...,...
90,SHA-256,PoW,1.857473e+07,2.100000e+07,-0.473525,1.610651,0.037096,Bitcoin,1
91,Ethash,PoW,1.138884e+08,-1.000000e+00,-0.461942,1.714705,0.179304,Ethereum,1
92,Leased POS,LPoS,1.038306e+08,-1.000000e+00,-0.269076,-1.810980,3.045140,Waves,3
93,Ouroboros,PoS,3.178190e+10,4.500000e+10,0.407978,-1.548307,1.117781,Cardano,3


### Visualizing Results

#### 3D-Scatter with Clusters

In [228]:
# Create a 3D-Scatter with the PCA data and the clusters
alt.Chart(clustered_df).mark_circle(size=60).encode(
    x='PC 1',
    y='PC 2',
    color='Class',
    tooltip=['CoinName', 'Algorithm', 'TotalCoinsMined', 'MaxSupply']
).interactive()

#### Scatter Plot with Tradable Cryptocurrencies

In [229]:
# Scale data to create the scatter plot
Y = clustered_df[['TotalCoinsMined', 'MaxSupply']]
Scaler.fit(Y)
scaled_data = Scaler.transform(Y)
scatter_plot_data = pd.DataFrame(scaled_data)

In [230]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
scatter_df = pd.concat([scatter_plot_data, coins_name], axis=1)
scatter_df = scatter_df.rename(columns={0:'CoinsMined', 1:'MaxSupply'})

In [231]:
alt.Chart(scatter_df).mark_circle(size=60).encode(
    x='CoinsMined',
    y='MaxSupply',
    tooltip=['CoinName']
).interactive()

#### Table of Tradable Cryptocurrencies

In [232]:
clustered_df.iloc[80, clustered_df.columns.get_loc('CoinName')] = 'PIVT'

In [233]:
# Table with tradable cryptos
clustered_df = clustered_df.set_index('CoinName')

In [234]:
with pd.option_context(
    'display.max_rows', None,
    'display.max_columns', None,
    'display.width', 100000):
        print(clustered_df)

                               Algorithm             ProofType  TotalCoinsMined     MaxSupply       PC 1      PC 2      PC 3  Class
CoinName                                                                                                                           
NovaCoin                          Scrypt               PoW/PoS     3.078407e+06 -1.000000e+00  -0.380124 -1.167683 -1.779031      0
CounterParty                     SHA-256                   PoW     2.615225e+06 -1.000000e+00  -0.473714  1.610628  0.037110      1
NuShares                             PoS                   PoS     5.849667e+09  0.000000e+00  -0.170049 -1.066453  1.010254      3
MonaCoin                          Scrypt                   PoW     7.999310e+07 -1.000000e+00  -0.453235  0.904988 -0.542259      1
Triangles Coin                       X13               PoW/PoS     1.662081e+05  0.000000e+00  -0.357033 -1.983798 -2.300401      0
Emercoin                         SHA-256               PoW/PoS     4.673348e

In [235]:
# Print the total number of tradable cryptocurrencies
len(clustered_df)

95