# Clustering Crypto

In [20]:
!pip install -U altair

Requirement already up-to-date: altair in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (4.1.0)
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [21]:
# Sagemaker imports
!pip install -U altair
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker.predictor import csv_serializer, json_deserializer
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
import boto3  # AWS Python sdk
import altair as alt

# Initial imports
import pandas as pd
# import hvplot.pandas
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

Requirement already up-to-date: altair in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (4.1.0)
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


### Data Preprocessing

In [None]:
# Keep only cryptocurrencies that are on trading
clean_crypto_df = crypto_df.loc[crypto_df['IsTrading']==True]
clean_crypto_df.sample(10)

In [None]:
# Keep only cryptocurrencies with a working algorithm
# A non working algorithm is not present in data set - so nothing to drop
# clean_crypto_df = clean_crypto_df.loc[clean_crypto_df['Algorithm'] != '']


In [None]:
# Remove the "IsTrading" column
clean_crypto_df = clean_crypto_df.drop(columns='IsTrading')
clean_crypto_df.head(5)


In [None]:
# Remove rows with at least 1 null value

# Check if there are nulls
clean_crypto_df.isnull().mean()

# drop nulls
clean_crypto_df = clean_crypto_df.dropna()

In [None]:
# Remove rows with cryptocurrencies without coins mined
clean_crypto_df = clean_crypto_df.loc[clean_crypto_df['TotalCoinsMined'] != 0]
clean_crypto_df = clean_crypto_df.loc[clean_crypto_df['TotalCoinsMined'] != 'NaN']


In [None]:
# Fetch the cryptocurrencies names prior to drop them from crypto_df
crypto_names = clean_crypto_df['CoinName']
crypto_names.head(10)

In [None]:
# Remove the cryptocurrency name since it's not going to be used on the clustering algorithm
clean_crypto_df = clean_crypto_df.drop(columns='CoinName')
clean_crypto_df.head()

In [None]:
# Create dummies variables for text features
X = pd.get_dummies(clean_crypto_df, columns=['Algorithm', 'ProofType'])
X.head(5)

In [None]:
# Standardize data
X_scaled = StandardScaler().fit_transform(X)
X_scaled

### Reducing Dimensions Using PCA

In [None]:
# Use PCA to reduce dimension to 3 principal components

# Initialize PCA model
pca = PCA(n_components=3)

# Deploy model
X_pca = pca.fit_transform(X_scaled)

In [None]:
# Create a DataFrame with the principal components data
pcs_df = pd.DataFrame(data=X_pca, columns=['PC 1', "PC 2", "PC 3"], index= clean_crypto_df.index)
pcs_df.head(5)

### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [None]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range ok k values

for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot

elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

Running K-Means with `k=4`

In [None]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=5)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)
print(predictions)


In [None]:
# Creating a new DataFrame including predicted clusters and cryptocurrencies features
clustered_df = pd.DataFrame({
    "Algorithm": clean_crypto_df.Algorithm,
    "ProofType": clean_crypto_df.ProofType,
    "TotalCoinsMined": clean_crypto_df.TotalCoinsMined,
    "TotalCoinSupply": clean_crypto_df.TotalCoinSupply,
    "PC 1": pcs_df['PC 1'],
    "PC 2": pcs_df['PC 2'],
    "PC 3": pcs_df['PC 3'],
    "CoinName": crypto_names,
    "Class": model.labels_, 
    },
)
clustered_df.head()



In [None]:
### Visualizing Results

# #### 3D-Scatter with Clusters
# fig = px.scatter_3d(
#     clustered_df,
#     x="PC 1",
#     y="PC 2",
#     z="PC 3",
#     hover_name='CoinName',
#     hover_data= ['Algorithm'],
#     color="Class",
#     symbol="Class",
# )
# fig.update_layout(legend=dict(x=0, y=1))
# fig.show()

In [None]:
# Sagemaker Scatter
alt.Chart(clustered_df).mark_circle(size=60).encode(
    x='PC 1',
    y='PC 2',
    color='class',
    tooltip=['Algorithm','TotalCoinsMined','TotalCoinSupply','CoinName']
).interactive()

In [None]:
#### Table of Tradable Cryptocurrencies

In [None]:
# Table with tradable cryptos
# columns = ['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'Class']
# table = clustered_df.hvplot.table(columns)
# table


In [None]:
# # Print the total number of tradable cryptocurrencies
# tradable_currencies = table.shape[0]
# print(f'The total number of tradable cryptocurrencies is {tradable_currencies}')


In [None]:
# Sagemaker tradable currencies
data_scaler = MinMaxScaler()
clustered_df[['TotalCoinsMined', 'TotalCoinSupply']] = data_scaler.fit_transform(clustered_df[['TotalCoinsMined', 'TotalCoinSupply']])

alt.Chart(results).mark_circle(size=60).encode(
    x='TotalCoinsMined',
    y='TotalCoinSupply',
    color='class',
).interactive()

#### Scatter Plot with Tradable Cryptocurrencies

In [None]:
# Scale data to create the scatter plot
data_scaler = MinMaxScaler()
clustered_df[['TotalCoinsMined', 'TotalCoinSupply']] = data_scaler.fit_transform(clustered_df[['TotalCoinsMined', 'TotalCoinSupply']])

In [None]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
# clustered_df.hvplot(
#     kind="scatter", 
#     x="TotalCoinsMined", 
#     y="TotalCoinSupply", 
#     c='Class', 
#     colormap="Inferno_r",
#     hover_cols=['CoinName'],
# )