In [None]:
# Import the required modules
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import hvplot.pandas

# Machine Learning
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
from sklearn.metrics import silhouette_score, calinski_harabasz_score

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the data into a Pandas DataFrame
df = pd.read_csv(
    "Resources/crypto_market_data.csv", index_col="coin_id")

# Display sample data
df.head(10)

In [None]:
# Generate summary statistics
df.describe()

In [None]:
# Plot your data to see what's in your DataFrame
df.hvplot.line(
    width=800,
    height=400,
    rot=90
)

In [None]:
df.info()

In [None]:
df.describe()

---

### Prepare the Data

In [None]:
# Create a DataFrame with the scaled data

scaler = StandardScaler()
# Copy the crypto names from the original data
scaler.fit(df)
# Set the coinid column as index
scaled_data = scaler.transform(df)
df_scaled = pd.DataFrame(scaled_data, columns=df.columns)
df_scaled.index=df.index
df_scaled=df_scaled.reset_index()
# Display sample data
df_scaled.head()

---

### Find the Best Value for k Using the Original Data.

In [None]:

# Can this even be clustered?

# Assuming df_scaled is your DataFrame containing the scaled data, excluding the column with cryptocurrency names
numerical_columns = df_scaled.select_dtypes(include=['float64', 'int64']).columns
df_numerical = df_scaled[numerical_columns]

# Apply t-SNE on the numerical data
tsne = TSNE(perplexity=4)
df_tsne = pd.DataFrame(tsne.fit_transform(df_numerical))

# Plot the clusters
plt.scatter(df_tsne[0], df_tsne[1])
plt.title("t-SNE on Numerical Data")
plt.show()

In [None]:
#Define your X features to predict
X=df_scaled

In [None]:
# Assuming X is your data without non-numeric columns like cryptocurrency names
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns
X_numerical = X[numerical_columns]

inertia = []
silhouettes = []
cha_chas = []

# Create a a list to store the values of k
k = list(range(2, 20))

# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the spread_df DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(X_numerical)
    preds = k_model.predict(X_numerical)
    
    inertia.append(k_model.inertia_)
    score = silhouette_score(X_numerical, preds)
    silhouettes.append(score)
    
    cha_cha = calinski_harabasz_score(X_numerical, preds)
    cha_chas.append(cha_cha)
    
    print(f"Finished {i} out of {max(k)}")

In [None]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia, "silhouette_score": silhouettes, "cha_score": cha_chas}
df_elbow = pd.DataFrame(elbow_data)

df_elbow["acc"] = df_elbow.inertia.diff()

# Review the DataFrame
df_elbow.head(20)

In [None]:
# Plot the DataFrame
plt.plot(df_elbow["k"], df_elbow["inertia"])
plt.title("Elbow Curve")
plt.xticks(df_elbow["k"])
plt.ylabel("inertia")
plt.xlabel("k")
plt.show()

In [None]:
# Create a dictionary with the data to plot the Elbow curve
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head(20)




In [None]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
# Plot the DataFrame
plt.plot(df_elbow["k"], df_elbow["inertia"])
plt.title("Elbow Curve")
plt.xticks(df_elbow["k"])
plt.ylabel("inertia")
plt.xlabel("k")
plt.show()

In [None]:
# Create the elbow curve plot using hvPlot
elbow_plot = df_elbow.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve",
    xlabel="Number of Clusters (k)",
    ylabel="Inertia",
    xticks=df_elbow["k"],
    line_color='blue'  # Optional: Set a color for the line
)

# Display the elbow plot
elbow_plot

#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** The best value for k is typically where the inertia starts to decrease more slowly, indicating diminishing returns in terms of clustering improvement. In this case, the best value for k is likely 4. This choice is supported by factors such as the highest variance ratio criterion, the steepest elbow section in the inertia plot, and a good silhouette score at the elbow point..

---

### Cluster Cryptocurrencies with K-means Using the Original Data

In [None]:

df_sub=df_scaled.copy()

# Select only numeric columns for clustering
numeric_columns = df_scaled.select_dtypes(include=['float64', 'int64'])

# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=4, random_state=1) 

# Fit the K-Means model using the scaled data
model.fit(numeric_columns)

# Predict the clusters to group the cryptocurrencies using the scaled data
preds = model.predict(numeric_columns)

# Print the resulting array of cluster values.
print(preds)

# Create a copy of the DataFrame
df_sub2 = df_scaled.copy()

# Add a new column to the DataFrame with the predicted clusters
df_sub2['clusters'] = preds

# Display sample data
df_sub2.head()

In [None]:
# Select numerical columns from df_scaled
numerical_columns = df_scaled.select_dtypes(include=['float64', 'int64']).columns
df_numerical = df_scaled[numerical_columns]

# Apply t-SNE on the numerical data
tsne = TSNE(perplexity=4)
df_tsne = pd.DataFrame(tsne.fit_transform(df_numerical))

# Plot the clusters
plt.scatter(df_tsne[0], df_tsne[1], c=pd.factorize(df_sub2['clusters'])[0])
plt.title("t-SNE on Numerical Data with Clusters")
plt.show()

In [None]:
# Can this even be clustered?

# Assuming df_scaled is your DataFrame containing the scaled data, excluding the column with cryptocurrency names
numerical_columns = df_scaled.select_dtypes(include=['float64', 'int64']).columns
df_numerical = df_sub2[numerical_columns]

# Apply t-SNE on the numerical data
tsne = TSNE(perplexity=4)
df_tsne = pd.DataFrame(tsne.fit_transform(df_numerical))

# Plot the clusters
plt.scatter(df_tsne[0], df_tsne[1])
plt.title("t-SNE on SCALED Data")
plt.show()

In [None]:
plt.scatter(df_sub2.price_change_percentage_24h, df_sub2.price_change_percentage_7d, c= df_sub2.clusters)
plt.ylabel("price_change_percentage_7d")
plt.xlabel("price_change_percentage_24h")
plt.show()

In [None]:
scaled_plot = df_sub2.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    by="clusters",  # This will color the points by clusters
    title="Cryptocurrency Price Changes",
    xlabel="Price Change Percentage (24h)",
    ylabel="Price Change Percentage (7d)",
    hover_cols=["coin_id"]  # Optional: Include additional info on hover
)

# Display the plot
scaled_plot

In [None]:
# df_sub4 = df_sub2.copy()
# scaled_plot = df_sub4.hvplot.scatter(
#     x="price_change_percentage_24h",  # Fixed typo here
#     y="price_change_percentage_7d",
#     by="clusters",
#     hover_cols=["coin_id"],
#     marker=["hex", "square", "cross", "inverted_triangle"],
#     title="Cryptocurrencies clusters",
# )

# # Display the plot
# scaled_plot

In [None]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
df_sub2.reset_index().hvplot.scatter(
    width = 800,
    height=400,
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    color ="clusters",
    hover_cols="coin_id"
)


---

### Optimize Clusters with Principal Component Analysis.

In [None]:
# Select only numeric columns for correlation calculation
numeric_cols = df_scaled.select_dtypes(include=[np.number])

# Calculate the correlation matrix
corrs = numeric_cols.corr()
corrs

In [None]:
sns.heatmap(corrs, annot=True)
plt.show()

In [None]:
# Assuming df_scaled is your scaled DataFrame
numeric_df = df_scaled.select_dtypes(include=['float64', 'int64'])
n_components = len(numeric_df.columns)
pca=PCA(n_components=n_components)

pca_data = pca.fit_transform(numeric_df)
df_pca7 = pd.DataFrame(pca_data, columns=["PCA_" + str(x) for x in range(1, n_components + 1)])

# View the first five rows of the DataFrame. 
df_pca7.head()

In [None]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
# See if dimensionality reduction will help

# Assume pca.explained_variance_ratio_ is already computed
explained_variance_ratio = pca.explained_variance_ratio_

# Create a DataFrame from the explained variance ratio
df2 = pd.DataFrame({
    'Principal Component': range(1, n_components + 1),
    'Explained Variance Ratio': explained_variance_ratio
})

# Plot the elbow plot
plt.figure(figsize=(8, 5))
plt.plot(df2['Principal Component'], df2['Explained Variance Ratio'], marker='o', linestyle='--')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Elbow Plot of PCA Explained Variance Ratio')
plt.grid(True)
plt.show()

In [None]:
# Assuming df_scaled is your scaled DataFrame
numeric_df = df_scaled.select_dtypes(include=['float64', 'int64'])  # Select only numeric columns

# Set the number of components to 3
n_components = 3  

# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
pca = PCA(n_components=n_components)

# Fit the PCA model on the transformed numeric DataFrame
pca_data = pca.fit_transform(numeric_df)
df_pca3 = pd.DataFrame(pca_data, columns=["PCA_" + str(x) for x in range(1, n_components + 1)])
df_pca3.index = df_scaled.index

df_pca3.head()

In [None]:
explained_variance_ratio

In [None]:
sum(explained_variance_ratio[0:3])


#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** 3 components still explains90% of the variance, so it is worth doing dimensionality reduction.

In [None]:

# Display sample data
df_pca3.head()

---

### Find the Best Value for k Using the PCA Data

In [None]:
df_pca3.head()

In [None]:
# Assuming df_pca3 is your PCA DataFrame with PCA components
inertia_values = []
k_values = range(1, 11)  # You can adjust the range of k values

# Perform K-means clustering for different values of k
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_pca3)  # Fit K-means to the PCA data
    inertia_values.append(kmeans.inertia_)  # Store the inertia value

# Create a DataFrame for the elbow plot
df_pca_elbow = pd.DataFrame({
    'k': k_values,
    'inertia': inertia_values
})

# Create the elbow curve plot for PCA data
pca_elbow_plot = df_pca_elbow.hvplot.line(
    x="k",
    y="inertia",
    title="PCA Elbow Curve",
    xlabel="Number of Clusters (k)",
    ylabel="Inertia",
    xticks=df_pca_elbow["k"],
    line_color='orange'  # Optional: Set a color for the line
)

# Display the PCA elbow plot
pca_elbow_plot

In [None]:
# Create a list with the number of k-values from 1 to 11
# Assuming df_scaled is your DataFrame containing the scaled data, excluding the column with cryptocurrency names
numerical_columns = df_scaled.select_dtypes(include=['float64', 'int64']).columns
df_numerical = df_scaled[numerical_columns]

# Apply t-SNE on the numerical data
tsne = TSNE()
df_tsne1 = pd.DataFrame(tsne.fit_transform(df_pca7))

# Plot the clusters
plt.scatter(df_tsne1[1], df_tsne1[0])
plt.title("t-SNE on PCA Data")
plt.show()

In [None]:
#Define your X features to predict
X=df_scaled

In [None]:
# Create an empty list to store the inertia values
# Assuming X is your data without non-numeric columns like cryptocurrency names
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns
X_numerical = X[numerical_columns]

inertia = []
silhouettes = []
cha_chas = []

# Create a a list to store the values of k
k = list(range(2, 20))

# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the spread_df DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(X_numerical)
    preds = k_model.predict(X_numerical)
    
    inertia.append(k_model.inertia_)
    score = silhouette_score(X_numerical, preds)
    silhouettes.append(score)
    
    cha_cha = calinski_harabasz_score(X_numerical, preds)
    cha_chas.append(cha_cha)
    
    print(f"Finished {i} out of {max(k)}")


In [None]:
# Create a dictionary with the data to plot the Elbow curve
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia, "silhouette_score": silhouettes, "cha_score": cha_chas}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)

df_elbow["acc"] = df_elbow.inertia.diff()

# Review the DataFrame
df_elbow.head(20)



In [None]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
# Plot the DataFrame
plt.plot(df_elbow["k"], df_elbow["inertia"])
plt.title("Elbow Curve")
plt.xticks(df_elbow["k"])
plt.ylabel("inertia")
plt.xlabel("k")
plt.show()


In [None]:
df_elbow.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve",
    xticks=df_elbow["k"]
)

#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** K is likely looks to be best again. PCA looks better


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** K is likely looks to be best again. PCA looks better

### Cluster Cryptocurrencies with K-means Using the PCA Data

In [None]:
# Select only numeric columns for clustering
numeric_columns = df_scaled.select_dtypes(include=['float64', 'int64'])

# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=4, random_state=1) 

# Fit the K-Means model using the PCA data
model.fit(numeric_columns)

# Predict the clusters to group the cryptocurrencies using the PCA data
preds = model.predict(numeric_columns)

# Print the resulting array of cluster values.
print(preds)

# Create a copy of the DataFrame
df_sub2 = df_scaled.copy()

# Add a new column to the DataFrame with the predicted clusters
df_sub2['clusters'] = preds

# Display sample data
df_sub2.head()

In [None]:


# Create a copy of the DataFrame with the PCA data
df_sub2=df_pca3.copy()

# Add a new column to the DataFrame with the predicted clusters
df_sub2['clusters'] = preds

# Display sample data
df_sub2.head()



In [None]:
# Plot the clusters
plt.scatter(df_tsne1[1], df_tsne1[0], c=df_sub2.clusters)
plt.title("t-SNE on PCA Data")
plt.show()

In [None]:
plt.scatter(df_sub2.PCA_1, df_sub2.PCA_2, c=df_sub2.clusters)
plt.ylabel("PCA_1")
plt.xlabel("PCA_2")
plt.show()

In [None]:
# Create a scatter plot using hvPlot by setting 
# `x="PC1"` and `y="PC2"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
df_sub2.reset_index().hvplot.scatter(
    width = 800,
    height=400,
    x="PCA_1",
    y="PCA_2",
    color ="clusters",
    hover_cols="coin_id"
)


In [None]:
#Hvplot
df_sub3=df_sub2.copy()
pca_plot = df_sub3.hvplot.scatter(
    x="PCA_1",
    y="PCA_2",
    by ="clusters",
    hover_cols=["coin_id"],
    marker=["hex", "square","cross", "inverted_triangle"],
    title = "Cryptocurrencies clusters Using PCA Data",
)

pca_plot

### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [None]:
# # Composite plot to contrast the Elbow curves
elbow_plot + pca_elbow_plot

In [None]:
# # Composite plot to contrast the clusters
scaled_plot + pca_plot

#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** The cluster analysis results visually, it appears that reducing the number of features used to cluster the data using K-Means had a significant impact. In the initial plot, which depicted the original data clustering, the elbow curve indicated that the optimal value for K was 4, resulting in 4 clusters.