In [1]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [10]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
pop_df = pd.read_csv("Census_Population_By_City.csv", index_col="State")

# Review the DataFrame
pop_df.head()

Unnamed: 0_level_0,City,Total Population
State,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,Auburn,80009
Alabama,Birmingham,196353
Alabama,Dothan,70524
Alabama,Hoover,92427
Alabama,Huntsville,222363


In [11]:
# Generate summary statistics
pop_df.describe()

Unnamed: 0,Total Population
count,646.0
mean,191153.0
std,417611.1
min,63991.0
25%,79304.25
50%,104932.5
75%,169260.8
max,8335897.0


In [12]:
# Plot your data to see what's in your DataFrame
pop_df.hvplot.line(
    width=800,
    height=400,
    rot=90
)

In [14]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
pop_data_scaled = StandardScaler().fit_transform(
    pop_df[["Total Population"]]
)

In [20]:
# Create a DataFrame with the scaled data
pop_data_scaled = pd.DataFrame(
    pop_df,
    columns=["Total Population"]
)

# Copy the crypto names from the original data
pop_data_scaled["State"] = pop_df.index

# Set the coinid column as index
pop_data_scaled = pop_data_scaled.set_index("State")

# Display sample data
pop_data_scaled.head()

Unnamed: 0_level_0,Total Population
State,Unnamed: 1_level_1
Alabama,80009
Alabama,196353
Alabama,70524
Alabama,92427
Alabama,222363


In [21]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1, 11))

In [22]:
# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list

for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(pop_data_scaled)
    inertia.append(k_model.inertia_)




In [23]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()


Unnamed: 0,k,inertia
0,1,112487400000000.0
1,2,45133490000000.0
2,3,18589930000000.0
3,4,8186160000000.0
4,5,4828371000000.0


In [24]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)


In [25]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=4, random_state=1)

In [26]:
# Fit the K-Means model using the scaled data
model.fit(pop_data_scaled)



In [27]:
# Predict the clusters to group the cryptocurrencies using the scaled data
k_4 = model.predict(pop_data_scaled)

# Print the resulting array of cluster values.
print(k_4)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 0 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 1 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 0 0 0 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2
 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 0 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2
 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2
 2 2 2 2 2 2 0 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2
 2 2 2 3 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2
 2 0 2 2 2 2 2 2 2 0 2 2 

In [28]:
# Create a copy of the DataFrame
predictions_df = pop_data_scaled.copy()

In [29]:
# Add a new column to the DataFrame with the predicted clusters
predictions_df["predicted_cluster"] = k_4

# Display sample data
predictions_df.head()

Unnamed: 0_level_0,Total Population,predicted_cluster
State,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,80009,2
Alabama,196353,2
Alabama,70524,2
Alabama,92427,2
Alabama,222363,2


In [31]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.

predictions_df.hvplot.scatter(
    x="Total Population",
    y="State",
    by="predicted_cluster",
    hover_cols="City"
)

DataError: Supplied data does not contain specified dimensions, the following dimensions were not found: ['State']

PandasInterface expects tabular data, for more information on supported datatypes see http://holoviews.org/user_guide/Tabular_Datasets.html