In [14]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans
from pathlib import Path
from sklearn.decomposition import PCA

In [9]:
df = pd.read_csv(Path('Resources/counts_per_county.csv'))
df.head()

Unnamed: 0,statecode_of_county,county,income_yr_2021,income_yr_2020,income_yr_2019,vehicle_cnt,station_cnt
0,CA,Alameda,164437681,149239559,135758980,192172,792
1,CA,Alpine,89826,90114,94986,38,5
2,CA,Amador,2052718,1937437,1768935,768,12
3,CA,Butte,11393205,10798585,9983194,3423,30
4,CA,Calaveras,2668137,2509072,2275336,999,5


In [29]:
df = df[['income_yr_2021','vehicle_cnt','station_cnt']]
df.head()

Unnamed: 0,income_yr_2021,vehicle_cnt,station_cnt
0,164437681,192172,792
1,89826,38,5
2,2052718,768,12
3,11393205,3423,30
4,2668137,999,5


In [30]:
# standardize data with standard scalar
df_scaled = StandardScaler().fit_transform(df)
df_scaled.dtype

dtype('float64')

In [31]:
df_scaled

array([[ 2.60353414,  3.88461972,  3.0623706 ],
       [-0.39457536, -0.28304001, -0.24670566],
       [-0.35876737, -0.26720527, -0.21727296],
       ...,
       [ 0.34198949,  0.527178  ,  0.00557461],
       [-0.3440732 , -0.26343096, -0.26352434],
       [-0.32573808, -0.25065471, -0.26352434]])

In [32]:
#Transform PCA data to a dataframe
df_new = pd.DataFrame(data = df_scaled, columns = ['income_yr_2021', 'vehicle_cnt','station_cnt'])
df_new.head()

Unnamed: 0,income_yr_2021,vehicle_cnt,station_cnt
0,2.603534,3.88462,3.062371
1,-0.394575,-0.28304,-0.246706
2,-0.358767,-0.267205,-0.217273
3,-0.188374,-0.209615,-0.141589
4,-0.347541,-0.262195,-0.246706


In [36]:
# Initialize the K-means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(df_new)

# Predict clusters
predictions = model.predict(df_new)

# Add the predicted class columns
df_new["class"] = model.labels_
df_new.head()

Unnamed: 0,income_yr_2021,vehicle_cnt,station_cnt,class
0,2.603534,3.88462,3.062371,2
1,-0.394575,-0.28304,-0.246706,0
2,-0.358767,-0.267205,-0.217273,0
3,-0.188374,-0.209615,-0.141589,0
4,-0.347541,-0.262195,-0.246706,0


In [37]:
# Plotting the clusters with three features
fig = px.scatter_3d(df_new, x='income_yr_2021', y='vehicle_cnt', z='station_cnt', color="class", symbol="class", size="class",width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()