### Exploring K-Means  & Comparing it to DBSCAN 


In [9]:
from pathlib import Path
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans, DBSCAN
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics.cluster import silhouette_score
#from adspy_shared_utilities import plot_labelled_scatter
import matplotlib.pyplot as plt

In [13]:
# need version 1.4.1 to get read_pickle method to work
ROOT_DIR = os.path.realpath(os.path.join(os.getcwd(), '..'))
cln_pkl_loc = os.path.join(ROOT_DIR, 'data','cleanweathersmall.pkl')

In [14]:
# get location from clean pickel file
#cln_pkl_loc = Path('cleanweathersmall.pkl')
# load it onto df
df = pd.read_pickle(cln_pkl_loc)
# quick look at the df
df.sample(3)

Unnamed: 0,station,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
1296325,KANW0,2022-08-07 13:00:00,16.4,14.6,89.0,0.0,20.0,13.0,1019.0
1792290,KDNS0,2022-08-02 18:00:00,32.0,28.1,80.0,0.0,180.0,22.3,1009.0
3015252,KPPO0,2022-01-26 12:00:00,-19.6,-22.0,81.0,0.0,260.0,13.0,1031.0


### Apply K-Means Algorithm to create clusters

### Create clusters by grouping all stations by month and by hours based on datestamps

In [17]:
# loop thru all twelve months:
K_Means_list = [] # = pd.DataFrame()
for mth in range(1,13):
    # loop thru all 24 hours
    for hr in range(24):
        # query df based on month and time
        tmp_df = df.query(f'time.dt.hour == {hr} and time.dt.month == {mth}')
        # remove time variable from df
        tmp_df = tmp_df.loc[: , tmp_df.columns != 'time']
        # calculate mean of such datafrme
        tmp_df = tmp_df.groupby('station').mean()
        # drop null values
        tmp_df.dropna(inplace=True)
        # if we get a result, calculate K-Means clusters
        if not tmp_df.empty:
            # rescale the data to zero mean and unit variance
            scaler = StandardScaler()
            scaler.fit(tmp_df)
            X_scaled = scaler.transform(tmp_df)
            # apply K-Means clustering; start with known cluster count from DBSCAN, 5
            kmeans = KMeans(n_clusters = 5, random_state = 0)
            kmeans.fit(X_scaled)
            # run the K-Means algorithm
            clusters = kmeans.labels_
            # attach clusters to df
            tmp_df['KMeans_cluster'] = clusters
            # add month and hour as variables
            tmp_df['hr'], tmp_df['mth'] = hr, mth
            # calculate the silhouette score; only if they're not outliers
            tmp_df['silhouette_score'] = -1
            if set(clusters) != {-1}:
                tmp_df['silhouette_score'] = silhouette_score(X_scaled, clusters)                
            # collect all dfs
            K_Means_list.append(tmp_df)
            
# collect all DBSCAN dfs into one
KMeans_df = pd.concat(K_Means_list)
# reset the index; we can query the 'station' as a column much easier
KMeans_df.reset_index(inplace=True)
tmp_df

Unnamed: 0_level_0,temp,dwpt,rhum,prcp,wdir,wspd,pres
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [3]:
# quick distribution of the clusters; it looks evenly
# KMeans_df.groupby('KMeans_cluster').count()[['station']]
KMeans_df.KMeans_cluster.hist();

NameError: name 'KMeans_df' is not defined

### Select Clusters Where AA Shows up

In [12]:
# list the clusters where KARB0 shows up; exclude the -1 (outliers)
AA_Clusters = KMeans_df.query('station == "KARB0"')['KMeans_cluster'].unique()
AA_Clusters

array([2, 3, 0, 1, 4], dtype=int32)

In [38]:
# what stations make up these clusters?
AA_near_stations = KMeans_df.query('KMeans_cluster in @AA_Clusters and silhouette_score > 0')['station'].unique()
len(AA_near_stations), # AA_near_stations

(607,)

### Since all FIVE clusters contain all stations, let's filter those with the highest Silhouette scores

In [163]:
# sort the scores in descending order
top_silh_scr = sorted(KMeans_df.query('KMeans_cluster in @AA_Clusters and silhouette_score > 0')['silhouette_score'].unique())[::-1]
# there are over 200 scores. The top [3] scores already produce over 500 matches (see cell below)!
top_silh_scr = top_silh_scr[0]
top_silh_scr

0.2968700650319063

In [164]:
# see how many stations are with the highest score(s)
AA_top_stations = KMeans_df.query('KMeans_cluster in @AA_Clusters and silhouette_score >= @top_silh_scr')['station'].unique()
len(AA_top_stations), # still a huge number

(279,)

### Let's compare & filter stations based on combined K-Means & DBSCAN clusters

In [165]:
# EXTREMELY likely that K-Means contains all station that DBSCAN has
# import df with clusters; let's compare these results for K-Means
DBS_Kmeans_df = pd.read_pickle('DBSCAN_clusters.pkl')

In [173]:
# combine it with K-Means df
DBS_Kmeans_df['KMeans_cluster'], DBS_Kmeans_df['KMeans_sil_cluster'] = KMeans_df['KMeans_cluster'], KMeans_df['silhouette_score']
# DBS_Kmeans_df.head()

In [167]:
# list the clusters where KARB0 shows up; exclude the -1 (outliers)
# ALSO, bring in ONLY those with the highest K-Means Silhouette scores
AA_Clusters = DBS_Kmeans_df.query('station == "KARB0" \
                            and DBSCAN_cluster != -1 \
                            and KMeans_sil_cluster >= @top_silh_scr')['KMeans_cluster'].unique()
AA_Clusters # ONLY need to review cluster 1

array([1], dtype=int32)

In [171]:
# what stations make up these clusters (for both clustering algorithms)?
AA_near_stations = DBS_Kmeans_df.query('KMeans_cluster in @AA_Clusters and KMeans_sil_cluster >= @top_silh_scr')['station'].unique()
len(AA_near_stations), AA_near_stations

(279,
 array(['0CO7B', '0FV1F', '4DUJO', '5RUZT', '6N2T2', '71261', '71270',
        '71283', '71298', '71307', '71352', '71368', '71433', '71438',
        '71439', '71462', '71465', '71527', '71538', '71573', '71623',
        '71631', '71633', '71634', '71704', '72013', '72014', '72019',
        '72420', '72428', '72520', '72521', '72524', '72525', '72526',
        '72530', '72531', '72532', '72533', '72534', '72535', '72536',
        '72537', '72538', '72539', '72542', '72543', '72544', '72545',
        '72547', '72548', '72628', '72634', '72635', '72636', '72637',
        '72638', '72639', '72641', '72642', '72643', '72645', '72648',
        '72744', '74455', '74467', '7Q717', '8HNVP', '93RHE', '9H92X',
        'ATA0X', 'CACQ0', 'FO7ZN', 'GUUD7', 'K4I30', 'K7B4X', 'KACQ0',
        'KADG0', 'KAEL0', 'KAFJ0', 'KAIO0', 'KAKR0', 'KAMN0', 'KANQ0',
        'KAOH0', 'KARB0', 'KARR0', 'KASW0', 'KATW0', 'KAUM0', 'KAWG0',
        'KAXA0', 'KAXV0', 'KAZO0', 'KBAX0', 'KBEH0', 'KBIV0', 'KBJJ0',


In [172]:
# export df
DBS_Kmeans_df.to_pickle('DBSCAN_KMeans_clusters.pkl')

In [190]:
# check and see that DBSCAN  stations are on K-Means
DBSCAN_AA_Clusters = DBS_Kmeans_df.query('station == "KARB0" and DBSCAN_cluster != -1')['DBSCAN_cluster'].unique()
DBSCAN_AA_near_stations = DBS_Kmeans_df.query('DBSCAN_cluster in @DBSCAN_AA_Clusters')['station'].unique()
len(DBSCAN_AA_near_stations), DBSCAN_AA_near_stations
len(set(AA_near_stations).intersection(set(DBSCAN_AA_near_stations))), set(AA_near_stations).intersection(set(DBSCAN_AA_near_stations))

(180,
 {'0CO7B',
  '0FV1F',
  '4DUJO',
  '5RUZT',
  '6N2T2',
  '71283',
  '71298',
  '71307',
  '71352',
  '71368',
  '71527',
  '71538',
  '71623',
  '71631',
  '71633',
  '71634',
  '71704',
  '72420',
  '72428',
  '72520',
  '72521',
  '72525',
  '72526',
  '72530',
  '72533',
  '72535',
  '72536',
  '72537',
  '72539',
  '72543',
  '72545',
  '72634',
  '72635',
  '72636',
  '72637',
  '72638',
  '72643',
  '72645',
  '7Q717',
  '8HNVP',
  '93RHE',
  '9H92X',
  'ATA0X',
  'FO7ZN',
  'GUUD7',
  'K7B4X',
  'KADG0',
  'KAEL0',
  'KAIO0',
  'KAKR0',
  'KAMN0',
  'KANQ0',
  'KAOH0',
  'KARB0',
  'KARR0',
  'KASW0',
  'KAUM0',
  'KAXV0',
  'KAZO0',
  'KBEH0',
  'KBIV0',
  'KBJJ0',
  'KBTL0',
  'KC290',
  'KC350',
  'KC620',
  'KC650',
  'KCFS0',
  'KCGF0',
  'KCNB0',
  'KCNC0',
  'KD950',
  'KDET0',
  'KDFI0',
  'KDKB0',
  'KDLL0',
  'KDPA0',
  'KEDJ0',
  'KEFT0',
  'KEKM0',
  'KENW0',
  'KETB0',
  'KFBL0',
  'KFFL0',
  'KFFX0',
  'KFKA0',
  'KFLD0',
  'KFPK0',
  'KFXY0',
  'KGGI0',
  'K

In [5]:
# # Review the clusters now:
# cluster_list = [f'Cluster {c}' for c in range(len(kmeans.cluster_centers_))]
# plot_labelled_scatter(X_scaled, kmeans.labels_, cluster_list, "K-Means Clusters" )

## Review AA station; which cluster does it fall under?

In [6]:
# # distribution of all clusters:
# df['K_cluster_21'].hist()
# plt.title("Distribution of All Clusters");

In [7]:
# # A distribution of clusters where KARB0 ("Ann Arbor") is part of
# df.query('station == "KARB0"')['K_cluster_21'].hist();
# plt.title("Distribution of Clusters in Ann Arbor");