### Exploring DBSCAN, K-Means, & Agglomerative Clustering to find the optimum number of clusters

In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans, DBSCAN
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.cluster import silhouette_score

In [2]:
# need version 1.4.1 to get read_pickle method to work
pd.__version__

'1.4.1'

In [3]:
# get location from clean pickel file
cln_pkl_loc = Path('cleanweathersmall.pkl')
# load it onto df
df = pd.read_pickle(cln_pkl_loc)
# quick look at the df
df.sample(3)

Unnamed: 0,station,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
3526502,KY510,2022-07-12 11:00:00,14.6,14.6,100.0,0.0,260.0,7.6,1013.0
2706398,KMPZ0,2022-01-02 20:00:00,-11.0,-17.1,61.0,0.0,320.0,7.6,1027.0
727848,72551,2022-07-26 11:00:00,19.4,16.1,81.0,0.0,130.0,13.0,1014.7


### Collapse All Features -- Use Average

## Apply DBSCAN Algorithm to create clusters

### Create clusters by grouping all stations by month and by hours based on datestamps


In [4]:
# loop thru all twelve months:
DBSCAN_list = [] # = pd.DataFrame()
for mth in range(1,13):
    # loop thru all 24 hours
    for hr in range(24):
        # query df based on month and time
        tmp_df = df.query(f'time.dt.hour == {hr} and time.dt.month == {mth}')
        # remove time variable from df
        tmp_df = tmp_df.loc[: , tmp_df.columns != 'time']
        # calculate mean of such datafrme
        tmp_df = tmp_df.groupby('station').mean()
        # drop null values
        tmp_df.dropna(inplace=True)
        # if we get a result, calculate DBSCAN clusters
        if not tmp_df.empty:
            # rescale the data to zero mean and unit variance
            scaler = StandardScaler()
            scaler.fit(tmp_df)
            X_scaled = scaler.transform(tmp_df)
            # create the DBSCAN object; use all processors available
            dbscan = DBSCAN(n_jobs=-1)
            # run the DBSCAN algorithm
            clusters = dbscan.fit_predict(X_scaled)
            # attache clusters to df
            tmp_df['DBSCAN_cluster'] = clusters
            # add month and hour as variables
            tmp_df['hr'], tmp_df['mth'] = hr, mth
            # calculate the silhouette score; only if they're not outliers
            tmp_df['silhouette_score'] = -1
            if set(clusters) != {-1}:
                tmp_df['silhouette_score'] = silhouette_score(X_scaled, clusters)                
            # collect all dfs
            DBSCAN_list.append(tmp_df)
            
# collect all DBSCAN dfs into one
DBSCAN_df = pd.concat(DBSCAN_list)
# reset the index; we can query the 'station' as a column much easier
DBSCAN_df.reset_index(inplace=True)

### Select Clusters Where AA Belongs to

In [17]:
# list the clusters where KARB0 shows up; exclude the -1 (outliers)
AA_Clusters = DBSCAN_df.query('station == "KARB0" and DBSCAN_cluster != -1')['DBSCAN_cluster'].unique()

In [18]:
AA_Clusters

array([5, 3, 0, 1, 2])

In [19]:
# what stations make up these clusters?
AA_near_stations = DBSCAN_df.query('DBSCAN_cluster in @AA_Clusters and silhouette_score > 0')['station'].unique()
len(AA_near_stations), AA_near_stations

(31,
 array(['0CNUO', '20QWH', '6URQB', '71850', 'DCBG8', 'MUKMN', 'NCUQS',
        'UJHR7', 'VMWBN', 'UV7W2', 'V5792', '9W5OW', '71851', '8ZB0I',
        'I0EZ7', '4DUJO', '9H92X', 'ATA0X', 'CTGT0', 'MAU7O', '71273',
        '71667', 'CXQT0', 'L1YUU', 'LII7V', '71147', '71564', '71962',
        'SJZBK', '71849', 'GCHAU'], dtype=object))

In [8]:
# export df with clusters; we'll use these results for K-Means
DBSCAN_df.to_pickle('DBSCAN_clusters.pkl')