### Clustering to reduce features

In [122]:
import os
import pandas as pd
from datetime import datetime, timedelta
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from sklearn.metrics.cluster import silhouette_score

In [106]:
ROOT_DIR = os.path.realpath(os.path.join(os.getcwd(), '..'))
cln_pkl_loc = os.path.join(ROOT_DIR, 'data','cleanweathersmall.pkl')

In [107]:
df = pd.read_pickle(cln_pkl_loc)
df.groupby('station').count()

Unnamed: 0_level_0,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0CNUO,6360,6360,6360,6360,6360,6360,6360,6360
0CO7B,6360,6360,6360,6360,6360,6360,6360,6360
0FV1F,6360,6360,6360,6360,6360,6360,6360,6360
1J1PJ,6360,6360,6360,6360,6360,6360,6360,6360
1JWST,6360,6360,6360,6360,6360,6360,6360,6360
...,...,...,...,...,...,...,...,...
V5792,6360,6360,6360,6360,6360,6360,6360,6360
VMWBN,6360,6360,6360,6360,6360,6360,6360,6360
XM44W,6360,6360,6360,6360,6360,6360,6360,6360
ZFZUV,6360,6360,6360,6360,6360,6360,6360,6360


### 1. shifting time by 24 hours for Ann Arbor

In [108]:
df_AA=df[df['station']=="KARB0"] #spliting off AA so I don't adjust its time.  

In [109]:
df_AA

Unnamed: 0,station,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
1310160,KARB0,2022-01-01 00:00:00,5.0,2.9,86.0,0.0,120.0,11.0,1005.0
1310161,KARB0,2022-01-01 01:00:00,5.0,2.9,86.0,0.0,120.0,11.2,1005.4
1310162,KARB0,2022-01-01 02:00:00,5.0,2.9,86.0,0.0,120.0,5.4,1005.3
1310163,KARB0,2022-01-01 03:00:00,5.0,2.9,86.0,0.0,0.0,0.0,1004.9
1310164,KARB0,2022-01-01 04:00:00,5.6,3.3,85.0,0.0,0.0,0.0,1004.9
...,...,...,...,...,...,...,...,...,...
1316515,KARB0,2022-09-22 19:00:00,15.0,3.5,46.0,0.0,320.0,25.9,1017.6
1316516,KARB0,2022-09-22 20:00:00,16.1,4.5,46.0,0.0,320.0,18.4,1017.9
1316517,KARB0,2022-09-22 21:00:00,16.0,4.4,46.0,0.0,330.0,22.0,1019.0
1316518,KARB0,2022-09-22 22:00:00,14.0,2.9,47.0,0.0,330.0,22.0,1019.0


In [110]:
df_NOAA=df[df['station']!="KARB0"]

In [111]:
df_NOAA['time']=df_NOAA['time']+timedelta(hours=24) #adds 24 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_NOAA['time']=df_NOAA['time']+timedelta(hours=24) #adds 24


In [112]:
df_AA

Unnamed: 0,station,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
1310160,KARB0,2022-01-01 00:00:00,5.0,2.9,86.0,0.0,120.0,11.0,1005.0
1310161,KARB0,2022-01-01 01:00:00,5.0,2.9,86.0,0.0,120.0,11.2,1005.4
1310162,KARB0,2022-01-01 02:00:00,5.0,2.9,86.0,0.0,120.0,5.4,1005.3
1310163,KARB0,2022-01-01 03:00:00,5.0,2.9,86.0,0.0,0.0,0.0,1004.9
1310164,KARB0,2022-01-01 04:00:00,5.6,3.3,85.0,0.0,0.0,0.0,1004.9
...,...,...,...,...,...,...,...,...,...
1316515,KARB0,2022-09-22 19:00:00,15.0,3.5,46.0,0.0,320.0,25.9,1017.6
1316516,KARB0,2022-09-22 20:00:00,16.1,4.5,46.0,0.0,320.0,18.4,1017.9
1316517,KARB0,2022-09-22 21:00:00,16.0,4.4,46.0,0.0,330.0,22.0,1019.0
1316518,KARB0,2022-09-22 22:00:00,14.0,2.9,47.0,0.0,330.0,22.0,1019.0


In [113]:
#removing first day and last day 
#(manual current but could be automated)
df_AA24HR=pd.concat([df_AA,df_NOAA])
df_AA24HR=df_AA24HR[(df_AA24HR['time']>='2022-01-02 00:00:00') & (df_AA24HR['time']<='2022-09-22 23:00:00')]

In [114]:
#droping extra columns that we have determined are not part of the model
df_AA24HR.drop(columns=['prcp','wdir','wspd'], inplace=True)

### 2. Clustering

In [116]:
#Scaling all values for kmeans - normalizing columns

scaler = StandardScaler() #read that standard scaler is probably best for knn vs min/max (first attempt)
scaler.fit(df_AA24HR.iloc[:,2:])
scaled=scaler.fit_transform(df_AA24HR.iloc[:,2:])
scaled_df=pd.DataFrame(scaled, columns = df_AA24HR.iloc[:2,2:].columns)
#normalized = preprocessing.normalize(df_AA24HR.iloc[:,2:9])
#normalized
#testknn=df_AA24HR[df_AA24HR['time']=='2022-01-02 1:00:00']
scaled_df=df_AA24HR.iloc[:,0:2].merge(scaled_df, left_index=True, right_index=True)


#testing
#scaled_df=scaled_df[scaled_df['time']<='2022-03-30 00:00:00']

In [117]:
#creating a results DF
result_df=pd.DataFrame(scaled_df.station.unique(), columns=['station'])
result_df['total']=0
result_df


Unnamed: 0,station,total
0,KARB0,0
1,0CNUO,0
2,0CO7B,0
3,0FV1F,0
4,1J1PJ,0
...,...,...
568,UJHR7,0
569,UV7W2,0
570,V5792,0
571,VMWBN,0


In [118]:
station_loc = os.path.join(ROOT_DIR, 'station_analysis','AAstation.csv') #station data to get distance for comparision
dfstations=pd.read_csv(station_loc)

In [119]:
scaled_df.groupby('station').count()

Unnamed: 0_level_0,time,temp,dwpt,rhum,pres
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0CNUO,6336,6336,6336,6336,6336
0CO7B,6336,6336,6336,6336,6336
0FV1F,6336,6336,6336,6336,6336
1J1PJ,6336,6336,6336,6336,6336
1JWST,6336,6336,6336,6336,6336
...,...,...,...,...,...
UJHR7,6336,6336,6336,6336,6336
UV7W2,6336,6336,6336,6336,6336
V5792,6336,6336,6336,6336,6336
VMWBN,6336,6336,6336,6336,6336


In [143]:
#the actual kmeans
knn = KMeans(n_clusters=4) #need to tune this value and try other clustering (hierarchical) to find stable values
for t in tqdm(scaled_df.time.unique()): #looping through every hour
    dfhour=scaled_df[scaled_df['time']==t].copy()
    knndata=dfhour.iloc[:,2:]
    #print(t)
    knn.fit(knndata)
    y_kmeans = knn.predict(knndata)
    dfhour['cluster']=y_kmeans #finding clusters
    AAcluster=dfhour.loc[dfhour.station=='KARB0','cluster'].values[0] #finding cluster AA is in
    df_resulthr=dfhour[dfhour['cluster']==AAcluster].copy()
    df_resulthr['total']=1 #setting a threshold
    result_df=pd.concat([result_df, df_resulthr[['station','total']]]).groupby(['station']).sum().reset_index() #joining back to my results df.  I wasn't sure how else to do this.  
result_df

100%|██████████| 6336/6336 [09:26<00:00, 11.18it/s]


Unnamed: 0,station,total
0,0CNUO,3362
1,0CO7B,3609
2,0FV1F,3331
3,1J1PJ,3371
4,1JWST,3371
...,...,...
568,UJHR7,1929
569,UV7W2,1997
570,V5792,2003
571,VMWBN,1887


In [146]:

    #result_df.sort_values(by='total',ascending=False).head(60)
pd.merge(result_df,dfstations,left_on='station', right_on='id').sort_values(by='total',ascending=False).head(26) #top 20 locations (top will be AA)

Unnamed: 0,station,total,id,name,country,region,wmo,icao,latitude,longitude,elevation,timezone,hourly_start,hourly_end,daily_start,daily_end,monthly_start,monthly_end,milesfromstat
206,KARB0,12672,KARB0,Ann Arbor / Pittsfield,US,MI,,KARB,42.223,-83.7456,256.0,America/Detroit,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2006-01-01,2022-01-01,0.0
208,KARV0,7484,KARV0,Minocqua / Woodruff,US,WI,,KARV,45.9279,-89.7309,497.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2009-01-01,2022-01-01,392.465081
472,KPNT0,7310,KPNT0,Pontiac / Cayuga,US,IL,,KPNT,40.9244,-88.6239,203.0,America/Chicago,2006-09-26,2022-09-22,2011-03-18,2022-04-24,2015-01-01,2022-01-01,268.175428
471,KPNM0,7166,KPNM0,Princeton / Princetown [Misspelling],US,MN,,KPNM,45.5599,-93.6082,299.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2006-01-01,2022-01-01,543.149574
470,KPMV0,6991,KPMV0,Plattsmouth / Mynard,US,NE,,KPMV,40.9484,-95.9174,367.0,America/Chicago,2006-11-21,2022-09-22,2006-11-22,2022-04-24,2009-01-01,2022-01-01,636.208367
207,KARR0,6625,KARR0,Aurora / Sugar Grove,US,IL,,KARR,41.7719,-88.4757,217.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2006-01-01,2022-01-01,245.469652
205,KAQP0,6345,KAQP0,Appleton,US,MN,,KAQP,45.2275,-96.0043,311.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2007-01-01,2022-01-01,647.081592
469,KPLN0,6289,KPLN0,Pellston,US,MI,,KPLN,45.5709,-84.7967,220.0,America/Detroit,1973-01-01,2022-09-22,1973-01-01,2022-04-24,2000-01-01,2021-01-01,237.016818
202,KANQ0,6010,KANQ0,Angola / Flint,US,IN,,KANQ,41.6397,-85.0835,303.0,America/Indiana/Indianapolis,2020-01-14,2022-09-22,2020-01-14,2022-04-24,2020-01-01,2022-01-01,79.84114
219,KAZO0,5909,KAZO0,Kalamazoo / Colonial Acres Mobile Home Park,US,MI,,KAZO,42.2344,-85.5516,266.0,America/Detroit,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2006-01-01,2022-01-01,92.642774


In [142]:
knn = KMeans(n_clusters=8)
dfhour=scaled_df[scaled_df['time']=='2022-08-30 10:00:00'].copy()
knndata=dfhour.iloc[:,2:]
knn.fit(knndata)
clusters=knn.labels_
silhouette_score(knndata, clusters)

0.28413939677617917