### Clustering to reduce features

In [153]:
import os
import pandas as pd
from datetime import datetime, timedelta
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from sklearn.metrics.cluster import silhouette_score

In [175]:
ROOT_DIR = os.path.realpath(os.path.join(os.getcwd(), '..'))
cln_pkl_loc = os.path.join(ROOT_DIR, 'data','cleanweathersmall.pkl')

In [176]:
df = pd.read_pickle(cln_pkl_loc)
df.groupby('station').count()

Unnamed: 0_level_0,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0CNUO,8784,8784,8784,8784,8784,8784,8784,8784
0CO7B,8784,8784,8784,8784,8784,8784,8784,8784
0FV1F,8784,8784,8784,8784,8784,8784,8784,8784
1J1PJ,8784,8784,8784,8784,8784,8784,8784,8784
1JWST,8784,8784,8784,8784,8784,8784,8784,8784
...,...,...,...,...,...,...,...,...
V5792,8784,8784,8784,8784,8784,8784,8784,8784
VMWBN,8784,8784,8784,8784,8784,8784,8784,8784
XM44W,8784,8784,8784,8784,8784,8784,8784,8784
ZFZUV,8784,8784,8784,8784,8784,8784,8784,8784


### 1. shifting time by 24 hours for Ann Arbor

In [177]:
df_AA=df[df['station']=="KARB0"] #spliting off AA so I don't adjust its time.  

In [178]:
df_AA

Unnamed: 0,station,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
2485872,KARB0,2021-10-15 00:00:00,17.8,16.7,93.0,0.0,230.0,5.4,1012.2
2485873,KARB0,2021-10-15 01:00:00,17.2,15.5,90.0,0.0,260.0,7.6,1012.7
2485874,KARB0,2021-10-15 02:00:00,16.1,14.5,90.0,0.0,0.0,0.0,1012.9
2485875,KARB0,2021-10-15 03:00:00,16.7,14.3,86.0,0.0,350.0,9.4,1012.5
2485876,KARB0,2021-10-15 04:00:00,16.1,15.0,93.0,0.0,220.0,7.6,1013.2
...,...,...,...,...,...,...,...,...,...
2494651,KARB0,2022-10-15 19:00:00,11.1,-0.1,46.0,0.0,210.0,24.1,1013.2
2494652,KARB0,2022-10-15 20:00:00,11.1,-0.1,46.0,0.0,220.0,24.1,1013.7
2494653,KARB0,2022-10-15 21:00:00,10.0,-0.5,48.0,0.0,220.0,20.5,1013.8
2494654,KARB0,2022-10-15 22:00:00,8.0,-1.3,52.0,0.0,220.0,9.0,1014.0


In [179]:
df_NOAA=df[df['station']!="KARB0"]

In [180]:
df_NOAA['time']=df_NOAA['time']+timedelta(hours=24) #adds 24 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_NOAA['time']=df_NOAA['time']+timedelta(hours=24) #adds 24


In [181]:
df_AA

Unnamed: 0,station,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
2485872,KARB0,2021-10-15 00:00:00,17.8,16.7,93.0,0.0,230.0,5.4,1012.2
2485873,KARB0,2021-10-15 01:00:00,17.2,15.5,90.0,0.0,260.0,7.6,1012.7
2485874,KARB0,2021-10-15 02:00:00,16.1,14.5,90.0,0.0,0.0,0.0,1012.9
2485875,KARB0,2021-10-15 03:00:00,16.7,14.3,86.0,0.0,350.0,9.4,1012.5
2485876,KARB0,2021-10-15 04:00:00,16.1,15.0,93.0,0.0,220.0,7.6,1013.2
...,...,...,...,...,...,...,...,...,...
2494651,KARB0,2022-10-15 19:00:00,11.1,-0.1,46.0,0.0,210.0,24.1,1013.2
2494652,KARB0,2022-10-15 20:00:00,11.1,-0.1,46.0,0.0,220.0,24.1,1013.7
2494653,KARB0,2022-10-15 21:00:00,10.0,-0.5,48.0,0.0,220.0,20.5,1013.8
2494654,KARB0,2022-10-15 22:00:00,8.0,-1.3,52.0,0.0,220.0,9.0,1014.0


In [186]:
#removing first day and last day 
#(manual current but could be automated)
df_AA24HR=pd.concat([df_AA,df_NOAA])
df_AA24HR=df_AA24HR[(df_AA24HR['time']>='2021-10-16 00:00:00') & (df_AA24HR['time']<='2022-10-14 23:00:00')]

In [187]:
#droping extra columns that we have determined are not part of the model
df_AA24HR.drop(columns=['prcp','wdir','wspd'], inplace=True)

### 2. Clustering

In [198]:
#Scaling all values for kmeans - normalizing columns

scaler = StandardScaler() #read that standard scaler is probably best for knn vs min/max (first attempt)
scaler.fit(df_AA24HR.iloc[:,2:])
scaled=scaler.fit_transform(df_AA24HR.iloc[:,2:])
scaled_df=pd.DataFrame(scaled, columns = df_AA24HR.iloc[:2,2:].columns)
#normalized = preprocessing.normalize(df_AA24HR.iloc[:,2:9])
#normalized
#testknn=df_AA24HR[df_AA24HR['time']=='2022-01-02 1:00:00']
scaled_df=df_AA24HR.iloc[:,0:2].merge(scaled_df, left_index=True, right_index=True)


#testing
#scaled_df=scaled_df[scaled_df['time']<='2022-03-30 00:00:00']

In [199]:
#creating a results DF
result_df=pd.DataFrame(scaled_df.station.unique(), columns=['station'])
result_df['total']=0
result_df


Unnamed: 0,station,total
0,KARB0,0
1,0CNUO,0
2,0CO7B,0
3,0FV1F,0
4,1J1PJ,0
...,...,...
888,QHA0T,0
889,SJZBK,0
890,UJHR7,0
891,UV7W2,0


In [200]:
station_loc = os.path.join(ROOT_DIR, 'station_analysis','AAstation.csv') #station data to get distance for comparision
dfstations=pd.read_csv(station_loc)

In [201]:
scaled_df.groupby('station').count()

Unnamed: 0_level_0,time,temp,dwpt,rhum,pres
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0CNUO,8736,8736,8736,8736,8736
0CO7B,8736,8736,8736,8736,8736
0FV1F,8736,8736,8736,8736,8736
1J1PJ,8736,8736,8736,8736,8736
1JWST,8736,8736,8736,8736,8736
...,...,...,...,...,...
QHA0T,8736,8736,8736,8736,8736
SJZBK,8736,8736,8736,8736,8736
UJHR7,8736,8736,8736,8736,8736
UV7W2,8736,8736,8736,8736,8736


In [202]:
#the actual kmeans
knn = KMeans(n_clusters=4) #need to tune this value and try other clustering (hierarchical) to find stable values
for t in tqdm(scaled_df.time.unique()): #looping through every hour
    dfhour=scaled_df[scaled_df['time']==t].copy()
    knndata=dfhour.iloc[:,2:]
    #print(t)
    knn.fit(knndata)
    y_kmeans = knn.predict(knndata)
    dfhour['cluster']=y_kmeans #finding clusters
    AAcluster=dfhour.loc[dfhour.station=='KARB0','cluster'].values[0] #finding cluster AA is in
    df_resulthr=dfhour[dfhour['cluster']==AAcluster].copy()
    df_resulthr['total']=1 #setting a threshold
    result_df=pd.concat([result_df, df_resulthr[['station','total']]]).groupby(['station']).sum().reset_index() #joining back to my results df.  I wasn't sure how else to do this.  
result_df

100%|██████████| 8736/8736 [16:48<00:00,  8.66it/s]


Unnamed: 0,station,total
0,0CNUO,962
1,0CO7B,1162
2,0FV1F,1139
3,1J1PJ,1008
4,1JWST,897
...,...,...
888,QHA0T,1514
889,SJZBK,1807
890,UJHR7,1266
891,UV7W2,1363


In [207]:

    #result_df.sort_values(by='total',ascending=False).head(60)
result=pd.merge(result_df,dfstations,left_on='station', right_on='id').sort_values(by='total',ascending=False) #top 20 locations (top will be AA)
#result.head(50)
result.to_csv('kmeansclustering2.csv')

In [206]:
knn = KMeans(n_clusters=5)
dfhour=scaled_df[scaled_df['time']=='2022-08-30 10:00:00'].copy()
knndata=dfhour.iloc[:,2:]
knn.fit(knndata)
clusters=knn.labels_
silhouette_score(knndata, clusters)

0.2840511977997185

In [151]:
result

Unnamed: 0,station,total,id,name,country,region,wmo,icao,latitude,longitude,elevation,timezone,hourly_start,hourly_end,daily_start,daily_end,monthly_start,monthly_end,milesfromstat
206,KARB0,12672,KARB0,Ann Arbor / Pittsfield,US,MI,,KARB,42.223,-83.7456,256.0,America/Detroit,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2006-01-01,2022-01-01,0.0
208,KARV0,7484,KARV0,Minocqua / Woodruff,US,WI,,KARV,45.9279,-89.7309,497.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2009-01-01,2022-01-01,392.465081
472,KPNT0,7310,KPNT0,Pontiac / Cayuga,US,IL,,KPNT,40.9244,-88.6239,203.0,America/Chicago,2006-09-26,2022-09-22,2011-03-18,2022-04-24,2015-01-01,2022-01-01,268.175428
471,KPNM0,7166,KPNM0,Princeton / Princetown [Misspelling],US,MN,,KPNM,45.5599,-93.6082,299.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2006-01-01,2022-01-01,543.149574
470,KPMV0,6991,KPMV0,Plattsmouth / Mynard,US,NE,,KPMV,40.9484,-95.9174,367.0,America/Chicago,2006-11-21,2022-09-22,2006-11-22,2022-04-24,2009-01-01,2022-01-01,636.208367
207,KARR0,6625,KARR0,Aurora / Sugar Grove,US,IL,,KARR,41.7719,-88.4757,217.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2006-01-01,2022-01-01,245.469652
205,KAQP0,6345,KAQP0,Appleton,US,MN,,KAQP,45.2275,-96.0043,311.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2007-01-01,2022-01-01,647.081592
469,KPLN0,6289,KPLN0,Pellston,US,MI,,KPLN,45.5709,-84.7967,220.0,America/Detroit,1973-01-01,2022-09-22,1973-01-01,2022-04-24,2000-01-01,2021-01-01,237.016818
202,KANQ0,6010,KANQ0,Angola / Flint,US,IN,,KANQ,41.6397,-85.0835,303.0,America/Indiana/Indianapolis,2020-01-14,2022-09-22,2020-01-14,2022-04-24,2020-01-01,2022-01-01,79.84114
219,KAZO0,5909,KAZO0,Kalamazoo / Colonial Acres Mobile Home Park,US,MI,,KAZO,42.2344,-85.5516,266.0,America/Detroit,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2006-01-01,2022-01-01,92.642774
