### Clustering to reduce features

In [221]:
import os
import pandas as pd
from datetime import datetime, timedelta
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [2]:
ROOT_DIR = os.path.realpath(os.path.join(os.getcwd(), '..'))
cln_pkl_loc = os.path.join(ROOT_DIR, 'data','cleanweathersmall.pkl')

In [3]:
df = pd.read_pickle(cln_pkl_loc)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3657000 entries, 0 to 3656999
Data columns (total 9 columns):
 #   Column   Dtype         
---  ------   -----         
 0   station  object        
 1   time     datetime64[ns]
 2   temp     float64       
 3   dwpt     float64       
 4   rhum     float64       
 5   prcp     float64       
 6   wdir     float64       
 7   wspd     float64       
 8   pres     float64       
dtypes: datetime64[ns](1), float64(7), object(1)
memory usage: 279.0+ MB


### 1. shifting time by 24 hours for Ann Arbor

In [4]:
df_AA=df[df['station']=="KARB0"] #spliting off AA so I don't adjust its time.  

In [222]:
df_AA

Unnamed: 0,station,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
1310160,KARB0,2022-01-01 00:00:00,5.0,2.9,86.0,0.0,120.0,15.0,1005.0
1310161,KARB0,2022-01-01 01:00:00,5.0,2.9,86.0,0.0,120.0,11.2,1005.4
1310162,KARB0,2022-01-01 02:00:00,5.0,2.9,86.0,0.0,120.0,5.4,1005.3
1310163,KARB0,2022-01-01 03:00:00,5.0,2.9,86.0,0.0,0.0,0.0,1004.9
1310164,KARB0,2022-01-01 04:00:00,5.6,3.3,85.0,0.0,0.0,0.0,1004.9
...,...,...,...,...,...,...,...,...,...
1316515,KARB0,2022-09-22 19:00:00,16.1,2.8,41.0,0.0,329.0,26.6,1017.0
1316516,KARB0,2022-09-22 20:00:00,16.1,2.8,41.0,0.0,328.0,26.3,1017.2
1316517,KARB0,2022-09-22 21:00:00,15.8,2.6,41.0,0.0,329.0,25.2,1017.4
1316518,KARB0,2022-09-22 22:00:00,15.7,2.1,40.0,0.0,332.0,24.8,1017.8


In [223]:
df_NOAA=df[df['station']!="KARB0"]

In [224]:
df_NOAA['time']=df_NOAA['time']+timedelta(hours=24) #adds 24 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_NOAA['time']=df_NOAA['time']+timedelta(hours=24) #adds 24


In [225]:
df_AA

Unnamed: 0,station,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
1310160,KARB0,2022-01-01 00:00:00,5.0,2.9,86.0,0.0,120.0,15.0,1005.0
1310161,KARB0,2022-01-01 01:00:00,5.0,2.9,86.0,0.0,120.0,11.2,1005.4
1310162,KARB0,2022-01-01 02:00:00,5.0,2.9,86.0,0.0,120.0,5.4,1005.3
1310163,KARB0,2022-01-01 03:00:00,5.0,2.9,86.0,0.0,0.0,0.0,1004.9
1310164,KARB0,2022-01-01 04:00:00,5.6,3.3,85.0,0.0,0.0,0.0,1004.9
...,...,...,...,...,...,...,...,...,...
1316515,KARB0,2022-09-22 19:00:00,16.1,2.8,41.0,0.0,329.0,26.6,1017.0
1316516,KARB0,2022-09-22 20:00:00,16.1,2.8,41.0,0.0,328.0,26.3,1017.2
1316517,KARB0,2022-09-22 21:00:00,15.8,2.6,41.0,0.0,329.0,25.2,1017.4
1316518,KARB0,2022-09-22 22:00:00,15.7,2.1,40.0,0.0,332.0,24.8,1017.8


In [226]:
#removing first day and last day 
#(manual current but could be automated)
df_AA24HR=pd.concat([df_AA,df_NOAA])
df_AA24HR=df_AA24HR[(df_AA24HR['time']>='2022-01-02 00:00:00') & (df_AA24HR['time']<='2022-09-22 23:00:00')]

### 2. Clustering

In [232]:
#Scaling all values for kmeans - normalizing columns

scaler = StandardScaler() #read that standard scaler is probably best for knn vs min/max (first attempt)
scaler.fit(df_AA24HR.iloc[:,2:9])
scaled=scaler.fit_transform(df_AA24HR.iloc[:,2:9])
scaled_df=pd.DataFrame(scaled, columns = df.iloc[:2,2:9].columns)
#normalized = preprocessing.normalize(df_AA24HR.iloc[:,2:9])
#normalized
#testknn=df_AA24HR[df_AA24HR['time']=='2022-01-02 1:00:00']
scaled_df=df_AA24HR.iloc[:,0:2].merge(scaled_df, left_index=True, right_index=True)
scaled_df.head()

#testing
scaled_df=scaled_df[scaled_df['time']<='2022-03-30 00:00:00']

In [236]:
#creating a results DF
result_df=pd.DataFrame(scaled_df.station.unique(), columns=['station'])
result_df['total']=0
result_df


Unnamed: 0,station,total
0,KARB0,0
1,0CNUO,0
2,0CO7B,0
3,0FV1F,0
4,1J1PJ,0
...,...,...
568,UJHR7,0
569,UV7W2,0
570,V5792,0
571,VMWBN,0


In [237]:
#the actual kmeans
knn = KMeans(n_clusters=15) #need to tune this value and try other clustering (hierarchical) to find stable values
for t in scaled_df.time.unique(): #looping through every hour
    dfhour=scaled_df[scaled_df['time']==t].copy()
    knndata=dfhour.iloc[:,2:9]
    knn.fit(knndata)
    y_kmeans = knn.predict(knndata)
    dfhour['cluster']=y_kmeans #finding clusters
    AAcluster=dfhour.loc[dfhour.station=='KARB0','cluster'].values[0] #finding cluster AA is in
    df_resulthr=dfhour[dfhour['cluster']==AAcluster].copy()
    df_resulthr['total']=1 #setting a threshold
    result_df=pd.concat([result_df, df_resulthr[['station','total']]]).groupby(['station']).sum().reset_index() #joining back to my results df.  I wasn't sure how else to do this.  
result_df

Unnamed: 0,station,total
0,0CNUO,51
1,0CO7B,103
2,0FV1F,86
3,1J1PJ,108
4,1JWST,83
...,...,...
568,UJHR7,62
569,UV7W2,69
570,V5792,59
571,VMWBN,51


In [238]:
station_loc = os.path.join(ROOT_DIR, 'station_analysis','AAstation.csv') #station data to get distance for comparision
dfstations=pd.read_csv(station_loc)
    #result_df.sort_values(by='total',ascending=False).head(60)
pd.merge(result_df,dfstations,left_on='station', right_on='id').sort_values(by='total',ascending=False).head(21) #top 20 locations (top will be AA)

Unnamed: 0,station,total,id,name,country,region,wmo,icao,latitude,longitude,elevation,timezone,hourly_start,hourly_end,daily_start,daily_end,monthly_start,monthly_end,milesfromstat
206,KARB0,2089,KARB0,Ann Arbor / Pittsfield,US,MI,,KARB,42.223,-83.7456,256.0,America/Detroit,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2006-01-01,2022-01-01,0.0
208,KARV0,639,KARV0,Minocqua / Woodruff,US,WI,,KARV,45.9279,-89.7309,497.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2009-01-01,2022-01-01,392.465081
209,KASW0,493,KASW0,Warsaw / Monoquet,US,IN,,KASW,41.2747,-85.8401,259.0,America/Indiana/Indianapolis,2006-10-28,2022-09-22,2009-09-05,2022-04-24,2013-01-01,2022-01-01,126.490533
207,KARR0,464,KARR0,Aurora / Sugar Grove,US,IL,,KARR,41.7719,-88.4757,217.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2006-01-01,2022-01-01,245.469652
472,KPNT0,456,KPNT0,Pontiac / Cayuga,US,IL,,KPNT,40.9244,-88.6239,203.0,America/Chicago,2006-09-26,2022-09-22,2011-03-18,2022-04-24,2015-01-01,2022-01-01,268.175428
471,KPNM0,446,KPNM0,Princeton / Princetown [Misspelling],US,MN,,KPNM,45.5599,-93.6082,299.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2006-01-01,2022-01-01,543.149574
465,KPHN0,440,KPHN0,Port Huron / Marysville,US,MI,,KPHN,42.911,-82.5289,198.0,America/Detroit,2006-01-01,2022-09-22,2006-01-01,2022-04-25,2007-01-01,2022-01-01,78.157743
200,KAMN0,430,KAMN0,Alma / Eugene,US,MI,,KAMN,43.3221,-84.6879,230.0,America/Detroit,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2006-01-01,2022-01-01,89.732372
475,KPQN0,425,KPQN0,Pipestone,US,MN,,KPQN,43.9833,-96.3003,529.0,America/Chicago,2006-01-01,2022-08-02,2006-01-01,2022-04-24,2012-01-01,2022-01-01,645.894729
484,KRDK0,424,KRDK0,Red Oak / Villa Village Mobile Home Park,US,IA,,KRDK,41.0108,-95.2589,319.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2011-01-01,2022-01-01,601.622813


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testknn['cluster']=y_kmeans
