### Clustering to reduce features

In [221]:
import os
import pandas as pd
from datetime import datetime, timedelta
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [2]:
ROOT_DIR = os.path.realpath(os.path.join(os.getcwd(), '..'))
cln_pkl_loc = os.path.join(ROOT_DIR, 'data','cleanweathersmall.pkl')

In [3]:
df = pd.read_pickle(cln_pkl_loc)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3657000 entries, 0 to 3656999
Data columns (total 9 columns):
 #   Column   Dtype         
---  ------   -----         
 0   station  object        
 1   time     datetime64[ns]
 2   temp     float64       
 3   dwpt     float64       
 4   rhum     float64       
 5   prcp     float64       
 6   wdir     float64       
 7   wspd     float64       
 8   pres     float64       
dtypes: datetime64[ns](1), float64(7), object(1)
memory usage: 279.0+ MB


### 1. shifting time by 24 hours for Ann Arbor

In [4]:
df_AA=df[df['station']=="KARB0"] #spliting off AA so I don't adjust its time.  

In [222]:
df_AA

Unnamed: 0,station,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
1310160,KARB0,2022-01-01 00:00:00,5.0,2.9,86.0,0.0,120.0,15.0,1005.0
1310161,KARB0,2022-01-01 01:00:00,5.0,2.9,86.0,0.0,120.0,11.2,1005.4
1310162,KARB0,2022-01-01 02:00:00,5.0,2.9,86.0,0.0,120.0,5.4,1005.3
1310163,KARB0,2022-01-01 03:00:00,5.0,2.9,86.0,0.0,0.0,0.0,1004.9
1310164,KARB0,2022-01-01 04:00:00,5.6,3.3,85.0,0.0,0.0,0.0,1004.9
...,...,...,...,...,...,...,...,...,...
1316515,KARB0,2022-09-22 19:00:00,16.1,2.8,41.0,0.0,329.0,26.6,1017.0
1316516,KARB0,2022-09-22 20:00:00,16.1,2.8,41.0,0.0,328.0,26.3,1017.2
1316517,KARB0,2022-09-22 21:00:00,15.8,2.6,41.0,0.0,329.0,25.2,1017.4
1316518,KARB0,2022-09-22 22:00:00,15.7,2.1,40.0,0.0,332.0,24.8,1017.8


In [223]:
df_NOAA=df[df['station']!="KARB0"]

In [224]:
df_NOAA['time']=df_NOAA['time']+timedelta(hours=24) #adds 24 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_NOAA['time']=df_NOAA['time']+timedelta(hours=24) #adds 24


In [225]:
df_AA

Unnamed: 0,station,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
1310160,KARB0,2022-01-01 00:00:00,5.0,2.9,86.0,0.0,120.0,15.0,1005.0
1310161,KARB0,2022-01-01 01:00:00,5.0,2.9,86.0,0.0,120.0,11.2,1005.4
1310162,KARB0,2022-01-01 02:00:00,5.0,2.9,86.0,0.0,120.0,5.4,1005.3
1310163,KARB0,2022-01-01 03:00:00,5.0,2.9,86.0,0.0,0.0,0.0,1004.9
1310164,KARB0,2022-01-01 04:00:00,5.6,3.3,85.0,0.0,0.0,0.0,1004.9
...,...,...,...,...,...,...,...,...,...
1316515,KARB0,2022-09-22 19:00:00,16.1,2.8,41.0,0.0,329.0,26.6,1017.0
1316516,KARB0,2022-09-22 20:00:00,16.1,2.8,41.0,0.0,328.0,26.3,1017.2
1316517,KARB0,2022-09-22 21:00:00,15.8,2.6,41.0,0.0,329.0,25.2,1017.4
1316518,KARB0,2022-09-22 22:00:00,15.7,2.1,40.0,0.0,332.0,24.8,1017.8


In [226]:
#removing first day and last day 
#(manual current but could be automated)
df_AA24HR=pd.concat([df_AA,df_NOAA])
df_AA24HR=df_AA24HR[(df_AA24HR['time']>='2022-01-02 00:00:00') & (df_AA24HR['time']<='2022-09-22 23:00:00')]

### 2. Clustering

In [228]:
#Scaling all values for kmeans - normalizing columns

scaler = StandardScaler() #read that standard scaler is probably best for knn vs min/max (first attempt)
scaler.fit(df_AA24HR.iloc[:,2:9])
scaled=scaler.fit_transform(df_AA24HR.iloc[:,2:9])
scaled_df=pd.DataFrame(scaled, columns = df.iloc[:2,2:9].columns)
#normalized = preprocessing.normalize(df_AA24HR.iloc[:,2:9])
#normalized
#testknn=df_AA24HR[df_AA24HR['time']=='2022-01-02 1:00:00']
scaled_df=df_AA24HR.iloc[:,0:2].merge(scaled_df, left_index=True, right_index=True)
scaled_df.head()

#testing
#scaled_df=scaled_df[scaled_df['time']<='2022-03-30 00:00:00']

Unnamed: 0,station,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
1310184,KARB0,2022-01-02 00:00:00,1.226448,1.102791,-0.541609,-0.00018,-1.469087,0.301447,-0.594109
1310185,KARB0,2022-01-02 01:00:00,1.082588,1.034509,-0.327593,0.000811,-1.469087,-0.274202,-0.594109
1310186,KARB0,2022-01-02 02:00:00,1.010658,1.034509,-0.113576,0.000811,-1.64817,-1.46814,-0.469047
1310187,KARB0,2022-01-02 03:00:00,0.866797,0.958641,0.10044,0.000811,-1.469087,-0.466085,-0.343984
1310188,KARB0,2022-01-02 04:00:00,0.722937,0.958641,0.635481,0.000811,-1.64817,-1.46814,-0.343984


In [229]:
#creating a results DF
result_df=pd.DataFrame(scaled_df.station.unique(), columns=['station'])
result_df['total']=0
result_df


Unnamed: 0,station,total
0,KARB0,0
1,0CNUO,0
2,0CO7B,0
3,0FV1F,0
4,1J1PJ,0
...,...,...
568,UJHR7,0
569,UV7W2,0
570,V5792,0
571,VMWBN,0


In [None]:
#the actual kmeans
knn = KMeans(n_clusters=20) #set clusters to 20 to get 20-30 clustered with AA  Need to try others
for t in scaled_df.time.unique(): #looping through every hour
    dfhour=scaled_df[scaled_df['time']==t].copy()
    knndata=dfhour.iloc[:,2:9]
    knn.fit(knndata)
    y_kmeans = knn.predict(knndata)
    dfhour['cluster']=y_kmeans #finding clusters
    AAcluster=dfhour.loc[dfhour.station=='KARB0','cluster'].values[0] #finding cluster AA is in
    df_resulthr=dfhour[dfhour['cluster']==AAcluster].copy()
    df_resulthr['total']=1 #setting a threshold
    result_df=pd.concat([result_df, df_resulthr[['station','total']]]).groupby(['station']).sum().reset_index() #joining back to my results df.  I wasn't sure how else to do this.  
result_df

In [None]:
station_loc = os.path.join(ROOT_DIR, 'station_analysis','AAstation.csv') #station data to get distance for comparision
dfstations=pd.read_csv(station_loc)
    #result_df.sort_values(by='total',ascending=False).head(60)
pd.merge(result_df,dfstations,left_on='station', right_on='id').sort_values(by='total',ascending=False).head(21) #top 20 locations (top will be AA)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testknn['cluster']=y_kmeans
