### Clustering to reduce features

In [124]:
import os
import pandas as pd
from datetime import datetime, timedelta
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

In [2]:
ROOT_DIR = os.path.realpath(os.path.join(os.getcwd(), '..'))
cln_pkl_loc = os.path.join(ROOT_DIR, 'data','cleanweathersmall.pkl')

In [3]:
df = pd.read_pickle(cln_pkl_loc)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3657000 entries, 0 to 3656999
Data columns (total 9 columns):
 #   Column   Dtype         
---  ------   -----         
 0   station  object        
 1   time     datetime64[ns]
 2   temp     float64       
 3   dwpt     float64       
 4   rhum     float64       
 5   prcp     float64       
 6   wdir     float64       
 7   wspd     float64       
 8   pres     float64       
dtypes: datetime64[ns](1), float64(7), object(1)
memory usage: 279.0+ MB


### 1. shifting time by 24 hours for Ann Arbor

In [4]:
df_AA=df[df['station']=="KARB0"]

In [5]:
df_AA

Unnamed: 0,station,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
1310160,KARB0,2022-01-01 00:00:00,5.0,2.9,86.0,0.0,120.0,15.0,1005.0
1310161,KARB0,2022-01-01 01:00:00,5.0,2.9,86.0,0.0,120.0,11.2,1005.4
1310162,KARB0,2022-01-01 02:00:00,5.0,2.9,86.0,0.0,120.0,5.4,1005.3
1310163,KARB0,2022-01-01 03:00:00,5.0,2.9,86.0,0.0,0.0,0.0,1004.9
1310164,KARB0,2022-01-01 04:00:00,5.6,3.3,85.0,0.0,0.0,0.0,1004.9
...,...,...,...,...,...,...,...,...,...
1316515,KARB0,2022-09-22 19:00:00,16.1,2.8,41.0,0.0,329.0,26.6,1017.0
1316516,KARB0,2022-09-22 20:00:00,16.1,2.8,41.0,0.0,328.0,26.3,1017.2
1316517,KARB0,2022-09-22 21:00:00,15.8,2.6,41.0,0.0,329.0,25.2,1017.4
1316518,KARB0,2022-09-22 22:00:00,15.7,2.1,40.0,0.0,332.0,24.8,1017.8


In [29]:
df_NOAA=df[df['station']!="KARB0"]

In [30]:
df_NOAA['time']=df_NOAA['time']+timedelta(hours=24)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_NOAA['time']=df_NOAA['time']+timedelta(hours=24)


In [138]:
df_AA

Unnamed: 0,station,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
1310160,KARB0,2022-01-01 00:00:00,5.0,2.9,86.0,0.0,120.0,15.0,1005.0
1310161,KARB0,2022-01-01 01:00:00,5.0,2.9,86.0,0.0,120.0,11.2,1005.4
1310162,KARB0,2022-01-01 02:00:00,5.0,2.9,86.0,0.0,120.0,5.4,1005.3
1310163,KARB0,2022-01-01 03:00:00,5.0,2.9,86.0,0.0,0.0,0.0,1004.9
1310164,KARB0,2022-01-01 04:00:00,5.6,3.3,85.0,0.0,0.0,0.0,1004.9
...,...,...,...,...,...,...,...,...,...
1316515,KARB0,2022-09-22 19:00:00,16.1,2.8,41.0,0.0,329.0,26.6,1017.0
1316516,KARB0,2022-09-22 20:00:00,16.1,2.8,41.0,0.0,328.0,26.3,1017.2
1316517,KARB0,2022-09-22 21:00:00,15.8,2.6,41.0,0.0,329.0,25.2,1017.4
1316518,KARB0,2022-09-22 22:00:00,15.7,2.1,40.0,0.0,332.0,24.8,1017.8


In [156]:
#removing first day and last day 
#(manual current but could be automated)
df_AA24HR=pd.concat([df_AA,df_NOAA])
df_AA24HR=df_AA24HR[(df_AA24HR['time']>='2022-01-02 00:00:00') & (df_AA24HR['time']<='2022-09-22 23:00:00')]

### 2. Clustering

In [197]:
#Scaling all values for kmeans

scaler = MinMaxScaler()
scaler.fit(df_AA24HR.iloc[:,2:9])
scaled=scaler.fit_transform(df_AA24HR.iloc[:,2:9])
scaled_df=pd.DataFrame(scaled, columns = df.iloc[:2,2:9].columns)
#normalized = preprocessing.normalize(df_AA24HR.iloc[:,2:9])
#normalized
#testknn=df_AA24HR[df_AA24HR['time']=='2022-01-02 1:00:00']
scaled_df=df_AA24HR.iloc[:,0:2].merge(scaled_df, left_index=True, right_index=True)
scaled_df.head()

#testing
scaled_df=scaled_df[scaled_df['time']<='2022-01-30 00:00:00']

In [198]:
#creating a results DF
result_df=pd.DataFrame(scaled_df.station.unique(), columns=['station'])
result_df['total']=0
result_df


Unnamed: 0,station,total
0,KARB0,0
1,0CNUO,0
2,0CO7B,0
3,0FV1F,0
4,1J1PJ,0
...,...,...
568,UJHR7,0
569,UV7W2,0
570,V5792,0
571,VMWBN,0


In [199]:
#the actual kmeans
knn = KMeans(n_clusters=20)
for t in scaled_df.time.unique():
    dfhour=scaled_df[scaled_df['time']==t].copy()
    knndata=dfhour.iloc[:,2:9]
    knn.fit(knndata)
    y_kmeans = knn.predict(knndata)
    dfhour['cluster']=y_kmeans
    AAcluster=dfhour.loc[dfhour.station=='KARB0','cluster'].values[0]
    df_resulthr=dfhour[dfhour['cluster']==AAcluster].copy()
    df_resulthr['total']=1
    result_df=pd.concat([result_df, df_resulthr[['station','total']]]).groupby(['station']).sum().reset_index()
result_df

Unnamed: 0,station,total
0,0CNUO,0
1,0CO7B,0
2,0FV1F,0
3,1J1PJ,0
4,1JWST,0
...,...,...
568,UJHR7,0
569,UV7W2,0
570,V5792,0
571,VMWBN,0


In [212]:
station_loc = os.path.join(ROOT_DIR, 'station_analysis','AAstation.csv')
dfstations=pd.read_csv(station_loc)
    #result_df.sort_values(by='total',ascending=False).head(60)
pd.merge(result_df,dfstations,left_on='station', right_on='id').sort_values(by='total',ascending=False).head(21)

Unnamed: 0,station,total,id,name,country,region,wmo,icao,latitude,longitude,elevation,timezone,hourly_start,hourly_end,daily_start,daily_end,monthly_start,monthly_end,milesfromstat
206,KARB0,673,KARB0,Ann Arbor / Pittsfield,US,MI,,KARB,42.223,-83.7456,256.0,America/Detroit,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2006-01-01,2022-01-01,0.0
208,KARV0,229,KARV0,Minocqua / Woodruff,US,WI,,KARV,45.9279,-89.7309,497.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2009-01-01,2022-01-01,392.465081
461,KPDC0,226,KPDC0,Prairie Du Chien / Selchs Mobile Home Park,US,WI,,KPDC,43.0193,-91.1237,201.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2007-01-01,2022-01-01,379.982598
435,KOEB0,223,KOEB0,Coldwater / Countryside Mobile Court,US,MI,,KOEB,41.9336,-85.0523,292.0,America/Detroit,2006-01-01,2022-09-22,2006-01-01,2022-04-25,2008-01-01,2022-01-01,70.093076
209,KASW0,213,KASW0,Warsaw / Monoquet,US,IN,,KASW,41.2747,-85.8401,259.0,America/Indiana/Indianapolis,2006-10-28,2022-09-22,2009-09-05,2022-04-24,2013-01-01,2022-01-01,126.490533
496,KRYM0,209,KRYM0,Ray S Miller Aaf / Camp Ripley Junction,US,MN,,KRYM,46.0912,-94.3605,351.0,America/Chicago,2012-07-13,2022-09-22,2014-02-02,2022-04-24,,,590.699263
229,KBIE0,207,KBIE0,Beatrice / Hoyle North 77 Mobile Homes,US,NE,,KBIE,40.3013,-96.7541,403.0,America/Chicago,2006-01-01,2022-09-22,2006-01-02,2022-04-24,2008-01-01,2022-01-01,689.505441
195,KAHQ0,206,KAHQ0,Wahoo,US,NE,,KAHQ,41.2406,-96.5946,373.0,America/Chicago,2011-05-17,2022-09-22,2011-05-18,2022-04-24,2014-01-01,2022-01-01,667.059311
463,KPEX0,197,KPEX0,Paynesville,US,MN,,KPEX,45.3721,-94.7447,361.0,America/Chicago,2005-08-12,2022-09-22,2005-08-13,2022-04-24,2011-01-01,2022-01-01,590.821951
421,KMPZ0,188,KMPZ0,Mount Pleasant / Woodside Mobile Estates,US,IA,,KMPZ,40.9466,-91.5111,223.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2015-01-01,2022-01-01,411.734003


In [54]:
testknn['cluster']=y_kmeans

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testknn['cluster']=y_kmeans


In [56]:
testknn[testknn['cluster']==14]

Unnamed: 0,station,time,temp,dwpt,rhum,prcp,wdir,wspd,pres,cluster
1310185,KARB0,2022-01-02 01:00:00,-2.8,-5.6,81.0,0.7,30.0,16.6,1010.2,14
184441,71352,2022-01-02 01:00:00,1.7,0.2,90.0,0.0,40.0,7.0,1010.1,14
591481,72530,2022-01-02 01:00:00,5.0,3.3,89.0,0.0,40.0,18.4,1004.0,14
648721,72539,2022-01-02 01:00:00,3.9,0.1,76.0,0.0,50.0,11.2,1005.6,14
763201,72635,2022-01-02 01:00:00,1.7,0.5,92.0,0.0,40.0,14.8,1006.4,14
769561,72636,2022-01-02 01:00:00,2.2,-0.6,82.0,0.0,40.0,13.0,1006.2,14
782281,72638,2022-01-02 01:00:00,0.0,-2.9,81.0,0.0,50.0,9.4,1008.1,14
795001,72640,2022-01-02 01:00:00,3.3,1.2,86.0,0.3,30.0,18.4,1006.0,14
807721,72642,2022-01-02 01:00:00,-1.7,-3.9,85.0,0.0,30.0,14.8,1007.2,14
1252921,KAIO0,2022-01-02 01:00:00,-6.0,-6.0,100.0,0.0,40.0,20.5,1011.0,14


In [116]:
testknn2=df_AA24HR[df_AA24HR['time']=='2022-01-15 8:00:00']

In [117]:
testdata=testknn2.iloc[:,2:9]
knn = KMeans(n_clusters=20)
knn.fit(testdata)
y_kmeans = knn.predict(testdata)
testknn2['cluster']=y_kmeans
AAcluster=testknn2.loc[testknn2.station=='KARB0','cluster'].values[0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testknn2['cluster']=y_kmeans


6

In [118]:
supertest=testknn2[testknn2['cluster']==AAcluster].merge(testknn[testknn['cluster']==14], left_on='station', right_on='station')

In [119]:
supertest

Unnamed: 0,station,time_x,temp_x,dwpt_x,rhum_x,prcp_x,wdir_x,wspd_x,pres_x,cluster_x,time_y,temp_y,dwpt_y,rhum_y,prcp_y,wdir_y,wspd_y,pres_y,cluster_y
0,KARB0,2022-01-15 08:00:00,-11.1,-15.7,69.0,0.0,50.0,20.5,1028.5,19,2022-01-02 01:00:00,-2.8,-5.6,81.0,0.7,30.0,16.6,1010.2,14
1,KCLI0,2022-01-15 08:00:00,-8.0,-13.1,67.0,0.0,50.0,18.4,1023.0,19,2022-01-02 01:00:00,-2.0,-5.0,80.0,0.0,30.0,5.4,1010.0,14
2,KGOV0,2022-01-15 08:00:00,-9.0,-12.0,79.0,0.0,50.0,22.3,1022.2,19,2022-01-02 01:00:00,-0.9,-2.2,91.0,0.0,50.0,7.6,1008.7,14
3,KLDM0,2022-01-15 08:00:00,-4.7,-10.7,63.0,0.0,50.0,11.2,1021.0,19,2022-01-02 01:00:00,0.8,-2.6,78.0,0.0,30.0,7.6,1007.0,14
