### Clustering to reduce features

In [1]:
import os
import pandas as pd
from datetime import datetime, timedelta
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from sklearn.metrics.cluster import silhouette_score

In [2]:
ROOT_DIR = os.path.realpath(os.path.join(os.getcwd(), '..'))
cln_pkl_loc = os.path.join(ROOT_DIR, 'data','cleanweathersmall.pkl')

In [3]:
df = pd.read_pickle(cln_pkl_loc)
df.groupby('station').count()

Unnamed: 0_level_0,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0CNUO,8784,8784,8784,8784,8784,8784,8784,8784
0CO7B,8784,8784,8784,8784,8784,8784,8784,8784
0FV1F,8784,8784,8784,8784,8784,8784,8784,8784
1J1PJ,8784,8784,8784,8784,8784,8784,8784,8784
1JWST,8784,8784,8784,8784,8784,8784,8784,8784
...,...,...,...,...,...,...,...,...
V5792,8784,8784,8784,8784,8784,8784,8784,8784
VMWBN,8784,8784,8784,8784,8784,8784,8784,8784
XM44W,8784,8784,8784,8784,8784,8784,8784,8784
ZFZUV,8784,8784,8784,8784,8784,8784,8784,8784


### 1. shifting time by 24 hours for Ann Arbor

In [4]:
df_AA=df[df['station']=="KARB0"] #spliting off AA so I don't adjust its time.  

In [5]:
df_AA

Unnamed: 0,station,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
2485872,KARB0,2021-10-15 00:00:00,17.8,16.7,93.0,0.0,230.0,5.4,1012.2
2485873,KARB0,2021-10-15 01:00:00,17.2,15.5,90.0,0.0,260.0,7.6,1012.7
2485874,KARB0,2021-10-15 02:00:00,16.1,14.5,90.0,0.0,0.0,0.0,1012.9
2485875,KARB0,2021-10-15 03:00:00,16.7,14.3,86.0,0.0,350.0,9.4,1012.5
2485876,KARB0,2021-10-15 04:00:00,16.1,15.0,93.0,0.0,220.0,7.6,1013.2
...,...,...,...,...,...,...,...,...,...
2494651,KARB0,2022-10-15 19:00:00,11.1,-0.1,46.0,0.0,210.0,24.1,1013.2
2494652,KARB0,2022-10-15 20:00:00,11.1,-0.1,46.0,0.0,220.0,24.1,1013.7
2494653,KARB0,2022-10-15 21:00:00,10.0,-0.5,48.0,0.0,220.0,20.5,1013.8
2494654,KARB0,2022-10-15 22:00:00,8.0,-1.3,52.0,0.0,220.0,9.0,1014.0


In [6]:
df_NOAA=df[df['station']!="KARB0"]

In [7]:
df_NOAA['time']=df_NOAA['time']+timedelta(hours=24) #adds 24 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_NOAA['time']=df_NOAA['time']+timedelta(hours=24) #adds 24


In [8]:
df_AA

Unnamed: 0,station,time,temp,dwpt,rhum,prcp,wdir,wspd,pres
2485872,KARB0,2021-10-15 00:00:00,17.8,16.7,93.0,0.0,230.0,5.4,1012.2
2485873,KARB0,2021-10-15 01:00:00,17.2,15.5,90.0,0.0,260.0,7.6,1012.7
2485874,KARB0,2021-10-15 02:00:00,16.1,14.5,90.0,0.0,0.0,0.0,1012.9
2485875,KARB0,2021-10-15 03:00:00,16.7,14.3,86.0,0.0,350.0,9.4,1012.5
2485876,KARB0,2021-10-15 04:00:00,16.1,15.0,93.0,0.0,220.0,7.6,1013.2
...,...,...,...,...,...,...,...,...,...
2494651,KARB0,2022-10-15 19:00:00,11.1,-0.1,46.0,0.0,210.0,24.1,1013.2
2494652,KARB0,2022-10-15 20:00:00,11.1,-0.1,46.0,0.0,220.0,24.1,1013.7
2494653,KARB0,2022-10-15 21:00:00,10.0,-0.5,48.0,0.0,220.0,20.5,1013.8
2494654,KARB0,2022-10-15 22:00:00,8.0,-1.3,52.0,0.0,220.0,9.0,1014.0


In [9]:
#removing first day and last day 
#(manual current but could be automated)
df_AA24HR=pd.concat([df_AA,df_NOAA])
df_AA24HR=df_AA24HR[(df_AA24HR['time']>='2021-10-16 00:00:00') & (df_AA24HR['time']<='2022-10-14 23:00:00')]

In [10]:
#droping extra columns that we have determined are not part of the model
df_AA24HR.drop(columns=['prcp','wdir','wspd'], inplace=True)

### 2. Clustering

In [11]:
#Scaling all values for kmeans - normalizing columns

scaler = StandardScaler() #read that standard scaler is probably best for knn vs min/max (first attempt)
scaler.fit(df_AA24HR.iloc[:,2:])
scaled=scaler.fit_transform(df_AA24HR.iloc[:,2:])
scaled_df=pd.DataFrame(scaled, columns = df_AA24HR.iloc[:2,2:].columns)
#normalized = preprocessing.normalize(df_AA24HR.iloc[:,2:9])
#normalized
#testknn=df_AA24HR[df_AA24HR['time']=='2022-01-02 1:00:00']
scaled_df=df_AA24HR.iloc[:,0:2].merge(scaled_df, left_index=True, right_index=True)


#testing
#scaled_df=scaled_df[scaled_df['time']<='2022-03-30 00:00:00']
scaled_df.dropna(inplace=True)

In [22]:
#creating a results DF
result_df=pd.DataFrame(scaled_df.station.unique(), columns=['station'])
result_df['total']=0
result_df['silhouette total']=0
result_df


Unnamed: 0,station,total,silhouette total
0,KARB0,0,0
1,0CNUO,0,0
2,0CO7B,0,0
3,0FV1F,0,0
4,1J1PJ,0,0
...,...,...,...
888,QHA0T,0,0
889,SJZBK,0,0
890,UJHR7,0,0
891,UV7W2,0,0


In [23]:
station_loc = os.path.join(ROOT_DIR, 'station_analysis','AAstation.csv') #station data to get distance for comparision
dfstations=pd.read_csv(station_loc)

In [24]:
scaled_df.groupby('station').count()

Unnamed: 0_level_0,time,temp,dwpt,rhum,pres
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0CNUO,8736,8736,8736,8736,8736
0CO7B,8736,8736,8736,8736,8736
0FV1F,8736,8736,8736,8736,8736
1J1PJ,8736,8736,8736,8736,8736
1JWST,8736,8736,8736,8736,8736
...,...,...,...,...,...
QHA0T,8736,8736,8736,8736,8736
SJZBK,8736,8736,8736,8736,8736
UJHR7,8736,8736,8736,8736,8736
UV7W2,8736,8736,8736,8736,8736


In [26]:
#the actual kmeans
knn = KMeans(n_clusters=4) #need to tune this value and try other clustering (hierarchical) to find stable values
for t in tqdm(scaled_df.time.unique()): #looping through every hour
    dfhour=scaled_df[scaled_df['time']==t].copy()
    knndata=dfhour.iloc[:,2:]
    #print(t)
    knn.fit(knndata)
    y_kmeans = knn.predict(knndata)
    clusters=knn.labels_
    dfhour['cluster']=y_kmeans #finding clusters
    AAcluster=dfhour.loc[dfhour.station=='KARB0','cluster'].values[0] #finding cluster AA is in
    df_resulthr=dfhour[dfhour['cluster']==AAcluster].copy()
    df_resulthr['total']=1
    df_resulthr['silhouette total']=silhouette_score(knndata, clusters) #setting a threshold
    result_df=pd.concat([result_df, df_resulthr[['station','total','silhouette total']]]).groupby(['station']).sum().reset_index() #joining back to my results df.  I wasn't sure how else to do this.  
#result_df

100%|██████████| 8736/8736 [21:11<00:00,  6.87it/s]


In [28]:
result_df['silhouette station average']=result_df['silhouette total']/result_df['total']

In [30]:

    #result_df.sort_values(by='total',ascending=False).head(60)
result=pd.merge(result_df,dfstations,left_on='station', right_on='id').sort_values(by='total',ascending=False) #top 20 locations (top will be AA)
result.head(50)
#result.to_csv('kmeansclustering2.csv')

Unnamed: 0,station,total,silhouette total,silhouette station average,id,name,country,region,wmo,icao,...,longitude,elevation,timezone,hourly_start,hourly_end,daily_start,daily_end,monthly_start,monthly_end,milesfromstat
206,KARB0,8736,2638.021689,0.301971,KARB0,Ann Arbor / Pittsfield,US,MI,,KARB,...,-83.7456,256.0,America/Detroit,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2006-01-01,2022-01-01,0.0
416,KMML0,6413,1952.779165,0.304503,KMML0,Marshall,US,MN,,KMML,...,-95.8244,361.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2008-01-01,2022-01-01,627.132888
325,KGAF0,5351,1633.464052,0.305263,KGAF0,Grafton,US,ND,,KGAF,...,-97.3709,251.0,America/Chicago,2010-11-18,2022-09-22,2010-11-19,2022-04-24,2014-01-01,2022-01-01,787.526654
323,KFSW0,5326,1623.894513,0.304899,KFSW0,Fort Madison / Summit (Historical),US,IA,,KFSW,...,-91.3268,220.0,America/Chicago,2006-01-03,2022-09-22,2006-01-05,2022-04-24,2014-01-01,2021-01-01,408.045768
208,KARV0,5310,1622.952282,0.305641,KARV0,Minocqua / Woodruff,US,WI,,KARV,...,-89.7309,497.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2009-01-01,2022-01-01,392.465081
422,KMQB0,5308,1623.16894,0.305797,KMQB0,Macomb / Good Hope,US,IL,,KMQB,...,-90.6524,215.0,America/Chicago,2006-02-01,2022-09-22,2006-02-15,2022-04-24,2012-01-01,2022-01-01,377.657133
418,KMNN0,5275,1611.187047,0.305438,KMNN0,Marion / Logan (Historical),US,OH,,KMNN,...,-83.0635,303.0,America/New_York,2006-01-01,2022-09-22,2006-01-01,2022-04-21,2006-01-01,2022-01-01,116.402149
520,KTNU0,5272,1606.483724,0.30472,KTNU0,Newton / Coal Siding (Historical),US,IA,,KTNU,...,-93.0217,291.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2011-01-01,2022-01-01,479.185382
524,KTVC0,5263,1607.049025,0.305348,KTVC0,Traverse City / Treverse City [Misspelling] / ...,US,MI,,KTVC,...,-85.5824,190.0,America/Detroit,1973-01-01,2022-09-22,1973-01-02,2022-04-24,1981-01-01,2022-01-01,196.855534
85,72420,5201,1592.025374,0.3061,72420,Mansfield / Amoy,US,OH,72420.0,KMFD,...,-82.5166,395.0,America/New_York,2005-01-01,2022-09-22,1948-08-01,2022-09-14,1948-01-01,2022-01-01,115.837645


In [18]:
knn = KMeans(n_clusters=5)
dfhour=scaled_df[scaled_df['time']=='2022-08-30 10:00:00'].copy()
knndata=dfhour.iloc[:,2:]
knn.fit(knndata)
clusters=knn.labels_
silhouette_score(knndata, clusters)

0.28295356741725497

In [21]:
result.head(26)

Unnamed: 0,station,total,id,name,country,region,wmo,icao,latitude,longitude,elevation,timezone,hourly_start,hourly_end,daily_start,daily_end,monthly_start,monthly_end,milesfromstat
206,KARB0,8736,KARB0,Ann Arbor / Pittsfield,US,MI,,KARB,42.223,-83.7456,256.0,America/Detroit,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2006-01-01,2022-01-01,0.0
416,KMML0,6411,KMML0,Marshall,US,MN,,KMML,44.4517,-95.8244,361.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2008-01-01,2022-01-01,627.132888
325,KGAF0,5340,KGAF0,Grafton,US,ND,,KGAF,48.4047,-97.3709,251.0,America/Chicago,2010-11-18,2022-09-22,2010-11-19,2022-04-24,2014-01-01,2022-01-01,787.526654
208,KARV0,5336,KARV0,Minocqua / Woodruff,US,WI,,KARV,45.9279,-89.7309,497.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2009-01-01,2022-01-01,392.465081
323,KFSW0,5327,KFSW0,Fort Madison / Summit (Historical),US,IA,,KFSW,40.6593,-91.3268,220.0,America/Chicago,2006-01-03,2022-09-22,2006-01-05,2022-04-24,2014-01-01,2021-01-01,408.045768
422,KMQB0,5301,KMQB0,Macomb / Good Hope,US,IL,,KMQB,40.5201,-90.6524,215.0,America/Chicago,2006-02-01,2022-09-22,2006-02-15,2022-04-24,2012-01-01,2022-01-01,377.657133
418,KMNN0,5287,KMNN0,Marion / Logan (Historical),US,OH,,KMNN,40.6163,-83.0635,303.0,America/New_York,2006-01-01,2022-09-22,2006-01-01,2022-04-21,2006-01-01,2022-01-01,116.402149
524,KTVC0,5267,KTVC0,Traverse City / Treverse City [Misspelling] / ...,US,MI,,KTVC,44.7416,-85.5824,190.0,America/Detroit,1973-01-01,2022-09-22,1973-01-02,2022-04-24,1981-01-01,2022-01-01,196.855534
520,KTNU0,5266,KTNU0,Newton / Coal Siding (Historical),US,IA,,KTNU,41.6744,-93.0217,291.0,America/Chicago,2006-01-01,2022-09-22,2006-01-01,2022-04-24,2011-01-01,2022-01-01,479.185382
85,72420,5221,72420,Mansfield / Amoy,US,OH,72420.0,KMFD,40.8214,-82.5166,395.0,America/New_York,2005-01-01,2022-09-22,1948-08-01,2022-09-14,1948-01-01,2022-01-01,115.837645


In [20]:
y

NameError: name 'y' is not defined