In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from itertools import cycle, islice

In [3]:
data = pd.read_csv('minute_weather.csv')

In [4]:
data.shape

(144353, 13)

In [5]:
data.head()

Unnamed: 0,rowID,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
0,0,2011-09-10 00:00:49,912.3,64.76,97.0,1.2,106.0,1.6,85.0,1.0,,,60.5
1,1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8


In [6]:
#data sampling
sampled_df = data[(data['rowID'] % 10) == 0]
sampled_df.shape

(14436, 13)

In [7]:
sampled_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rowID,14436.0,72175.0,41674.585781,0.0,36087.5,72175.0,108262.5,144350.0
air_pressure,14436.0,917.110966,3.122374,906.4,915.3,917.1,918.9,926.8
air_temp,14436.0,60.084925,12.209394,38.48,48.92,60.44,70.7,90.68
avg_wind_direction,14428.0,148.572775,97.858588,0.0,52.0,169.0,211.0,359.0
avg_wind_speed,14428.0,2.695079,1.955484,0.0,1.3,2.2,3.6,23.4
max_wind_direction,14428.0,150.643887,94.168904,0.0,61.0,177.0,215.0,359.0
max_wind_speed,14428.0,3.327253,2.311952,0.1,1.7,2.7,4.4,24.5
min_wind_direction,14428.0,152.816329,102.253091,0.0,49.0,167.0,209.0,359.0
min_wind_speed,14428.0,2.04632,1.654196,0.0,0.9,1.6,2.8,21.6
rain_accumulation,14435.0,0.000482,0.005989,0.0,0.0,0.0,0.0,0.21


In [8]:
sampled_df[sampled_df['rain_accumulation'] == 0].shape

(14265, 13)

In [9]:
sampled_df[sampled_df['rain_duration'] == 0].shape

(14185, 13)

In [10]:
del sampled_df['rain_accumulation']
del sampled_df['rain_duration']

In [11]:
rows_before = sampled_df.shape[0]
sampled_df = sampled_df.dropna()
rows_after = sampled_df.shape[0]

In [12]:
rows_before - rows_after

8

In [13]:
sampled_df.columns

Index(['rowID', 'hpwren_timestamp', 'air_pressure', 'air_temp',
       'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction',
       'max_wind_speed', 'min_wind_direction', 'min_wind_speed',
       'relative_humidity'],
      dtype='object')

In [14]:
features = ['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction', 'max_wind_speed','relative_humidity']

In [15]:
select_df = sampled_df[features]

In [16]:
select_df.columns

Index(['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed',
       'max_wind_direction', 'max_wind_speed', 'relative_humidity'],
      dtype='object')

In [17]:
select_df

Unnamed: 0,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,relative_humidity
0,912.3,64.76,97.0,1.2,106.0,1.6,60.5
10,912.3,62.24,144.0,1.2,167.0,1.8,38.5
20,912.2,63.32,100.0,2.0,122.0,2.5,58.3
30,912.2,62.60,91.0,2.0,103.0,2.4,57.9
40,912.2,64.04,81.0,2.6,88.0,2.9,57.4
...,...,...,...,...,...,...,...
144310,918.1,46.04,203.0,2.6,215.0,3.4,76.4
144320,918.0,45.14,190.0,3.4,196.0,4.1,76.3
144330,917.8,45.14,174.0,2.5,193.0,4.0,79.0
144340,917.7,44.96,170.0,1.4,179.0,1.6,77.6


In [18]:
X = StandardScaler().fit_transform(select_df)
X

array([[-1.54148305,  0.38235007, -0.52703154, ..., -0.47409954,
        -0.74712302,  0.44492709],
       [-1.54148305,  0.17592945, -0.04673002, ...,  0.17369513,
        -0.66061303, -0.34354476],
       [-1.57351361,  0.26439543, -0.496374  , ..., -0.30418618,
        -0.35782806,  0.3660799 ],
       ...,
       [ 0.22019791, -1.2247819 ,  0.25984542, ...,  0.44980433,
         0.29099687,  1.10796023],
       [ 0.18816735, -1.23952623,  0.2189687 , ...,  0.30113015,
        -0.74712302,  1.05778475],
       [ 0.12410622, -1.25427056,  0.24962624, ...,  0.41794558,
        -0.22806307,  1.20114327]])

In [19]:
#Using kmeans clustering
kmeans = KMeans(n_clusters=12)
model = kmeans.fit(X)
print("model\n", model)



model
 KMeans(n_clusters=12)


In [20]:
centers = model.cluster_centers_
centers

array([[ 4.78488966e-04,  9.02192459e-01, -1.11082270e+00,
        -5.56276597e-01, -1.00960106e+00, -5.91965889e-01,
        -6.47220366e-01],
       [-1.15600777e+00, -1.11240450e+00,  4.79343758e-01,
         4.50884394e-01,  5.95918728e-01,  4.70004795e-01,
         1.30180976e+00],
       [ 6.90955913e-02,  5.75329349e-01,  1.51794547e+00,
        -6.40896111e-01,  1.80164042e+00, -5.92712743e-01,
        -6.29757536e-01],
       [ 5.64397592e-02,  5.25163931e-01, -1.02198379e+00,
         7.92702483e-01, -9.56799929e-01,  7.64905572e-01,
        -7.56796066e-01],
       [-1.03900176e-01,  6.20116292e-01,  5.19478907e-01,
         5.79943632e-01,  6.25980520e-01,  5.21983767e-01,
        -2.35501518e-01],
       [-7.57498193e-03, -8.55563316e-01,  7.41705426e-01,
        -5.42280513e-01,  9.54529595e-01, -5.35512795e-01,
         1.12004342e+00],
       [-1.41705892e+00, -9.73551693e-01,  1.99424750e-01,
         2.43619070e+00,  2.70220628e-01,  2.47082310e+00,
         9.4718912