In [1]:
import pandas as pd
from scikit_posthocs import posthoc_tukey

In [2]:
df = pd.read_csv('data/miso_wind_data.csv')
df.head()

Unnamed: 0,Market Day,Hour Ending,MWh
0,1/1/2020,1,12161.42
1,1/1/2020,2,13090.65
2,1/1/2020,3,13597.16
3,1/1/2020,4,13638.77
4,1/1/2020,5,13320.79


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Market Day   8784 non-null   object
 1   Hour Ending  8784 non-null   int64 
 2   MWh          8784 non-null   object
dtypes: int64(1), object(2)
memory usage: 206.0+ KB


In [4]:
df['MWh'] = df['MWh'].astype('float')

In [5]:
df['Market Day'] = pd.to_datetime(df['Market Day'])
df.set_index('Market Day', inplace=True)

In [6]:
spring = df['3-1-2020': '5-31-2020'][['MWh']]
summer = df['6-1-2020': '8-31-2020'][['MWh']]
fall = df['9-1-2020': '11-30-2020'][['MWh']]
winter = df['12-1-2020':][['MWh']]

In [7]:
spring.columns = ['spring']
summer.columns = ['summer']
fall.columns = ['fall']
winter.columns = ['winter']

spring.reset_index(inplace=True, drop=True)
summer.reset_index(inplace=True, drop=True)
fall.reset_index(inplace=True, drop=True)
winter.reset_index(inplace=True, drop=True)

In [8]:
tukey_df = pd.concat([spring.iloc[:744], summer.iloc[:744], fall.iloc[:744], winter.iloc[:744]], axis=1)
tukey_df.head()

Unnamed: 0,spring,summer,fall,winter
0,13207.85,12868.09,5153.18,9500.17
1,12951.7,12018.25,5065.62,9628.47
2,12743.27,11392.95,5118.42,9640.06
3,12761.31,11709.88,5070.59,9163.2
4,12457.18,11582.99,5236.84,8834.51


In [9]:
melted = tukey_df.melt(var_name='groups', value_name='values')
melted.head()

Unnamed: 0,groups,values
0,spring,13207.85
1,spring,12951.7
2,spring,12743.27
3,spring,12761.31
4,spring,12457.18


In [10]:
posthoc_tukey(melted, group_col='groups', val_col='values')

Unnamed: 0,spring,summer,fall,winter
spring,1.0,0.001649,0.9,0.003585
summer,0.001649,1.0,0.016828,0.001
fall,0.9,0.016828,1.0,0.001
winter,0.003585,0.001,0.001,1.0


In [11]:
tukey_df.mean()

spring    8629.941344
summer    7829.922083
fall      8480.562460
winter    9383.298374
dtype: float64

We can see the difference in meanbetween most groups is significant, with winter having the strongest wind power, and the summer the weakest. The only groups to not have a significant different are spring and fall. We can see these two have almost the same MWh value around 8500. We used the Tukey test to test for significant differences between multiple groups.