In [1]:
# Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pandas import set_option
set_option('display.max_rows',6)

# Stats Libraries
import statsmodels.api as sm
import scipy

%matplotlib inline

In [4]:
wind = pd.read_csv('Squamish.Wind.2010-2015.csv',sep=',')
wind

Unnamed: 0,datetime,windspeed,winddir,windgust,temperature,pressure
0,6/16/10 1:00,18,225,21,131,15
1,6/16/10 1:03,16,227,21,138,15
2,6/16/10 1:06,15,227,20,142,15
...,...,...,...,...,...,...
438124,4/21/15 3:48,11,222,15,150,8
438125,4/21/15 3:51,8,196,10,143,9
438126,4/21/15 3:54,8,196,10,143,9


## I would like to test if the windspeed and windgust are statistically different from each other.  I hypothesize that they aren't statistically different, as they followed very similar patterns when I graphed them, and their values always seem quite similar to each other.

In [20]:
## Resampling the data
wind2 = pd.read_csv('Squamish.Wind.2010-2015.csv',sep=',',index_col='datetime')
wind2.index = pd.to_datetime(wind2.index,unit='ms')
wind_resampled = wind2.resample('20D',how = 'mean')
wind_resampled.shape

(89, 5)

In [30]:
wind_resampled

Unnamed: 0_level_0,windspeed,winddir,windgust,temperature,pressure
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-06-16 01:00:00,9.867171,206.252030,12.528562,141.413627,16.229456
2010-07-06 01:00:00,10.932289,187.367884,13.271708,174.686889,15.140886
2010-07-26 01:00:00,8.824489,186.088640,10.719733,178.046187,12.987618
...,...,...,...,...,...
2015-03-02 01:00:00,2.333333,240.666667,4.666667,78.666667,17.333333
2015-03-22 01:00:00,,,,,
2015-04-11 01:00:00,13.497645,229.993721,16.288854,142.574568,16.249608


In [31]:
# Drop the NaN data points
wind_resampled = wind_resampled.dropna()
wind_resampled.head()

Unnamed: 0_level_0,windspeed,winddir,windgust,temperature,pressure
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-06-16 01:00:00,9.867171,206.25203,12.528562,141.413627,16.229456
2010-07-06 01:00:00,10.932289,187.367884,13.271708,174.686889,15.140886
2010-07-26 01:00:00,8.824489,186.08864,10.719733,178.046187,12.987618
2010-08-15 01:00:00,9.112571,189.895532,11.324135,162.768944,12.309023
2010-09-24 01:00:00,7.416236,201.459552,10.095525,127.993976,16.790878


In [32]:
# Making variables for just windspeed and windgust
speed = wind_resampled['windspeed']
gust = wind_resampled['windgust']

In [34]:
# p-value < 0.05, we can say data is not normal, so we can't do a t-test
scipy.stats.shapiro(speed)

(0.9393665194511414, 0.0005879071541130543)

In [35]:
# p-value < 0.05, we can say data is not normal, so we can't do a t-test
scipy.stats.shapiro(gust)

(0.9574524164199829, 0.0068333568051457405)

We can't do a t-test, so we'll do a kruskalwallis test.

In [38]:
scipy.stats.mstats.kruskalwallis(speed,gust)

(52.798079281247965, 3.6966257909208721e-13)

The p-value from the Kruskal-Wallis test is < 0.05, so we can reject null hypothesis that the medians of the two groups are equal.  Therefore, the medians of the two groups are significantly different, and we can say that there is a statistical difference between the wind gust and the wind speed.