# Import Libraries

In [1]:
from scipy import stats

# Normality Tests

## Shapiro-Wilk Test

Assumptions:
- Observation are independent and identically distributed

Hypothesis:
- Ho: The sample is Gaussian (normally) distributed
- Ha: The sample does not have a Gaussian distribution

**Always refer to documentation as well! It is your best friend**

In [3]:
#initialize data
data = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]

#get stat and pvalue
w_stat, p_value = stats.shapiro(data)

#print stats
print('w_stat=%.3f, p_value=%.3f' % (w_stat, p_value))

#if statement for hypothesis test
if p_value < 0.05:
    print('Probably Gaussian')
else:
    print('Probaly not Gaussian')

w_stat=0.895, p_value=0.193
Probaly not Gaussian


## D'Agostino's K^2 Test

Assumptions:
- independed and identically distributed observations

Hypothesis:
- The sample has a Gaussian distribution
- The samples does not have a Gaussian distribution



In [4]:
#using data from above
test_stat, p_value = stats.normaltest(data)

#print stats
print('w_stat=%.3f, p_value=%.3f' % (w_stat, p_value))

#if statement for hypothesis test
if p_value < 0.05:
    print('Probably Gaussian')
else:
    print('Probaly not Gaussian')

w_stat=0.895, p_value=0.183
Probaly not Gaussian


  k, _ = kurtosistest(a, axis)


## Anderson-Dariling Test

Assumptions:
- Independent and identically distributed observations

Hypothesis:
- The sample is Gaussian distribution
- The sample is not Gaussian distribution

In [8]:
# data from above

result = stats.anderson(data) #get result

print('stat=%.3f' % (result.statistic))

for i in range(len(result.critical_values)):
    sl, cv = result.significance_level[i], result.critical_values[i] #get significance level and critical value
    if result.statistic < cv:
        print('Probably Gaussian at the %.1f%% level' % (sl))
    else:
        print('Probably not Gaussian at the %.1f%% level' % (sl))

stat=0.424
Probably Gaussian at the 15.0% level
Probably Gaussian at the 10.0% level
Probably Gaussian at the 5.0% level
Probably Gaussian at the 2.5% level
Probably Gaussian at the 1.0% level


# Correlation Tests

## Pearson's Correlation Coefficient (r)

Test for Linear relationship

Assumptions:
- Independent and identically distributed observations
- normally distributed observations
- same variance in each observation

Hypothesis:
- The samples are independent
- The samples are not independent

In [10]:
# initialize data
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [0.353, 3.517, 0.125, -7.545, -0.555, -1.536, 3.350, -1.578, -3.537, -1.579]

#get test stat and pvalue
test_stat, p_value = stats.pearsonr(data1, data2)

print('stat=%.3f, p=%.3f' % (test_stat, p_value))
if p_value > 0.05:
 print('Probably independent')
else:
 print('Probably dependent')

stat=0.688, p=0.028
Probably dependent


## Spearman's Rank Correlation

Test if two samples are in a monotonic relationship

Assumptions:
- independent and identically distributed observations in each sample
- Observations can be ranked

Hypothesis:
- The two sample are independent
- The two samples are not independent

In [11]:
#data initialized above

#get test_stat and pvalue
test_stat, p_value = stats.spearmanr(data1, data2)

print('stat=%.3f, p=%.3f' % (test_stat, p_value))
if p_value > 0.05:
 print('Probably independent')
else:
 print('Probably dependent')

stat=0.855, p=0.002
Probably dependent


## Kendall's Rank Correlation

Tests if two samples have a monotonic relationship

Assumptions:
- independent and identically distributed observations
- observations in each sample can be ranked

Hypothesis:
- The two sample are independent
- The two samples are not independent

In [14]:
# data from above

#get stat and p_value
test_stat, p_value = stats.kendalltau(data1, data2)

print('stat=%.3f, p=%.3f' % (test_stat, p_value))
if p_value > 0.05:
 print('Probably independent')
else:
 print('Probably dependent')

stat=0.733, p=0.002
Probably dependent


## Chi-Squared Test

Two categorical variables independent or not

Assumptions:
- observations used in contingency table are independent
- >= 25 examples in each cell of contingency table

Hypothesis:
- The samples are independent
- There is a dependency between samples; the samples are not independent

In [15]:
#initialize table
table = [[10, 20, 30],[6,  9,  17]]

#get stats
stat, p, dof, expected = stats.chi2_contingency(table)

#print results
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
 print('Probably independent')
else:
 print('Probably dependent')

stat=0.272, p=0.873
Probably independent


# Stationary Tests

## Augmented Dickey-Fuller Unit Root Test

Test to determine if time series has a unit root; trend or generally autoregressive

Assumptions:
- Temporally ordered observations

Hypothesis:
- A unit root is present (series is non-stationary)
- A unit root is not present (series is stationary)

In [17]:
# Example of the Augmented Dickey-Fuller unit root test

#import module
from statsmodels.tsa.stattools import adfuller

data = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

#get stats
stat, p, lags, obs, crit, t = adfuller(data)

#print results
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
 print('Probably not Stationary')
else:
 print('Probably Stationary')

stat=0.992, p=0.994
Probably not Stationary


## Kwiatkowski-Phillips-Schmidt-Shin

Time series stationary or not

Assumptions:
- temporally ordered observations

Hypothesis:
- The time series is trend-stationary
- The time series is not trend stationary

In [18]:
#import module
from statsmodels.tsa.stattools import kpss

data = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

#get results
stat, p, lags, crit = kpss(data)

#print results
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
 print('Probably Stationary')
else:
 print('Probably not Stationary')

stat=0.594, p=0.023
Probably not Stationary


# Parametric Statistical Hypothesis Tests

Comparing data samples

## Student's t-test

Test - are means of two independent samples significantly different

Assumptions:
- independent and identically distributed observations
- normally distributed observations
- same variance for each observation

Hypothesis:
- The means of the samples are equal
- The means of the samples are not equal

In [None]:
# Import module
from scipy.stats import ttest_ind

#initialize data
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]

#get results
stat, p = ttest_ind(data1, data2)

#print results
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
 print('Probably the same distribution')
else:
 print('Probably different distributions')

## Paired Student's t-test

Tests if means of two paired samples are significantly different

Assumptions:
- independent and identically distributed observations
- normally distributed observations
- same variance in observations
- observations are paired in each sample

Hypothesis:
- The means are equal
- The means are not equal

In [2]:
# import library
from scipy.stats import ttest_rel

#data
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]

#get results
stat, p = ttest_rel(data1, data2)

#print results
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
 print('Probably the same distribution')
else:
 print('Probably different distributions')

stat=-0.334, p=0.746
Probably the same distribution


## ANOVA

Tests if means of >= two independent samples are significantly different

Assumptions:
- independent and identically distributed
- normally distributed observations
- same variance for each observation

Hypothesis:
- The means fo the samples are equal
- The means are not equal

In [3]:
#import module
from scipy.stats import f_oneway

#initialize ddata
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
data3 = [-0.208, 0.696, 0.928, -1.148, -0.213, 0.229, 0.137, 0.269, -0.870, -1.204]

#get results
stat, p = f_oneway(data1, data2, data3)

#print results
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
 print('Probably the same distribution')
else:
 print('Probably different distributions')

stat=0.096, p=0.908
Probably the same distribution


## Repeated Measures of ANOVA Test

Tests if means of >= two paires samples are significantly different

Assumptions:
- independent and identically distributed observations
- normally distributed
- same variance
- paired observations

Hypothesis:
- The means are equal
- The means are not equal


*Not supported in Python at the moment*

# Non-Parametric Statisitical Hypothesis Tests

## Mann-Whitney U Test

Test if distributions of two independent samples are equal or not

Assumptions:
- indpendent and identically distributed observations
- observations can be ranked

Hypothesis:
- Distributions are equal
- distributions are not equal

In [4]:

# Import module
from scipy.stats import mannwhitneyu

#decalre data
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]

#get results
stat, p = mannwhitneyu(data1, data2)

#print results
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
 print('Probably the same distribution')
else:
 print('Probably different distributions')

stat=40.000, p=0.473
Probably the same distribution


## Wilcoxon Signed-Rank Test

Tests if distributions of two paired samples are equal or not

Assumptions:
- independent and identically distributed
- observations can be ranked
- observatiosn are paired across each sample

In [5]:

# Example of the Wilcoxon Signed-Rank Test
#import module
from scipy.stats import wilcoxon

#data 
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]

# get results
stat, p = wilcoxon(data1, data2)

#print results
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
 print('Probably the same distribution')
else:
 print('Probably different distributions')

stat=21.000, p=0.557
Probably the same distribution


## Kruskal-Wallis H Test

Tests distributions of two or more independent samples are equal

Assumptions:
- independent and identically distributed
- obcervations can be ranked

Hypothesis:
- Distributions are equal
- Distributions are not equal

In [6]:
#import library
from scipy.stats import kruskal

#decalre data
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]

#get results
stat, p = kruskal(data1, data2)

#print resutlts
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
 print('Probably the same distribution')
else:
 print('Probably different distributions')

stat=0.571, p=0.450
Probably the same distribution


## Friedman Test

Tests if distributions of >= two paired samples are equal or not

Assumptions:
- independent and identically distributed
- observations can be ranked
- paired observations across each sample

Hypothesis:
- The distributions are equal
- The distributions are not equal

In [None]:
#import library
from scipy.stats import friedmanchisquare

#declare data
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
data3 = [-0.208, 0.696, 0.928, -1.148, -0.213, 0.229, 0.137, 0.269, -0.870, -1.204]

#get results
stat, p = friedmanchisquare(data1, data2, data3)

#print results
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
 print('Probably the same distribution')
else:
 print('Probably different distributions')