## Test the difference between 2 groups

###  t-test assesses whether the means of two groups are statistically different from each other. This analysis is appropriate whenever you want to compare the means of two group

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import seaborn as sns
import statsmodels.api as sm
from sklearn import linear_model

  from pandas.core import datetools


In [2]:
from scipy.stats import ttest_1samp, wilcoxon, ttest_ind, mannwhitneyu

In [3]:
import scipy.stats as stats

In [4]:
# Load the data into a pandas dataframe
iris = sns.load_dataset("iris")
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


## One-sided t-test
### null hypothesis: expected mean value of sepal length = 17
### reject H0 if p is less than 0.05

In [5]:
t_statistic, p_value = ttest_1samp(iris["sepal_length"], 17)

In [6]:
print "one-sample t-test", p_value

one-sample t-test 1.35472732419e-170


In [None]:
# p_value < 0.05 => alternative hypothesis:
# data deviate significantly from the hypothesis that the mean
# is 17 at the 5% level of significance

### t-tests can be applied when the data are normally distributed (these are patrametric tests)
### so test for normality

In [7]:
import scipy

In [8]:
shapiro_results = scipy.stats.shapiro(iris["sepal_length"])

In [9]:
print(shapiro_results)

(0.9760899543762207, 0.010180278681218624)


In [None]:
##reject H0

### in case of non-normal data distributions
###  non-parametric tests

In [10]:
# one sample wilcoxon-test
z_statistic, p_value = wilcoxon(iris["sepal_length"] - 17)
print "one-sample wilcoxon-test", p_value

one-sample wilcoxon-test 2.24134140611e-26


### two-sample t-test
### null hypothesis: the two groups have the same mean

In [11]:
t_statistic, p_value = ttest_ind(iris["sepal_length"], iris["petal_length"])

In [12]:
print "two-sample t-test", p_value

two-sample t-test 2.85710406958e-31


In [13]:
# two-sample wilcoxon test (non-parametric)
# a.k.a Mann Whitney U
u, p_value = mannwhitneyu(iris["sepal_length"], iris["petal_length"])
print "two-sample wilcoxon-test", p_value

two-sample wilcoxon-test 8.51265073376e-27


### paired t-test

In [14]:
# paired t-test: doing two measurments on the same experimental unit
# e.g., before and after a treatment
t_statistic, p_value = ttest_1samp(iris["sepal_length"] - iris["petal_length"], 0)

In [15]:
# p < 0.05 => alternative hypothesis:
# the difference in mean is not equal to 0
print "paired t-test", p_value

paired t-test 1.79962922773e-50


In [16]:
# alternative to paired t-test when data has an ordinary scale or when not
# normally distributed
z_statistic, p_value = wilcoxon(iris["sepal_length"] - iris["petal_length"])

print "paired wilcoxon-test", p_value

paired wilcoxon-test 2.26115054884e-26
