In [3]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import stats as st


def std_sample_mean(s_population, n):
    """For a sample of size n, calculate the standard deviation of the sample mean,
    given the standard deviation of the population.
    """
    return s_population / np.sqrt(n)


def ci(mean, std, confidence):
    '''Calculate the confidence interval for the specified normal distribution of N(mean, std)
    at given two-sided confidence level.
    '''
    two_sided_confidence = confidence + (1 - confidence) / 2
    std_error = st.norm.ppf(two_sided_confidence)
    return mean - std_error * std, mean + std_error * std


def ci_t(mean, std, df, confidence):
    '''Calculate the confidence interval for the specified t distribution of N(mean, std)
    at given two-sided confidence level.
    '''
    two_sided_confidence = confidence + (1 - confidence) / 2
    std_error = st.t.ppf(two_sided_confidence, df)
    return mean - std_error * std, mean + std_error * std


def r2(t, df):
    """Return the coefficient of determination given the t-statistic of a t-test and the
    degrees of freedom df.
    """
    return t**2 / (t**2 + df)


## t-test of two independent samples

In [32]:
filename = "../data/Food Prices.csv"
food = pd.read_csv(filename)

food.dtypes

Average meal prices at restaurants in Gettysburg ($)      int64
Average meal prices at restaurants in Wilma ($)         float64
dtype: object

In [33]:
food.head()

Unnamed: 0,Average meal prices at restaurants in Gettysburg ($),Average meal prices at restaurants in Wilma ($)
0,9,11.0
1,5,10.0
2,6,12.0
3,11,9.0
4,8,8.0


In [34]:
m_getty = food['Average meal prices at restaurants in Gettysburg ($)'].mean()
m_wilma = food['Average meal prices at restaurants in Wilma ($)'].mean()

n_getty = food['Average meal prices at restaurants in Gettysburg ($)'].dropna().count()
n_wilma = food['Average meal prices at restaurants in Wilma ($)'].dropna().count()

std_getty = food['Average meal prices at restaurants in Gettysburg ($)'].std()
std_wilma = food['Average meal prices at restaurants in Wilma ($)'].std()

print(n_getty, n_wilma)
print(m_getty, m_wilma)
print(std_getty, std_wilma)

18 14
8.944444444444445 11.142857142857142
2.6451336499586917 2.1788191176076888


In [46]:
m_diff = m_getty - m_wilma

In [47]:
std_m_diff = np.sqrt(std_getty**2 / n_getty + std_wilma**2 / n_wilma)
print(std_m_diff)

0.8531100847677227


In [48]:
t = m_diff / std_m_diff
print(t)

-2.5769390582356815


In [49]:
df = n_getty + n_wilma - 2
t_critical = st.t.ppf(.975, df)
print(t_critical)

2.0422724563012373


In [50]:
p = st.t.cdf(t, df) * 2
print(p)

0.01512946515275131


### Conclusion: Since p < .05 we reject the null hypothesis that the two sample means are the same

### Alternatively we can use `scipy.stats.ttest_ind()` to obtain the same result

In [51]:
t, p = st.ttest_ind(food['Average meal prices at restaurants in Gettysburg ($)'], \
                    food['Average meal prices at restaurants in Wilma ($)'].dropna(), equal_var=False)
print(t, p)

-2.576939058235681 0.015153519382495312


## Another t-test of two independent samples

In [55]:
m_a = 33.5
m_b = 31.2

n_a = 6
n_b = 5
df = n_a + n_b - 2

std_a = 8.89
std_b = 10.16

In [58]:
m_diff = m_a - m_b
std_diff = np.sqrt(std_a**2 / n_a + std_b**2 / n_b)

In [59]:
t = m_diff / std_diff
print(t)

0.39551177686805394


In [60]:
t_critical = st.t.ppf(.975, df)
print(t_critical)

2.2621571627409915


## Another t-test of two independent samples

In [66]:
filename = "../data/shoes.csv"
shoes = pd.read_csv(filename)

shoes.dtypes

shoes_females    float64
shoes_males        int64
dtype: object

In [67]:
shoes.head()

Unnamed: 0,shoes_females,shoes_males
0,90.0,4
1,28.0,120
2,30.0,5
3,10.0,3
4,5.0,10


In [68]:
m_m = shoes['shoes_males'].mean()
m_f = shoes['shoes_females'].mean()

n_m = shoes['shoes_males'].count()
n_f = shoes['shoes_females'].count()
df = n_m + n_f - 2

std_m = shoes['shoes_males'].std()
std_f = shoes['shoes_females'].std()

print(n_m, n_f)
print(m_m, m_f)
print(std_m, std_f)

11 7
18.0 33.142857142857146
34.27243790569909 31.360423952430722


In [69]:
m_diff = m_f - m_m
std_m_diff = np.sqrt(std_m**2 / n_m + std_f**2 / n_f)

print(std_m_diff)

15.725088769901236


In [70]:
t = m_diff / std_m_diff
print(t)

0.9629743503795974


In [71]:
t_critical = st.t.ppf(.975, df)
print(t_critical)

2.1199052992210112


In [72]:
ci_t(m_diff, std_m_diff, df, .95)

(-18.192841871177293, 48.478556156891585)

In [73]:
r2(t, df)

0.05478242400037163

In [78]:
st.t.ppf(.975, 5)

2.5705818366147395

## Problem Set 11

In [89]:
n_x = 18
n_y = 25

df_x = n_x - 1
df_y = n_y - 1
df_diff = df_x + df_y

m_x = 3.8
m_y = 2.1
m_diff = m_x - m_y

var_pooled = 0.13

std_m_diff = np.sqrt(var_pooled / n_x + var_pooled / n_y)

t_critical = st.t.ppf(.95, df_diff)

t = m_diff / std_m_diff

print("t = {:.3f}".format(t))
print("t critical = {:.3f}".format(t_critical))


t = 15.253
t critical = 1.683


In [90]:
n_x = 52
n_y = 57

df_x = n_x - 1
df_y = n_y - 1
df_diff = df_x + df_y

m_x = 12
m_y = 8
m_diff = m_x - m_y

var_pooled = 5.1

std_m_diff = np.sqrt(var_pooled / n_x + var_pooled / n_y)

t_critical = st.t.ppf(.975, df_diff)

t = (m_diff - 3) / std_m_diff

print("t = {:.3f}".format(t))
print("t critical = {:.3f}".format(t_critical))


t = 2.309
t critical = 1.982


In [92]:
n_x = 207
n_y = 220

df_x = n_x - 1
df_y = n_y - 1
df_diff = df_x + df_y

m_x = 35.8
m_y = 31.6
m_diff = m_x - m_y

ss1 = 481
ss2 = 322
var_pooled = (ss1 + ss2) / df_diff

std_m_diff = np.sqrt(var_pooled / n_x + var_pooled / n_y)

t_critical = st.t.ppf(.995, df_diff)

t = m_diff / std_m_diff

print("pooled variance = {:.3f}".format(var_pooled))
print("t = {:.3f}".format(t))
print("t critical = {:.3f}".format(t_critical))


pooled variance = 1.889
t = 31.555
t critical = 2.587


In [96]:
st.t.ppf(.95, 23)

1.7138715277470473

In [104]:
x = np.array([2, -3, 5, 4, 7,])
y = np.array([10, 13, 15, 10])

ss1 = np.sum((x - x.mean())**2)
ss2 = np.sum((y - y.mean())**2)

m_diff = x.mean() - y.mean()

n_x = len(x)
n_y = len(y)
df_diff = n_x + n_y - 2

var_pooled = (ss1 + ss2) / df_diff

std_m_diff = np.sqrt(var_pooled / n_x + var_pooled / n_y)

t = m_diff / std_m_diff

t_critical = st.t.ppf(.995, df_diff)

print("pooled variance = {:.3f}".format(var_pooled))
print("t = {:.3f}".format(t))
print("t critical = {:.3f}".format(t_critical))

print(ss1, ss2)
print(std_m_diff)

pooled variance = 10.857
t = -4.072
t critical = 3.499
58.0 18.0
2.210365192839022


In [108]:
n_x = 10
n_y = 10

df_x = n_x - 1
df_y = n_y - 1
df_diff = df_x + df_y

m_x = 10
m_y = 7
m_diff = m_x - m_y

std_m_diff = .94

t_critical = st.t.ppf(.975, df_diff)

t = m_diff / std_m_diff

print("t = {:.3f}".format(t))
print("t critical = {:.3f}".format(t_critical))

std_pooled = 2.33
cohen_d = m_diff / std_pooled
print("Cohen's d = {:.3f}".format(cohen_d))

r_2 = r2(t, df_diff)
print("Coefficient of determination r^2 = {:.3f}".format(r_2))

t = 3.191
t critical = 2.101
Cohen's d = 1.288
Coefficient of determination r^2 = 0.361
