In [1]:
import numpy as np
import pandas as pd
from scipy.stats import f

## 1. Write a Python code to implement Hotelling’s T2 test of a mean vector. (For p-value calculation, python package can be used).

In [2]:
def hotelling_t_test(data, u0, alpha):  # u0 : null hypothesis  / # alpha : significance level
    
    # Calculate Hotelling's T square and Test statisitc
    n = len(data)
    p = len(data.columns)
    sample_mean_v = data.mean()
    mean_diff = sample_mean_v - u0
    cov_v = data.cov()
    cov_inv = np.linalg.inv(cov_v)
    T_square = len(data) * (mean_diff.T @ cov_inv @ mean_diff)
    Test_stat = T_square * ((n-p) / ((n-1)*p))
    
    # Calculate critical region
    c_region = f.ppf(1-alpha, p, n-p, loc=0, scale=1)
    
    # p-value
    p_value = f.sf(Test_stat, p, n-p)
    
    # Hypothesis Testing
    if Test_stat > c_region :
        print("Test Statistic = {:.3f}".format(Test_stat))
        print("p-value = {}".format(p_value))
        print("Hotelling's T^2 = {:.3f}".format(T_square))
        print("Reject the null hypothesis")
    else :
        print("Test Statistic = {:.3f}".format(Test_stat))
        print("p-value = {}".format(p_value))
        print("Hotelling's T^2 = {:.3f}".format(T_square))
        print("Do not reject the null hypothesis")


## 2. Using the data ‘college.dat’ (described in Table 5.2 of the textbook)

In [3]:
# 데이터 불러오기

college = pd.read_csv('college.dat', header = None, delim_whitespace = True)
college.columns = ['social', 'verbal', 'science']
college

Unnamed: 0,social,verbal,science
0,468,41,26
1,428,39,26
2,514,53,21
3,547,67,33
4,614,61,27
...,...,...,...
82,614,70,23
83,527,49,30
84,474,41,16
85,441,47,26


## 2-a.

In [4]:
# use code in #1

u0 = np.array([500,50,30]) # null hypothesis
alpha = 0.05
hotelling_t_test(college, u0, alpha)

Test Statistic = 72.706
p-value = 2.8280970624648144e-23
Hotelling's T^2 = 223.310
Reject the null hypothesis


## 2-b.

In [5]:
# use python function

from statsmodels.stats import multivariate as mv

u0 = np.array([500,50,30]) # null hypothesis
Ddf = college - u0
mv.test_mvmean(Ddf)

<class 'statsmodels.stats.base.HolderTuple'>
statistic = 72.70563859508101
pvalue = 2.828097062464791e-23
df = (3, 84)
t2 = 223.3101756848917
distr = 'F'
tuple = (72.70563859508101, 2.828097062464791e-23)

## 2-c. Determine the lengths and directions for the axes of the 95% confidence ellipsoid for 𝜇.

In [6]:
# Direction for the axes of the 95% confidence ellipsoid for mu

S = college.cov()
eigvals, eigvecs = np.linalg.eig(S)

n = len(college)
p = len(college.columns)
F = f.ppf(1-alpha, p, n-p, loc=0, scale=1)
axes_pos_list = []
axes_neg_list = []
length_list = []
for i in range(len(college.columns)):
    axes_positive = np.sqrt(eigvals[i]) * np.sqrt( (((n-1)*p) / (n*(n-p))) * F ) * eigvecs[i]
    axes_negative = - np.sqrt(eigvals[i]) * np.sqrt( (((n-1)*p) / (n*(n-p))) * F ) * eigvecs[i]
    length = np.sqrt(eigvals[i]) * np.sqrt( (((n-1)*p) / (n*(n-p))) * F ) * 2
    axes_pos_list.append(axes_positive)
    axes_neg_list.append(axes_negative)
    length_list.append(length)
    print("Direction for Axes {} : {}, {}".format(i+1, axes_pos_list[i], axes_neg_list[i]))
print("Length of the confidence region : {}".format(length_list))

Direction for Axes 1 : [-23.5853725   -2.46154906  -0.88530442], [23.5853725   2.46154906  0.88530442]
Direction for Axes 2 : [-0.25579154  2.45938874 -0.02368372], [ 0.25579154 -2.45938874  0.02368372]
Direction for Axes 3 : [-0.04505214  0.00669323  1.18162262], [ 0.04505214 -0.00669323 -1.18162262]
Length of the confidence region : [47.45999510189589, 4.945536654922297, 2.365000216451611]


## d. Find the simultaneous confidence interval for 𝜇1 − 2𝜇2 + 𝜇3.

In [7]:
a = np.array([1, -2, 1])
n = len(college)
p = len(college.columns)
F = f.ppf(1-alpha, p, n-p, loc=0, scale=1)
np.sqrt( (((n-1)*p) / (n*(n-p))) * F * (a.T @ S @ a) )

lower = a.T @ college.mean() - np.sqrt( (((n-1)*p) / (n*(n-p))) * F * (a.T @ S @ a) )
upper = a.T @ college.mean() + np.sqrt( (((n-1)*p) / (n*(n-p))) * F * (a.T @ S @ a) )

print("Simultaneous confidence interval : lower = {}, upper = {}".format(lower, upper))

Simultaneous confidence interval : lower = 422.05125244523714, upper = 462.6154142214296


## 3. Four measurements of the response stiffness on each of 30 boards are listed in data ‘stiff.dat’ (described in Table 4.3 of the textbook). The measures are repeated in the sense that they were made one after another.

## a. Find the simultaneous confidence interval for 𝜇1 + 2𝜇2 − 𝜇3 − 2𝜇4.

In [8]:
# 3. stiff 데이터 읽기

stiff = pd.read_csv('stiff.dat', header=None, delim_whitespace=True)
stiff.columns = [0,1,2,3,4]
stiff_Adj = stiff.drop(4,axis = 1)
stiff_Adj.head()

Unnamed: 0,0,1,2,3
0,1889,1651,1561,1778
1,2403,2048,2087,2197
2,2119,1700,1815,2222
3,1645,1627,1110,1533
4,1976,1916,1614,1883


In [9]:
S = stiff_Adj.cov()
a = np.array([1, 2, -1, -2])
n = len(stiff_Adj)
p = len(stiff_Adj.columns)
alpha = 0.05
F = f.ppf(1-alpha, p, n-p, loc=0, scale=1)

lower = a.T @ stiff_Adj.mean() - np.sqrt( (((n-1)*p) / (n*(n-p))) * F * (a.T @ S @ a) )
upper = a.T @ stiff_Adj.mean() + np.sqrt( (((n-1)*p) / (n*(n-p))) * F * (a.T @ S @ a) )

print("Simultaneous confidence interval : lower = {}, upper = {}".format(lower, upper))

Simultaneous confidence interval : lower = 123.10266537479504, upper = 769.0973346252039


## b. Repeat (a) under large sample assumption (i.e. n-p is large enough).

In [10]:
from scipy.stats import chi2

S = stiff_Adj.cov()
a = np.array([1, 2, -1, -2])
n = len(stiff_Adj)
p = len(stiff_Adj.columns)
alpha = 0.05
Chi2 = chi2.ppf(1-alpha, 4, loc=0, scale=1)

lower = a.T @ stiff_Adj.mean() - np.sqrt(Chi2) * np.sqrt( (a.T @ S @ a) / n )
upper = a.T @ stiff_Adj.mean() + np.sqrt(Chi2) * np.sqrt( (a.T @ S @ a) / n )

print("Simultaneous confidence interval : lower = {}, upper = {}".format(lower, upper))

Simultaneous confidence interval : lower = 161.68206247064114, upper = 730.5179375293578


## 4.Write a Python code to test the hypothesis

In [11]:
def profile_analysis_test(sample, C, alpha):
    
    # Calculate T square & Test statistic
    n = len(sample)
    S = sample.cov()
    sample_mean = sample.mean()
    df1 = C.shape[0] 
    df2 = n - C.shape[0] 
    t2 = n * (C @ sample.mean()).T @ np.linalg.inv(C @ S @ C.T) @ (C @ sample.mean()) # T square
    test_stat = t2 * ((n - df1) / ((n-1) * df1)) # Test statisitic
    
    # Calculate critical region
    c_region = f.ppf(1-alpha, df1, df2, loc=0, scale=1) * ( ((n-1) * df1) / (n - df1) )
    
    # p-value
    p_value = f.sf(test_stat, df1, df2)
    
    # Hypothesis Testing
    if t2 > c_region :
        print("Test statistic = {:.3f}".format(test_stat))
        print("t2 = {:.3f}".format(t2))
        print("p-value = {}".format(p_value))
        print("Reject the null hypothesis")
    else :
        print("Test statistic = {:.3f}".format(test_stat))
        print("t2 = {:.3f}".format(t2))
        print("p-value = {}".format(p_value))
        print("Do not reject the null hypothesis")

## 5. Four measurements of the response stiffness on each of 30 boards are listed in data ‘stiff.dat’ (described in Table 4.3 of the textbook). The measures are repeated in the sense that they were made one after another.

## a. Perform a test for flat means using your Python code in #4.

In [12]:
# Flat mean testing

C = np.array([[1,-1,0,0], [0,1,-1,0], [0,0,1,-1]])

profile_analysis_test(stiff_Adj, C, 0.05)

Test statistic = 79.051
t2 = 254.721
p-value = 1.7219150170540008e-13
Reject the null hypothesis


## b. Perform a test for linear trend using your Python code in #4.

In [13]:
# Linear trend testing

C = np.array([[1,-2,1,0], [0,1,-2, 1]])

profile_analysis_test(stiff_Adj, C, 0.05)

Test statistic = 87.080
t2 = 180.380
p-value = 9.560459481929117e-13
Reject the null hypothesis


## c. Repeat (a) and (b) using a command in a Python package

In [14]:
# Profile Analysis

C = pd.DataFrame([[1,-1,0,0], [0,1,-1,0], [0,0,1,-1]])
flat = stiff_Adj.dot(C.T)
flat.columns = ["x0-x1", "x1-x2", "x2-x3"]

mv.test_mvmean(flat)

<class 'statsmodels.stats.base.HolderTuple'>
statistic = 79.05140875138369
pvalue = 1.721915017054007e-13
df = (3, 27)
t2 = 254.72120597668078
distr = 'F'
tuple = (79.05140875138369, 1.721915017054007e-13)

In [15]:
# Linear Trend

C = pd.DataFrame([[1,-2,1,0], [0,1,-2, 1]])
flat = stiff_Adj.dot(C.T)
flat.columns = ["x1-2*x2+x3", "x2-2*x3+x4"]

mv.test_mvmean(flat)

<class 'statsmodels.stats.base.HolderTuple'>
statistic = 87.08000779315768
pvalue = 9.560459481929198e-13
df = (2, 28)
t2 = 180.38001614296948
distr = 'F'
tuple = (87.08000779315768, 9.560459481929198e-13)