## 1.Write a Python code to test the hypothesis

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import f

In [2]:
def profile_analysis(df1, df2, alpha, C) :
    n1 = len(df1)
    n2 = len(df2)
    df1_mean = df1.mean()
    df2_mean = df2.mean()
    S1 = df1.cov()
    S2 = df2.cov()
    S_p = ( (n1-1)*S1 + (n2-1)*S2 ) / ( n1 + n2 - 2 ) # Assume equal variance
    q = C.shape[0] 
    alpha = alpha # parameter
    
    
    t2 = (df1_mean - df2_mean).T @ C.T @ np.linalg.inv( (1/n1 + 1/n2) * C @ S_p @ C.T ) @ C @ (df1_mean - df2_mean)
    test_stat = ( (n1+n2-q-1) / (q*(n1+n2-2)) ) * t2

     # Calculate critical region
    c_region = f.ppf(1-alpha, q, n1+n2-q-1, loc=0, scale=1)

    # p-value
    p_value = f.sf(test_stat, q, n1+n2-q-1)

    # Hypothesis Testing
    if test_stat > c_region :
        print("Test statistic = {:.3f}".format(test_stat))
        print("t2 = {:.3f}".format(t2))
        print("p-value = {}".format(p_value))
        print("Reject the null hypothesis")
    else :
        print("Test statistic = {:.3f}".format(test_stat))
        print("t2 = {:.3f}".format(t2))
        print("p-value = {}".format(p_value))
        print("Do not reject the null hypothesis")

## 2. Table 6.9 contains measurements on the carapaces of 24 female and 24 male turtles. The data is in file ‘turtle.dat’.

In [3]:
# 데이터 불러오기

turtle = pd.read_csv('turtle.dat', header=None, delim_whitespace=True)
turtle.columns = ['t1', 't2', 't3', 'sex']

# 남녀 데이터셋 나누기

turtle_female = turtle[turtle['sex'] == "female"][['t1', 't2', 't3']].reset_index(drop = True)
turtle_male = turtle[turtle['sex'] == "male"][['t1', 't2', 't3']].reset_index(drop = True)

### a. Are the profiles parallel? (Use your code in #1)

In [5]:
C = np.array([[1,-1,0], [0,1,-1]])
profile_analysis(turtle_female, turtle_male, 0.05, C)

Test statistic = 7.544
t2 = 15.423
p-value = 0.0014947765346784248
Reject the null hypothesis


### b. Are the profiles coincident? (Use your code in #1)

In [6]:
C = np.array([1,1,1])
C = C.reshape([1,3])

profile_analysis(turtle_female, turtle_male, 0.05, C)

Test statistic = 24.965
t2 = 24.965
p-value = 8.894702339275939e-06
Reject the null hypothesis


### c. Repeat (a) and (b) using Python packages.

In [7]:
# 데이터 가공

diff = turtle.drop(['sex'], axis = 1)
diff.columns = [0,1,2]
C = pd.DataFrame([[1,-1,0], [0,1,-1]])

diff_F = diff.dot(C.T)
diff_F.columns = ['t12', 't23']

para = diff_F.join(pd.DataFrame(turtle['sex']))
para.head()

Unnamed: 0,t12,t23,sex
0,17,43,female
1,19,46,female
2,17,44,female
3,19,44,female
4,21,44,female


In [8]:
# (a) Are the profiles parallel?

from statsmodels.multivariate.manova import MANOVA

maov = MANOVA.from_formula('t12 + t23 ~ sex', data = para)
print(maov.mv_test())

                   Multivariate linear model
                                                               
---------------------------------------------------------------
       Intercept         Value  Num DF  Den DF  F Value  Pr > F
---------------------------------------------------------------
          Wilks' lambda  0.0175 2.0000 45.0000 1260.1630 0.0000
         Pillai's trace  0.9825 2.0000 45.0000 1260.1630 0.0000
 Hotelling-Lawley trace 56.0072 2.0000 45.0000 1260.1630 0.0000
    Roy's greatest root 56.0072 2.0000 45.0000 1260.1630 0.0000
---------------------------------------------------------------
                                                               
---------------------------------------------------------------
             sex           Value  Num DF  Den DF F Value Pr > F
---------------------------------------------------------------
             Wilks' lambda 0.7489 2.0000 45.0000  7.5439 0.0015
            Pillai's trace 0.2511 2.0000 45.0000  7.5439 0.

In [9]:
# (b) Are the profiles coincident?

from scipy import stats
from scipy.stats import ttest_ind

In [10]:
# univariate t1+t2+t3

female = turtle[turtle['sex'] == 'female'].iloc[:, :3].sum(axis = 1)
male = turtle[turtle['sex'] == 'male'].iloc[:, :3].sum(axis = 1)

In [11]:
# (1) Equal variance test
stats.levene(female, male)

# p value가 0.05보다 작으므로 등분산 가정 성립하지 않음.

LeveneResult(statistic=6.045379899395555, pvalue=0.017766075695295522)

In [12]:
# (2) t-test
stats.ttest_ind(female, male, equal_var = True)

Ttest_indResult(statistic=4.996482809354179, pvalue=8.894702339275784e-06)

### d. Test the equality of the two population mean vectors at a=0.05 using oneway MANOVA approach (use the Python package).

In [13]:
maov = MANOVA.from_formula('t1+t2+t3 ~ sex', data = turtle)
print(maov.mv_test())

                   Multivariate linear model
                                                               
---------------------------------------------------------------
       Intercept         Value  Num DF  Den DF  F Value  Pr > F
---------------------------------------------------------------
          Wilks' lambda  0.0144 3.0000 44.0000 1001.2534 0.0000
         Pillai's trace  0.9856 3.0000 44.0000 1001.2534 0.0000
 Hotelling-Lawley trace 68.2673 3.0000 44.0000 1001.2534 0.0000
    Roy's greatest root 68.2673 3.0000 44.0000 1001.2534 0.0000
---------------------------------------------------------------
                                                               
---------------------------------------------------------------
             sex           Value  Num DF  Den DF F Value Pr > F
---------------------------------------------------------------
             Wilks' lambda 0.3886 3.0000 44.0000 23.0782 0.0000
            Pillai's trace 0.6114 3.0000 44.0000 23.0782 0.