In [1]:
import scipy
import numpy as np
import pandas as pd

In [2]:
ds = [54,55, 63, 60, 61, 58, 51, 53, 64, 67, 70, 70, 67, 62, 58, 61, 54, 50, 48, 64, 57, 47, 47, 42]

In [3]:
A_labels = ["A1", "A2", "A3"]
B_labels = [f"B{i}" for i in range(1,5)]
counter = 0;
data = []
for a in A_labels:
    for b in B_labels:
        data.append([a,b,ds[counter]])
        counter +=1 
        data.append([a,b,ds[counter]])
        counter +=1 

In [4]:
df = pd.DataFrame(data, columns= ["LabelA","LabelB", "val"])

In [51]:
df

Unnamed: 0,LabelA,LabelB,val
0,A1,B1,54
1,A1,B1,55
2,A1,B2,63
3,A1,B2,60
4,A1,B3,61
5,A1,B3,58
6,A1,B4,51
7,A1,B4,53
8,A2,B1,64
9,A2,B1,67


In [83]:
def f_two_way(df: pd.DataFrame, factor1: str, factor2: str, target: str):
    """
    両側検定を仮定 
    
    -----------------
        df: DataFrame
            因子２つとそれに対応するt_columnが入ったデータフレーム
            下記構造であることが期待される。

            *****************************
            |factor1|factor2|target|
            -----------------
            |a1|b2|t1|
            |a2|b2|t2|

            |an|b2|t3|
            *****************************
            
        factor1: str
            因子１とするカラム名

        factor2: str
            因子2とするカラム名

        target: str
            
            
    exception
    ---------
        [factor1, factor2]ごとの試行回数が２以上であり、かつ均一

    return
    ------
        {
            "interraction": {"F:", "df","Prob": }
            target1: {"F:", "df","Prob": }
            target2: {"F:", "df","Prob": }
        }
    """
    from scipy import stats
    
    assert target in df, f'{target} is not found'
    assert factor1 in df, f'{factor1} is not found'
    assert factor2 in df, f'{factor2} is not found'
    assert df.groupby(by = [factor1, factor2]).count()[target].nunique() == 1, f"This function assumes that samples are the same among each parameter specification"

    a = df[factor1].nunique()
    b = df[factor2].nunique()
    r = df.groupby(by = [factor1, factor2]).count()[target][0]
    n = a*b*r

    assert r >= 2, f"number of samples per parameter must be more than 2"
    assert n == df.shape[0], f"something is wrong, n = {n} and number of samples is {df.shape[0]}"

    ct = (df[target].sum()**2)/n
    
    ## 平方和
    S_T = (df[target]**2).sum() -ct
    S_A_OR_B = (df.groupby(by = [factor1, factor2])[target].sum()**2).sum()/r - ct
    SE = S_T - S_A_OR_B
    SA = (df.groupby(by = factor1)[target].sum()**2).sum()/(b*r) - ct
    SB = (df.groupby(by = factor2)[target].sum()**2).sum()/(a*r) - ct
    S_A_and_B = S_A_OR_B - SA - SB
    
    ## 平均平方 (不偏分散)
    VA = SA / (a-1)
    VB = SB / (b-1)
    V_A_and_B = S_A_and_B/((a-1)*(b-1))
    VE = SE/(a*b*(r-1))

    ## F値 (上から、交互作用がないと仮定したときのF値,)
    fval_interaction = V_A_and_B / VE
    fval_1 = VA / VE
    fval_2 = VB / VE

    fs = [fval_interaction, fval_1, fval_2]
    labels = ["interfaction", factor1, factor2]
    degrees = [(a-1)*(b-1), a-1, b-1]
    ps = [ min(stats.f.cdf(f, degree, a*b*(r-1)), stats.f.sf(f, degree, a*b*(r-1))) * 2 for f, degree in zip(fs, degrees)]
    
    
    
    ret = {}
    
    for l,f,d,p in zip(labels,fs,degrees,ps):
        ret[l] = {}
        ret[l]["prob"] = p
        ret[l]["f"] = f
        ret[l]["degree"] = (d, a*b*(r-1))

    return ret


In [84]:
f_two_way(df,"LabelA", "LabelB", "val")

{'interfaction': {'prob': 0.07579837057968339,
  'f': 0.22174226061919233,
  'degree': (6, 12)},
 'LabelA': {'prob': 0.00031820517843342505,
  'f': 19.775377969762417,
  'degree': (2, 12)},
 'LabelB': {'prob': 0.021034664169601827,
  'f': 5.865370770338289,
  'degree': (3, 12)}}

In [89]:
(df.groupby(by = ["LabelA", "LabelB"]).mean()).max()

val    70.0
dtype: float64