In [1]:
%mkdir -p emperical_eq_fp

In [2]:
import pandas as pd

In [3]:
from scipy.stats import ttest_ind, mannwhitneyu, brunnermunzel, median_test

from tqdm.auto import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from scipy.stats import norm

def hodges_lehmann(vals, alpha=0.01):

    A, B = vals

    n = len(A)
    m = len(B)

    M = list(sorted([a - b for a in A for b in B]))

    # https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test#Normal_approximation_and_tie_correction
    # half of sample size +- z score for CI * pooled std of sample size
    # expected U H0, sd U H0
    ZN = norm.ppf(1 - alpha / 2)  # one of tails
    EUH0 = n * m / 2
    SUH0 = (n * m * (n + m + 1) / 12)**0.5

    L = EUH0 - ZN * SUH0
    U = EUH0 + ZN * SUH0

    # for python
    lower = int(round(L) - 1)
    upper = int(round(U) - 1)

    # for boundaries
    lower = max(lower, 0)
    upper = min(upper, len(M) - 1)

    lower = M[lower]
    upper = M[upper]

    return (lower, upper)

In [5]:
def pseuodo_log(x, pseudo_count=1):
    x = np.array(x) + pseudo_count
    x = np.log(x)
    return x

In [8]:
ROWS = []

# for sample size
for SIZE in [15, 25, 50, 100]: 

    RES = {}

    t = tqdm(range(100000))

    VALS = {}

    for ATT in t:

        np.random.seed(ATT)

        DRAW = True

        while DRAW:
            S = np.random.negative_binomial(np.random.randint(1, 10),
                                            np.random.uniform(),
                                            size=SIZE *2)
            S = S.astype('float').tolist()

            if len(set(S)) > 1:
                DRAW = False

        S1 = S[:SIZE]
        S2 = S[SIZE:]

        U, p1 = mannwhitneyu(S1, S2)
        try:
            p1_0 = median_test(S1, S2).pvalue
        except:
            p1_0 = 1
        
        try:
            p1_1 = brunnermunzel(S1, S2).pvalue
        except:
            p1_1 = 1

        s, p2 = ttest_ind(S1, S2, equal_var=True)
        s, p3 = ttest_ind(S1, S2, equal_var=False)

        s, p4 = ttest_ind(pseuodo_log(S1), pseuodo_log(S2), equal_var=True) #  eq. OLS solution
        s, p5 = ttest_ind(pseuodo_log(S1), pseuodo_log(S2), equal_var=False)

        ci = CI = hodges_lehmann([S1, S2], 0.05)
        strict05 = np.sign(ci[0]) == np.sign(ci[1]) and (ci[0] != 0
                                                         and ci[1] != 0)

        ci = CI = hodges_lehmann([S1, S2], 0.01)
        strict01 = np.sign(ci[0]) == np.sign(ci[1]) and (ci[0] != 0
                                                         and ci[1] != 0)

        ci = CI = hodges_lehmann([S1, S2], 0.001)
        strict001 = np.sign(ci[0]) == np.sign(ci[1]) and (ci[0] != 0
                                                          and ci[1] != 0)

        d = (2 * U) / (SIZE**2) - 1
        d = abs(d)

        RES[ATT] = {
            'MWU': p1,
            'MT': p1_0,
            'BM': p1_1,
            'TT': p2,
            'TTW': p3,
            'PL-TT': p4,
            'PL-TTW': p5,
            'HL-NO_0.05': strict05,
            'HL-NO_0.01': strict01,
            'HL-NO_0.001': strict001,
            'd': d
        }

    RES = pd.DataFrame(RES).T

    for r in ['MWU', 'MT', 'BM', 'TT', 'TTW', 'PL-TT', 'PL-TTW']:
        RES[f'{r}_0.05'] = RES[r] <= 0.05
        RES[f'{r}_0.01'] = RES[r] <= 0.01
        RES[f'{r}_0.001'] = RES[r] <= 0.001

    RES['CD_0.15'] = RES['d'] >= 0.15
    RES['CD_0.33'] = RES['d'] >= 0.33
    RES['CD_0.47'] = RES['d'] >= 0.47

    RES['CD_0.15_0.05'] = RES['CD_0.15'] & RES['MWU_0.05']
    RES['CD_0.33_0.05'] = RES['CD_0.33'] & RES['MWU_0.05']
    RES['CD_0.47_0.05'] = RES['CD_0.47'] & RES['MWU_0.05']

    ROW = pd.DataFrame(RES.iloc[:, 7:].astype('int').sum().T.drop('d'),
                       columns=[f"n={SIZE}"])

    ROWS.append(ROW)

100%|████████████████████████████████████████████████████████████████| 100000/100000 [14:07<00:00, 117.95it/s]


In [18]:
RESF = pd.concat(ROWS, axis=1)

ORDER = [
    'TT_0.05', 'TTW_0.05', 'PL-TT_0.05', 'PL-TTW_0.05', 'MWU_0.05', 'BM_0.05',
    'MT_0.05', 'HL-NO_0.05'
]
ORDER += [
    'TT_0.01', 'TTW_0.01', 'PL-TT_0.01', 'PL-TTW_0.01', 'MWU_0.01', 'BM_0.01',
    'MT_0.01', 'HL-NO_0.01'
]
ORDER += [
    'TT_0.001', 'TTW_0.001', 'PL-TT_0.001', 'PL-TTW_0.001', 'MWU_0.001',
    'BM_0.001', 'MT_0.001', 'HL-NO_0.001'
]
ORDER += [
    'CD_0.15', 'CD_0.33', 'CD_0.47', 'CD_0.15_0.05', 'CD_0.33_0.05',
    'CD_0.47_0.05'
]


RESF.loc[ORDER].to_excel('emperical_eq_fp/emperical_fp_100k_eq.xlsx')

RESF.loc[ORDER]

Unnamed: 0,n=15,n=25,n=50,n=100
TT_0.05,4793,4869,4895,4926
TTW_0.05,4572,4794,4884,4920
PL-TT_0.05,4893,5059,4964,4960
PL-TTW_0.05,4818,5032,4953,4958
MWU_0.05,4558,4863,4922,4834
BM_0.05,5283,5268,5067,4903
MT_0.05,1792,2872,3119,3626
HL-NO_0.05,1793,1458,1076,816
TT_0.01,925,932,900,919
TTW_0.01,821,863,892,914
