In [4]:
import numpy as np
from statsmodels.stats.power import FTestAnovaPower

np.random.seed(42)  # For reproducibility
n_runs = 5
sd = 0.1

no_prompt = np.random.normal(loc=0.5, scale=sd, size=n_runs).clip(0, 1)
prompt1 = np.random.normal(loc=0.6, scale=sd, size=n_runs).clip(0, 1)
prompt2 = np.random.normal(loc=0.7, scale=sd, size=n_runs).clip(0, 1)
prompt3 = np.random.normal(loc=0.75, scale=sd, size=n_runs).clip(0, 1)

print("No Prompt:", no_prompt)
print("Prompt 1:", prompt1)
print("Prompt 2:", prompt2)
print("Prompt 3:", prompt3)

No Prompt: [0.54967142 0.48617357 0.56476885 0.65230299 0.47658466]
Prompt 1: [0.5765863  0.75792128 0.67674347 0.55305256 0.654256  ]
Prompt 2: [0.65365823 0.65342702 0.72419623 0.50867198 0.52750822]
Prompt 3: [0.69377125 0.64871689 0.78142473 0.65919759 0.60876963]


In [10]:
# Calculate means and SD
data = [no_prompt, prompt1, prompt2, prompt3]
means = [np.mean(group) for group in data]
sds = [np.std(group, ddof=1) for group in data]

print("Means:", means)
print("SDs:", sds)

# Effect size (f)
grand_mean = np.mean(means)
pooled_sd = np.sqrt(np.mean([sd**2 for sd in sds]))
f = np.sqrt(np.mean([(m - grand_mean)**2 for m in means]) / len(means)) / pooled_sd # Cohen’s f
print(f"Effect size (f): {f:.3f}")  # e.g., 6.25 (huge due to low SD)

# Power analysis
sample_size = FTestAnovaPower().solve_power(effect_size=f, alpha=0.05, power=0.8, k_groups=4)
print(f"Runs per prompt (pooled): {sample_size:.0f}")


Means: [0.5459002974325087, 0.6437119249072426, 0.6134923349602497, 0.6783760181232674]
SDs: [0.07082322188687894, 0.08211426041162091, 0.09198370331800733, 0.06509334847919003]
Effect size (f): 0.312
Runs per prompt (pooled): 116
