In [1]:
from scipy.stats import levene, shapiro, ttest_rel, spearmanr, permutation_test
from itertools import permutations
import pandas as pd
import numpy as np
from tqdm import tqdm
import pingouin as pg
import matplotlib.pyplot as plt
import scienceplots
import seaborn as sns
import statsmodels.api as sm
plt.style.use(['science', 'ieee'])

In [2]:
import scipy

In [3]:
scipy.__version__

'1.11.1'

In [4]:
df = pd.read_csv("../results/participants.csv")

In [5]:
df["diff_F1"] = df["Before"] - df["After"]

In [6]:
treatment_group = df[df["Group"] == "A"]

In [7]:
treatment_group

Unnamed: 0,Participant,Total-Size,Before,After,Group,diff_F1
0,1,113,0.72,0.83,A,-0.11
2,3,93,0.9,0.97,A,-0.07
3,4,85,0.88,0.93,A,-0.05
7,8,108,0.87,0.95,A,-0.08
8,9,73,0.73,0.91,A,-0.18
9,10,78,0.79,0.82,A,-0.03
13,11,79,0.57,0.81,A,-0.24


In [8]:
control_group = df[df["Group"] == "P"]

In [9]:
f1s_before, f1s_after = treatment_group["Before"].tolist(), treatment_group["After"].tolist()

In [10]:
print(f"Mean F1 Before (Treatment Group): {np.mean(f1s_before)}")

Mean F1 Before (Treatment Group): 0.78


In [11]:
print(f"Std. F1 Before (Treatment Group): {np.std(f1s_before)}")

Std. F1 Before (Treatment Group): 0.10875923606099591


In [12]:
print(f"Mean F1 After (Treatment Group): {np.mean(f1s_after)}")

Mean F1 After (Treatment Group): 0.8885714285714287


In [13]:
print(f"Std. F1 After (Treatment Group): {np.std(f1s_after)}")

Std. F1 After (Treatment Group): 0.06197431989566817


## Statistical tests on Treatment (or Action or Experiment) Group

In [14]:
def perm_levene(*samples):
    return levene(*samples).statistic

def prem_ttest_rel(*samples):
    return ttest_rel(*samples).statistic

In [15]:
# Testing for homoscedasticity using  the Levene test
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.levene.html
exp_levene_res = permutation_test(
    (f1s_before, f1s_after), perm_levene,
    permutation_type='samples', alternative='two-sided'
)

In [16]:
# H0: The Levene test tests the null hypothesis that all input samples are from populations with equal variances.
# p > 0.05 => cannot reject H0, the samples may satisfy homoscedasticity
exp_levene_res.pvalue

0.71875

In [17]:
# Testing for normality using Shapiro-Wilk test for F1s before and after
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.levene.html
normality_f1s_before = shapiro(f1s_before)

In [18]:
# H0: The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution.
# p > 0.05 => cannot reject H0, the samples may satisfy homoscedasticity
normality_f1s_before

ShapiroResult(statistic=0.9058297276496887, pvalue=0.3677538335323334)

In [19]:
normality_f1s_after = shapiro(f1s_after)

In [20]:
# H0: The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution.
# p > 0.05 => accept H0
normality_f1s_after

ShapiroResult(statistic=0.872991681098938, pvalue=0.19711165130138397)

In [21]:
# H0: The increase of F1 scores is not significant for experiment
exp_t_stat_res = permutation_test(
    (f1s_before, f1s_after), prem_ttest_rel,
    permutation_type='samples', alternative='two-sided'
)

In [22]:
exp_t_stat_res.statistic

-3.7999999999999967

In [23]:
exp_t_stat_res.pvalue

0.015625

In [24]:
# p ≤ 0.05, H0 can be rejected => increase is significant

In [25]:
# Caluclating Hedge's g
data = treatment_group.rename(columns={'Before': 'Round 1', 'After': 'Round 2'})
group_exp_plot = pd.melt(data, id_vars='Participant', value_vars=['Round 1', 'Round 2'],var_name='Round', value_name='Value')
n = group_exp_plot.groupby('Round').count()
n1 = n['Value']['Round 1']
n2 = n['Value']['Round 2']
dof = n.sum()['Value'] - 2
variances = group_exp_plot.groupby('Round').var(ddof=1)
var1 = variances['Value']['Round 1']
var2 = variances['Value']['Round 2']
diff_mean = abs(group_exp_plot.groupby('Round').mean().diff()['Value'][-1])
s_pooled_star = np.sqrt((((n1 - 1) * var1) + ((n2 - 1) * var2)) / dof)
hedgess_g = diff_mean / s_pooled_star
hedgess_g

1.1356150550392803

## Statistical tests on Placebo (or Control) Group

In [26]:
ctrl_f1s_before, ctrl_f1s_after = control_group["Before"].tolist(), control_group["After"].tolist()

In [27]:
# Testing for homoscedasticity using  the Levene test
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.levene.html
ctrl_levene_res = permutation_test(
    (ctrl_f1s_before, ctrl_f1s_after), perm_levene,
    permutation_type='samples', alternative='two-sided'
)

In [28]:
# H0: The Levene test tests the null hypothesis that all input samples are from populations with equal variances.
# p > 0.05 => cannot reject H0, the samples may satisfy homoscedasticity
print(ctrl_levene_res.statistic)
print(ctrl_levene_res.pvalue)

0.09374999999999886
0.6875


In [29]:
# Testing for normality using Shapiro-Wilk test for F1s before and after
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.levene.html
normality_ctrl_f1s_before = shapiro(ctrl_f1s_before)

In [30]:
# H0: The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution.
# p > 0.05 => cannot reject H0, the samples may satisfy homoscedasticity
normality_ctrl_f1s_before

ShapiroResult(statistic=0.9584856629371643, pvalue=0.8056417107582092)

In [31]:
normality_ctrl_f1s_after = shapiro(ctrl_f1s_after)

In [32]:
# H0: The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution.
# p > 0.05 => cannot reject H0, the samples may satisfy homoscedasticity
normality_ctrl_f1s_after

ShapiroResult(statistic=0.9196439385414124, pvalue=0.46663081645965576)

In [33]:
# H0: The increase of F1 scores is not significant for experiment
ctrl_t_stat_res = permutation_test(
    (ctrl_f1s_before, ctrl_f1s_after), prem_ttest_rel,
    permutation_type='samples', alternative='two-sided'
)

In [34]:
print(ctrl_t_stat_res.statistic)
print(ctrl_t_stat_res.pvalue)

-2.497480745059397
0.0625


In [35]:
# p > 0.05, H0 cannot be rejected => increase is NOT significant

In [36]:
print(f"Mean F1 Before (Control Group): {np.mean(ctrl_f1s_before)}")

Mean F1 Before (Control Group): 0.8328571428571427


In [37]:
print(f"Std F1 Before (Control Group): {np.std(ctrl_f1s_before)}")

Std F1 Before (Control Group): 0.03806546455564064


In [38]:
print(f"Mean F1 After (Control Group): {np.mean(ctrl_f1s_after)}")

Mean F1 After (Control Group): 0.8571428571428571


In [39]:
print(f"Std F1 After (Control Group): {np.std(ctrl_f1s_after)}")

Std F1 After (Control Group): 0.03149343955006944


In [40]:
treatment_group

Unnamed: 0,Participant,Total-Size,Before,After,Group,diff_F1
0,1,113,0.72,0.83,A,-0.11
2,3,93,0.9,0.97,A,-0.07
3,4,85,0.88,0.93,A,-0.05
7,8,108,0.87,0.95,A,-0.08
8,9,73,0.73,0.91,A,-0.18
9,10,78,0.79,0.82,A,-0.03
13,11,79,0.57,0.81,A,-0.24


In [41]:
spearmanr(treatment_group["Total-Size"], treatment_group["After"])

SignificanceResult(statistic=0.39285714285714296, pvalue=0.38331687042697266)

In [42]:
spearmanr(control_group["Total-Size"], control_group["After"])

SignificanceResult(statistic=-0.18018749253911182, pvalue=0.6990457740939315)

In [43]:
x = treatment_group["Total-Size"]
y = treatment_group["diff_F1"]

In [44]:
def statistic1d(x):
        return spearmanr(x, y).correlation

In [45]:
res = permutation_test((x,), statistic1d, permutation_type='pairings',
                           n_resamples=np.inf, alternative="two-sided")

In [46]:
res.statistic

0.03571428571428572

In [47]:
res.pvalue

0.9634920634920635

In [None]:
# H0: The variation of F1 before and after the feedback loop is independent from the number of annotations (rho = 0)

