In [None]:
import numpy as np
import scipy.stats as stats
from scipy.stats import sem, t

# --- Effect Size Function ---
def paired_cohens_d_and_ci(a, b, confidence=0.95):
    """
    Computes Cohen's d for paired samples and its confidence interval.

    Parameters:
        a (array-like): First set of paired observations.
        b (array-like): Second set of paired observations.
        confidence (float): Confidence level for the interval (default is 0.95).

    Returns:
        d (float): Cohen's d effect size.
        ci_low (float): Lower bound of the confidence interval.
        ci_high (float): Upper bound of the confidence interval.
    """
    diff = np.array(a) - np.array(b)
    n = len(diff)
    mean_diff = np.mean(diff)
    std_diff = np.std(diff, ddof=1)

    if std_diff == 0:
        return 0.0 if mean_diff == 0 else np.nan, np.nan, np.nan

    d = mean_diff / std_diff
    se = sem(diff)
    t_crit = t.ppf(1 - (1 - confidence)/2, n - 1)
    ci_low = (mean_diff - t_crit * se) / std_diff
    ci_high = (mean_diff + t_crit * se) / std_diff

    return d, ci_low, ci_high


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon, mannwhitneyu
from scipy.stats import ttest_rel, t, sem, shapiro # Import shapiro

# Load the scores (update file paths as needed)
scores_boruta = pd.read_csv('../Elements/Boruta_RusBoost/AcrossSplits/scores.csv')
scores_Multi = pd.read_csv('../Elements/Multi_SMOTE_LogReg/AcrossSplits/scores.csv')


# Extract F1-scores
f1_boruta = scores_boruta["F1-score"][:-2]
f1_logreg = scores_Multi["F1-score"][:-2]

# Significance level
alpha = 0.05

# Ensure the lengths match for paired test
if len(f1_boruta) != len(f1_logreg):
    raise ValueError("The two samples must have the same length for the Wilcoxon signed-rank test.")

# Perform paired t test (for paired/related samples)
stat_t, p_t = ttest_rel(f1_boruta, f1_logreg)

print("paired t test = {:.4f}, p-value = {:.4f}".format(stat_t, p_t))
if p_t < alpha:
    print("Reject the null hypothesis (significant difference between the two models).")
else:
    print("Fail to reject the null hypothesis (no significant difference between the two models).")


paired t test = 2.2656, p-value = 0.0861
Fail to reject the null hypothesis (no significant difference between the two models).


In [10]:
d, ci_low, ci_high = paired_cohens_d_and_ci(f1_boruta, f1_logreg)
print(d)


1.0132220175170246


In [2]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon, mannwhitneyu
from scipy.stats import ttest_rel

# Load the scores (update file paths as needed)
scores_Rus = pd.read_csv('../Elements/RusBoost/AcrossSplits/scores.csv')
scores_boruta = pd.read_csv('../Elements/Boruta_RusBoost/AcrossSplits/scores.csv')


# Extract F1-scores
f1_boruta = scores_boruta["F1-score"][:-2]
f1_Rus = scores_Rus["F1-score"][:-2]

# Significance level
alpha = 0.05

# Ensure the lengths match for paired test
if len(f1_boruta) != len(f1_Rus):
    raise ValueError("The two samples must have the same length for the Wilcoxon signed-rank test.")

# Perform paired t test (for paired/related samples)
w_stat, p_value_wilcoxon = ttest_rel(f1_boruta, f1_Rus)

print("paired t test = {:.4f}, p-value = {:.4f}".format(w_stat, p_value_wilcoxon))
if p_value_wilcoxon < alpha:
    print("Reject the null hypothesis (significant difference between the two models).")
else:
    print("Fail to reject the null hypothesis (no significant difference between the two models).")


paired t test = 1.0716, p-value = 0.3442
Fail to reject the null hypothesis (no significant difference between the two models).


In [14]:
d,  ci_low, ci_up = paired_cohens_d_and_ci(f1_boruta, f1_Rus)
print(f"Cohen's d: {d:.3f}")
print(f"95% CI: [{ci_low:.4f}, {ci_up:.4f}]")

Cohen's d: 0.479
95% CI: [-0.7624, 1.7209]


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon, mannwhitneyu

# Load the scores (update file paths as needed)
scores_boruta = pd.read_csv('../Elements/Boruta_RusBoost/AcrossSplits/scores.csv')
scores_Log = pd.read_csv('../Elements/SMOTE_LogReg/AcrossSplits/scores.csv')


# Extract F1-scores
f1_boruta = scores_boruta["F1-score"][:-2]
f1_Log = scores_Log["F1-score"][:-2]

# Significance level
alpha = 0.05

# Ensure the lengths match for paired test
if len(f1_boruta) != len(f1_Log):
    raise ValueError("The two samples must have the same length for the Wilcoxon signed-rank test.")

# Perform paired t test (for paired/related samples)
w_stat, p_value_wilcoxon = ttest_rel(f1_boruta, f1_Log)

print("T test: w-statistic = {:.4f}, p-value = {:.4f}".format(w_stat, p_value_wilcoxon))
if p_value_wilcoxon < alpha:
    print("Reject the null hypothesis (significant difference between the two models).")
else:
    print("Fail to reject the null hypothesis (no significant difference between the two models).")


T test: w-statistic = 1.6410, p-value = 0.1761
Fail to reject the null hypothesis (no significant difference between the two models).


In [18]:
d,  ci_low, ci_up = paired_cohens_d_and_ci(f1_boruta, f1_Log)
print(f"Cohen's d: {d:.3f}")
print(f"95% CI: [{ci_low:.4f}, {ci_up:.4f}]")

Cohen's d: 0.734
95% CI: [-0.5078, 1.9755]


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon, mannwhitneyu

# Load the scores (update file paths as needed)
scores_Multi = pd.read_csv('../Elements/Multi_SMOTE_LogReg/AcrossSplits/scores.csv')
scores_Rus = pd.read_csv('../Elements/RusBoost/AcrossSplits/scores.csv')


# Extract F1-scores
f1_multi = scores_Multi["F1-score"][:-2]
f1_Rus = scores_Rus["F1-score"][:-2]

# Significance level
alpha = 0.05

# Ensure the lengths match for paired test
if len(f1_multi) != len(f1_Rus):
    raise ValueError("The two samples must have the same length for the Wilcoxon signed-rank test.")

# Perform paired t test (for paired/related samples)
w_stat, p_value_wilcoxon = ttest_rel(f1_multi, f1_Rus)

print("paired T test: w-statistic = {:.4f}, p-value = {:.4f}".format(w_stat, p_value_wilcoxon))
if p_value_wilcoxon < alpha:
    print("Reject the null hypothesis (significant difference between the two models).")
else:
    print("Fail to reject the null hypothesis (no significant difference between the two models).")


paired T test: w-statistic = -1.4927, p-value = 0.2098
Fail to reject the null hypothesis (no significant difference between the two models).


In [2]:
f1_Rus

0    0.771
1    0.804
2    0.831
3    0.720
4    0.792
Name: F1-score, dtype: float64

In [23]:
d, ci_low, ci_up = paired_cohens_d_and_ci(f1_multi, f1_Rus)
print(f"Cohen's d: {d:.3f}")
print(f"95% CI: [{ci_low:.4f}, {ci_up:.4f}]")

Cohen's d: -0.668
95% CI: [-1.9092, 0.5741]


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon, mannwhitneyu

# Load the scores (update file paths as needed)
scores_Multi = pd.read_csv('../Elements/Multi_SMOTE_LogReg/AcrossSplits/scores.csv')
scores_Log = pd.read_csv('../Elements/SMOTE_LogReg/AcrossSplits/scores.csv')


# Extract F1-scores
f1_multi = scores_Multi["F1-score"][:-2]
f1_Log = scores_Log["F1-score"][:-2]

# Significance level
alpha = 0.05

# Ensure the lengths match for paired test
if len(f1_multi) != len(f1_Log):
    raise ValueError("The two samples must have the same length for the Wilcoxon signed-rank test.")

# Perform paired t test (for paired/related samples)
w_stat, p_value_wilcoxon = ttest_rel(f1_multi, f1_Log)

print("paired T  test: w-statistic = {:.4f}, p-value = {:.4f}".format(w_stat, p_value_wilcoxon))
if p_value_wilcoxon < alpha:
    print("Reject the null hypothesis (significant difference between the two models).")
else:
    print("Fail to reject the null hypothesis (no significant difference between the two models).")


paired T  test: w-statistic = -1.3061, p-value = 0.2616
Fail to reject the null hypothesis (no significant difference between the two models).


In [27]:
d, ci_low, ci_up = paired_cohens_d_and_ci(f1_multi, f1_Log)
print(f"Cohen's d: {d:.3f}")
print(f"95% CI: [{ci_low:.4f}, {ci_up:.4f}]")

Cohen's d: -0.584
95% CI: [-1.8258, 0.6575]


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon, mannwhitneyu

# Load the scores (update file paths as needed)
scores_Rus = pd.read_csv('../Elements/RusBoost/AcrossSplits/scores.csv')
scores_Log = pd.read_csv('../Elements/SMOTE_LogReg/AcrossSplits/scores.csv')


# Extract F1-scores
f1_Rus = scores_Rus["F1-score"][:-2]
f1_Log = scores_Log["F1-score"][:-2]




# Significance level
alpha = 0.05

# Ensure the lengths match for paired test
if len(f1_Rus) != len(f1_Log):
    raise ValueError("The two samples must have the same length for the Wilcoxon signed-rank test.")

# Perform paired t test(for paired/related samples)
w_stat, p_value_wilcoxon = ttest_rel(f1_Rus, f1_Log)

print("paired t test = {:.4f}, p-value = {:.4f}".format(w_stat, p_value_wilcoxon))
if p_value_wilcoxon < alpha:
    print("Reject the null hypothesis (significant difference between the two models).")
else:
    print("Fail to reject the null hypothesis (no significant difference between the two models).")


paired t test = 1.0584, p-value = 0.3496
Fail to reject the null hypothesis (no significant difference between the two models).


In [32]:
d,  ci_low, ci_up = paired_cohens_d_and_ci(f1_Rus, f1_Log)
print(f"Cohen's d: {d:.3f}")
print(f"95% CI: [{ci_low:.4f}, {ci_up:.4f}]")

Cohen's d: 0.473
95% CI: [-0.7683, 1.7150]


In [33]:
from scipy.stats import shapiro

scores_boruta = pd.read_csv('../Elements/Boruta_RusBoost/AcrossSplits/scores.csv')
stat, p_value = shapiro(scores_boruta)
print(f"Shapiro-Wilk test W={stat:.4f}, p={p_value:.4f}")



Shapiro-Wilk test W=nan, p=1.0000


In [2]:
import pandas as pd
scores_Rus = pd.read_csv('../Elements/RusBoost/AcrossSplits/scores.csv')
f1_Rus = scores_Rus["F1-score"][:-2]
f1_Rus

0    0.771
1    0.804
2    0.831
3    0.720
4    0.792
Name: F1-score, dtype: float64

In [4]:
import pandas as pd
from scipy.stats import ttest_rel, wilcoxon, shapiro
from itertools import combinations
from statsmodels.stats.multitest import multipletests
import numpy as np

file_list = [
    '../Elements/RusBoost/AcrossSplits/scores.csv',
    '../Elements/RusBoost/MinimumSet/scores.csv',
    '../Elements/RusBoost/FirstOrder/scores.csv',
    '../Elements/RusBoost/TwoFeatures/scores.csv'
]

def load_f1_scores(filepath):
    df = pd.read_csv(filepath)["F1-score"].iloc[:5].values 
    return df

def cohens_d_paired(x, y):
    diff = x - y
    mean_diff = np.mean(diff)
    sd_diff = np.std(diff, ddof=1)  # campionaria
    return mean_diff / sd_diff

alpha = 0.05
file_pairs = list(combinations(file_list, 2))

p_values = []
test_used = []
results = []

for f1_path, f2_path in file_pairs:
    f1_1 = load_f1_scores(f1_path)
    f1_2 = load_f1_scores(f2_path)

    if len(f1_1) != len(f1_2):
        print(f"[ERRORE] Lunghezze diverse tra {f1_path} e {f2_path}, salto il confronto.\n")
        continue

    diff = f1_1 - f1_2
    stat_shapiro, p_shapiro = shapiro(diff)

    if p_shapiro > alpha:
        # differenze normali → paired t-test
        stat, p_val = ttest_rel(f1_1, f1_2)
        test = "Paired t-test"
    else:
        # differenze non normali → Wilcoxon
        stat, p_val = wilcoxon(f1_1, f1_2)
        test = "Wilcoxon signed-rank test"

    d = cohens_d_paired(f1_1, f1_2)

    p_values.append(p_val)
    test_used.append(test)
    results.append((f1_path, f2_path, stat, p_val, d))

# Correzione Holm-Bonferroni
reject, pvals_corrected, _, _ = multipletests(p_values, alpha=alpha, method='holm')

for i, (f1_path, f2_path, stat, p_val, d) in enumerate(results):
    print(f"Confronto tra:\n  {f1_path}\n  {f2_path}")
    print(f"Test usato: {test_used[i]}")
    print(f"Statistic = {stat:.4f}, p-value originale = {p_val:.4f}, p-value corretto Holm = {pvals_corrected[i]:.4f}")
    print(f"Cohen's d (effetto) = {d:.4f}")
    if reject[i]:
        print("Risultato: Differenza significativa dopo correzione Holm (rifiuto H0)\n")
    else:
        print("Risultato: Nessuna differenza significativa dopo correzione Holm (non rifiuto H0)\n")


Confronto tra:
  ../Elements/RusBoost/AcrossSplits/scores.csv
  ../Elements/RusBoost/MinimumSet/scores.csv
Test usato: Paired t-test
Statistic = -0.7939, p-value originale = 0.4717, p-value corretto Holm = 1.0000
Cohen's d (effetto) = -0.3550
Risultato: Nessuna differenza significativa dopo correzione Holm (non rifiuto H0)

Confronto tra:
  ../Elements/RusBoost/AcrossSplits/scores.csv
  ../Elements/RusBoost/FirstOrder/scores.csv
Test usato: Paired t-test
Statistic = -0.7775, p-value originale = 0.4803, p-value corretto Holm = 1.0000
Cohen's d (effetto) = -0.3477
Risultato: Nessuna differenza significativa dopo correzione Holm (non rifiuto H0)

Confronto tra:
  ../Elements/RusBoost/AcrossSplits/scores.csv
  ../Elements/RusBoost/TwoFeatures/scores.csv
Test usato: Paired t-test
Statistic = 2.8910, p-value originale = 0.0445, p-value corretto Holm = 0.2096
Cohen's d (effetto) = 1.2929
Risultato: Nessuna differenza significativa dopo correzione Holm (non rifiuto H0)

Confronto tra:
  ../Ele

In [6]:
import pandas as pd
import numpy as np
from scipy.stats import sem, t

file_list = [
    '../Elements/RusBoost/AcrossSplits/scores.csv',
    '../Elements/RusBoost/MinimumSet/scores.csv',
    '../Elements/RusBoost/FirstOrder/scores.csv',
    '../Elements/RusBoost/TwoFeatures/scores.csv'
]

def load_f1_scores(filepath):
    df = pd.read_csv(filepath)["F1-score"].iloc[:5].values 
    return df

def cohens_d_paired(x, y):
    diff = x - y
    mean_diff = np.mean(diff)
    sd_diff = np.std(diff, ddof=1)  # campionaria
    return mean_diff / sd_diff

alpha = 0.05

minimumset = load_f1_scores(file_list[1])
all = load_f1_scores(file_list[0])
print(cohens_d_paired(minimumset, all))

# --- Effect Size Function ---
def paired_cohens_d_and_ci(a, b, confidence=0.95):
    """
    Computes Cohen's d for paired samples and its confidence interval.

    Parameters:
        a (array-like): First set of paired observations.
        b (array-like): Second set of paired observations.
        confidence (float): Confidence level for the interval (default is 0.95).

    Returns:
        d (float): Cohen's d effect size.
        ci_low (float): Lower bound of the confidence interval.
        ci_high (float): Upper bound of the confidence interval.
    """
    diff = np.array(a) - np.array(b)
    n = len(diff)
    mean_diff = np.mean(diff)
    std_diff = np.std(diff, ddof=1)

    if std_diff == 0:
        return 0.0 if mean_diff == 0 else np.nan, np.nan, np.nan

    d = mean_diff / std_diff
    se = sem(diff)
    t_crit = t.ppf(1 - (1 - confidence)/2, n - 1)
    ci_low = (mean_diff - t_crit * se) / std_diff
    ci_high = (mean_diff + t_crit * se) / std_diff

    return d, ci_low, ci_high

paired_cohens_d_and_ci(minimumset, all)

0.3550225616304195


(0.3550225616304195, -0.886641436573347, 1.596686559834186)

In [7]:
paired_cohens_d_and_ci( all, minimumset)

(-0.3550225616304195, -1.596686559834186, 0.886641436573347)