In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import joblib

# Load data and model
adhd_features_path = "/content/drive/MyDrive/Dataset/model/normalized_adhd_features.npy"
normalized_adhd_features = np.load(adhd_features_path)

model_path = "/content/drive/MyDrive/Dataset/model/final/Kmeans_TBR_Model.pkl"
kmeans = joblib.load(model_path)

# Get cluster assignments
clusters = kmeans.predict(normalized_adhd_features)
optimal_k = len(kmeans.cluster_centers_)  # Get number of clusters from model

# Extract features for each cluster
cluster_features = []
for i in range(optimal_k):
    cluster_features.append(normalized_adhd_features[clusters == i].flatten())

# Perform ANOVA
f_val, p_val = stats.f_oneway(*cluster_features)
print(f"ANOVA results: F={f_val:.3f}, p={p_val:.4f}")

# Tukey's HSD test
tukey = pairwise_tukeyhsd(
    endog=normalized_adhd_features.flatten(),  # Fixed variable name
    groups=clusters,
    alpha=0.05
)
print(tukey.summary())

# Assign labels based on statistical significance
if p_val < 0.05:
    # Get mean values for each cluster
    cluster_means = [np.mean(features) for features in cluster_features]

    # Order clusters from low to high theta/low beta ratio
    sorted_indices = np.argsort(cluster_means)

    # Assign labels based on the ordering (inverse relationship between TBR and attention)
    attention_labels = ["High Attention", "Mid Attention", "Low Attention"]
    label_mapping = {sorted_indices[0]: attention_labels[0],
                     sorted_indices[1]: attention_labels[1],
                     sorted_indices[2]: attention_labels[2]}

    print("\nStatistically significant cluster labels:")
    for cluster, label in label_mapping.items():
        print(f"Cluster {cluster}: {label} (mean TBR = {cluster_means[cluster]:.3f})")

ANOVA results: F=6955.708, p=0.0000
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper  reject
---------------------------------------------------
     0      1  -1.1532   0.0 -1.1949 -1.1115   True
     0      2   1.2992   0.0  1.2541  1.3443   True
     1      2   2.4523   0.0  2.4035  2.5011   True
---------------------------------------------------

Statistically significant cluster labels:
Cluster 1: High Attention (mean TBR = -1.111)
Cluster 0: Mid Attention (mean TBR = 0.043)
Cluster 2: Low Attention (mean TBR = 1.342)


In [None]:
# 1. Get number of groups and total samples
n_groups = optimal_k
n_total = len(normalized_adhd_features)

# 2. Overall grand mean
grand_mean = np.mean(normalized_adhd_features)

# 3. Calculate SS_between and SS_total
ss_between = sum(
    len(cluster) * (np.mean(cluster) - grand_mean) ** 2
    for cluster in cluster_features
)
ss_total = sum((x - grand_mean) ** 2 for x in normalized_adhd_features.flatten())

# 4. Effect size (eta squared)
eta_squared = ss_between / ss_total

# 5. Degrees of freedom
df_between = n_groups - 1
df_within = n_total - n_groups

# 6. Print results
print(f"\nDegrees of freedom (within): {df_within}")
print(f"Effect size (η²): {eta_squared:.4f}")



Degrees of freedom (within): 3369
Effect size (η²): 0.8050


In [None]:
from itertools import combinations

def cohen_d(x, y):
    """Compute Cohen's d for two samples."""
    nx = len(x)
    ny = len(y)
    pooled_std = np.sqrt(((nx - 1) * np.var(x, ddof=1) + (ny - 1) * np.var(y, ddof=1)) / (nx + ny - 2))
    return (np.mean(x) - np.mean(y)) / pooled_std

# Pairwise Cohen's d calculations
print("\nPairwise Cohen's d:")
for i, j in combinations(range(optimal_k), 2):
    d = cohen_d(cluster_features[i], cluster_features[j])
    print(f"Cluster {i} vs. Cluster {j}: Cohen's d = {d:.4f}")


Pairwise Cohen's d:
Cluster 0 vs. Cluster 1: Cohen's d = 2.9739
Cluster 0 vs. Cluster 2: Cohen's d = -2.9692
Cluster 1 vs. Cluster 2: Cohen's d = -4.7842


In [None]:
print("\nExact Tukey HSD p-values:")
for row in tukey._results_table.data[1:]:  # Skip header
    group1, group2, meandiff, p_adj, lower, upper, reject = row
    print(f"{group1} vs. {group2}: mean diff = {meandiff:.4f}, p = {p_adj:.6g}, CI = [{lower:.4f}, {upper:.4f}]")


Exact Tukey HSD p-values:
0 vs. 1: mean diff = -1.1532, p = 0, CI = [-1.1949, -1.1115]
0 vs. 2: mean diff = 1.2992, p = 0, CI = [1.2541, 1.3443]
1 vs. 2: mean diff = 2.4523, p = 0, CI = [2.4035, 2.5011]


In [None]:
print("\nCluster Means and Standard Deviations:")
for i, cluster in enumerate(cluster_features):
    mean = np.mean(cluster)
    std = np.std(cluster, ddof=1)  # Sample SD
    print(f"Cluster {i}: Mean = {mean:.4f}, SD = {std:.4f}, n = {len(cluster)}")


Cluster Means and Standard Deviations:
Cluster 0: Mean = 0.0426, SD = 0.3390, n = 1542
Cluster 1: Mean = -1.1106, SD = 0.4512, n = 1028
Cluster 2: Mean = 1.3417, SD = 0.5820, n = 802
