In [1]:
import os
import numpy as np
import pandas as pd
from sklearn import metrics

In [None]:
for folder in os.listdir('../results/'):
    method_df = pd.DataFrame()
    for exp in os.listdir('../results/' + folder):
        path = '../results/' + folder + '/' + exp
        exp_result = np.load(path, allow_pickle=True)

        exp_df = pd.DataFrame()
        all_names, all_ri, all_ari, all_nmi = [], [], [], []
        for i in range(len(exp_result)):
            name = exp_result[i]['dataset_name']
            ri = metrics.rand_score(exp_result[i]['predicted_labels'], exp_result[i]['original_labels'])
            ari = metrics.adjusted_rand_score(exp_result[i]['predicted_labels'], exp_result[i]['original_labels'])
            nmi = metrics.normalized_mutual_info_score(exp_result[i]['predicted_labels'], exp_result[i]['original_labels'])

            all_names.append(name)
            all_ri.append(ri)
            all_ari.append(ari)
            all_nmi.append(nmi)

        exp_df['dataset_name'] = all_names
        exp_df['RI'] = all_ri
        exp_df['ARI'] = all_ari
        exp_df['NMI'] = all_nmi

        method_df = pd.concat([method_df, exp_df], axis=0, ignore_index=True)

    method_df = method_df.groupby('dataset_name').mean()
    method_df.reset_index().to_csv('./experiment_results/' + folder + '_results.csv')

In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import math


def graph_ranks(avranks, names, cd=None, cdmethod=None, lowv=None, highv=None,
                width=6, textspace=1, reverse=False, filename=None, **kwargs):
    try:
        import matplotlib.pyplot as plt
        from matplotlib.backends.backend_agg import FigureCanvasAgg
    except ImportError:
        raise ImportError("Function graph_ranks requires matplotlib.")

    width = float(width)
    textspace = float(textspace)

    def nth(l, n):
        n = lloc(l, n)
        return [a[n] for a in l]

    def lloc(l, n):
        if n < 0:
            return len(l[0]) + n
        else:
            return n

    def mxrange(lr):
        if not len(lr):
            yield ()
        else:
            # it can work with single numbers
            index = lr[0]
            if isinstance(index, int):
                index = [index]
            for a in range(*index):
                for b in mxrange(lr[1:]):
                    yield tuple([a] + list(b))

    def print_figure(fig, *args, **kwargs):
        canvas = FigureCanvasAgg(fig)
        canvas.print_figure(*args, **kwargs)

    sums = avranks

    tempsort = sorted([(a, i) for i, a in enumerate(sums)], reverse=reverse)
    ssums = nth(tempsort, 0)
    sortidx = nth(tempsort, 1)
    nnames = [names[x] for x in sortidx]

    if lowv is None:
        lowv = min(1, int(math.floor(min(ssums))))
    if highv is None:
        highv = max(len(avranks), int(math.ceil(max(ssums))))

    cline = 0.4

    k = len(sums)

    lines = None

    linesblank = 0
    scalewidth = width - 2 * textspace

    def rankpos(rank):
        if not reverse:
            a = rank - lowv
        else:
            a = highv - rank
        return textspace + scalewidth / (highv - lowv) * a

    distanceh = 0.25

    if cd and cdmethod is None:
        # get pairs of non significant methods

        def get_lines(sums, hsd):
            # get all pairs
            lsums = len(sums)
            allpairs = [(i, j) for i, j in mxrange([[lsums], [lsums]]) if j > i]
            # remove not significant
            notSig = [(i, j) for i, j in allpairs
                      if abs(sums[i] - sums[j]) <= hsd]
            # keep only longest

            def no_longer(ij_tuple, notSig):
                i, j = ij_tuple
                for i1, j1 in notSig:
                    if (i1 <= i and j1 > j) or (i1 < i and j1 >= j):
                        return False
                return True

            longest = [(i, j) for i, j in notSig if no_longer((i, j), notSig)]

            return longest

        lines = get_lines(ssums, cd)
        linesblank = 0.2 + 0.2 + (len(lines) - 1) * 0.1

        # add scale
        distanceh = 0.25
        cline += distanceh

    # calculate height needed height of an image
    minnotsignificant = max(2 * 0.2, linesblank)
    height = cline + ((k + 1) / 2) * 0.2 + minnotsignificant

    fig = plt.figure(figsize=(width, height))
    fig.set_facecolor('white')
    ax = fig.add_axes([0, 0, 1, 1])  # reverse y axis
    ax.set_axis_off()

    hf = 1. / height  # height factor
    wf = 1. / width

    def hfl(l):
        return [a * hf for a in l]

    def wfl(l):
        return [a * wf for a in l]


    # Upper left corner is (0,0).
    ax.plot([0, 1], [0, 1], c="w")
    ax.set_xlim(0, 1)
    ax.set_ylim(1, 0)

    def line(l, color='k', **kwargs):
        """
        Input is a list of pairs of points.
        """
        ax.plot(wfl(nth(l, 0)), hfl(nth(l, 1)), color=color, **kwargs)

    def text(x, y, s, *args, **kwargs):
        ax.text(wf * x, hf * y, s, size=18, *args, **kwargs)

    line([(textspace, cline), (width - textspace, cline)], linewidth=1.7)

    bigtick = 0.1
    smalltick = 0.05

    tick = None
    for a in list(np.arange(lowv, highv, 0.5)) + [highv]:
        tick = smalltick
        if a == int(a):
            tick = bigtick
        line([(rankpos(a), cline - tick / 2),
              (rankpos(a), cline)],
             linewidth=1.7)

    for a in range(lowv, highv + 1):
        text(rankpos(a), cline - tick / 2 - 0.05, str(a),
             ha="center", va="bottom")

    k = len(ssums)

    a = 0.4
    b = a + 0.1

    for i in range(math.ceil(k / 2)):
        chei = cline + minnotsignificant + i * 0.24
        line([(rankpos(ssums[i]), cline),
              (rankpos(ssums[i]), chei),
              (textspace + b, chei)],
             linewidth=1.7)
        text(textspace + a, chei, nnames[i], ha="right", va="center")

    for i in range(math.ceil(k / 2), k):
        chei = cline + minnotsignificant + (k - i - 1) * 0.24
        line([(rankpos(ssums[i]), cline),
              (rankpos(ssums[i]), chei),
              (textspace + scalewidth - b, chei)],
             linewidth=1.7)
        text(textspace + scalewidth - a, chei, nnames[i],
             ha="left", va="center")

    if cd and cdmethod is None:
        # upper scale
        if not reverse:
            begin, end = rankpos(lowv), rankpos(lowv + cd)
        else:
            begin, end = rankpos(highv), rankpos(highv - cd)

        # no-significance lines
        def draw_lines(lines, side=0.05, height=0.1):
            start = cline + 0.2
            for l, r in lines:
                line([(rankpos(ssums[l]) - side, start),
                      (rankpos(ssums[r]) + side, start)],
                     linewidth=2.5)
                start += height

        draw_lines(lines)

    elif cd:
        begin = rankpos(avranks[cdmethod] - cd)
        end = rankpos(avranks[cdmethod] + cd)
        line([(begin, cline), (end, cline)],
             linewidth=2.5)
        line([(begin, cline + bigtick / 2),
              (begin, cline - bigtick / 2)],
             linewidth=2.5)
        line([(end, cline + bigtick / 2),
              (end, cline - bigtick / 2)],
             linewidth=2.5)

    if filename:
        print_figure(fig, filename, **kwargs)


import os
import matplotlib.pyplot as plt


def compute_CD(avranks, n, alpha="0.05", test="nemenyi"):
    """
    Returns critical difference for Nemenyi or Bonferroni-Dunn test
    according to given alpha (either alpha="0.05" or alpha="0.1") for average
    ranks and number of tested datasets N. Test can be either "nemenyi" for
    for Nemenyi two tailed test or "bonferroni-dunn" for Bonferroni-Dunn test.
    """
    k = len(avranks)
    d = {("nemenyi", "0.05"): [0, 0, 1.959964, 2.343701, 2.569032, 2.727774,
                               2.849705, 2.94832, 3.030879, 3.101730, 3.163684,
                               3.218654, 3.268004, 3.312739, 3.353618, 3.39123,
                               3.426041, 3.458425, 3.488685, 3.517073,
                               3.543799],
         ("nemenyi", "0.1"): [0, 0, 1.644854, 2.052293, 2.291341, 2.459516,
                              2.588521, 2.692732, 2.779884, 2.854606, 2.919889,
                              2.977768, 3.029694, 3.076733, 3.119693, 3.159199,
                              3.195743, 3.229723, 3.261461, 3.291224, 3.319233],
         ("bonferroni-dunn", "0.05"): [0, 0, 1.960, 2.241, 2.394, 2.498, 2.576,
                                       2.638, 2.690, 2.724, 2.773],
         ("bonferroni-dunn", "0.1"): [0, 0, 1.645, 1.960, 2.128, 2.241, 2.326,
                                      2.394, 2.450, 2.498, 2.539]}
    q = d[(test, alpha)]
    cd = q[k] * (k * (k + 1) / (6.0 * n)) ** 0.5
    return cd

def stat_plots(methods, path):
    def cd_diagram_process(df, rank_ascending=False):
        df = df.rank(ascending=rank_ascending, axis=1)
        return df

    ri_df = pd.DataFrame()
    for method in methods:
        df = pd.read_csv(path + method)
        ri = list(df['RI'].values)
        ri_df[method] = ri

    rank_ri_df  = cd_diagram_process(ri_df)
    rank_df = rank_ri_df.mean().sort_values()

    names = []
    for method in rank_df.index.values:
        names.append(method[:-4])

    avranks =  rank_df.values
    cd = compute_CD(avranks, 128, "0.1")
    graph_ranks(avranks, names, cd=cd, width=5, textspace=0.1)
    plt.show()

In [None]:
path = './experiment_results/'
stat_plots(np.array(os.listdir(path)), path)

In [5]:
import os
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon
from statsmodels.stats.multitest import multipletests

def wilcoxon_vs_baseline(baseline_file: str,
                         folder: str,
                         metric: str = "RI",
                         alpha: float = 0.01,
                         alternative: str = "greater",
                         print_table: bool = True) -> pd.DataFrame:

    # --- read baseline once --------------------------------------------------
    base_df = pd.read_csv(os.path.join(folder, baseline_file))
    base_vals = base_df[metric].values
    algorithms, p_raw, w_stat, n_pos, n_zero, n_neg = [], [], [], [], [], []

    # --- loop through other CSVs --------------------------------------------
    for csv in os.listdir(folder):
        if csv == baseline_file or not csv.endswith(".csv"):
            continue

        algo_df  = pd.read_csv(os.path.join(folder, csv))
        algo_vals = algo_df[metric].values

        # sanity‑check equal length
        if len(algo_vals) != len(base_vals):
            raise ValueError(f"{csv}: length {len(algo_vals)} "
                             f"≠ baseline length {len(base_vals)}")

        # Wilcoxon signed‑rank
        stat, p = wilcoxon(algo_vals, base_vals, alternative=alternative)

        # win / tie / loss counts
        diff = algo_vals - base_vals
        n_pos.append((diff >  0).sum())
        n_zero.append((diff == 0).sum())
        n_neg.append((diff <  0).sum())

        algorithms.append(csv.replace(".csv", ""))   # nice name
        w_stat.append(stat)
        p_raw.append(p)

    # --- Holm–Bonferroni adjustment -----------------------------------------
    reject, p_holm, _, _ = multipletests(p_raw, alpha=alpha, method="holm")

    # --- summary table -------------------------------------------------------
    results = pd.DataFrame({
        "Algorithm"  : algorithms,
        "n_pos"      : n_pos,
        "n_zero"     : n_zero,
        "n_neg"      : n_neg,
        "W_stat"     : w_stat,
        "p_raw"      : p_raw,
        "p_Holm"     : p_holm,
        f"Reject_H0@α={alpha}": reject
    }).sort_values("p_Holm")

    if print_table:
        print("\nWilcoxon vs baseline ({})  —  Holm–Bonferroni α = {}\n"
              .format(alternative, alpha))
        print(results.to_string(index=False, float_format="%.4g"))

    return results

In [None]:
folder = "./experiment_results"
baseline_file = "kmeans_results.csv"
wilcoxon_vs_baseline(baseline_file, folder, metric="RI")