In [None]:
here = '1_Method/'

### Evaluation

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score
from sklearn.metrics.cluster import contingency_matrix

def clustering_report(y_true, y_pred):
    """
    Compute clustering metrics between ground-truth labels and cluster labels.

    Parameters
    ----------
    y_true : array-like (pd.Series, list, np.ndarray)
        Ground-truth class labels (can be strings or ints).
    y_pred : array-like
        Cluster assignments (can be strings or ints).

    Returns
    -------
    dict with keys: 'NMI', 'ACC', 'Purity', 'ARI'
    """
    # Convert to pandas Series for easy NA handling
    # reset index
    y_true = pd.Series(y_true).reset_index(drop=True)
    y_pred = pd.Series(y_pred).reset_index(drop=True)


    # Drop pairs with NA
    mask = (~y_true.isna()) & (~y_pred.isna())
    y_true = y_true[mask].to_numpy()
    y_pred = y_pred[mask].to_numpy()

    if y_true.size == 0:
        raise ValueError("No valid (non-NA) label pairs to compare.")

    # Contingency table: rows=true classes, cols=predicted clusters
    C = contingency_matrix(y_true, y_pred)  # shape: [n_true, n_pred]
    N = C.sum()

    # Purity: sum over clusters of majority true class proportion
    purity = np.sum(C.max(axis=0)) / N

    # ACC: maximize trace via Hungarian algorithm
    try:
        from scipy.optimize import linear_sum_assignment
        row_ind, col_ind = linear_sum_assignment(-C)  # maximize
        acc = C[row_ind, col_ind].sum() / N
    except Exception:
        # Greedy fallback if SciPy isn't available
        C_work = C.copy().astype(float)
        acc_sum = 0.0
        used_rows, used_cols = set(), set()
        while len(used_rows) < C.shape[0] and len(used_cols) < C.shape[1]:
            i, j = np.unravel_index(np.argmax(C_work), C_work.shape)
            if C_work[i, j] <= -1:  # exhausted
                break
            acc_sum += C[i, j]
            used_rows.add(i); used_cols.add(j)
            C_work[i, :] = -1
            C_work[:, j] = -1
        acc = acc_sum / N

    # NMI & ARI
    nmi = normalized_mutual_info_score(y_true, y_pred, average_method="arithmetic")
    ari = adjusted_rand_score(y_true, y_pred)

    return {"NMI": float(nmi), "ACC": float(acc), "Purity": float(purity), "ARI": float(ari)}


def evaluate(results):
  report = pd.DataFrame()
  for result in results:
    # turn into dataframe
    # add names
    df = pd.DataFrame(result['params'], index=[0])
    df['name'] = result['name']
    metrics = clustering_report(result['df']['adj'], result['output']['topics'])
    df['NMI'] = metrics['NMI']
    df['ACC'] = metrics['ACC']
    df['Purity'] = metrics['Purity']
    df['ARI'] = metrics['ARI']
    report = pd.concat([report, df])
  return report

In [None]:
def search(results, params):
    """
    Keep results where every key in `params` either:
      - is missing in result['params'] (ignored), or
      - exists and equals the requested value.
    """
    out = []
    for r in results:
        rp = r.get('params', {})
        if all((k not in rp) or (rp[k] == v) for k, v in params.items()):
            out.append(r)
    return out


# Analysis

In [None]:
import pickle as pkl

with open(here + 'output/results_kmean2.pkl', 'rb') as f:
    results_kmean_run1 = pkl.load(f)

with open(here + 'output/results_hdbscan2.pkl', 'rb') as f:
    results_hdbscan_run1 = pkl.load(f)

with open(here + 'output/results_hdbscan3.pkl', 'rb') as f:
    results_hdbscan_run2 = pkl.load(f)

In [None]:
eval_run_1 = pd.concat([evaluate(results_kmean_run1), evaluate(results_hdbscan_run1)])