In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

from TELF.post_processing.ArcticFox import ClusteringAnalyzer

# ──────────────────────────────────────────────────────────────────────────────
# 1) Factor‐based clustering
# ──────────────────────────────────────────────────────────────────────────────
df_factor = pd.DataFrame({
    "clean_title_abstract": [
        "apple banana apple",
        "banana cherry banana",
        "cherry apple cherry",
        "banana banana apple",
        "cherry cherry banana",
        "apple cherry banana"
    ],
    "year": [2020, 2021, 2020, 2022, 2021, 2022]
})

vocab = ["apple", "banana", "cherry"]
W = np.array([
    [0.6, 0.1],
    [0.3, 0.5],
    [0.1, 0.4],
])
H = np.array([
    [5, 1, 2, 4, 1, 3],
    [1, 4, 5, 2, 5, 3],
])

factor_analyzer = ClusteringAnalyzer(
    top_n_words=2,
    out_dir="example_out/factor",
    archive_subdir="archive_factor"
)
out1 = factor_analyzer.analyze(df=df_factor, W=W, H=H, vocab=vocab)
print("1) Factor‐based clustering wrote:", out1)


# ──────────────────────────────────────────────────────────────────────────────
# 2) Label‐based clustering
# ──────────────────────────────────────────────────────────────────────────────
df_labels = df_factor.copy()
df_labels["predicted_cluster"] = [0, 1, 1, 0, 1, 0]

label_analyzer = ClusteringAnalyzer(
    top_n_words=3,
    out_dir="example_out/labels",
    archive_subdir="archive_labels"
)
out2 = label_analyzer.analyze(df=df_labels, cluster_col="predicted_cluster")
print("2) Label‐based clustering wrote:", out2)


# ──────────────────────────────────────────────────────────────────────────────
# 3) Pass‐through (no clustering)
# ──────────────────────────────────────────────────────────────────────────────
df_none = pd.DataFrame({
    "clean_title_abstract": [
        "just some text",
        "more text here",
        "even more free text"
    ]
})

none_analyzer = ClusteringAnalyzer(
    top_n_words=5,
    out_dir="example_out/none",
    archive_subdir="archive_none"
)
out3 = none_analyzer.analyze(df=df_none)
print("3) Pass‐through wrote:", out3)


# ──────────────────────────────────────────────────────────────────────────────
# 4) Hierarchical HNMFk clustering
# ──────────────────────────────────────────────────────────────────────────────
# We'll create a dummy HNMFk‐like object with one leaf node covering the entire df.
class DummyHNMFkModel:
    def __init__(self, nodes):
        self._nodes = nodes
    def traverse_nodes(self):
        return self._nodes

# define one leaf node with W, H and all row indices
node_dir = Path("example_out/hnmfk/k=2")
nodes = [{
    "leaf": True,
    "W": W,
    "H": H,
    "signature": None,        # not used when W/H are provided
    "probabilities": None,
    "original_indices": list(range(len(df_factor))),
    "node_save_path": str(node_dir / "cluster_for_k=2.csv")
}]
hnmfk_model = DummyHNMFkModel(nodes)

hnmfk_analyzer = ClusteringAnalyzer(
    top_n_words=2,
    out_dir="example_out/hnmfk",
    archive_subdir="archive_hnmfk"
)
out4 = hnmfk_analyzer.analyze(
    df=df_factor,
    hnmfk_model=hnmfk_model,
    vocab=vocab,
    process_parents=False,
    skip_completed=True
)
print("4) Hierarchical HNMFk clustering wrote:", out4)
