# Block pattern discovery (unsupervised)

Goal: detect candidate row-order blocks using feature patterns (no labels).
We search for abrupt changes in rolling statistics across rows.

In [None]:
from __future__ import annotations

from pathlib import Path

import numpy as np
import pandas as pd


DATA_DIR = Path("../data")


def load_features(data_dir: Path) -> pd.DataFrame:
    train = pd.read_csv(data_dir / "training_set_features.csv", index_col="respondent_id")
    test = pd.read_csv(data_dir / "test_set_features.csv", index_col="respondent_id")
    train = train.copy()
    test = test.copy()
    train["__is_train__"] = 1
    test["__is_train__"] = 0
    full = pd.concat([train, test], axis=0)
    return full


def rolling_diff(series: pd.Series, window: int) -> pd.Series:
    roll = series.rolling(window=window, min_periods=window // 2).mean()
    diff = roll.diff().abs()
    return diff


def top_category_share(series: pd.Series) -> pd.Series:
    if series.dropna().empty:
        return pd.Series(np.nan, index=series.index)
    top = series.value_counts(dropna=True).idxmax()
    return (series == top).astype(float)


def aggregate_change_scores(
    X: pd.DataFrame, window: int = 200, top_n: int = 25
) -> pd.DataFrame:
    num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()
    cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

    # exclude helper column from numeric stats
    if "__is_train__" in num_cols:
        num_cols.remove("__is_train__")

    change = pd.Series(0.0, index=X.index)

    # numeric: rolling mean changes
    for col in num_cols:
        diff = rolling_diff(X[col].astype(float), window)
        if diff.notna().any():
            norm = diff / (diff.quantile(0.95) + 1e-9)
            change = change.add(norm.fillna(0.0), fill_value=0.0)

    # categorical: rolling changes in top-category share
    for col in cat_cols:
        share = top_category_share(X[col])
        diff = rolling_diff(share, window)
        if diff.notna().any():
            norm = diff / (diff.quantile(0.95) + 1e-9)
            change = change.add(norm.fillna(0.0), fill_value=0.0)

    # missingness changes (all columns)
    miss = X.isna().mean(axis=1)
    diff = rolling_diff(miss, window)
    if diff.notna().any():
        norm = diff / (diff.quantile(0.95) + 1e-9)
        change = change.add(norm.fillna(0.0), fill_value=0.0)

    out = pd.DataFrame({"change_score": change})
    out = out.sort_values("change_score", ascending=False)
    return out.head(top_n)


X_full = load_features(DATA_DIR)
X_full = X_full.reset_index(drop=True)

candidate_breaks = aggregate_change_scores(X_full, window=200, top_n=30)
candidate_breaks

## Create block IDs from candidate breakpoints

You can adjust the breakpoint list based on the candidate indices above.

In [None]:
def assign_blocks(n_rows: int, breaks: list[int]) -> pd.Series:
    breaks = sorted([b for b in breaks if 0 < b < n_rows])
    labels = np.zeros(n_rows, dtype=int)
    current = 0
    for i, b in enumerate(breaks):
        labels[current:b] = i
        current = b
    labels[current:] = len(breaks)
    return pd.Series(labels, name="block_id")


# Example: use top 5 candidate breakpoints
top_breaks = candidate_breaks.index[:5].tolist()
block_id = assign_blocks(len(X_full), top_breaks)
block_id.value_counts().sort_index()

## Optional: quick missingness map for a few blocks

This helps visualize how missingness changes across detected blocks.

In [None]:
sample_cols = X_full.columns[:20].tolist()
miss_by_block = X_full[sample_cols].isna().groupby(block_id).mean()
miss_by_block