# Block pattern discovery (unsupervised)

Goal: detect candidate row-order blocks using feature patterns (no labels).
We search for abrupt changes in rolling statistics across rows.

In [1]:
from __future__ import annotations

from pathlib import Path

import numpy as np
import pandas as pd


DATA_DIR = Path("../data")


def load_features(data_dir: Path) -> pd.DataFrame:
    train = pd.read_csv(data_dir / "training_set_features.csv", index_col="respondent_id")
    test = pd.read_csv(data_dir / "test_set_features.csv", index_col="respondent_id")
    train = train.copy()
    test = test.copy()
    train["__is_train__"] = 1
    test["__is_train__"] = 0
    full = pd.concat([train, test], axis=0)
    return full


def rolling_diff(series: pd.Series, window: int) -> pd.Series:
    roll = series.rolling(window=window, min_periods=window // 2).mean()
    diff = roll.diff().abs()
    return diff


def top_category_share(series: pd.Series) -> pd.Series:
    if series.dropna().empty:
        return pd.Series(np.nan, index=series.index)
    top = series.value_counts(dropna=True).idxmax()
    return (series == top).astype(float)


def aggregate_change_scores(
    X: pd.DataFrame, window: int = 200, top_n: int = 25
) -> pd.DataFrame:
    num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()
    cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

    # exclude helper column from numeric stats
    if "__is_train__" in num_cols:
        num_cols.remove("__is_train__")

    change = pd.Series(0.0, index=X.index)

    # numeric: rolling mean changes
    for col in num_cols:
        diff = rolling_diff(X[col].astype(float), window)
        if diff.notna().any():
            norm = diff / (diff.quantile(0.95) + 1e-9)
            change = change.add(norm.fillna(0.0), fill_value=0.0)

    # categorical: rolling changes in top-category share
    for col in cat_cols:
        share = top_category_share(X[col])
        diff = rolling_diff(share, window)
        if diff.notna().any():
            norm = diff / (diff.quantile(0.95) + 1e-9)
            change = change.add(norm.fillna(0.0), fill_value=0.0)

    # missingness changes (all columns)
    miss = X.isna().mean(axis=1)
    diff = rolling_diff(miss, window)
    if diff.notna().any():
        norm = diff / (diff.quantile(0.95) + 1e-9)
        change = change.add(norm.fillna(0.0), fill_value=0.0)

    out = pd.DataFrame({"change_score": change})
    out = out.sort_values("change_score", ascending=False)
    return out.head(top_n)


X_full = load_features(DATA_DIR)
X_full = X_full.reset_index(drop=True)

candidate_breaks = aggregate_change_scores(X_full, window=200, top_n=30)
candidate_breaks

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = X.select_dtypes(include=["object"]).columns.tolist()


Unnamed: 0,change_score
17676,26.115285
12184,25.644517
109,25.341919
31358,25.195328
34823,24.907154
45047,24.863731
51689,24.766945
47998,24.665532
43421,24.629819
14624,24.494097


## Refine breakpoints and check train/test boundary

We filter candidates to enforce a minimum spacing so we do not over-segment noise, and we also check the train/test boundary index.

In [2]:
def select_breaks(candidates: pd.DataFrame, min_gap: int = 500, max_breaks: int = 8) -> list[int]:
    chosen: list[int] = []
    for idx in candidates.index.tolist():
        if all(abs(idx - c) >= min_gap for c in chosen):
            chosen.append(int(idx))
        if len(chosen) >= max_breaks:
            break
    return sorted(chosen)


train_size = pd.read_csv(DATA_DIR / "training_set_features.csv").shape[0]
train_test_boundary = train_size

print("Train size:", train_size)
print("Boundary index (train->test):", train_test_boundary)

near_boundary = candidate_breaks.loc[
    (candidate_breaks.index >= train_test_boundary - 500)
    & (candidate_breaks.index <= train_test_boundary + 500)
]
print("Candidates near boundary:\n", near_boundary)

filtered_breaks = select_breaks(candidate_breaks, min_gap=800, max_breaks=8)
print("Filtered breaks (min_gap=800):", filtered_breaks)

Train size: 26707
Boundary index (train->test): 26707
Candidates near boundary:
        change_score
26676     24.284725
Filtered breaks (min_gap=800): [109, 12184, 17676, 31358, 34823, 45047, 47998, 51689]


## Label impact by detected blocks (train only)

We use the detected breaks (plus the train/test boundary) to define blocks, then check how label rates vary by block on the training portion only.

In [3]:
y = pd.read_csv(DATA_DIR / "training_set_labels.csv", index_col="respondent_id")

def assign_blocks(n_rows: int, breaks: list[int]) -> pd.Series:
    breaks = sorted([b for b in breaks if 0 < b < n_rows])
    labels = np.zeros(n_rows, dtype=int)
    current = 0
    for i, b in enumerate(breaks):
        labels[current:b] = i
        current = b
    labels[current:] = len(breaks)
    return pd.Series(labels, name="block_id")

def build_selected_breaks(
    breaks: list[int], boundary: int, min_block_start: int = 500
) -> list[int]:
    cleaned = [b for b in breaks if b >= min_block_start]
    cleaned = sorted(set(cleaned + [boundary]))
    return cleaned


selected_breaks = build_selected_breaks(filtered_breaks, train_test_boundary, min_block_start=500)
print("Selected breaks:", selected_breaks)

block_id_full = assign_blocks(len(X_full), selected_breaks)
block_id_train = block_id_full.iloc[:train_size]

block_sizes = block_id_train.value_counts().sort_index().rename("n")
block_rates = y.groupby(block_id_train).mean()
block_rates = block_rates.join(block_sizes)

overall = y.mean()
for col in y.columns:
    block_rates[f"delta_{col}"] = block_rates[col] - overall[col]

block_rates

Selected breaks: [12184, 17676, 26707, 31358, 34823, 45047, 47998, 51689]


Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine,n,delta_h1n1_vaccine,delta_seasonal_vaccine
block_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.214215,0.467334,12184,0.001762,0.001726
1,0.209213,0.4685,5492,-0.00324,0.002891
2,0.212047,0.461521,9031,-0.000406,-0.004087


## Top drifting features around a breakpoint

For each selected breakpoint, we compare a window before vs after to see which features shifted most.

In [4]:
def top_drift_features(X: pd.DataFrame, break_idx: int, window: int = 200, top_n: int = 8) -> pd.DataFrame:
    start = max(0, break_idx - window)
    end = min(len(X), break_idx + window)
    left = X.iloc[start:break_idx]
    right = X.iloc[break_idx:end]

    num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()
    cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
    if "__is_train__" in num_cols:
        num_cols.remove("__is_train__")

    rows = []

    for col in num_cols:
        l = left[col].astype(float)
        r = right[col].astype(float)
        if l.notna().any() and r.notna().any():
            diff = float(abs(l.mean() - r.mean()))
            scale = float(l.std() + r.std() + 1e-9)
            rows.append({"feature": col, "type": "numeric", "score": diff / scale})

    for col in cat_cols:
        l = left[col]
        r = right[col]
        if l.dropna().empty or r.dropna().empty:
            continue
        top = pd.concat([l, r]).value_counts(dropna=True).idxmax()
        l_share = float((l == top).mean())
        r_share = float((r == top).mean())
        rows.append({"feature": col, "type": "categorical", "score": abs(l_share - r_share)})

    miss = X.isna().mean(axis=1)
    miss_diff = float(abs(miss.iloc[start:break_idx].mean() - miss.iloc[break_idx:end].mean()))
    rows.append({"feature": "__missing_rate__", "type": "missingness", "score": miss_diff})

    out = pd.DataFrame(rows).sort_values("score", ascending=False)
    return out.head(top_n)


for b in filtered_breaks:
    print("\nBreakpoint:", b)
    display(top_drift_features(X_full, b, window=200, top_n=8))


Breakpoint: 109


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = X.select_dtypes(include=["object"]).columns.tolist()


Unnamed: 0,feature,type,score
5,behavioral_wash_hands,numeric,0.132197
8,behavioral_touch_face,numeric,0.087554
18,opinion_seas_vacc_effective,numeric,0.072492
3,behavioral_avoidance,numeric,0.069586
11,chronic_med_condition,numeric,0.066395
15,opinion_h1n1_vacc_effective,numeric,0.063883
9,doctor_recc_h1n1,numeric,0.060524
21,household_adults,numeric,0.057918



Breakpoint: 12184


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = X.select_dtypes(include=["object"]).columns.tolist()


Unnamed: 0,feature,type,score
15,opinion_h1n1_vacc_effective,numeric,0.110779
17,opinion_h1n1_sick_from_vacc,numeric,0.101099
4,behavioral_face_mask,numeric,0.099646
8,behavioral_touch_face,numeric,0.085971
14,health_insurance,numeric,0.075077
0,h1n1_concern,numeric,0.07177
3,behavioral_avoidance,numeric,0.070461
6,behavioral_large_gatherings,numeric,0.069638



Breakpoint: 17676


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = X.select_dtypes(include=["object"]).columns.tolist()


Unnamed: 0,feature,type,score
22,household_children,numeric,0.080296
27,income_poverty,categorical,0.07
20,opinion_seas_sick_from_vacc,numeric,0.069755
31,hhs_geo_region,categorical,0.065
23,age_group,categorical,0.055
32,census_msa,categorical,0.045
3,behavioral_avoidance,numeric,0.044642
11,chronic_med_condition,numeric,0.043876



Breakpoint: 31358


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = X.select_dtypes(include=["object"]).columns.tolist()


Unnamed: 0,feature,type,score
29,rent_or_own,categorical,0.155
6,behavioral_large_gatherings,numeric,0.109493
2,behavioral_antiviral_meds,numeric,0.095301
7,behavioral_outside_home,numeric,0.091164
14,health_insurance,numeric,0.088945
31,hhs_geo_region,categorical,0.085
23,age_group,categorical,0.08
16,opinion_h1n1_risk,numeric,0.072183



Breakpoint: 34823


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = X.select_dtypes(include=["object"]).columns.tolist()


Unnamed: 0,feature,type,score
11,chronic_med_condition,numeric,0.095119
8,behavioral_touch_face,numeric,0.081099
26,sex,categorical,0.08
4,behavioral_face_mask,numeric,0.078568
19,opinion_seas_risk,numeric,0.069661
3,behavioral_avoidance,numeric,0.065905
30,employment_status,categorical,0.065
12,child_under_6_months,numeric,0.063379



Breakpoint: 45047


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = X.select_dtypes(include=["object"]).columns.tolist()


Unnamed: 0,feature,type,score
11,chronic_med_condition,numeric,0.082954
1,h1n1_knowledge,numeric,0.063444
10,doctor_recc_seasonal,numeric,0.055784
15,opinion_h1n1_vacc_effective,numeric,0.055569
6,behavioral_large_gatherings,numeric,0.053314
8,behavioral_touch_face,numeric,0.05072
28,marital_status,categorical,0.05
27,income_poverty,categorical,0.05



Breakpoint: 47998


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = X.select_dtypes(include=["object"]).columns.tolist()


Unnamed: 0,feature,type,score
11,chronic_med_condition,numeric,0.12431
9,doctor_recc_h1n1,numeric,0.090666
0,h1n1_concern,numeric,0.066563
28,marital_status,categorical,0.06
1,h1n1_knowledge,numeric,0.05274
21,household_adults,numeric,0.050184
25,race,categorical,0.05
13,health_worker,numeric,0.045376



Breakpoint: 51689


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = X.select_dtypes(include=["object"]).columns.tolist()


Unnamed: 0,feature,type,score
8,behavioral_touch_face,numeric,0.140575
10,doctor_recc_seasonal,numeric,0.097112
26,sex,categorical,0.085
16,opinion_h1n1_risk,numeric,0.083645
13,health_worker,numeric,0.071545
27,income_poverty,categorical,0.065
1,h1n1_knowledge,numeric,0.061257
28,marital_status,categorical,0.055


## Create block IDs from candidate breakpoints

You can adjust the breakpoint list based on the candidate indices above.

In [5]:
def assign_blocks(n_rows: int, breaks: list[int]) -> pd.Series:
    breaks = sorted([b for b in breaks if 0 < b < n_rows])
    labels = np.zeros(n_rows, dtype=int)
    current = 0
    for i, b in enumerate(breaks):
        labels[current:b] = i
        current = b
    labels[current:] = len(breaks)
    return pd.Series(labels, name="block_id")


# Example: use top 5 candidate breakpoints
top_breaks = candidate_breaks.index[:5].tolist()
block_id = assign_blocks(len(X_full), top_breaks)
block_id.value_counts().sort_index()

block_id
0      109
1    12075
2     5492
3    13682
4     3465
5    18592
Name: count, dtype: int64

## Optional: quick missingness map for a few blocks

This helps visualize how missingness changes across detected blocks.

In [6]:
sample_cols = X_full.columns[:20].tolist()
miss_by_block = X_full[sample_cols].isna().groupby(block_id).mean()
miss_by_block

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk
block_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,0.018349,0.0,0.0,0.018349,0.0,0.009174,0.0,0.009174,0.0,0.055046,0.055046,0.009174,0.018349,0.018349,0.40367,0.018349,0.0,0.0,0.0,0.009174
1,0.002899,0.004224,0.002567,0.008861,0.000745,0.001159,0.00323,0.003395,0.004638,0.079669,0.079669,0.03619,0.031056,0.030642,0.461863,0.015652,0.014741,0.014907,0.017971,0.019793
2,0.003642,0.005827,0.003095,0.007465,0.000364,0.001639,0.00346,0.002549,0.00346,0.085033,0.085033,0.032775,0.025856,0.025127,0.460306,0.012564,0.0122,0.013292,0.015295,0.015477
3,0.004093,0.004385,0.002704,0.007163,0.000731,0.0019,0.00307,0.002558,0.005409,0.081859,0.081859,0.038956,0.033548,0.033182,0.456878,0.015714,0.015787,0.015641,0.017907,0.021269
4,0.002597,0.004618,0.003463,0.008081,0.001154,0.001732,0.002309,0.002309,0.004329,0.080808,0.080808,0.036075,0.032035,0.029726,0.460606,0.015296,0.013564,0.013853,0.017316,0.019913
5,0.002958,0.004249,0.002851,0.007799,0.000699,0.001398,0.002743,0.003496,0.004948,0.079873,0.079873,0.033724,0.02926,0.028292,0.457509,0.014038,0.013985,0.013716,0.016566,0.017642
