In [38]:
!pip install antropy --quiet
!pip install PyWavelets --quiet


In [41]:
%pip install crunch-cli --upgrade --quiet --progress-bar off
!crunch setup-notebook structural-break vpZXd9oUlwMy6GhLU36kHqPf

Note: you may need to restart the kernel to use updated packages.
crunch-cli, version 7.4.0
delete /kaggle/working/.crunchdao
you appear to have never submitted code before
data/X_train.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/X_train.parquet (204327238 bytes)
data/X_test.reduced.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/X_test.reduced.parquet (2380918 bytes)
data/y_train.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/y_train.parquet (61003 bytes)
data/y_test.reduced.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/y_test.reduced.parquet (2655 bytes)
                                                                                
---
Success! Your environment has been correctly setup.
Next recommended actions:
1. Load the Crunch 

## Setup

In [62]:
import os
import typing

# Import your dependencies
import joblib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import sklearn.metrics
from scipy.stats import wasserstein_distance  # 1D Earth Mover's Distance

import warnings
from scipy.signal import welch

warnings.filterwarnings("ignore", category=UserWarning)

In [63]:
import crunch

# Load the Crunch Toolings
crunch = crunch.load_notebook()

loaded inline runner with module: <module '__main__'>

cli version: 7.4.0
available ram: 31.35 gb
available cpu: 4 core
----


In [64]:
# Load the data simply
X_train, y_train, X_test = crunch.load_data()

data/X_train.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/X_train.parquet (204327238 bytes)
data/X_train.parquet: already exists, file length match
data/X_test.reduced.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/X_test.reduced.parquet (2380918 bytes)
data/X_test.reduced.parquet: already exists, file length match
data/y_train.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/y_train.parquet (61003 bytes)
data/y_train.parquet: already exists, file length match
data/y_test.reduced.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/y_test.reduced.parquet (2655 bytes)
data/y_test.reduced.parquet: already exists, file length match


In [9]:
print(X_train.shape)
print(y_train.value_counts())


(23715734, 2)
structural_breakpoint
False    7092
True     2909
Name: count, dtype: int64


In [17]:
print("Number of datasets:", len(X_test))

Number of datasets: 101


### 🔥 train.py

In [96]:
%%writefile main.py


import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis, ks_2samp, mannwhitneyu, wasserstein_distance
from scipy.signal import welch, hilbert
from statsmodels.tsa.stattools import acf, pacf
# ======= Refactor-friendly train() and infer() =======
import os
import joblib
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV

import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress convergence warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=UserWarning)  # suppress welch nperseg warnings

# optional libs - safe to import if installed
try:
    import antropy as ant
except Exception:
    ant = None
try:
    import pywt
except Exception:
    pywt = None

# --- helper funcs (robust + small fixes) ---
# ---### 🔥 Advanced Feature Engineering Function-----
def compute_permutation_entropy(x, m=3, tau=1):
    if ant is None or len(x) < (m + 1):
        return np.nan
    try:
        return float(ant.perm_entropy(x, order=m, delay=tau, normalize=True))
    except Exception:
        return np.nan

def compute_multiscale_entropy(x, scales=(2,3,5)):
    if ant is None:
        return np.nan
    vals = []
    for s in scales:
        xs = x[::s]
        if len(xs) > 10:
            try:
                vals.append(float(ant.sample_entropy(xs)))
            except Exception:
                pass
    return float(np.mean(vals)) if vals else np.nan

def spectral_features(x, fs=1.0):
    # safe nperseg
    nperseg = min(256, max(8, len(x)))
    f, Pxx = welch(x, fs=fs, nperseg=nperseg)
    # spectral entropy fallback
    if ant is not None:
        try:
            se = float(ant.spectral_entropy(x, sf=fs, method='welch', normalize=True))
        except Exception:
            P = Pxx / (Pxx.sum() + 1e-12)
            se = float(-np.sum(P * np.log(P + 1e-12)))
    else:
        P = Pxx / (Pxx.sum() + 1e-12)
        se = float(-np.sum(P * np.log(P + 1e-12)))
    centroid = float(np.sum(f * Pxx) / (np.sum(Pxx) + 1e-12))
    return se, centroid

def wavelet_energy(x, wavelet='db1', level=3):
    if pywt is None:
        return np.nan, np.nan
    try:
        coeffs = pywt.wavedec(x, wavelet, level=level)
        energies = [np.sum(c**2) for c in coeffs if len(c) > 0]
        return float(np.mean(energies)), float(np.std(energies))
    except Exception:
        return np.nan, np.nan

def lz_complexity(x):
    if ant is None:
        return np.nan
    try:
        sig = np.sign(x - np.nanmean(x)).astype(int)
        return float(ant.lziv_complexity(sig))
    except Exception:
        return np.nan

def robust_stats(arr):
    return {
        'mean': float(np.nanmean(arr)),
        'median': float(np.nanmedian(arr)),
        'std': float(np.nanstd(arr)),
        'iqr': float(np.nanpercentile(arr,75) - np.nanpercentile(arr,25))
    }

# --- cleaned main builder ---
def build_advanced_features(X_df, y_series):
    """
    X_df: MultiIndex DataFrame (id, time) with columns ['value','period']
    y_series: pd.Series indexed by id
    Returns: X_features (DataFrame indexed by id), y_features (Series)
    """
    rows = []
    ids = []

    for sid, df in X_df.groupby(level='id', sort=False):
        df = df.dropna(subset=['value'])
        before = df.loc[df['period']==0, 'value'].to_numpy(dtype=float)
        after  = df.loc[df['period']==1, 'value'].to_numpy(dtype=float)

        # skip too short segments
        if len(before) < 8 or len(after) < 8:
            continue

        # basic stats once
        b = robust_stats(before)
        a = robust_stats(after)

        feats = {
            'b_mean': b['mean'], 'a_mean': a['mean'],
            'b_median': b['median'], 'a_median': a['median'],
            'b_std': b['std'], 'a_std': a['std'],
            'b_iqr': b['iqr'], 'a_iqr': a['iqr'],
        }

        # diffs/ratios
        feats['mean_diff'] = feats['a_mean'] - feats['b_mean']
        feats['median_diff'] = feats['a_median'] - feats['b_median']
        feats['std_diff'] = feats['a_std'] - feats['b_std']
        feats['iqr_diff'] = feats['a_iqr'] - feats['b_iqr']
        feats['std_ratio'] = (feats['a_std'] + 1e-9) / (feats['b_std'] + 1e-9)

        # complexity: compute once per side (no duplicates)
        feats['pe_m3_t1_diff'] = compute_permutation_entropy(after, m=3, tau=1) - compute_permutation_entropy(before, m=3, tau=1)
        feats['pe_m5_t1_diff'] = compute_permutation_entropy(after, m=5, tau=1) - compute_permutation_entropy(before, m=5, tau=1)
        feats['mse_diff'] = compute_multiscale_entropy(after) - compute_multiscale_entropy(before)

        # distances/tests
        try:
            feats['emd'] = float(wasserstein_distance(before, after))
        except:
            feats['emd'] = np.nan
        try:
            feats['ks_stat'] = float(ks_2samp(before, after).statistic)
        except:
            feats['ks_stat'] = np.nan
        try:
            feats['mw_p'] = float(mannwhitneyu(before, after).pvalue)
        except:
            feats['mw_p'] = np.nan

        # acf energy diff
        try:
            acf_b = np.nan_to_num(acf(before, nlags=20, fft=True))
            acf_a = np.nan_to_num(acf(after, nlags=20, fft=True))
            feats['acf_energy_diff'] = float(np.sum(acf_a**2) - np.sum(acf_b**2))
        except:
            feats['acf_energy_diff'] = np.nan

        # spectral & wavelet
        se_b, cent_b = spectral_features(before)
        se_a, cent_a = spectral_features(after)
        feats['spectral_entropy_diff'] = se_a - se_b
        feats['spectral_centroid_diff'] = cent_a - cent_b

        we_b_mean, we_b_std = wavelet_energy(before)
        we_a_mean, we_a_std = wavelet_energy(after)
        feats['wavelet_energy_diff'] = we_a_mean - we_b_mean

        # hilbert instantaneous freq diff
        try:
            inst_b = np.mean(np.diff(np.unwrap(np.angle(hilbert(before)))))
            inst_a = np.mean(np.diff(np.unwrap(np.angle(hilbert(after)))))
            feats['hilbert_freq_diff'] = float(inst_a - inst_b)
        except:
            feats['hilbert_freq_diff'] = np.nan

        # compression
        feats['lz_diff'] = lz_complexity(after) - lz_complexity(before)

        rows.append(feats)
        ids.append(sid)

    X_features = pd.DataFrame(rows, index=ids)
    X_features.index.name = 'id'
    X_features = X_features.replace([np.inf, -np.inf], np.nan).fillna(0)
    y_features = y_series.loc[X_features.index].astype(int)

    return X_features, y_features


def extract_features_series(df):
    """
    df: DataFrame with columns ['value','period'] for a single id
    returns: dict of features (same keys as build_advanced_features)
    """
    df = df.dropna(subset=['value'])
    before = df.loc[df['period']==0, 'value'].to_numpy(dtype=float)
    after  = df.loc[df['period']==1, 'value'].to_numpy(dtype=float)

    # skip too short segments
    if len(before) < 8 or len(after) < 8:
        return None

    b = robust_stats(before)
    a = robust_stats(after)

    feats = {
        'b_mean': b['mean'], 'a_mean': a['mean'],
        'b_median': b['median'], 'a_median': a['median'],
        'b_std': b['std'], 'a_std': a['std'],
        'b_iqr': b['iqr'], 'a_iqr': a['iqr'],
    }

    feats['mean_diff']   = feats['a_mean'] - feats['b_mean']
    feats['median_diff'] = feats['a_median'] - feats['b_median']
    feats['std_diff']    = feats['a_std'] - feats['b_std']
    feats['iqr_diff']    = feats['a_iqr'] - feats['b_iqr']
    feats['std_ratio']   = (feats['a_std'] + 1e-9) / (feats['b_std'] + 1e-9)

    # complexity
    feats['pe_m3_t1_diff'] = compute_permutation_entropy(after, m=3, tau=1) - compute_permutation_entropy(before, m=3, tau=1)
    feats['pe_m5_t1_diff'] = compute_permutation_entropy(after, m=5, tau=1) - compute_permutation_entropy(before, m=5, tau=1)
    feats['mse_diff']      = compute_multiscale_entropy(after) - compute_multiscale_entropy(before)

    # distances/tests
    try:
        feats['emd'] = float(wasserstein_distance(before, after))
    except:
        feats['emd'] = np.nan
    try:
        feats['ks_stat'] = float(ks_2samp(before, after).statistic)
    except:
        feats['ks_stat'] = np.nan
    try:
        feats['mw_p'] = float(mannwhitneyu(before, after).pvalue)
    except:
        feats['mw_p'] = np.nan

    # acf energy diff
    try:
        acf_b = np.nan_to_num(acf(before, nlags=20, fft=True))
        acf_a = np.nan_to_num(acf(after, nlags=20, fft=True))
        feats['acf_energy_diff'] = float(np.sum(acf_a**2) - np.sum(acf_b**2))
    except:
        feats['acf_energy_diff'] = np.nan

    # spectral & wavelet
    se_b, cent_b = spectral_features(before)
    se_a, cent_a = spectral_features(after)
    feats['spectral_entropy_diff']  = se_a - se_b
    feats['spectral_centroid_diff'] = cent_a - cent_b

    we_b_mean, we_b_std = wavelet_energy(before)
    we_a_mean, we_a_std = wavelet_energy(after)
    feats['wavelet_energy_diff'] = we_a_mean - we_b_mean

    # hilbert instantaneous freq diff
    try:
        inst_b = np.mean(np.diff(np.unwrap(np.angle(hilbert(before)))))
        inst_a = np.mean(np.diff(np.unwrap(np.angle(hilbert(after)))))
        feats['hilbert_freq_diff'] = float(inst_a - inst_b)
    except:
        feats['hilbert_freq_diff'] = np.nan

    # compression
    feats['lz_diff'] = lz_complexity(after) - lz_complexity(before)

    return feats

# assume build_advanced_features is in scope (the cleaned version we made)
# assume get_base_models and train_and_validate exist or we will define inline.

def train(X_train: "pd.DataFrame", y_train: "pd.Series", model_directory_path: str):
    """
    Crunch-style train function.
    - X_train: MultiIndex DataFrame (id,time) with columns ['value','period']
    - y_train: Series indexed by id
    - model_directory_path: directory to save model.joblib
    """
    os.makedirs(model_directory_path, exist_ok=True)
    rng_seed = 42

    # 1) Feature extraction (deterministic)
    X_features, y_features = build_advanced_features(X_train, y_train)
    print(f"[train] Features built: {X_features.shape}, labels: {y_features.value_counts().to_dict()}")

    # 2) Ensure deterministic seeds in model definitions
    # Build base models (MLP wrapped with scaler pipeline so scaler is saved inside)
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.ensemble import RandomForestClassifier, StackingClassifier
    from sklearn.linear_model import LogisticRegression
    import lightgbm as lgb
    import xgboost as xgb
    from sklearn.neural_network import MLPClassifier

    neg = len(y_features) - int(y_features.sum())
    pos = int(y_features.sum())
    pos_ratio = neg / (pos + 1e-9)

    base_models = [
        ("lgb", lgb.LGBMClassifier(
            n_estimators=400, learning_rate=0.05, num_leaves=31,
            subsample=0.8, colsample_bytree=0.8, random_state=rng_seed,
            class_weight="balanced"
        )),
        ("xgb", xgb.XGBClassifier(
            n_estimators=400, learning_rate=0.05, max_depth=5,
            subsample=0.8, colsample_bytree=0.8,
            eval_metric="logloss", use_label_encoder=False,
            random_state=rng_seed, scale_pos_weight=pos_ratio
        )),
        ("rf", RandomForestClassifier(
            n_estimators=300, max_depth=10, n_jobs=-1, random_state=rng_seed,
            class_weight="balanced"
        )),
        ("mlp", Pipeline([
            ("scaler", StandardScaler()),
            ("mlp", MLPClassifier(hidden_layer_sizes=(64,), max_iter=300,
                                  random_state=rng_seed))
        ])),
        # you may add a logistic wrapper if you like
    ]

    # 3) Per-base-model CV to compute OOF preds (GroupKFold)
    gkf = GroupKFold(n_splits=5)
    # store OOF predictions for stacking diagnostics
    oof_preds_per_model = {name: np.zeros(len(X_features)) for name, _ in base_models}
    X_idx = np.arange(len(X_features))
    groups = X_features.index

    for name, model in base_models:
        print(f"[train] CV for base model: {name}")
        fold_preds = np.zeros(len(X_features))
        for fold, (tr_idx, val_idx) in enumerate(gkf.split(X_features, y_features, groups)):
            X_tr, X_val = X_features.iloc[tr_idx], X_features.iloc[val_idx]
            y_tr, y_val = y_features.iloc[tr_idx], y_features.iloc[val_idx]

            # fit
            model.fit(X_tr, y_tr)
            proba = model.predict_proba(X_val)[:, 1]
            fold_preds[val_idx] = proba
            print(f"   fold {fold} | {name} ROC-AUC: {roc_auc_score(y_val, proba):.4f}")

        oof_score = roc_auc_score(y_features, fold_preds)
        print(f" [train] {name} OOF ROC-AUC: {oof_score:.4f}")
        oof_preds_per_model[name] = fold_preds.copy()

    # 4) Fit calibrated base models on full data for stacking (optional calibration)
    calibrated_estimators = []
    for name, model in base_models:
        print(f"[train] Fitting & calibrating full model: {name}")
        model.fit(X_features, y_features)
        calibrated = CalibratedClassifierCV(model, method="isotonic", cv=3)
        # CalibratedClassifierCV.fit will refit internally; this can be slow but yields calibrated probs.
        calibrated.fit(X_features, y_features)
        calibrated_estimators.append((name, calibrated))

    # 5) Meta learner: LightGBM (non-linear)
    meta = lgb.LGBMClassifier(n_estimators=300, learning_rate=0.05, num_leaves=15,
                              subsample=0.8, colsample_bytree=0.8, random_state=rng_seed,
                              class_weight="balanced")

    # 6) Build stacking classifier with passthrough (so original features are also available)
    stack = StackingClassifier(
        estimators=calibrated_estimators,
        final_estimator=meta,
        stack_method="predict_proba",
        passthrough=True,
        cv=5,  # internal cv inside sklearn stacking for safety
        n_jobs=-1
    )

    # Fit final stacking on full training features
    stack.fit(X_features, y_features)
    final_oof = roc_auc_score(y_features, stack.predict_proba(X_features)[:, 1])
    print(f"[train] Final stacking OOF ROC-AUC (on train features): {final_oof:.4f}")

    # 7) Save artifact: include feature columns and pipeline
    artifact = {
        "feature_cols": list(X_features.columns),
        "stacking_model": stack,
        "oof_per_model": oof_preds_per_model,
        "train_oof_score": float(final_oof),
        "random_seed": rng_seed
    }
    # ✅ Aur ab ye do lines daalo:
    os.makedirs(model_directory_path, exist_ok=True)
    joblib.dump(artifact, os.path.join(model_directory_path, "model.joblib"))
    print("[train] Saved artifact to", os.path.join(model_directory_path, "model.joblib"))
    return

def infer(X_test, model_directory_path):
    """
    Crunch-style infer generator. Yield once, then yield probabilities for each test series (one df at a time).
    X_test: iterable of pd.DataFrame (single-series DataFrames) as Crunch expects.
    """
    import os, joblib, pandas as pd

    artifact = joblib.load(os.path.join(model_directory_path, "model.joblib"))
    feature_cols = artifact["feature_cols"]
    stack = artifact["stacking_model"]

    # signal ready
    yield

    if X_test is None:
        # safety check
        return

    # process each test DF (single series) and extract features then predict
    for df in X_test:
        feats = extract_features_series(df)
        if feats is None:
            yield 0.0
            continue
        x = pd.DataFrame([feats])
        x = x.reindex(columns=feature_cols).fillna(0)
        proba = float(stack.predict_proba(x)[0, 1])
        yield proba




Overwriting main.py
[LightGBM] [Info] Number of positive: 1939, number of negative: 4728
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022435 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6057
[LightGBM] [Info] Number of data points in the train set: 6667, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 1939, number of negative: 4728
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6054
[LightGBM] [Info] Number of data points in the train set: 6667, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.00000

## Local testing

To make sure your `train()` and `infer()` function are working properly, you can call the `crunch.test()` function that will reproduce the cloud environment locally. <br />
Even if it is not perfect, it should give you a quick idea if your model is working properly.

In [None]:
crunch.test(
    # Uncomment to disable the train
    #force_first_train=False,

    # Uncomment to disable the determinism check
    # no_determinism_check=True,
)

23:10:22 forbidden library: feature_builder  (request to whitelist: https://hub.crunchdao.com/competitions/structural-break/resources/whitelisted-libraries?requestAlias=feature_builder)
23:10:22 
23:10:22 started
23:10:22 running local test
23:10:22 internet access isn't restricted, no check will be done
23:10:22 
23:10:23 starting unstructured loop...
23:10:23 executing - command=train


data/X_train.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/X_train.parquet (204327238 bytes)
data/X_train.parquet: already exists, file length match
data/X_test.reduced.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/X_test.reduced.parquet (2380918 bytes)
data/X_test.reduced.parquet: already exists, file length match
data/y_train.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/y_train.parquet (61003 bytes)
data/y_train.parquet: already exists, file length match
data/y_test.reduced.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/y_test.reduced.parquet (2655 bytes)
data/y_test.reduced.parquet: already exists, file length match


## Results

Once the local tester is done, you can preview the result stored in `data/prediction.parquet`.

In [80]:
prediction = pd.read_parquet("data/prediction.parquet")
prediction

FileNotFoundError: [Errno 2] No such file or directory: 'data/prediction.parquet'

### Local scoring

You can call the function that the system uses to estimate your score locally.

In [86]:
# Load the targets
target = pd.read_parquet("data/y_test.reduced.parquet")["structural_breakpoint"]

# Call the scoring function
sklearn.metrics.roc_auc_score(
    target,
    prediction,
)

NameError: name 'prediction' is not defined

# Submit your Notebook

To submit your work, you must:
1. Download your Notebook from Kaggle
2. Upload it to the platform
3. Create a run to validate it

### >> https://hub.crunchdao.com/competitions/structural-break/submit/notebook

![Download and Submit Notebook](https://raw.githubusercontent.com/crunchdao/competitions/refs/heads/master/documentation/animations/download-and-submit-notebook-on-kaggle.gif)