In [None]:
# pip install pyarrow==6.0.1
# pip show fastparquet

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import entropy, normaltest, jarque_bera
from statsmodels.tsa.stattools import adfuller
import tqdm

import tensorflow as tf
from tensorflow.keras import layers, models, Input
from tensorflow.keras.metrics import AUC
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score

In [None]:
# Read data from local parquet file
X_train = pd.read_parquet("data/X_train.parquet")
X_train_features = pd.read_parquet("data/X_train_features.parquet")
y_train = pd.read_parquet("data/y_train.parquet")

In [None]:
missing_values = X_train_features.isnull().sum()
missing_values[missing_values > 0]

In [None]:
# X_train_features[X_train_features['ratio_median']\
#          .isnull()][['value_median_pre', 'value_median_post', 'ratio_median']]

In [None]:
# Impute missing values with medians
for col in missing_values[missing_values > 0].index:
    X_train_features[col] = X_train_features[col].fillna(X_train_features[col].median())     

## FCN

In [None]:
# Scale features
scaler = StandardScaler()
X_values = scaler.fit_transform(X_train_features)
y_values = y_train.replace({False:0, True:1}).values

# Train-test split
X_tr, X_val, y_tr, y_val = train_test_split(X_values, y_values, test_size=0.2, stratify=y_values)

In [None]:
assert not np.isnan(X_values).any(), "NaNs in input features"
assert not np.isnan(y_values).any(), "NaNs in labels"
assert not np.any(np.isinf(X_values)), "Infs in input features"
assert not np.any(np.isinf(y_values)), "Infs in labels"

In [None]:
# Define model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

def build_model(input_dim):
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(128),
        layers.BatchNormalization(),
        layers.LeakyReLU(),
        layers.Dropout(0.5),
        layers.Dense(64),
        layers.BatchNormalization(),
        layers.LeakyReLU(),
        # layers.Dropout(0.4),
        # layers.Dense(64),
        # layers.LeakyReLU(),
        layers.Dense(1, activation='sigmoid')  # Output probability
    ])
    opt = optimizer = tf.keras.optimizers.Adam(learning_rate=0.01, clipnorm=1.0)
    model.compile(optimizer=opt,
                  loss='binary_crossentropy',
                  metrics=[AUC(name='auc')])
    return model

early_stop = EarlyStopping(
    monitor='val_auc', # Metric to monitor (e.g., 'val_loss', 'val_auc')
    patience=10, # Number of epochs to wait before stopping
    restore_best_weights=True, # Roll back to best model
    mode='max', # 'min' for loss, 'max' for AUC
    verbose=1
)

lr_schedule = ReduceLROnPlateau(monitor='val_auc', factor=0.5, patience=5)

# Build and train
model = build_model(X_tr.shape[1])
history = model.fit(X_tr, y_tr,
                    validation_data=(X_val, y_val),
                    epochs=100,
                    batch_size=128,
                    callbacks=[early_stop, lr_schedule],
                    verbose=2)

In [None]:
# Predict and evaluate
y_pred = model.predict(X_val).flatten()
auc_score = roc_auc_score(y_val, y_pred)
print(f"Validation ROC-AUC: {auc_score:.4f}")

In [None]:
# Plot train & evaluation metrics
def plot_training_metrics(history):
    epochs = range(1, len(history.history['loss']) + 1)

    plt.figure(figsize=(14, 5))

    # 🔻 Loss Plot
    plt.subplot(1, 2, 1)
    plt.plot(epochs, history.history['loss'], label='Training Loss', color='blue')
    plt.plot(epochs, history.history['val_loss'], label='Validation Loss', color='orange')
    plt.title('Loss over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Binary Crossentropy Loss')
    plt.legend()
    plt.grid(True)

    # 🔺 AUC Plot
    plt.subplot(1, 2, 2)
    plt.plot(epochs, history.history['auc'], label='Training AUC', color='green')
    plt.plot(epochs, history.history['val_auc'], label='Validation AUC', color='red')
    plt.title('AUC over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('ROC-AUC Score')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

plot_training_metrics(history)

In [None]:
# MAX AUC ~ 0.65

## Tabnet

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from torch.optim.lr_scheduler import StepLR
from torch.optim import Adam
import numpy as np

clf = TabNetClassifier(
    optimizer_fn=Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, "gamma":0.9},
    scheduler_fn=StepLR,
    mask_type='entmax'  # 'sparsemax' also available
)

clf.fit(
    X_train=X_tr, y_train=y_tr.flatten(),
    eval_set=[(X_val, y_val.flatten())],
    eval_name=['val'],
    eval_metric=['auc'],
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

In [None]:
# MAX AUC ~ 0.65

## Siamese Neural Net

In [None]:
X_train_sample = X_train.loc[0:10000]
y_train_sample = y_train.loc[0:10000]

In [None]:
missing_values = X_train_sample.isnull().sum()
missing_values[missing_values > 0]

In [None]:
def extract_features(segment, windows=[5, 10, 15, 20,
                                       30, 40, 50, 60, 70, 80, 90,
                                       100, 200, 300, 400, 500,
                                       600, 700, 800, 900, 1000]):
    features = []
    for w in windows:
        if len(segment) < w:
            rolling = pd.Series(segment).rolling(window=len(segment))
        rolling = pd.Series(segment).rolling(window=w)
        features.extend([
            rolling.mean().dropna().mean(),
            rolling.std().dropna().mean(),
            rolling.skew().dropna().mean(),
            rolling.kurt().dropna().mean(),
            np.mean(np.diff(segment)),  # trend
            np.std(np.diff(segment)),   # volatility
            np.max(segment) - np.min(segment),  # range
        ])
        
        features.extend([
            # Central tendency & dispersion
            np.median(segment),
            np.percentile(segment, 25),
            np.percentile(segment, 75),
            np.var(segment),
            np.ptp(segment),  # peak-to-peak range

            # Distribution shape
            # entropy(np.histogram(segment, bins=20, density=True)[0]),  # entropy
            # normaltest(segment)[0],  # deviation from normality
            # jarque_bera(segment)[0],  # skew/kurtosis test

            # Autocorrelation & stationarity
            pd.Series(segment).autocorr(lag=1),
            pd.Series(segment).autocorr(lag=5),
            # adfuller(segment)[0],  # ADF test statistic (stationarity)

            # Frequency domain
            np.mean(np.abs(np.fft.fft(segment))),  # average FFT magnitude
            np.argmax(np.abs(np.fft.fft(segment))),  # dominant frequency index
            np.sum(np.abs(np.fft.fft(segment))[:10]),  # low-frequency energy

            # Change dynamics
            np.mean(np.abs(np.diff(segment))),  # mean absolute change
            np.max(np.abs(np.diff(segment))),   # max jump
            np.std(np.diff(segment)),           # change volatility

            # Complexity
            np.sum(np.diff(np.sign(np.diff(segment))) != 0),  # number of turning points
            np.sum(np.diff(segment) > 0),                     # upward steps
            np.sum(np.diff(segment) < 0),                     # downward steps
        ])
    return np.array(features)

In [None]:
def prepare_dataset(data):
    X0, X1, y = [], [], []
    for item in tqdm.tqdm(data):
        f0 = extract_features(item['period_0'])
        f1 = extract_features(item['period_1'])
        if f0.shape[0] != f1.shape[0]:  # skip if unequal
            continue
        X0.append(f0)
        X1.append(f1)
        y.append(item['label'])

    X0, X1, y = np.array(X0), np.array(X1), np.array(y)
    scaler = StandardScaler()
    X0 = scaler.fit_transform(X0)
    X1 = scaler.transform(X1)
    
    return X0, X1, y

In [None]:
def build_siamese_model(input_dim):
    def base_network():
        inp = Input(shape=(input_dim,))
        x = layers.Dense(256)(inp)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Dropout(0.3)(x)
        x = layers.Dense(128, activation='relu')(x)
        x = layers.Dropout(0.3)(x)
        x = layers.Dense(64, activation='relu')(x)
        return models.Model(inp, x)
    
    input_a = Input(shape=(input_dim,))
    input_b = Input(shape=(input_dim,))
    
    base = base_network()
    feat_a = base(input_a)
    feat_b = base(input_b)
    
    # Compare embeddings
    merged = layers.Lambda(lambda tensors: tf.abs(tensors[0] - tensors[1]))([feat_a, feat_b])
    x = layers.Dense(16, activation='relu')(merged)
    x = layers.Dense(8, activation='relu')(x)
    output = layers.Dense(1, activation='sigmoid')(x)
    
    model = models.Model(inputs=[input_a, input_b], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])
    return model

In [None]:
assert not np.isnan(X0).any(), "NaNs in input features"
assert not np.isnan(X1).any(), "NaNs in labels"
assert not np.any(np.isinf(X0)), "Infs in input features"
assert not np.any(np.isinf(X1)), "Infs in labels"

In [None]:
# Replace with your actual dataset
data = []
for i in tqdm.tqdm(range(10001)):
    df = X_train_sample.loc[i]
    label = y_train_sample.loc[i]
    data.append({'id': i,
                 'period_0': df[df['period'] == 0]['value'],
                 'period_1': df[df['period'] == 1]['value'],
                 'label': label['structural_breakpoint']})
print('GENERATE DATASET : DONE')
print('RUNNING DATA PREPARATION')
X0, X1, y = prepare_dataset(data)

print('PREPARE DATASET : DONE')
X0_train, X0_val, X1_train, X1_val, y_train, y_val = train_test_split(X0, X1, y, test_size=0.2, stratify=y)

# Fill nans in prepared dataset
column_medians = np.nanmedian(X1_train, axis=0)
column_medians = np.nan_to_num(column_medians, nan=0)
nan_indices = np.where(np.isnan(X1_train))
X1_train[nan_indices] = np.take(column_medians, nan_indices[1])

column_medians = np.nanmedian(X1_val, axis=0)
column_medians = np.nan_to_num(column_medians, nan=0)
nan_indices = np.where(np.isnan(X1_val))
X1_val[nan_indices] = np.take(column_medians, nan_indices[1])

model = build_siamese_model(X0.shape[1])
history = model.fit([X0_train, X1_train], y_train, validation_data=([X0_val, X1_val], y_val),
                    epochs=100, batch_size=128, verbose=2)

In [None]:
y_pred = model.predict([X0_val, X1_val]).flatten()
auc_score = roc_auc_score(y_val, y_pred)
print(f"Validation ROC-AUC: {auc_score:.4f}")

In [None]:
# Plot train & evaluation metrics
def plot_training_metrics(history):
    epochs = range(1, len(history.history['loss']) + 1)

    plt.figure(figsize=(14, 5))

    # 🔻 Loss Plot
    plt.subplot(1, 2, 1)
    plt.plot(epochs, history.history['loss'], label='Training Loss', color='blue')
    plt.plot(epochs, history.history['val_loss'], label='Validation Loss', color='orange')
    plt.title('Loss over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Binary Crossentropy Loss')
    plt.legend()
    plt.grid(True)

    # 🔺 AUC Plot
    plt.subplot(1, 2, 2)
    plt.plot(epochs, history.history['auc'], label='Training AUC', color='green')
    plt.plot(epochs, history.history['val_auc'], label='Validation AUC', color='red')
    plt.title('AUC over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('ROC-AUC Score')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

plot_training_metrics(history)

In [None]:
def cross_validate_siamese(X0, X1, y, k=5, epochs=20, batch_size=128):
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    auc_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X0, y)):
        print(f"\n🔁 Fold {fold + 1}/{k}")

        X0_train, X0_val = X0[train_idx], X0[val_idx]
        X1_train, X1_val = X1[train_idx], X1[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model = build_siamese_model(X0.shape[1])
        model.fit([X0_train, X1_train], y_train,
                  validation_data=([X0_val, X1_val], y_val),
                  epochs=epochs, batch_size=batch_size, verbose=0)

        y_pred = model.predict([X0_val, X1_val]).flatten()
        auc = roc_auc_score(y_val, y_pred)
        print(f"✅ Fold {fold + 1} ROC-AUC: {auc:.4f}")
        auc_scores.append(auc)

    print(f"\n📊 Mean ROC-AUC over {k} folds: {np.mean(auc_scores):.4f}")
    return auc_scores

auc_scores = cross_validate_siamese(X0, X1, y, k=5, epochs=20)

## Auto Feature Extraction with 1D CNN

In [None]:
def build_feature_extractor(input_length):
    inp = Input(shape=(input_length, 1))  # univariate time series
    x = layers.LayerNormalization(axis=-1)(inp)
    x = layers.Masking(mask_value=0., input_shape=(input_length, 1))(x)

    x = layers.Conv1D(filters=64, kernel_size=7, activation='relu', padding='same')(x)
    x = layers.MaxPooling1D(pool_size=2)(x)

    x = layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same')(x)
    x = layers.MaxPooling1D(pool_size=2)(x)

    x = layers.Conv1D(filters=256, kernel_size=3, activation='relu', padding='same')(x)
    x = layers.GlobalAveragePooling1D()(x)  # reduces to fixed-size vector
    x = layers.Dense(64, activation='relu')(x)

    return models.Model(inp, x)

In [None]:
def build_feature_extractor(input_length):
    inp = Input(shape=(input_length, 1))
    x = layers.Masking(mask_value=0., input_shape=(input_length, 1))(inp)

    # Block 1
    x = layers.Conv1D(64, kernel_size=7, dilation_rate=1, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.MaxPooling1D(pool_size=2)(x)

    # Block 2
    x = layers.Conv1D(128, kernel_size=5, dilation_rate=2, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.MaxPooling1D(pool_size=2)(x)

    # Block 3
    x1 = layers.Conv1D(256, kernel_size=3, dilation_rate=4, padding='same')(x)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.ReLU()(x1)

    # Residual connection
    shortcut = layers.Conv1D(256, kernel_size=1, dilation_rate=1, padding='same')(x)
    shortcut = layers.BatchNormalization()(shortcut)
    x = layers.Add()([x1, shortcut])
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(0.3)(x)

    return models.Model(inp, x)

In [None]:
def build_siamese_raw_model(input_length_a, input_length_b):
    feature_extractor_a = build_feature_extractor(input_length_a)
    feature_extractor_b = build_feature_extractor(input_length_b)

    input_a = Input(shape=(input_length_a, 1))  # period_0
    input_b = Input(shape=(input_length_b, 1))  # period_1

    feat_a = feature_extractor_a(input_a)
    feat_b = feature_extractor_b(input_b)

    # Compare embeddings
    merged = layers.Lambda(lambda tensors: tf.abs(tensors[0] - tensors[1]))([feat_a, feat_b])
    x = layers.Dense(16, activation='relu')(merged)
    x = layers.Dense(8, activation='relu')(x)
    output = layers.Dense(1, activation='sigmoid')(x)

    model = models.Model(inputs=[input_a, input_b], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])
    return model

In [None]:
def prepare_raw_data(data, segment_length_a='max', segment_length_b='max'):
    X0, X1, y = [], [], []
    for item in tqdm.tqdm(data):
        if isinstance(segment_length_a, int) and segment_length_a <= len(item['period_0']):
            p0 = np.array(item['period_0'][(-segment_length_a):])
        else:
            p0 = np.array(item['period_0'])[:]
        if isinstance(segment_length_b, int) and segment_length_b <= len(item['period_1']):
            p1 = np.array(item['period_1'][:segment_length_b])
        else:
            p1 = np.array(item['period_1'])[:]
       
        X0.append(p0.reshape(-1, 1))
        X1.append(p1.reshape(-1, 1))
        y.append(item['label'])
        
    # Pad sequences to a max length
    X0 = pad_sequences(X0, maxlen=max_len_a, padding='pre', value=0)
    X1 = pad_sequences(X1, maxlen=max_len_b, padding='post', value=0)

    return np.array(X0), np.array(X1), np.array(y)

In [None]:
# Read data from local parquet file
# X_train = pd.read_parquet("data/X_train.parquet")
# y_train = pd.read_parquet("data/y_train.parquet")

# Print max series lengths for period 0/1
# series_len = X_train.groupby(['id', 'period'])['value'].count().to_frame()
# idx = pd.IndexSlice
# print(series_len.loc[idx[:, 0], :]['value'].max(), series_len.loc[idx[:, 1], :]['value'].max())

In [None]:
data = []
max_len_a = 2500
max_len_b = 1000

for i in tqdm.tqdm(range(10001)):
    df = X_train.loc[i]
    label = y_train.loc[i]
    data.append({'id': i,
                 'period_0': df[df['period'] == 0]['value'],
                 'period_1': df[df['period'] == 1]['value'],
                 'label': label['structural_breakpoint']})
    
print('GENERATE DATASET : DONE')
print('RUNNING DATA PREPARATION')
X0_raw, X1_raw, y_raw = prepare_raw_data(data, segment_length_a='max', segment_length_b='max')

print('PREPARE DATASET : DONE')
X0_tr, X0_val, X1_tr, X1_val, y_tr, y_val = train_test_split(X0_raw, X1_raw, y_raw, test_size=0.2, stratify=y_raw)

model = build_siamese_raw_model(input_length_a=max_len_a, input_length_b=max_len_b)
history = model.fit([X0_tr, X1_tr], y_tr, validation_data=([X0_val, X1_val], y_val),
                     epochs=100, batch_size=128, verbose=2)

In [None]:
y_pred = model.predict([X0_val, X1_val]).flatten()
auc_score = roc_auc_score(y_val, y_pred)
print(f"Validation ROC-AUC: {auc_score:.4f}")

In [None]:
# Plot train & evaluation metrics
def plot_training_metrics(history):
    epochs = range(1, len(history.history['loss']) + 1)

    plt.figure(figsize=(14, 5))

    # 🔻 Loss Plot
    plt.subplot(1, 2, 1)
    plt.plot(epochs, history.history['loss'], label='Training Loss', color='blue')
    plt.plot(epochs, history.history['val_loss'], label='Validation Loss', color='orange')
    plt.title('Loss over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Binary Crossentropy Loss')
    plt.legend()
    plt.grid(True)

    # 🔺 AUC Plot
    plt.subplot(1, 2, 2)
    plt.plot(epochs, history.history['auc'], label='Training AUC', color='green')
    plt.plot(epochs, history.history['val_auc'], label='Validation AUC', color='red')
    plt.title('AUC over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('ROC-AUC Score')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

plot_training_metrics(history)

In [None]:
def cross_validate_siamese(X0, X1, y, k=5, epochs=50, batch_size=512):
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    auc_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X0, y)):
        print(f"\n🔁 Fold {fold + 1}/{k}")

        X0_train, X0_val = X0[train_idx], X0[val_idx]
        X1_train, X1_val = X1[train_idx], X1[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model = build_siamese_raw_model(1000, 100)
        model.fit([X0_train, X1_train], y_train,
                  validation_data=([X0_val, X1_val], y_val),
                  epochs=epochs, batch_size=batch_size, verbose=0)

        y_pred = model.predict([X0_val, X1_val]).flatten()
        auc = roc_auc_score(y_val, y_pred)
        print(f"✅ Fold {fold + 1} ROC-AUC: {auc:.4f}")
        auc_scores.append(auc)

    print(f"\n📊 Mean ROC-AUC over {k} folds: {np.mean(auc_scores):.4f}")
    return auc_scores

auc_scores = cross_validate_siamese(X0_raw, X1_raw, y_raw, k=5, epochs=20)