In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tqdm
from datetime import datetime
import joblib
import os

from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from scipy.stats import ttest_ind, ks_2samp, levene
from scipy.stats import entropy, normaltest, jarque_bera
from statsmodels.tsa.stattools import adfuller
from scipy.signal import periodogram

from xgboost import XGBClassifier

In [None]:
# X_train = pd.read_parquet("data/X_train.parquet")
X_train_features = pd.read_parquet("data/X_train_features.parquet")
y_train = pd.read_parquet("data/y_train.parquet")

print(X_train_features.shape, y_train.shape)

In [None]:
# X_test = pd.read_parquet("data/X_test.reduced.parquet")
# y_test = pd.read_parquet("data/y_test.reduced.parquet")

In [None]:
missing_values = X_train_features.isnull().sum()
missing_values[missing_values > 0]

In [None]:
# Impute missing values with medians
for col in missing_values[missing_values > 0].index:
    X_train_features[col] = X_train_features[col].fillna(X_train_features[col].median())     

## Mutual information

In [None]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.datasets import load_breast_cancer
import pandas as pd

# Compute mutual information scores
mi_scores = mutual_info_classif(X_train_features, y_train.values, discrete_features=False, random_state=42)

# Create a DataFrame of scores
mi_df = pd.DataFrame({'Feature': X_train_features.columns, 'MI Score': mi_scores})
mi_df = mi_df.sort_values(by='MI Score', ascending=False)

# Display top features
print(mi_df.head(10))

In [None]:
# Select top k features
k = 300
selector = SelectKBest(mutual_info_classif, k=k)
X_selected = selector.fit_transform(X_train_features, y_train.values)
selected_features = X_train_features.columns[selector.get_support()]
print("Selected Features:", selected_features.tolist())

In [None]:
# Get cross-validation scores
model = XGBClassifier(n_estimators=1000, objective='binary:logistic', 
                      max_depth=8, colsample_bytree=1, subsample=1,
                      learning_rate=0.1, n_jobs=-1, random_state=15)

print('RUNNING CROSS VALIDATION :')
roc_auc_scores = cross_val_score(model, X_train_features[selected_features], y_train.values, cv=5, scoring='roc_auc', verbose=1)
print(roc_auc_scores)
print(roc_auc_scores.mean(), roc_auc_scores.std())

## Tree-based

In [None]:
# Train XGB model on all features
model = XGBClassifier(n_estimators=500, objective='binary:logistic', max_depth=6, n_jobs=-1, random_state=15)
print('TRAINING MODEL :')
model.fit(X_train_features, y_train.values)

feat_imp = pd.DataFrame({
    "feature": X_train_features.columns,
    "importance": model.feature_importances_})

# Sort by importance (descending)
feat_imp = feat_imp.sort_values(by="importance", ascending=False)
feat_imp

In [None]:
best_features = feat_imp[:150]['feature']
print(best_features.tolist())

# best_features = ['value_max_jump_pre_600', 'pval_levene_40', 'value_mean_absolute_change_post_650', 'value_ptp_pre_150', 'value_kurtosis_post_850', 'value_mean_post', 'value_min_post_600', 'value_percentile_25_pre_750', 'value_avg_fft_mag_pre_20', 'pval_ks_2samp_200', 'value_downward_steps_pre_850', 'value_max_post', 'value_kurtosis_pre', 'value_percentile_25_pre_850', 'value_percentile_25_post_450', 'pval_levene_100', 'value_mean_absolute_change_pre_250', 'value_min_post_500', 'value_percentile_25_post_250', 'value_num_turning_points_post_800', 'value_max_post_300', 'value_avg_fft_mag_pre_350', 'value_ptp_post_50', 'value_percentile_25_pre_150', 'value_kurtosis_pre_600', 'value_std_post_60', 'value_sum_fft_mag_post_750', 'value_kurtosis_pre_700', 'value_mean_pre_550', 'value_std_pre_20', 'value_avg_fft_mag_post', 'value_volatility_pre_30', 'pval_ks_2samp_90', 'value_mean_absolute_change_post_50', 'pval_ks_2samp_950', 'value_auto_1_post', 'value_auto_2_post', 'value_slope_post_350', 'value_mean_absolute_change_pre_700', 'value_std_pre_700', 'value_kurtosis_post_600', 'value_min_pre_70', 'value_mean_absolute_change_pre_60', 'value_percentile_25_pre_80', 'value_avg_fft_mag_post_650', 'pval_ks_2samp_30', 'value_mean_pre_450', 'value_max_pre_300', 'value_auto_2_pre_900', 'value_percentile_25_post_90', 'value_percentile_25_pre', 'value_min_pre_950', 'value_mean_absolute_change_post_90', 'value_max_post_40', 'value_num_turning_points_pre_10', 'value_upward_steps_pre_950', 'value_ptp_post_250', 'value_avg_fft_mag_post_500', 'value_ptp_pre_250', 'value_std_post_10', 'value_ptp_post_600', 'value_downward_steps_post_200', 'value_sum_fft_mag_post_400', 'value_downward_steps_post', 'value_mean_post_500', 'value_auto_1_post_450', 'value_max_jump_pre_70', 'value_num_turning_points_pre_200', 'pval_levene_400', 'value_mean_post_650', 'value_percentile_25_post_150', 'value_skew_post_800', 'value_std_post_20', 'value_ptp_post_5', 'diff_max_jump', 'pval_ks_2samp_60', 'value_mean_absolute_change_pre_450', 'value_auto_3_pre_350', 'ratio_mean_absolute_change', 'diff_mean', 'value_ptp_pre_500', 'pval_levene_30', 'value_num_turning_points_post_80', 'value_volatility_post_90', 'value_ptp_pre_700', 'value_upward_steps_post_90', 'value_upward_steps_pre_250', 'value_percentile_25_pre_250', 'value_volatility_post_300', 'value_num_turning_points_post_50', 'value_max_jump_pre_10', 'value_volatility_pre_20', 'value_dominant_freq_ind_post_800', 'value_ptp_pre_550', 'value_upward_steps_pre_70', 'value_std_pre_500', 'ratio_std_1000', 'pval_ks_2samp_150', 'pval_ks_2samp_500', 'value_ptp_pre_950', 'value_percentile_75_post_450', 'value_std_post_80', 'pval_ttest_ind_950', 'value_volatility_pre_700', 'value_ptp_pre_650', 'value_upward_steps_post', 'pval_ks_2samp_700', 'value_upward_steps_pre_60', 'value_auto_1_pre_800', 'ratio_mean_absolute_change_1000', 'value_skew_post_400', 'value_auto_3_post_550', 'value_mean_pre_350', 'value_num_turning_points_pre_250', 'pval_ks_2samp_5', 'pval_levene_80', 'value_auto_1_pre_700', 'value_downward_steps_post_500', 'value_trend_post_350', 'ratio_num_turning_points', 'value_volatility_pre_750', 'value_median_pre_250', 'value_trend_post_80', 'value_skew_post', 'value_max_pre_80', 'value_ptp_post_750', 'value_mean_absolute_change_post_60', 'value_avg_fft_mag_pre', 'value_mean_pre_800', 'value_median_pre_150', 'value_percentile_75_pre_600', 'ratio_median', 'pval_ks_2samp_20', 'value_skew_pre_40', 'diff_percentile_25', 'value_trend_pre_150', 'value_kurtosis_post_150', 'value_auto_2_post_550', 'pval_levene_750', 'value_upward_steps_pre_350', 'value_max_jump_pre', 'value_mean_absolute_change_pre_650', 'value_percentile_25_post_500', 'pval_ks_2samp_550', 'value_auto_2_post_300', 'value_max_jump_post_250', 'diff_num_turning_points_1000', 'value_ptp_pre', 'value_mean_absolute_change_pre_20', 'value_dominant_freq_ind_post_950']

In [None]:
# Get cross-validation scores
model = XGBClassifier(n_estimators=2000, objective='binary:logistic', 
                      max_depth=8, colsample_bytree=1, subsample=1,
                      learning_rate=0.1, n_jobs=-1, random_state=15)

print('RUNNING CROSS VALIDATION :')
roc_auc_scores = cross_val_score(model, X_train_features[best_features], y_train.values, cv=5, scoring='roc_auc', verbose=1)
print(roc_auc_scores)
print(roc_auc_scores.mean(), roc_auc_scores.std())

In [None]:
# Train XGB model on best features
best_features = ['value_max_jump_pre_600', 'pval_levene_40', 'value_mean_absolute_change_post_650', 'value_ptp_pre_150', 'value_kurtosis_post_850', 'value_mean_post', 'value_min_post_600', 'value_percentile_25_pre_750', 'value_avg_fft_mag_pre_20', 'pval_ks_2samp_200', 'value_downward_steps_pre_850', 'value_max_post', 'value_kurtosis_pre', 'value_percentile_25_pre_850', 'value_percentile_25_post_450', 'pval_levene_100', 'value_mean_absolute_change_pre_250', 'value_min_post_500', 'value_percentile_25_post_250', 'value_num_turning_points_post_800', 'value_max_post_300', 'value_avg_fft_mag_pre_350', 'value_ptp_post_50', 'value_percentile_25_pre_150', 'value_kurtosis_pre_600', 'value_std_post_60', 'value_sum_fft_mag_post_750', 'value_kurtosis_pre_700', 'value_mean_pre_550', 'value_std_pre_20', 'value_avg_fft_mag_post', 'value_volatility_pre_30', 'pval_ks_2samp_90', 'value_mean_absolute_change_post_50', 'pval_ks_2samp_950', 'value_auto_1_post', 'value_auto_2_post', 'value_slope_post_350', 'value_mean_absolute_change_pre_700', 'value_std_pre_700', 'value_kurtosis_post_600', 'value_min_pre_70', 'value_mean_absolute_change_pre_60', 'value_percentile_25_pre_80', 'value_avg_fft_mag_post_650', 'pval_ks_2samp_30', 'value_mean_pre_450', 'value_max_pre_300', 'value_auto_2_pre_900', 'value_percentile_25_post_90', 'value_percentile_25_pre', 'value_min_pre_950', 'value_mean_absolute_change_post_90', 'value_max_post_40', 'value_num_turning_points_pre_10', 'value_upward_steps_pre_950', 'value_ptp_post_250', 'value_avg_fft_mag_post_500', 'value_ptp_pre_250', 'value_std_post_10', 'value_ptp_post_600', 'value_downward_steps_post_200', 'value_sum_fft_mag_post_400', 'value_downward_steps_post', 'value_mean_post_500', 'value_auto_1_post_450', 'value_max_jump_pre_70', 'value_num_turning_points_pre_200', 'pval_levene_400', 'value_mean_post_650', 'value_percentile_25_post_150', 'value_skew_post_800', 'value_std_post_20', 'value_ptp_post_5', 'diff_max_jump', 'pval_ks_2samp_60', 'value_mean_absolute_change_pre_450', 'value_auto_3_pre_350', 'ratio_mean_absolute_change', 'diff_mean', 'value_ptp_pre_500', 'pval_levene_30', 'value_num_turning_points_post_80', 'value_volatility_post_90', 'value_ptp_pre_700', 'value_upward_steps_post_90', 'value_upward_steps_pre_250', 'value_percentile_25_pre_250', 'value_volatility_post_300', 'value_num_turning_points_post_50', 'value_max_jump_pre_10', 'value_volatility_pre_20', 'value_dominant_freq_ind_post_800', 'value_ptp_pre_550', 'value_upward_steps_pre_70', 'value_std_pre_500', 'ratio_std_1000', 'pval_ks_2samp_150', 'pval_ks_2samp_500', 'value_ptp_pre_950', 'value_percentile_75_post_450', 'value_std_post_80', 'pval_ttest_ind_950', 'value_volatility_pre_700', 'value_ptp_pre_650', 'value_upward_steps_post', 'pval_ks_2samp_700', 'value_upward_steps_pre_60', 'value_auto_1_pre_800', 'ratio_mean_absolute_change_1000', 'value_skew_post_400', 'value_auto_3_post_550', 'value_mean_pre_350', 'value_num_turning_points_pre_250', 'pval_ks_2samp_5', 'pval_levene_80', 'value_auto_1_pre_700', 'value_downward_steps_post_500', 'value_trend_post_350', 'ratio_num_turning_points', 'value_volatility_pre_750', 'value_median_pre_250', 'value_trend_post_80', 'value_skew_post', 'value_max_pre_80', 'value_ptp_post_750', 'value_mean_absolute_change_post_60', 'value_avg_fft_mag_pre', 'value_mean_pre_800', 'value_median_pre_150', 'value_percentile_75_pre_600', 'ratio_median', 'pval_ks_2samp_20', 'value_skew_pre_40', 'diff_percentile_25', 'value_trend_pre_150', 'value_kurtosis_post_150', 'value_auto_2_post_550', 'pval_levene_750', 'value_upward_steps_pre_350', 'value_max_jump_pre', 'value_mean_absolute_change_pre_650', 'value_percentile_25_post_500', 'pval_ks_2samp_550', 'value_auto_2_post_300', 'value_max_jump_post_250', 'diff_num_turning_points_1000', 'value_ptp_pre', 'value_mean_absolute_change_pre_20', 'value_dominant_freq_ind_post_950']
model = XGBClassifier(n_estimators=500, objective='binary:logistic', n_jobs=-1, random_state=15)
print('TRAINING MODEL :')
model.fit(X_train_features[best_features], y_train.values)

In [None]:
# Train XGB model with best features & optimal hyperparameters
model = XGBClassifier(n_estimators=1000, objective='binary:logistic', 
                      max_depth=8, colsample_bytree=1, subsample=1, learning_rate=0.1, 
                      n_jobs=-1, random_state=15)

model.fit(X_train_features[best_features], y_train.values)

In [None]:
# Store model in joblib file
joblib.dump(model, os.path.join('./', 'model.joblib'))

## HPO

In [None]:
# Optimize hyperparameters on the training data
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score

param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [4, 6, 8, 10],
    'learning_rate': [0.001, 0.01, 0.1],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
}

grid_search = GridSearchCV(
    estimator=XGBClassifier(random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    verbose=1,
    n_jobs=-1
)

def optimize_model(X_train, y_train):
    # X_train = X_train.reset_index()
    # y_train = y_train.to_frame().reset_index()

    # Generate features from values before and after breakpoint
    # X_train_features = generate_features(X_train)
    grid_search.fit(X_train,  y_train.values)
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    return best_model, best_params, best_score

best_model, best_params, best_score = optimize_model(X_train_features, y_train)
print(best_params, best_score)

# best_params = {'colsample_bytree': 0.6,
#               'learning_rate': 0.001,
#               'max_depth': 7,
#               'n_estimators': 300,
#               'subsample': 0.8}