In [6]:
!pip install optuna imblearn

import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, auc, roc_auc_score, f1_score
import xgboost as xgb
import optuna
from imblearn.combine import SMOTETomek
from joblib import Parallel, delayed

def generate_time_series_dataset(n_samples, n_features, imbalance_ratio):

    Args:
    n_samples: total rows (>= 10000)
    n_features: number of numeric features
    imbalance_ratio: majority:minority ratio (e.g., 10 -> 10:1)
    Returns:
    pd.DataFrame with columns: ['timestamp', 'f0',...,'f{n-1}', 'target']
    # timestamps at regular intervals
    timestamps = pd.date_range(start='2020-01-01', periods=n_samples, freq='h') # Changed 'H' to 'h'

    # latent signal that generates rare events
    t = np.arange(n_samples)
    seasonal = 0.5 * np.sin(2 * np.pi * t / 24) # daily seasonality
    trend = 0.0005 * t

    features = []
    for i in range(n_features):
        noise = np.random.normal(scale=1.0, size=n_samples)
        decay = np.exp(- (i+1)/20.0)
        feat = decay * (seasonal + trend) + 0.1 * np.random.normal(size=n_samples) + 0.3 * np.sin(2*np.pi*t/(24*(i+1))) + noise*0.2
        features.append(feat)
    X = np.vstack(features).T

    # Create a score that correlates with rare events
    score = (X[:, :3].sum(axis=1) + 0.5 * np.random.randn(n_samples) + 0.5 * seasonal + 0.1*t)

    # Convert score to probabilities via sigmoid
    def sigmoid(x):
        return 1 / (1 + np.exp(-x/3.0))

    base_prob = sigmoid(score)

    # We need an overall minority rate ~ 1/(imbalance_ratio+1)
    desired_minority_rate = 1.0 / (imbalance_ratio + 1.0)

    # scale base_prob so mean equals desired rate
    scaled_prob = base_prob * (desired_minority_rate / base_prob.mean())
    scaled_prob = np.clip(scaled_prob, 0, 1)

    y = np.random.binomial(1, scaled_prob)

    df = pd.DataFrame(X, columns=[f'f{i}' for i in range(n_features)])
    df['timestamp'] = timestamps
    df['target'] = y

    # shuffle not allowed for time-series; keep chronological order
    return df


# preprocessing.py
def create_features(df):
    # Example feature engineering: lag features, rolling stats
    df = df.copy()
    df = df.sort_values('timestamp')

    # create a few lags for first 3 features
    for f in ['f0', 'f1', 'f2']:
        df[f + '_lag1'] = df[f].shift(1)
        df[f + '_lag24'] = df[f].shift(24)
        df[f + '_rmean24'] = df[f].rolling(window=24, min_periods=1).mean()
    return df

# baseline_lr.py
RANDOM_SEED = 42

# generate data
df = generate_time_series_dataset(n_samples=20000, n_features=15, imbalance_ratio=9.0)

# fe
df = create_features(df)
# Drop rows with NaN values created by lag features
df.dropna(inplace=True)

features = [c for c in df.columns if c not in ['timestamp', 'target']]
X = df[features].values
y = df['target'].values

# scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

lr_metrics = []
for train_idx, test_idx in tscv.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_SEED)
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)

    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    auc_pr = auc(recall, precision)
    auc_roc = roc_auc_score(y_test, y_proba)
    f1 = f1_score(y_test, y_pred)

    lr_metrics.append({'auc_pr': auc_pr, 'auc_roc': auc_roc, 'f1': f1})

print('Logistic Regression (TimeSeriesSplit) metrics (mean +/- std):')
for k in ['auc_pr', 'auc_roc', 'f1']:
    vals = [m[k] for m in lr_metrics]
    print(k, np.mean(vals), np.std(vals))

# xgb_unbalanced.py

# reuse df, features, X_scaled, y from earlier generation and preprocessing
def evaluate_xgb_unbalanced(X, y, tscv):
    metrics = []
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]


        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)


        params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'seed': 42,
        'verbosity': 0
        }
        bst = xgb.train(params, dtrain, num_boost_round=200)
        y_proba = bst.predict(dtest)
        y_pred = (y_proba > 0.5).astype(int)


        precision, recall, _ = precision_recall_curve(y_test, y_proba)
        auc_pr = auc(recall, precision)
        auc_roc = roc_auc_score(y_test, y_proba)
        f1 = f1_score(y_test, y_pred)
        metrics.append({'auc_pr': auc_pr, 'auc_roc': auc_roc, 'f1': f1})
    return metrics


metrics_unbal = evaluate_xgb_unbalanced(X_scaled, y, tscv)
print('Unbalanced XGBoost metrics (mean +/- std):')
for k in ['auc_pr', 'auc_roc', 'f1']:
    vals = [m[k] for m in metrics_unbal]
    print(k, np.mean(vals), np.std(vals))

# xgb_optuna.py

# We'll optimize hyperparameters using a custom objective that performs TimeSeriesSplit

def objective(trial, X, y, tscv):
    param = {
    'verbosity': 0,
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'tree_method': 'hist',
    'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
    'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
    'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3, 1.0),
    'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
    'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
    'max_depth': trial.suggest_int('max_depth', 3, 10),
    'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
    'gamma': trial.suggest_uniform('gamma', 0.0, 5.0),
    # We'll not set scale_pos_weight here; instead use resampling for some trials
    }


    pr_scores = []

for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]


  # Apply SMOTE-Tomek on training fold only
        smt = SMOTETomek(random_state=42)
        X_res, y_res = smt.fit_resample(X_train, y_train)


        dtrain = xgb.DMatrix(X_res, label=y_res)
        dtest = xgb.DMatrix(X_test, label=y_test)


  # early stopping
        evallist = [(dtest, 'eval')]
        booster = xgb.train(param, dtrain, num_boost_round=1000, evals=evallist,
        early_stopping_rounds=30, verbose_eval=False)
        y_proba = booster.predict(dtest, ntree_limit=booster.best_ntree_limit)


        precision, recall, _ = precision_recall_curve(y_test, y_proba)
        auc_pr = auc(recall, precision)
        pr_scores.append(auc_pr)
# We return mean PR AUC across folds
    return float(np.mean(pr_scores))


Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna, imblearn
Successfully installed colorlog-6.10.1 imblearn-0.0 optuna-4.6.0
Logistic Regression (TimeSeriesSplit) metrics (mean +/- std):
auc_pr 0.0969682187999805 0.011675083161486315
auc_roc 0.48087007765473055 0.01483468335271087
f1 0.12093821937865265 0.03865438655913782
Unbalanced XGBoost metrics (mean +/- std):
auc_pr 0.09933232575032924 0.009055501505288858
auc_roc 0.488593099426167 0.009952