In [1]:
import numpy as np
from typing import Literal, Tuple

def generate_camouflaged_dataset(
    core_pattern: Literal["diagonal_sine", "high_freq_sine", "poly_interaction", "xor", "spiral"],
    n_samples: int = 2000,
    n_features: int = 50,
    signal_dims: int = 2,
    noise: float = 0.05,
    random_state: int = 42
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Generates a high-dimensional dataset where the signal is embedded
    in a small subset of dimensions and the rest are random noise.
    
    core_pattern: The underlying pattern hidden in signal_dims features.
    n_samples: Number of samples.
    n_features: Total number of features in the dataset.
    signal_dims: Number of dimensions that carry the actual signal (usually 1-3).
    noise: Probability of label flipping OR numeric noise depending on pattern.
    random_state: RNG seed.
    """
    rng = np.random.default_rng(random_state)

    # Step 1: Create the signal in low dimensions
    if core_pattern == "diagonal_sine":
        X_signal = rng.uniform(-2, 2, size=(n_samples, 2))
        y = (np.sin(X_signal[:, 0] + X_signal[:, 1]) > 0).astype(int)
    elif core_pattern == "high_freq_sine":
        X_signal = rng.uniform(-1, 1, size=(n_samples, 1))
        y = (np.sin(20 * X_signal[:, 0]) > 0).astype(int)
    elif core_pattern == "poly_interaction":
        X_signal = rng.uniform(-2, 2, size=(n_samples, 2))
        y = ((X_signal[:, 0] + X_signal[:, 1])**5 > 0).astype(int)
    elif core_pattern == "xor":
        X_signal = rng.uniform(-1, 1, size=(n_samples, 2))
        y = ((X_signal[:, 0] > 0) ^ (X_signal[:, 1] > 0)).astype(int)
    elif core_pattern == "spiral":
        theta = np.sqrt(rng.random(n_samples)) * 2 * np.pi
        r = 2 * theta + np.pi
        X1 = r * np.cos(theta) + rng.normal(0, noise, n_samples)
        X2 = r * np.sin(theta) + rng.normal(0, noise, n_samples)
        X_signal = np.vstack([X1, X2]).T
        y = (theta % (2 * np.pi) > np.pi).astype(int)
    else:
        raise ValueError(f"Unknown pattern: {core_pattern}")

    # Step 2: Add label noise
    flip_mask = rng.random(n_samples) < noise
    y[flip_mask] = 1 - y[flip_mask]

    # Step 3: Embed the signal into high-dimensional noise
    X_full = rng.normal(0, 1, size=(n_samples, n_features))
    X_full[:, :signal_dims] = X_signal[:, :signal_dims]

    # Optional: Randomly permute feature order so signal isn't always in first columns
    perm = rng.permutation(n_features)
    X_full = X_full[:, perm]

    return X_full, y


In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

X, y = generate_camouflaged_dataset("diagonal_sine", n_samples=5000, n_features=50, signal_dims=2)

xgb = XGBClassifier(eval_metric="logloss")
lr = LogisticRegression(max_iter=500)

print("XGBoost score:", cross_val_score(xgb, X, y, cv=5).mean())
print("Logistic Regression score:", cross_val_score(lr, X, y, cv=5).mean())

XGBoost score: 0.9248000000000001
Logistic Regression score: 0.8552
