In [1]:
import pandas as pd
import numpy as np

In [None]:
SEED = 42 # answer to everything
N_SAMPLES = 30000
PUBLIC_PRIVATE_SPLIT = 0.3 # Fraction of samples used for the public dataset

2000 vs 8000
NOISE = 0.1 # Previously was 0.3
OUTPUT_PATH = "data.csv"

In [None]:
from sklearn.datasets import (
    make_moons, make_circles, make_blobs, make_classification, make_hastie_10_2,
    make_friedman1, make_friedman2, make_friedman3, make_regression
)
from typing import Callable, List, Tuple

class ds:
    def __init__(self, weight: float, name: str, generator: Callable[[], Tuple[np.ndarray, np.ndarray]]):
        self.weight: float = weight
        self.name: str = name

        self.X: np.ndarray
        self.y: np.ndarray
        self.X, self.y = generator()


DATASETS: List[ds] = [
    ds(4, "moon",      lambda: make_moons(n_samples=N_SAMPLES, noise=NOISE, random_state=SEED)),
    ds(3, "circle",    lambda: make_circles(n_samples=N_SAMPLES, noise=NOISE, factor=0.6, random_state=SEED)),
    ds(2, "blob",      lambda: make_blobs(n_samples=N_SAMPLES, centers=3, n_features=2, random_state=SEED, return_centers=False)), # type: ignore # return_centers=False to avoid returning centers
    ds(2, "hastie",    lambda: make_hastie_10_2(n_samples=N_SAMPLES, random_state=SEED)),
    ds(2, "friedman1", lambda: make_friedman1(n_samples=N_SAMPLES, noise=NOISE, random_state=SEED)),
    ds(2, "friedman2", lambda: make_friedman2(n_samples=N_SAMPLES, noise=NOISE, random_state=SEED)),
    ds(2, "friedman3", lambda: make_friedman3(n_samples=N_SAMPLES, noise=NOISE, random_state=SEED)),
    ds(1, "class",     lambda: make_classification(n_samples=N_SAMPLES, n_features=5, n_informative=3, n_redundant=1, random_state=SEED)),
    ds(1, "reg",       lambda: make_regression(n_samples=N_SAMPLES, n_features=5, n_informative=3, noise=NOISE, random_state=SEED, coef=False)), # type: ignore # coef=False to avoid returning coefficients
]

In [None]:
X_combined = np.hstack([d.X for d in DATASETS])
normalized_weights = np.array([d.weight for d in DATASETS])
normalized_weights /= normalized_weights.sum()
y_final = sum(DATASETS[i].y * normalized_weights[i] for i in range(len(DATASETS)))
y_final += np.random.normal(0, 0.05, size= N_SAMPLES) # Some small, random noise to the final target

In [None]:
df = pd.DataFrame(X_combined, columns=[f"n_{i}" for i in range(X_combined.shape[1])])
df["target"] = y_final

df.to_csv(OUTPUT_PATH, index=False)
print(f"Data saved to {OUTPUT_PATH}")

Data saved to data.csv
