In [9]:
import pandas as pd
import numpy as np

In [10]:
SEED = 42 # answer to everything

N_SAMPLES = 30000
PUBLIC_PRIVATE_SPLIT = 0.3 # Fraction of samples used for the public dataset
TEST_TRAIN_SPLIT = 0.2 # Fraction of samples used for the test set
print(f"Private dataset: {(1 - PUBLIC_PRIVATE_SPLIT) * 100:1f}%\nPublic train dataset: {PUBLIC_PRIVATE_SPLIT * (1 - TEST_TRAIN_SPLIT) * 100:1f}%\nPublic test dataset: {PUBLIC_PRIVATE_SPLIT * TEST_TRAIN_SPLIT * 100:1f}%")
print(f"Public train dataset size: {int(N_SAMPLES * PUBLIC_PRIVATE_SPLIT * (1 - TEST_TRAIN_SPLIT))}")

INITIAL_NOISE = 0.1 # Previously was 0.3
FINAL_NOISE = 0.05 # Added to the final target

Private dataset: 70.000000%
Public train dataset: 24.000000%
Public test dataset: 6.000000%
Public train dataset size: 7200


In [None]:
from sklearn.datasets import (
    make_moons, make_circles, make_blobs, make_classification, make_hastie_10_2,
    make_friedman1, make_friedman2, make_friedman3, make_regression
)
from typing import Callable

class ds:
    def __init__(self, weight: float, name: str, generator: Callable[[], tuple[np.ndarray, np.ndarray]]):
        self.weight: float = weight
        self.name: str = name

        X: np.ndarray
        y: np.ndarray
        X, y = generator()
        self.X: pd.DataFrame = pd.DataFrame(X, columns=[f"{self.name}_{i}" for i in range(X.shape[1])])
        self.y: pd.Series = pd.Series(y, name="{self.name}_y")
    
    def __hash__(self):
        return hash(self.name)

    def __eq__(self, other):
        return isinstance(other, ds) and self.name == other.name

# Make sure each has a unique name.
DATASETS: list[ds] = [
    ds(4, "moon",      lambda: make_moons(n_samples=N_SAMPLES, noise=INITIAL_NOISE, random_state=SEED)),
    ds(3, "circle",    lambda: make_circles(n_samples=N_SAMPLES, noise=INITIAL_NOISE, factor=0.6, random_state=SEED)),
    ds(2, "blob",      lambda: make_blobs(n_samples=N_SAMPLES, centers=3, n_features=2, random_state=SEED, return_centers=False)), # type: ignore # return_centers=False to avoid returning centers
    ds(2, "hastie",    lambda: make_hastie_10_2(n_samples=N_SAMPLES, random_state=SEED)),
    ds(2, "friedman1", lambda: make_friedman1(n_samples=N_SAMPLES, noise=INITIAL_NOISE, random_state=SEED)),
    ds(2, "friedman2", lambda: make_friedman2(n_samples=N_SAMPLES, noise=INITIAL_NOISE, random_state=SEED)),
    ds(2, "friedman3", lambda: make_friedman3(n_samples=N_SAMPLES, noise=INITIAL_NOISE, random_state=SEED)),
    ds(1, "class",     lambda: make_classification(n_samples=N_SAMPLES, n_features=5, n_informative=3, n_redundant=1, random_state=SEED)),
    ds(1, "reg",       lambda: make_regression(n_samples=N_SAMPLES, n_features=5, n_informative=3, noise=INITIAL_NOISE, random_state=SEED, coef=False)), # type: ignore # coef=False to avoid returning coefficients
]

In [None]:
df = pd.concat([d.X for d in DATASETS], axis=1)

In [None]:
normalized_weights = np.array([d.weight for d in DATASETS], dtype=np.float64) # dtype=np.float64 for division in the next line
normalized_weights /= normalized_weights.sum()
y_final = sum(DATASETS[i].y * normalized_weights[i] for i in range(len(DATASETS)))
y_final += np.random.normal(0, FINAL_NOISE, size= N_SAMPLES) # Some small, random noise to the final target
df['y'] = y_final

In [18]:
from datetime import datetime
timestamp = datetime.now().strftime("%d_%H-%M-%S")
output_path = f"data_{timestamp}.csv"

df.to_csv(output_path, index=False)
print(f"Data saved to {output_path}")

Data saved to data_09_15-42-21.csv
