In [1]:
import pandas as pd
import numpy as np

# DataFrame Creation

In [2]:
SEED = 42 # answer to everything
np.random.seed(SEED)

TRAIN_FRAC = 0.3
PUBLIC_LEADERBOARD_FRAC = 0.2
PRIVATE_LEADERBOARD_FRAC = 0.5
assert TRAIN_FRAC + PUBLIC_LEADERBOARD_FRAC + PRIVATE_LEADERBOARD_FRAC == 1, "Fractions must sum to 1"
N_TRAIN_SAMPLES = 10000
N_SAMPLES = int(N_TRAIN_SAMPLES // TRAIN_FRAC)
print(f"Total number of samples: {N_SAMPLES}")

PUBLIC_PRIVATE_SPLIT = 0.3 # Fraction of samples used for the public dataset
TEST_TRAIN_SPLIT = 0.2 # Fraction of samples used for the test set

ID_VARIABLE_NAME = "ID" # Name of the ID variable.
TARGET_VARIABLE_NAME = "y" # Name of the target variable.

INITIAL_NOISE = 0.1 # Previously was 0.3
FINAL_NOISE = 0.05 # Added to the final target
NAN_PROB = 0.1 # Probability of a value being NaN

DATA_DIR = "data"

Total number of samples: 33333


In [3]:
from sklearn.datasets import (
    make_moons, make_circles, make_blobs, make_classification, make_hastie_10_2,
    make_friedman1, make_friedman2, make_friedman3, make_regression
)
from typing import Callable

class ds:
    def __init__(self, weight: float, name: str, generator: Callable[[], tuple[np.ndarray, np.ndarray]]):
        self.weight: float = weight
        self.name: str = name

        self.raw_X: np.ndarray
        self.raw_y: np.ndarray
        self.raw_X, self.raw_y = generator()
        self.X: pd.DataFrame = pd.DataFrame(
            self.raw_X,
            columns=[
                f"{self.name}_{i+1}" for i in range(self.raw_X.shape[1])
            ]
        )
        self.y: pd.Series = pd.Series(self.raw_y, name="{self.name}_y")
    
    def __hash__(self):
        return hash(self.name)

    def __eq__(self, other):
        return isinstance(other, ds) and self.name == other.name

# Make sure each has a unique name.
DATASETS: list[ds] = [
    ds(4, "moon",      lambda: make_moons(n_samples=N_SAMPLES, noise=INITIAL_NOISE, random_state=SEED)),
    ds(3, "circle",    lambda: make_circles(n_samples=N_SAMPLES, noise=INITIAL_NOISE, factor=0.6, random_state=SEED)),
    ds(2, "blob",      lambda: make_blobs(n_samples=N_SAMPLES, centers=3, n_features=2, random_state=SEED, return_centers=False)), # type: ignore # return_centers=False to avoid returning centers
    ds(2, "hastie",    lambda: make_hastie_10_2(n_samples=N_SAMPLES, random_state=SEED)),
    ds(2, "friedman1", lambda: make_friedman1(n_samples=N_SAMPLES, noise=INITIAL_NOISE, random_state=SEED)),
    ds(2, "friedman2", lambda: make_friedman2(n_samples=N_SAMPLES, noise=INITIAL_NOISE, random_state=SEED)),
    ds(2, "friedman3", lambda: make_friedman3(n_samples=N_SAMPLES, noise=INITIAL_NOISE, random_state=SEED)),
    ds(1, "class",     lambda: make_classification(n_samples=N_SAMPLES, n_features=5, n_informative=3, n_redundant=1, random_state=SEED)),
    ds(1, "reg",       lambda: make_regression(n_samples=N_SAMPLES, n_features=5, n_informative=3, noise=INITIAL_NOISE, random_state=SEED, coef=False)), # type: ignore # coef=False to avoid returning coefficients
]

In [5]:
id = pd.Series(np.arange(N_SAMPLES), name=ID_VARIABLE_NAME)
df = pd.concat([id] + [d.X for d in DATASETS], axis=1)

In [6]:
normalized_weights = np.array([d.weight for d in DATASETS], dtype=np.float64) # dtype=np.float64 for division in the next line
normalized_weights /= normalized_weights.sum()
y_final = sum(DATASETS[i].y * normalized_weights[i] for i in range(len(DATASETS)))
y_final += np.random.normal(0, FINAL_NOISE, size= N_SAMPLES) # Some small, random noise to the final target
df[TARGET_VARIABLE_NAME] = y_final

# Analysis

# CSV Creation

In [7]:
# Make some cells NaN with a given probability

n_cells = df.size
n_nan = int(n_cells * 0.10)

# Randomly choose positions
nan_indices = (
    np.random.choice(df.index, n_nan, replace=True),
    np.random.choice(df.columns, n_nan, replace=True)
)

# Assign NaN
for row, col in zip(*nan_indices):
    df.loc[row, col] = np.nan

In [8]:
# Shuffle columns, keeping the target column at the end.
cols = df.columns.tolist()
df = df[np.random.permutation(cols[:-1]).tolist() + [cols[-1]]]

In [9]:
# TODO: Renaming step. The names should fit the theme. After renaming, some columns can be Zalgo'd. Currently all of them are.

In [10]:
# Zalgo the column names. Kinda evil but it adds a bit of fun to the dataset.

import random

# Zalgo character set
zalgo_up = [chr(c) for c in range(0x0300, 0x036F)]
zalgo_mid = [chr(c) for c in range(0x1AB0, 0x1AFF)]
zalgo_down = [chr(c) for c in range(0x1DC0, 0x1DFF)]
zalgo_chars = zalgo_up + zalgo_mid + zalgo_down

def zalgoify(text: str, intensity: float = 0.5, max_marks: int = 3) -> str:
    """
    intensity: 0.0 (no zalgo) → 1.0 (every char zalgo'd)
    max_marks: maximum number of diacritic marks per affected character
    """
    zalgo_text = ""
    for char in text:
        zalgo_text += char
        if random.random() < intensity:
            for _ in range(random.randint(1, max_marks)):
                zalgo_text += random.choice(zalgo_chars)
    return zalgo_text

# Apply zalgo to the column names
df.columns = [zalgoify(col) for col in df.columns]

In [11]:
N_PUBLIC_SAMPLES = int(N_SAMPLES * PUBLIC_LEADERBOARD_FRAC)
train = df.iloc[:N_TRAIN_SAMPLES]
public = df.iloc[N_TRAIN_SAMPLES:N_TRAIN_SAMPLES + N_PUBLIC_SAMPLES]
private = df.iloc[N_TRAIN_SAMPLES + N_PUBLIC_SAMPLES:]

In [13]:
# Save the datasets to CSV files
from datetime import datetime
timestamp = datetime.now().strftime("%d_%H-%M-%S")

train.to_csv(f"{DATA_DIR}/train_{timestamp}.csv", index=ID_VARIABLE_NAME)
public.to_csv(f"{DATA_DIR}/public_{timestamp}.csv", index=ID_VARIABLE_NAME)
private.to_csv(f"{DATA_DIR}/private_{timestamp}.csv", index=ID_VARIABLE_NAME)