In [7]:
from pathlib import Path

import numpy as np
from sklearn import cluster, datasets
from sklearn.preprocessing import StandardScaler

In [8]:
n_samples = 500
seed = 30

rng = np.random.RandomState(seed)

In [9]:
clusters_path = Path("./clusters")
clusters_path.mkdir(exist_ok=True)

In [10]:
clusters_2d_path = clusters_path / "2D"
clusters_2d_path.mkdir(exist_ok=True)

In [None]:
noisy_circles = datasets.make_circles(
    n_samples=n_samples, factor=0.5, noise=0.05, random_state=seed
)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=0.05, random_state=seed)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=seed)
no_structure = rng.rand(n_samples, 2), None

# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)

# blobs with varied variances
varied = datasets.make_blobs(
    n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
)

In [15]:
default_base = {
    "eps": 0.3
}

dataset_results = {
    "noisy_circles": (noisy_circles, { }),
    "noisy_moons": (noisy_moons, { }),
    "varied": (varied,
        {
            "eps": 0.18,
        },
    ),
    "aniso": (
        aniso,
        {
            "eps": 0.15,
        },
    ),
    "blobs": (blobs, {"min_samples": 7, "xi": 0.1, "min_cluster_size": 0.2}),
    "no_structure": (no_structure, {}),
}

In [19]:
for name, (dataset, algo_params) in dataset_results.items():
    params = default_base.copy()
    params.update(algo_params)

    X, y = dataset

    X = StandardScaler().fit_transform(X)

    dbscan = cluster.DBSCAN(eps=params["eps"])
    dbscan.fit(X)

    y_pred = dbscan.labels_.astype(int)

    np.savez_compressed(clusters_2d_path / name, X=X, y=y, dbscan=y_pred)