# Performance Benchmark Data

In [None]:
https://www.researchgate.net/publication/301463871_HPDBSCAN_highly_parallel_DBSCAN
    
eps is set to 0.01, and minPoints as 40

In [22]:
import h5py
import numpy as np

In [28]:
f = h5py.File('./data/twitterSmall.h5.h5', 'r')
print(f.keys())


<KeysViewHDF5 ['Clusters', 'DBSCAN']>


In [29]:
print(len(f['Clusters']))
print(len(set(f['Clusters'][:])))

3704351
5820


In [30]:
print(f['DBSCAN'][:10])

[[53.544956 -2.767532]
 [51.623474 -0.134995]
 [51.948055 -2.069167]
 [51.080994 -1.331917]
 [52.204224 -2.230113]
 [51.60577  -0.347001]
 [53.579166  0.654444]
 [51.11222  -0.163057]
 [51.52723  -0.107052]
 [52.403286 -1.479962]]


In [31]:
from numpy.random import default_rng

dataset = f['DBSCAN']
for size in [20000, 50000, 100000]:
    rng = default_rng()
    numbers = rng.choice(len(dataset), size=size, replace=False)
    X = np.asarray(dataset)[numbers]
    with open("./data/benchmark/twitter_{}.txt".format(len(X)), "w") as f_o:
        # Writing data to a file
        f_o.write("{} \n".format(len(X)))
        f_o.write("{} {}\n".format(0.01, 40))
        for i in range(len(X)):
            f_o.write("{} {}\n".format(X[i ,0], X[i ,1]))

# Correctness Validation Data

In [None]:
from sklearn.cluster import DBSCAN
import numpy as np
clustering = DBSCAN(eps=3, min_samples=2).fit(f['DBSCAN'])

In [7]:
import time
import warnings

import numpy as np
import matplotlib.pyplot as plt

from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice

np.random.seed(0)

# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=0.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None

# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)

# blobs with varied variances
varied = datasets.make_blobs(
    n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
)

# ============
# Set up cluster parameters
# ============
plt.figure(figsize=(9 * 2 + 3, 13))
plt.subplots_adjust(
    left=0.02, right=0.98, bottom=0.001, top=0.95, wspace=0.05, hspace=0.01
)

plot_num = 1

default_base = {
    "quantile": 0.3,
    "eps": 0.3,
    "damping": 0.9,
    "preference": -200,
    "n_neighbors": 10,
    "n_clusters": 3,
    "min_samples": 20,
    "xi": 0.05,
    "min_cluster_size": 0.1,
}

datasets = [
    (
        noisy_circles,
        {
            "damping": 0.77,
            "preference": -240,
            "quantile": 0.2,
            "n_clusters": 2,
            "min_samples": 20,
            "xi": 0.25,
        },
    ),
    (noisy_moons, {"damping": 0.75, "preference": -220, "n_clusters": 2}),
    (
        varied,
        {
            "eps": 0.18,
            "n_neighbors": 2,
            "min_samples": 5,
            "xi": 0.035,
            "min_cluster_size": 0.2,
        },
    ),
    (
        aniso,
        {
            "eps": 0.15,
            "n_neighbors": 2,
            "min_samples": 20,
            "xi": 0.1,
            "min_cluster_size": 0.2,
        },
    ),
    (blobs, {}),
    (no_structure, {}),
]

for i_dataset, (dataset, algo_params) in enumerate(datasets):
    # update parameters with dataset-specific values
    params = default_base.copy()
    params.update(algo_params)
    
    X, y = dataset

    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)
   
    with open("./data/simple/simple_{}_1500.txt".format(i_dataset), "w") as f:
        # Writing data to a file
        f.write("{} \n".format(len(X)))
        f.write("{} {}\n".format(params["eps"], 5))
        for i in range(len(X)):
            f.write("{} {}\n".format(X[i ,0], X[i ,1]))


<Figure size 1512x936 with 0 Axes>