In [50]:
# import numpy as np

# n_samples = 10_000
# n_features = 5
# data = np.random.rand(n_samples, n_features)

In [51]:
# import dask.array as da

# # Suppose 'data' is a numpy array of shape (n_samples, n_features).
# # Convert it into a Dask array with a chosen chunk size:
# ddata = da.from_array(data, chunks=(1000, data.shape[1]))


In [52]:
# conda install dask-ml  (or pip install dask-ml)
# from dask_ml.cluster import KMeans

# Example usage:
# kmeans = KMeans(n_clusters=2, random_state=42)
# or
# kmeans = MiniBatchKMeans(n_clusters=2, random_state=42, batch_size=1000)


In [53]:
# import numpy as np
# import dask.array as da
# from dask_ml.cluster import KMeans  # dask-ml KMeans
# # from sklearn.metrics import pairwise_distances_argmin  # used later

# def recursive_bkmeans_dask(data, num_anchors, current_depth=0, max_depth=None):
#     """
#     Recursively partition data into clusters (balanced K-means style).
#     Returns a Python list of Dask array anchors (each anchor is data.mean(axis=0)).
#     """
#     if max_depth is None:
#         max_depth = int(np.ceil(np.log2(num_anchors)))
    
#     # Base cases
#     if num_anchors == 1 or data.shape[0] <= 1 or current_depth >= max_depth:
#         if data.shape[0] == 0:
#             return []
#         else:
#             # return a Python list with one Dask array (the centroid)
#             return [data.mean(axis=0)]

#     # Fit + predict with Dask-ML KMeans
#     clusterer = KMeans(n_clusters=2, random_state=42)
#     clusterer.fit(data)         # data is a Dask array
#     labels = clusterer.predict(data)

#     left = data[labels == 0]
#     right = data[labels == 1]

#     num_left = num_anchors // 2
#     num_right = num_anchors - num_left

#     # Recursively get anchors on left and right
#     anchors_left = recursive_bkmeans_dask(left, num_left, current_depth + 1, max_depth)
#     anchors_right = recursive_bkmeans_dask(right, num_right, current_depth + 1, max_depth)

#     # Return a Python list (concatenate the lists)
#     return anchors_left + anchors_right


In [54]:
# from sklearn.metrics import pairwise_distances_argmin

# def BKHK_dask(data, num_anchors):
#     """
#     data: a Dask array
#     Returns:
#         anchors: (num_anchors, n_features) NumPy array
#         assignments: (n_samples,) NumPy array of nearest-anchor indices
#     """
#     # Build the recursion in Python (no delayed)
#     anchors_list = recursive_bkmeans_dask(data, num_anchors)
#     # anchors_list is now a plain Python list of Dask arrays

#     # Convert each Dask array (anchor) into a NumPy array
#     anchors_np = [anchor_da.compute() for anchor_da in anchors_list]
#     # Possibly truncate if there are more anchors than needed
#     anchors_np = anchors_np[:num_anchors]
#     # Stack them together
#     anchors = np.stack(anchors_np, axis=0)  # shape = (num_anchors, n_features)

#     # Convert data to NumPy for final assignment
#     data_np = data.compute()  # shape = (n_samples, n_features)

#     # Assign each sample to the nearest anchor
#     assignments = pairwise_distances_argmin(data_np, anchors)

#     return anchors, assignments


In [55]:
# if __name__ == "__main__":
#     import numpy as np
#     import dask.array as da

#     n_samples = 10_000
#     n_features = 5
#     np_data = np.random.rand(n_samples, n_features)  # random data

#     # Convert to Dask array
#     ddata = da.from_array(np_data, chunks=(1000, n_features))

#     k = 16
#     anchors, assignments = BKHK_dask(ddata, k)
    
#     print("Anchors shape:", anchors.shape)          # (16, 5)
#     print("Assignments shape:", assignments.shape)  # (10000,)
#     print("First few assignments:", assignments[:10])


In [56]:
import numpy as np
import dask.array as da
from dask_ml.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin

def recursive_bkmeans_dask(data, num_anchors, current_depth=0, max_depth=None):
    if max_depth is None:
        max_depth = int(np.ceil(np.log2(num_anchors)))

    if num_anchors == 1 or data.shape[0] <= 1 or current_depth >= max_depth:
        if data.shape[0] == 0:
            return []
        return [data.mean(axis=0)]

    clusterer = KMeans(n_clusters=2, random_state=42)
    clusterer.fit(data)  # OK as data has known chunk sizes
    labels = clusterer.predict(data)

    # Must compute labels to know how big left/right will be
    labels_np = labels.compute()
    data_np = data.compute()

    # Split in memory
    left_np = data_np[labels_np == 0]
    right_np = data_np[labels_np == 1]

    # Convert back to dask arrays if you want
    left_da = da.from_array(left_np, chunks=(left_np.shape[0], data_np.shape[1]))
    right_da = da.from_array(right_np, chunks=(right_np.shape[0], data_np.shape[1]))

    num_left = num_anchors // 2
    num_right = num_anchors - num_left

    anchors_left = recursive_bkmeans_dask(left_da, num_left, current_depth + 1, max_depth)
    anchors_right = recursive_bkmeans_dask(right_da, num_right, current_depth + 1, max_depth)

    return anchors_left + anchors_right

def BKHK_dask(data, num_anchors):
    anchors_list = recursive_bkmeans_dask(data, num_anchors)
    # anchors_list is a Python list of Dask arrays
    anchors_np = [a.compute() for a in anchors_list]
    anchors_np = anchors_np[:num_anchors]
    anchors = np.stack(anchors_np, axis=0)

    # Final assignment
    data_np = data.compute()
    assignments = pairwise_distances_argmin(data_np, anchors)
    return anchors, assignments

if __name__ == "__main__":
    n_samples = 10_000
    n_features = 5
    np_data = np.random.rand(n_samples, n_features)

    # Must have known chunk sizes
    ddata = da.from_array(np_data, chunks=(1000, n_features))

    k = 16
    anchors, assignments = BKHK_dask(ddata, k)
    print("Anchors shape:", anchors.shape)
    print("Assignments shape:", assignments.shape)


Anchors shape: (16, 5)
Assignments shape: (10000,)
