In [1]:
import dask.array as da
from dask.distributed import Client
from dask_ml.preprocessing import StandardScaler
from dask_ml.cluster import KMeans
import numpy as np
from dask_ml.metrics import pairwise_distances, pairwise_distances_argmin_min
from time import time
from timeit import default_timer as now
from dask_ml.datasets import make_blobs
import pandas as pd
from dask.distributed import SSHCluster
import matplotlib.pyplot as plt
import getpass
from sklearn.datasets import fetch_kddcup99
from itertools import product
from tqdm.auto import tqdm
import pickle, pathlib

In [2]:
def total_min_distance(data, centroids):
    distances = pairwise_distances(data, centroids, metric='sqeuclidean')
    return distances.min(axis=1).sum().compute()

def initial_candidate_selection(data, num_clusters, oversampling_factor):
    num_points, _ = data.shape
    seed_idx = np.random.choice(num_points, size=1)
    centroid_pool = data[seed_idx].compute()

    init_cost = total_min_distance(data, centroid_pool)
    num_rounds = int(np.log(init_cost + 1e-6))  # Avoid log(0)

    for _ in range(num_rounds):
        dist_sq = pairwise_distances(data, centroid_pool, metric='sqeuclidean').min(axis=1)
        prob_dist = (oversampling_factor * dist_sq / dist_sq.sum()).compute()
        #random_values = da.random.random(size=len(prob_dist), chunks=prob_dist.chunks)
        
        chunk_size = int(np.ceil(len(prob_dist) / 12))
        random_values = da.random.random(size=len(prob_dist), chunks=(chunk_size,))
        
        selected_mask = random_values < prob_dist
        selected_indices = da.where(selected_mask)[0].compute()
        new_centroids = data[sorted(selected_indices)].compute()
        centroid_pool = np.vstack([centroid_pool, new_centroids])

    return centroid_pool

def compute_assignment_weights(data, centroids):
    distances = pairwise_distances(data, centroids, metric='euclidean')
    closest_indices = da.argmin(distances, axis=1)
    counts = da.bincount(closest_indices, minlength=len(centroids)).compute()
    weight_sum = counts.sum()
    if weight_sum == 0:
        return np.ones(len(centroids)) / len(centroids)
    cluster_weights = counts / weight_sum
    return cluster_weights

def kmeans_plus_plus_weighted_init(data, cluster_weights, num_clusters):
    num_points, _ = data.shape
    seed_idx = np.random.choice(num_points, size=1)
    centers = data[seed_idx].compute()

    for _ in range(1, num_clusters):
        dist_sq = pairwise_distances(data, centers, metric='sqeuclidean').min(axis=1) * cluster_weights
        prob = dist_sq / dist_sq.sum()
        new_idx = np.random.choice(num_points, size=1, p=prob.compute())
        new_center = data[sorted(new_idx)].compute()
        centers = np.vstack([centers, new_center])

    return centers

def assign_to_centroids(data, centers):
    cluster_labels, _ = pairwise_distances_argmin_min(data, centers, metric='sqeuclidean')
    return cluster_labels

def weighted_centroid_update(data, cluster_labels, cluster_weights, num_clusters):
    updated = []
    for cluster_id in range(num_clusters):
        cluster_mask = cluster_labels == cluster_id
        cluster_data = data[cluster_mask]
        weight_subset = cluster_weights[cluster_mask]
        if cluster_data.shape[0] == 0:
            continue
        cluster_mean = da.average(cluster_data, axis=0, weights=weight_subset)
        updated.append(cluster_mean)
    return da.stack(updated)

def run_lloyds(data, cluster_weights, num_clusters, max_iter=100, tolerance=1e-8):
    centroids = kmeans_plus_plus_weighted_init(data, cluster_weights, num_clusters)

    for _ in range(max_iter):
        cluster_labels = assign_to_centroids(data, centroids).compute()
        new_centroids = weighted_centroid_update(data, cluster_labels, cluster_weights, num_clusters).compute()
        if da.allclose(new_centroids, centroids, atol=tolerance).compute():
            break
        centroids = new_centroids
    return cluster_labels, centroids

def run_distributed_kmeans(data, num_clusters, max_iter=100, tolerance=1e-8, oversample_factor=2):
    candidate_centroids = initial_candidate_selection(data, num_clusters, oversample_factor)
    cluster_weights = compute_assignment_weights(data, candidate_centroids)
    dask_centroids = da.from_array(candidate_centroids, chunks=(candidate_centroids.shape[0], candidate_centroids.shape[1]))
    cluster_labels, centroids = run_lloyds(dask_centroids, cluster_weights, num_clusters, max_iter, tolerance)
    
    for _ in range(max_iter):
        cluster_labels = assign_to_centroids(data, centroids)
        new_centroids = da.stack([data[cluster_labels == i].mean(axis=0) for i in range(num_clusters)]).compute()
        if da.allclose(centroids, new_centroids, atol=tolerance).compute():
            break
        centroids = new_centroids

    return cluster_labels, centroids

# whole data with adjustable fraction (80%)

In [3]:
# Load compressed dataset without headers
raw_data = pd.read_csv('kddcup.data.gz', header=None, compression='gzip')

# Define categorical column indices to exclude
exclude_indices = [1, 2, 3, 41]

# Filter out categorical columns by selecting only numerical ones
numerical_data = raw_data.drop(columns=raw_data.columns[exclude_indices])

# Determine number of rows and the proportion to retain
total_entries = numerical_data.shape[0]
sample_fraction = 0.8  # Adjust this to use a smaller portion of the dataset if needed
sample_count = int(sample_fraction * total_entries)

# Extract a subset of the numerical data
subset = numerical_data.iloc[:sample_count]

# Convert to NumPy array for further processing or conversion to Dask
data = subset.to_numpy()

# Free up memory by deleting the original DataFrame
del raw_data, numerical_data, subset


In [4]:
def evaluate_kmeans_performance(dataset, num_workers, threads_per_node, mem_cap, chunk_spec, repetitions):
    # Define the list of VM IPs to use for the cluster
    vm_ips = [
        "10.67.22.199",  # Acts as scheduler and worker
        "10.67.22.199",
        "10.67.22.138",
        "10.67.22.85"
    ]

    ssh_creds = {
        "username": "ungureanu",
        "password": getpass.getpass("SSH password:"),
        "known_hosts": None
    }

    # Start the SSH-based Dask cluster
    cluster = SSHCluster(
        hosts=vm_ips,
        connect_options=ssh_creds,
        remote_python="/opt/miniconda3/envs/dask-env/bin/python",
        scheduler_options={
            "port": 8786,
            "dashboard_address": ":8787",
        },
        worker_options={
            "n_workers": num_workers,
            "nthreads": threads_per_node,
            "memory_limit": mem_cap
        }
    )

    client = Client(cluster)
    print(client)
    print(f"\nRunning benchmark: workers={num_workers}, threads={threads_per_node}, "
          f"memory={mem_cap}, chunks={chunk_spec}")

    # Transform to Dask array with chunking
    chunked_data = da.from_array(dataset, chunks=(dataset.shape[0] // chunk_spec, dataset.shape[1]))

    # Apply normalization
    normalizer = StandardScaler(with_mean=True)
    processed_data = normalizer.fit_transform(chunked_data)
    del chunked_data  # Free memory

    processed_data = processed_data.persist()  # Avoid redundant computation

    n_clusters = 4

    # Run custom KMeans benchmarking
    timing = []
    for _ in range(repetitions):
        tic = now()
        labels, centroids = run_distributed_kmeans(processed_data, n_clusters, 2)
        toc = now()
        timing.append(toc - tic)

    mean_time = np.mean(timing)
    std_time = np.std(timing)


    print(f"KMeans Avg: {mean_time:.2f}s ± {std_time:.2f}s")

    # Collect benchmarking summary
    summary = {
        'workers': num_workers,
        'threads': threads_per_node,
        'memory_limit': mem_cap,
        'chunks': chunk_spec,
        'kmeans_mean_time': mean_time,
        'kmeans_std_time': std_time
    }

    client.close()
    cluster.close()

    return summary


In [11]:
# List of parameter sets for distributed workers
configurations = [
    #done{'workers': 1, 'threads': 4, 'mem': '7 GB'},    # full VM
    #done{'workers': 2, 'threads': 2, 'mem': '3.5 GB'},  # split
    #does not start{'workers': 3, 'threads': 1, 'mem': '2.3 GB'},  # one per VM
    #{'workers': 1, 'threads': 2, 'mem': '7 GB'},    # under-utilise
    #{'workers': 2, 'threads': 1, 'mem': '3.5 GB'},  # limited threads
    #does not start{'workers': 3, 'threads': 2, 'mem': '2.0 GB'},  # oversubscribe
    {'workers': 1, 'threads': 3, 'mem': '7 GB'},    # odd threading
]

chunk_options = [4, 6, 8, 10, 16]

out_dir = pathlib.Path("bench_runs")
out_dir.mkdir(exist_ok=True)

for cfg, chunks in product(configurations, chunk_options):

    # … run evaluate_kmeans_performance exactly as before …
    output = evaluate_kmeans_performance(
        dataset=data,
        num_workers=cfg["workers"],
        threads_per_node=cfg["threads"],
        mem_cap=cfg["mem"],
        chunk_spec=chunks,
        repetitions=3,
    )
    output.update(cfg, chunks=chunks)

    # unique filename per combo
    stamp = int(time() * 1000)          # millisecond timestamp
    fname = f"w{cfg['workers']}_t{cfg['threads']}_c{chunks}_{stamp}.pkl"
    pickle.dump(output, (out_dir / fname).open("wb"))

SSH password: ········


2025-07-06 20:31:23,108 - distributed.deploy.ssh - INFO - 2025-07-06 20:31:23,108 - distributed.http.proxy - INFO - To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
2025-07-06 20:31:23,135 - distributed.deploy.ssh - INFO - 2025-07-06 20:31:23,134 - distributed.scheduler - INFO - State start
2025-07-06 20:31:23,136 - distributed.deploy.ssh - INFO - 2025-07-06 20:31:23,135 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-scratch-space/worker-rfjjghf8', purging
2025-07-06 20:31:23,136 - distributed.deploy.ssh - INFO - 2025-07-06 20:31:23,136 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-scratch-space/worker-whhioaa2', purging
2025-07-06 20:31:23,137 - distributed.deploy.ssh - INFO - 2025-07-06 20:31:23,136 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-scratch-space/worker-626vyvrn', purging
2025-07-06 20:31:23,139 - di

<Client: 'tcp://10.67.22.199:8786' processes=2 threads=6, memory=13.04 GiB>

Running benchmark: workers=1, threads=3, memory=7 GB, chunks=4


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/

KMeans Avg: 18.86s ± 1.37s


SSH password: ········


2025-07-06 20:32:33,185 - distributed.deploy.ssh - INFO - 2025-07-06 20:32:33,184 - distributed.http.proxy - INFO - To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
2025-07-06 20:32:33,212 - distributed.deploy.ssh - INFO - 2025-07-06 20:32:33,211 - distributed.scheduler - INFO - State start
2025-07-06 20:32:33,213 - distributed.deploy.ssh - INFO - 2025-07-06 20:32:33,213 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-scratch-space/worker-0qxh47rs', purging
2025-07-06 20:32:33,216 - distributed.deploy.ssh - INFO - 2025-07-06 20:32:33,216 - distributed.scheduler - INFO -   Scheduler at:   tcp://10.67.22.199:8786
2025-07-06 20:32:33,661 - distributed.deploy.ssh - INFO - 2025-07-06 20:32:33,660 - distributed.nanny - INFO -         Start Nanny at: 'tcp://10.67.22.199:42041'
2025-07-06 20:32:33,861 - distributed.deploy.ssh - INFO - 2025-07-06 20:32:33,860 - distributed.nanny - INFO -   

<Client: 'tcp://10.67.22.199:8786' processes=2 threads=6, memory=13.04 GiB>

Running benchmark: workers=1, threads=3, memory=7 GB, chunks=6


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/

KMeans Avg: 16.22s ± 1.53s


SSH password: ········


2025-07-06 20:33:41,338 - distributed.deploy.ssh - INFO - 2025-07-06 20:33:41,337 - distributed.http.proxy - INFO - To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
2025-07-06 20:33:41,364 - distributed.deploy.ssh - INFO - 2025-07-06 20:33:41,364 - distributed.scheduler - INFO - State start
2025-07-06 20:33:41,366 - distributed.deploy.ssh - INFO - 2025-07-06 20:33:41,364 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-scratch-space/worker-ufhdy96k', purging
2025-07-06 20:33:41,367 - distributed.deploy.ssh - INFO - 2025-07-06 20:33:41,367 - distributed.scheduler - INFO -   Scheduler at:   tcp://10.67.22.199:8786
2025-07-06 20:33:41,809 - distributed.deploy.ssh - INFO - 2025-07-06 20:33:41,809 - distributed.nanny - INFO -         Start Nanny at: 'tcp://10.67.22.199:43619'
2025-07-06 20:33:42,064 - distributed.deploy.ssh - INFO - 2025-07-06 20:33:42,063 - distributed.nanny - INFO -   

<Client: 'tcp://10.67.22.199:8786' processes=2 threads=6, memory=13.04 GiB>

Running benchmark: workers=1, threads=3, memory=7 GB, chunks=8


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/

KMeans Avg: 16.18s ± 1.82s


SSH password: ········


2025-07-06 20:34:44,457 - distributed.deploy.ssh - INFO - 2025-07-06 20:34:44,456 - distributed.http.proxy - INFO - To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
2025-07-06 20:34:44,483 - distributed.deploy.ssh - INFO - 2025-07-06 20:34:44,483 - distributed.scheduler - INFO - State start
2025-07-06 20:34:44,484 - distributed.deploy.ssh - INFO - 2025-07-06 20:34:44,484 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-scratch-space/worker-4t2xijpm', purging
2025-07-06 20:34:44,487 - distributed.deploy.ssh - INFO - 2025-07-06 20:34:44,487 - distributed.scheduler - INFO -   Scheduler at:   tcp://10.67.22.199:8786
2025-07-06 20:34:44,915 - distributed.deploy.ssh - INFO - 2025-07-06 20:34:44,915 - distributed.nanny - INFO -         Start Nanny at: 'tcp://10.67.22.199:36231'
2025-07-06 20:34:45,130 - distributed.deploy.ssh - INFO - 2025-07-06 20:34:45,129 - distributed.nanny - INFO -   

<Client: 'tcp://10.67.22.199:8786' processes=2 threads=6, memory=13.04 GiB>

Running benchmark: workers=1, threads=3, memory=7 GB, chunks=10


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/

KMeans Avg: 15.31s ± 1.68s


SSH password: ········


2025-07-06 20:35:48,812 - distributed.deploy.ssh - INFO - 2025-07-06 20:35:48,811 - distributed.http.proxy - INFO - To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
2025-07-06 20:35:48,837 - distributed.deploy.ssh - INFO - 2025-07-06 20:35:48,837 - distributed.scheduler - INFO - State start
2025-07-06 20:35:48,839 - distributed.deploy.ssh - INFO - 2025-07-06 20:35:48,838 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-scratch-space/worker-g4lxzqbk', purging
2025-07-06 20:35:48,841 - distributed.deploy.ssh - INFO - 2025-07-06 20:35:48,841 - distributed.scheduler - INFO -   Scheduler at:   tcp://10.67.22.199:8786
2025-07-06 20:35:49,284 - distributed.deploy.ssh - INFO - 2025-07-06 20:35:49,283 - distributed.nanny - INFO -         Start Nanny at: 'tcp://10.67.22.199:34867'
2025-07-06 20:35:49,552 - distributed.deploy.ssh - INFO - 2025-07-06 20:35:49,551 - distributed.nanny - INFO -   

<Client: 'tcp://10.67.22.199:8786' processes=2 threads=6, memory=13.04 GiB>

Running benchmark: workers=1, threads=3, memory=7 GB, chunks=16


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/

KMeans Avg: 15.16s ± 1.22s


In [1]:
results_df = pd.DataFrame(results_log)

NameError: name 'pd' is not defined

In [2]:
results_df

NameError: name 'results_df' is not defined

# just to be sure it is saved

In [30]:
results_df.to_pickle(f'{workers}_workers_{threads}_threads_'+memory+f'_memory_{chunks}_chunks.pickle')  

In [7]:
client = Client("tcp://10.67.22.199:8786")
client.close()

In [14]:
import pickle

with open("bench_runs/w1_t2_c4_1751824905448.pkl", "rb") as f:
    obj = pickle.load(f)

print(type(obj))
print(obj)


<class 'dict'>
{'workers': 1, 'threads': 2, 'memory_limit': '7 GB', 'chunks': 4, 'kmeans_mean_time': 17.099651399592403, 'kmeans_std_time': 2.0737015355583686, 'mem': '7 GB'}
