In [2]:
import sys
from io import BytesIO
from pathlib import Path
from zipfile import ZipFile

import torch
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bayes_opt import BayesianOptimization
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score

# Get my_package directory path from Notebook
sys.path.insert(0, str(Path().resolve().parents[1]) + "/src/")

from kmeansmm import KMeansMM


pd.set_option("display.max_colwidth", None)

URL = "https://archive.ics.uci.edu/static/public/231/pamap2+physical+activity+monitoring.zip"

In [6]:
def collect_dataset() -> pd.DataFrame:
    try:
        print("Requesting Dataset... ", end=" ")
        with ZipFile(
            BytesIO(requests.get(URL, timeout=160, stream=True).content)
        ) as myzip:
            print("File size: ", len(myzip.namelist()) * 0.001, "KB")
            with myzip.open(myzip.namelist()[0]) as file:
                df = pd.read_csv(file)
                print("DataGrame Shape: ", df.shape)
                df.to_csv("pamap2.csv", index=False)
                return df

    except Exception as exc:
        print(f"Failed: {exc}")
        raise exc

    finally:
        print("Done")


DF = collect_dataset()  # 5.9s
DF.info()

Requesting Dataset...  Done


KeyboardInterrupt: 

In [None]:
# Bounded region of parameter space
pbounds = {
    "n_clusters": (2, 10),
    "l": (0, 4),
    "max_iter": (800, 2800),
    "tol": (1e-4, 1e-1),
}


# Define the objective function to maximize (in this case, silhouette score)
def kmeans_objective(n_clusters, l, max_iter, tol):
    kmeans = KMeansMM(
        n_clusters=int(n_clusters),
        l=int(l),
        max_iter=int(max_iter),
        tol=tol,
    )
    kmeans.fit(torch.Tensor(X))
    return silhouette_score(X, kmeans.predict(torch.Tensor(X)))


# Initialize BayesianOptimization
optimizer = BayesianOptimization(
    f=kmeans_objective,
    pbounds=pbounds,
    random_state=101,
)

# Perform optimization
optimizer.maximize(init_points=10, n_iter=10)
print("Best hyperparameters:", optimizer.max["params"])

In [None]:
cluterer = KMeansMM(n_clusters=6, l=-1, max_iter=1558, tol=1e-4)
y_pred = cluterer.fit_predict(torch.FloatTensor(X))
centroids = cluterer.centroids.numpy()

plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap="viridis")
plt.scatter(
    centroids[:, 0], centroids[:, 1], marker="+", s=200, c="red", label="Centroids"
)
h = 0.02

# construct mesh
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# obtain labels per mesh point (reuse stored model)
Z = cluterer.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

# put result into color plot
plt.title("KMeans-- Clustering")
plt.imshow(
    Z,
    interpolation="nearest",
    cmap="Set2",
    alpha=0.75,
    extent=(xx.min(), xx.max(), yy.min(), yy.max()),
    aspect="auto",
    origin="lower",
)