In [None]:
import json
import pandas as pd

with open("camera-location/311/918/0/coordinates.json", "r") as f:
    coordinates = json.load(f)

In [None]:
df = pd.read_json("camera-location/311/918/0/coordinates.json", orient="index")
df = df.set_axis(["x", "y", "z"], axis=1)
df.head()

In [None]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean

x = df.values

wcss = [] # within cluster sum of squares
for i in range(1,11):
    model = KMeans(n_clusters = i, init = 'k-means++', random_state=42)
    cluster_labels = model.fit_predict(x)

    wcss.append(model.inertia_)


fig = plt.figure(figsize = (4,4))
plt.plot(range(1,11),wcss, linewidth=4, markersize=12,marker='o',color = 'green')
plt.xticks(np.arange(11))
plt.xlabel("Number of clusters")
plt.ylabel("wcss")
plt.show()
# 4

In [None]:
def plot_2d(axis, data, model, labels, n_clusters):
    ax2 = axis
    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(labels.astype(float) / n_clusters)
    ax2.scatter(
        x[:, 0], x[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
    )

    # Labeling the clusters
    centers = model.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(
        centers[:, 0],
        centers[:, 1],
        marker="o",
        c="white",
        alpha=1,
        s=200,
        edgecolor="k",
    )

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(
        "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
        % n_clusters,
        fontsize=14,
        fontweight="bold",
    )

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(labels.astype(float) / n_clusters)
    ax2.scatter(
        data[:, 0], data[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
    )

    # Labeling the clusters
    centers = model.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(
        centers[:, 0],
        centers[:, 1],
        marker="o",
        c="white",
        alpha=1,
        s=200,
        edgecolor="k",
    )

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")


In [None]:
def plot_silhouette(axis, data, labels, n_clusters):

    ax1 = axis
    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(data) + (n_clusters + 1) * 10])

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(data, labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(data, labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    
    plt.suptitle(
        "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
        % n_clusters,
        fontsize=14,
        fontweight="bold",
    )

    

In [None]:
# https://scikit-learn.org/1.5/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

import matplotlib.cm as cm
range_n_clusters = [2, 3, 4, 5, 6]

for n_clusters in range_n_clusters:
    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = clusterer.fit_predict(x)

    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(13, 5)

    plot_silhouette(axis=ax1, data=x, labels=cluster_labels, n_clusters=n_clusters)
    plot_2d(axis=ax2, data=x, model=clusterer, labels=cluster_labels, n_clusters=n_clusters)
    
    plt.show()

In [None]:
# 4

In [None]:
model = KMeans(n_clusters = 4, random_state=42)
df["assigned_label"] = model.fit_predict(x)
df.reset_index(inplace=True, names="camera_id")
df

In [None]:
cluster_labels = model.fit_predict(x)

# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(10, 4)

plot_silhouette(axis=ax1, data=x, labels=cluster_labels, n_clusters=4)
plot_2d(axis=ax2, data=x, model=model, labels=cluster_labels, n_clusters=4)

In [None]:
df.index.duplicated()

In [None]:
def distance_to_cluster_centers(row, centers):
    distances = []
    for center in centers:
        distances.append(euclidean(center, row.values))
    
    return distances

cluster_names = [f"dist_cluster_{i}" for i in range(len(model.cluster_centers_))]
df[cluster_names] = df[["x", "y", "z"]].apply(distance_to_cluster_centers, args=(model.cluster_centers_, ), axis=1, result_type="expand")
df = df.loc[~df.index.duplicated(),:].copy()
df.head(5)

In [None]:
def z_scores(cluster_columns):
    z_columns = []
    z_names = []
    for series_name, series in cluster_columns.items():
        z_columns.append((series - series.mean())/series.std())
        z_columns[-1].rename(f"z_score_{series_name.split('_')[-1]}", inplace=True)

    return pd.concat(z_columns, axis=1)

df_z = pd.concat([df, z_scores(df.loc[:,df.columns.str.contains('dist_cluster*')])], axis = 1)
df_z = df_z.loc[~df_z.index.duplicated(),:].copy()
df_z.head(5)

In [None]:
def is_outlier(row, threshold=2.5):
    assigned_cluster = int(row["assigned_label"])
    outlier = int(np.abs(row[f"z_score_{assigned_cluster}"]) > threshold)
    return outlier

def is_quantile_outlier(df, quantile=0.95):
    dist_names = [n for n in df.columns if "dist_cluster" in n]
    quantiles = np.diagonal(df.groupby("assigned_label").quantile(quantile)[dist_names])
    quantile_list = [quantiles[clus] for clus in df_z["assigned_label"]]
    cluster_distances = np.array([df_z[dist_names].to_numpy()[row,cluster] for row, cluster in enumerate(df_z["assigned_label"])])
    is_outlier = cluster_distances > quantile_list
    print(f"proportion of outliers: {sum(is_outlier) /len(cluster_distances)}") # # sanity check 
    return pd.Series(is_outlier, index=df.index)

#df_z["is_outlier"] = df_z.apply(is_outlier, axis=1) # this does not work.........
df_z["is_outlier"] = is_quantile_outlier(df_z)
df_z.head()

In [None]:
fig, ax = plt.subplots()
# only your own z score
for i in range(len(model.cluster_centers_)):
    df_z[df_z["assigned_label"] == i][f"z_score_{i}"].plot.hist(ax=ax)
    ax.legend()

In [None]:
print(df_z.groupby("assigned_label")["is_outlier"].sum())

for cluster in range(len(model.cluster_centers_)):
    print(f"Showing outliers for cluster {cluster}")


import plotly.express as px



fig = px.scatter_3d(
            df[df["is_outlier"] == 0], 
            x="x", 
            y="y", 
            z="z", 
            color = "assigned_label", 
            opacity = 0.5,
            width=800,
            height=800
            )

fig.add_trace(
    px.scatter_3d(
            df[df["is_outlier"] == 1], 
            x="x", 
            y="y", 
            z="z", 
            mode="markers"
            color = "is_outlier", 
            opacity = 1
            )
)

fig.show()

In [None]:
cluster_df = pd.DataFrame(model.cluster_centers_, columns= ["x", "y", "z"])
cluster_df["color"] = "red"
cluster_df["camera_id"] = df_z["camera_id"]
cluster_df

In [None]:
import plotly.express as px

color_clusters = px.colors.sequential.Plasma[:len(model.cluster_centers_)]
color_map = {idx: color for idx, color in enumerate(color_clusters)}

fig = px.scatter_3d(df_z, x="x", y="y", z="z", 
            color = "assigned_label", 
            opacity = 0.3,
            width=800,
            height=800,
            color_discrete_map=color_map
            )

fig.add_traces(
    px.scatter_3d(df_z[df_z["is_outlier"] == 1], x="x", y="y", z="z",
        opacity = 1,
        symbol="is_outlier",
        symbol_sequence=['diamond-open'],
    ).update_traces(marker_line_width=2).data
)

fig.add_traces(
    px.scatter_3d(
        cluster_df, x="x", y="y", z="z", 
        symbol="color",
        symbol_sequence=["x"],
        color="color"
    ).data
)



fig.show()