# Experiment w clustering for online word detection

In [None]:
import matplotlib.pyplot as plt
import numpy as np

from xournalpp_htr.documents import XournalppDocument

## Load example data

In [None]:
xpp_doc = XournalppDocument("../tests/data/2024-07-26_minimal.xopp")

## Settings

In [None]:
I_PAGE = 0
DPI = 72

## Plot example data as-is w/o segmentation

In [None]:
page = xpp_doc.pages[I_PAGE]

all_strokes = []

for layer in page.layers:
    for stroke in layer.strokes:
        x = stroke.x / DPI
        y = stroke.y / DPI

        y *= -1

        all_strokes.append({"x": x, "y": y, "x_mean": x.mean(), "y_mean": y.mean()})

In [None]:
plt.figure(
    figsize=(
        float(page.meta_data["width"]) / DPI,
        float(page.meta_data["height"]) / DPI,
    )
)

for stroke in all_strokes:
    x = stroke["x"]
    y = stroke["y"]
    plt.scatter(x, y, s=1, c="black")

plt.gca().set_aspect("equal")
plt.show()

## Attempt 1

Plot mean of each stroke on top of stroke.

In [None]:
THRESHOLD = 1e-1

In [None]:
plt.figure(
    figsize=(
        float(page.meta_data["width"]) / DPI,
        float(page.meta_data["height"]) / DPI,
    )
)

for stroke in all_strokes:
    plt.scatter(stroke["x"], stroke["y"], s=1, c="black")
    plt.scatter(stroke["x_mean"], stroke["y_mean"], c="red", s=1)

plt.gca().set_aspect("equal")
plt.show()

Next, I compute the distance between the strokes and threshold it:

In [None]:
distances = np.infty * np.ones((len(all_strokes), len(all_strokes)))

In [None]:
for i in range(len(all_strokes)):
    for j in range(i - 1 + 1):
        element = (all_strokes[i]["x_mean"] - all_strokes[j]["x_mean"]) ** 2 + (
            all_strokes[i]["y_mean"] - all_strokes[j]["y_mean"]
        ) ** 2
        distances[i, j] = element

In [None]:
distances[distances != -1.0].min(), distances[distances != -1.0].max()

In [None]:
plt.figure()
plt.imshow(distances)
plt.show()

Now construct the clusters (inefficiently for now just to get it running at first):

In [None]:
raise NotImplementedError("i need to implement that!")

already_used = np.zeros(len(all_strokes), dtype=bool)

stroke_groups = []

for i in range(1, len(all_strokes)):
    if not already_used[i]:
        word_members = []
        similar_strokes = np.where(distances[i] <= THRESHOLD)[0]
        similar_strokes = similar_strokes.tolist() + [
            i,
        ]

        print(similar_strokes)

        stroke_groups.append(similar_strokes)

        for j in similar_strokes:
            already_used[j] = True

To read: [1](https://stats.stackexchange.com/questions/148161/clustering-from-similarity-distance-matrix), [2](https://scikit-learn.org/dev/modules/generated/sklearn.cluster.SpectralClustering.html), [3](https://stats.stackexchange.com/questions/475687/clustering-given-distance-matrix-and-k-in-python).

The problem of this approach is that the thresholding is not scale invariant, i.e. larger written words will fail to be recognised to be a word b/c the strokes are too far apart.