In [1]:
import hdbscan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import manifold
from ipywidgets import interact, Output
from IPython.display import clear_output

import sys
sys.path.append('..')
from src.band_plotters import DATA_DIRECTORY
from src.cluster_plotters import plot_cluster_ellipses, plot_groups

ModuleNotFoundError: No module named 'hdbscan'

In [2]:
!pip install hdbscan colorcet

Collecting hdbscan
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting colorcet
  Downloading colorcet-3.0.1-py2.py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyct>=0.4.4
  Downloading pyct-0.5.0-py2.py3-none-any.whl (15 kB)
Collecting param>=1.7.0
  Downloading param-2.0.1-py3-none-any.whl (113 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.4/113.4 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (pyproject.toml) ... [?25ldone
[?25h  Created wheel for hdbscan: filena

In [None]:
FINGERPRINT_NAME = "all_k_branches_histogram_-8_to_8"
FINGERPRINT_LENGTH = 120
PERPLEXITY = 30
FLAT_ONLY = True
INPUT_NAME = f"{FINGERPRINT_NAME}_perplexity_{PERPLEXITY}_length_{FINGERPRINT_LENGTH}.csv"

## Load Data

In [None]:
df = pd.read_csv(f"../fingerprints/{INPUT_NAME}", index_col="ID")
if FLAT_ONLY:
    df = df[df.horz_flat_seg>0]
df.head()

## Cluster

In [None]:
fingerprint_cols = [str(i) for i in range(FINGERPRINT_LENGTH)]

In [None]:
clusterer = hdbscan.HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True,\
                        gen_min_span_tree=False, leaf_size=40, metric='minkowski', cluster_selection_method='leaf', min_cluster_size=4, min_samples=4, p=0.2)
clusterer.fit(df[fingerprint_cols])

df["labels"] = clusterer.labels_

In [None]:
def view_cluster(label):
    display(df[df.labels==label].head(100))

    num_plots = len(df[df.labels==label])

    fig, ax = plt.subplots(num_plots, 1, figsize=(4, 1*num_plots))

    for i, index in enumerate(df[df.labels==label].index):
        ax[i].plot(np.linspace(0, FINGERPRINT_LENGTH, FINGERPRINT_LENGTH), df.loc[index][fingerprint_cols])
        ax[i].set_xlabel(index)
    
    
interact(view_cluster, label=(-1, len(np.unique(df.labels))-2, 1))

## Rerun TSNE (ONLY IF ON FLAT SUBSET)

In [None]:
if FLAT_ONLY:
    tsne = manifold.TSNE(n_components=2, early_exaggeration=12.0, init="pca",learning_rate=100, random_state=0, perplexity=PERPLEXITY ,n_iter=10000, verbose=2)
    fingerprint_2d = tsne.fit_transform(df[fingerprint_cols])
    df.fx, df.fy = fingerprint_2d[:, 0], fingerprint_2d[:, 1]

## Example Plots
I recommend plotting the groups before the cluster ellipses, otherwise the figure might end up having axis going from 0 to 1.

In [None]:
ax = plot_groups(df,"discovery")
ax = plot_cluster_ellipses(df, ax=ax, color="black")
ax.legend()

In [None]:
ax = plot_groups(df, "relative_id")
ax = plot_cluster_ellipses(df, ax=ax, color="black")
plt.show()

In [None]:
ax = plot_groups(df, "segments")
ax = plot_cluster_ellipses(df, ax=ax, color="black")
ax.legend()
plt.show()