## Load Packages

In [17]:
# # Install tempo-embeddings from GitHub
# # This can also refer to a specific version or branch

%pip install -e ..

# %pip install --upgrade pip  # Required for properly resolving dependencies
# %pip uninstall -y tempo_embeddings  # Remove existing installation
# %pip install --upgrade git+https://github.com/Semantics-of-Sustainability/tempo-embeddings.git

Obtaining file:///Users/jose/Repos/tempo-embeddings
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: tempo_embeddings
  Building editable for tempo_embeddings (pyproject.toml) ... [?25ldone
[?25h  Created wheel for tempo_embeddings: filename=tempo_embeddings-0.0.1-0.editable-py3-none-any.whl size=9582 sha256=3c1f2b8effc8a8f1dd38bcd91828b9c3869b010bf105f7ced87f392e4f75f5c6
  Stored in directory: /private/var/folders/79/zf67ls7520x9m4mj7nx6q07w0000gp/T/pip-ephem-wheel-cache-lq970rru/wheels/de/25/96/d92b7a130b730e0ab67770d76841f36cb3d1f9cda32a4a539b
Successfully built tempo_embeddings
Installing collected packages: tempo_embeddings
  Attempting uninstall: tempo_embeddings
    Found existing installation: tempo_embeddings 0.0.1
    Uninstalling tempo_

In [18]:
# make sure installation has succeeded

import logging
from importlib import reload

reload(logging)
logging.basicConfig(
    format="%(asctime)s %(levelname)s:%(message)s",
    level=logging.INFO,
    datefmt="%I:%M:%S",
)

In [19]:
%load_ext autoreload
%autoreload now

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
try:
    import google.colab  # noqa: F401

    IN_COLAB = True
except ModuleNotFoundError:
    IN_COLAB = False

In [21]:
from pathlib import Path

from tempo_embeddings.text.corpus import Corpus

## Load Database Manager

The `db_path` parameter should point to the directory where the database is, so the original configuration and records are loaded. The database was created using the notebook `1_compute_embeddings_nl.ipynb`. If the given path does not exist, a new EMPTY database will be created there. 

A bigger `batch_size` could make the search faster but if it is too big you might run out of memory.

In [22]:
from ipywidgets import widgets

host_selector = widgets.RadioButtons(
    options=["local", "Research Cloud"],
    value="Research Cloud",
    description="Weaviate Database Host",
)
host_selector

RadioButtons(description='Weaviate Database Host', index=1, options=('local', 'Research Cloud'), value='Resear…

In [23]:
port_selector = widgets.IntText(value=8087, description="Weaviate Database Port")
port_selector

IntText(value=8087, description='Weaviate Database Port')

In [24]:
import weaviate
from tempo_embeddings.embeddings.model import SentenceTransformerModelWrapper
from tempo_embeddings.embeddings.weaviate_database import WeaviateDatabaseManager
from tempo_embeddings.settings import DEFAULT_LANGUAGE_MODEL

# KNOWN_HOSTS = {"local": "localhost", "Research Cloud": "145.38.192.173"} # Server GPU!
KNOWN_HOSTS = {
    "local": "localhost",
    "Research Cloud": "145.38.187.187",
}  # NEW Server Light (08-Oct-24)!

db = WeaviateDatabaseManager(
    client=weaviate.connect_to_local(
        host=KNOWN_HOSTS[host_selector.value], port=port_selector.value
    ),
    model=SentenceTransformerModelWrapper.from_pretrained(DEFAULT_LANGUAGE_MODEL),
)

04:01:04 INFO:HTTP Request: GET http://145.38.187.187:8087/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
04:01:04 INFO:HTTP Request: GET http://145.38.187.187:8087/v1/meta "HTTP/1.1 200 OK"
04:01:04 INFO:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
04:01:08 INFO:HTTP Request: GET http://145.38.187.187:8087/v1/schema/TempoEmbeddings "HTTP/1.1 200 OK"


## Choose from the available Collections in the Database

In [25]:
from ipywidgets import widgets

existing_colls = list(db.get_available_collections())
collection_selector = widgets.SelectMultiple(
    options=existing_colls,
    value=["ANP", "StatenGeneraal"],
    description="Choose a Collection:",
    disabled=False,
    style={"description_width": "initial"},
)

print("\nCollection Sizes")
print("----------------")
max_len = max(len(collection) for collection in existing_colls) + 1
for collection in existing_colls:
    print(f"{collection:{max_len}}\t{db.get_collection_count(collection)}")

collection_selector

04:01:12 INFO:HTTP Request: POST http://145.38.187.187:8087/v1/graphql "HTTP/1.1 200 OK"
04:01:12 INFO:HTTP Request: POST http://145.38.187.187:8087/v1/graphql "HTTP/1.1 200 OK"
04:01:12 INFO:HTTP Request: POST http://145.38.187.187:8087/v1/graphql "HTTP/1.1 200 OK"
04:01:12 INFO:HTTP Request: POST http://145.38.187.187:8087/v1/graphql "HTTP/1.1 200 OK"
04:01:12 INFO:HTTP Request: POST http://145.38.187.187:8087/v1/graphql "HTTP/1.1 200 OK"
04:01:12 INFO:HTTP Request: POST http://145.38.187.187:8087/v1/graphql "HTTP/1.1 200 OK"
04:01:12 INFO:HTTP Request: POST http://145.38.187.187:8087/v1/graphql "HTTP/1.1 200 OK"
04:01:12 INFO:HTTP Request: POST http://145.38.187.187:8087/v1/graphql "HTTP/1.1 200 OK"



Collection Sizes
----------------
StatenGeneraal  	820215
Volkskrant      	781978
NRC             	884143
Trouw           	568898
AlgemeenDagblad 	1005229
ANP             	126466
Telegraaf       	631072


SelectMultiple(description='Choose a Collection:', index=(5, 0), options=('StatenGeneraal', 'Volkskrant', 'NRC…

## Create Sub-Corpus

To make the processing and visualization easier, we will create a new `Corpus` comprising only a subet of the original Collection. This corpus will contain only the records of interest. This is done by querying the database with keyword and metadata constraints. In this example we allow to look for:

- **Filter Terms:** retrieve only passages that contain exactly the given keywords.
- **Year Range:** retrieve only the records which are inside the provided years
- **Neighbors:** This indicates how much to *expand* the search into more datapoints. The idea is to retrieve the *top_k* neighbors of the initially retrieved passages. Ideally this will give related passages that did not mention any of the keywords explicitly.

In [9]:
# TODO: replace by SelectionRangeSlider
widget_year_range = widgets.IntRangeSlider(
    description="Year Range: ",
    min=1800,
    max=2020,
    step=1,
    value=(1950, 2000),
    style={"description_width": "initial"},
    layout=widgets.Layout(width="400px"),
)
widget_terms = widgets.Text(
    description="Filter Terms (comma separated)",
    value="duurzaam",
    style={"description_width": "initial"},
    layout=widgets.Layout(width="600px"),
)
widget_neighbors = widgets.IntSlider(
    description="Expand Neighborhood Size: ",
    min=0,
    max=10,
    value=5,
    style={"description_width": "initial"},
    layout=widgets.Layout(width="400px"),
)

### Display the Widgets to choose the parameters

In [10]:
display(widget_terms)
display(widget_year_range)

Text(value='duurzaam', description='Filter Terms (comma separated)', layout=Layout(width='600px'), style=TextS…

IntRangeSlider(value=(1950, 2000), description='Year Range: ', layout=Layout(width='400px'), max=2020, min=180…

### Execute the Search

No need to move the code manually here. All parameters are grabbed from the widget values

In [11]:
# Unpack values form Widget
year_from, year_to = widget_year_range.value
FILTER_TERMS = [s.strip() for s in widget_terms.value.split(",")]
# Execute Database Query
where_range = {"year_from": year_from, "year_to": year_to}
print(f"Searching terms {FILTER_TERMS} between year {year_from} and {year_to}")
corpus = sum(
    (
        db.get_corpus(
            collection,
            filter_words=FILTER_TERMS,
            year_from=year_from,
            year_to=year_to,
            include_embeddings=True,
            limit=10000,
        )
        for collection in collection_selector.value
    ),
    start=Corpus([]),
)
print(f"Found {len(corpus)} items that match!")

01:55:24 INFO:Dropping UMAP model while merging corpora .
01:55:24 INFO:Merging corpora with identical vectorizers, reusing it.


Searching terms ['duurzaam'] between year 1950 and 2000


01:55:35 INFO:Dropping UMAP model while merging corpora .
01:55:35 INFO:Merging corpora with identical vectorizers, reusing it.


Found 9485 items that match!


## Cluster the Corpus

In [12]:
corpus.compress_embeddings()

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


array([[ 6.21053743,  5.48949718],
       [ 7.7494874 ,  6.34135103],
       [ 9.12733364,  6.33163738],
       ...,
       [ 8.91673756,  9.95978069],
       [10.29898548, 11.33841419],
       [10.02689552,  4.10463333]])

In [20]:
# Make sure (TF-)IDF is computed on the full corpus
corpus.fit_vectorizer()



### Choose Minimum Cluster Size

In [33]:
min_cluster_size_widget = widgets.IntSlider(
    value=50,
    min=5,
    max=len(corpus) // 10,
    step=1,
    description="Minimum Cluster Size:",
    disabled=False,
    continuous_update=False,
    orientation="horizontal",
    readout=True,
    readout_format="d",
    style={"description_width": "initial"},
)

In [34]:
min_cluster_size_widget

IntSlider(value=50, continuous_update=False, description='Minimum Cluster Size:', max=948, min=5, style=Slider…

### Execute Clustering

In [35]:
# Arguments: min_cluster_size=10, cluster_selection_epsilon=0.1, ...
# See https://scikit-learn.org/stable/modules/generated/sklearn.cluster.HDBSCAN.html for full list

# e.g. min_samples=10, cluster_selection_epsilon=0.2, cluster_selection_method="leaf"
clusters = list(
    corpus.cluster(
        min_cluster_size=min_cluster_size_widget.value, cluster_selection_epsilon=0.1
    )
)
print(
    f"Found {len(clusters)} clusters in the corpus. (min cluster size is {min_cluster_size_widget.value})"
)

if len(clusters) > 100:
    raise ValueError(
        "Seems like you have too many clusters! Try with a bigger value for min_cluster_size to avoid memory issues"
    )

max_cluster_size = max(len(c.passages) for c in clusters)

Found 28 clusters in the corpus. (min cluster size is 50)


  self.labels_, self.probabilities_ = tree_to_labels(


### Expand clusters

In [36]:
display(widget_neighbors)

IntSlider(value=5, description='Expand Neighborhood Size: ', layout=Layout(width='400px'), max=10, style=Slide…

In [37]:
from tqdm import tqdm

for cluster in tqdm(clusters, unit="cluster", desc="Expanding"):
    cluster += db.neighbours(cluster, k=widget_neighbors.value)

Expanding:   0%|          | 0/28 [00:00<?, ?cluster/s]02:14:57 INFO:HTTP Request: POST http://145.38.192.173:8087/v1/graphql "HTTP/1.1 200 OK"
02:14:58 INFO:Merging corpora with identical UMAP models, reusing it.
Expanding:   4%|▎         | 1/28 [00:01<00:32,  1.21s/cluster]02:14:58 INFO:HTTP Request: POST http://145.38.192.173:8087/v1/graphql "HTTP/1.1 200 OK"
02:15:03 INFO:Merging corpora with identical UMAP models, reusing it.
Expanding:   7%|▋         | 2/28 [00:06<01:29,  3.45s/cluster]02:15:03 INFO:HTTP Request: POST http://145.38.192.173:8087/v1/graphql "HTTP/1.1 200 OK"
02:15:06 INFO:Merging corpora with identical UMAP models, reusing it.
Expanding:  11%|█         | 3/28 [00:08<01:17,  3.09s/cluster]02:15:06 INFO:HTTP Request: POST http://145.38.192.173:8087/v1/graphql "HTTP/1.1 200 OK"
02:15:07 INFO:Merging corpora with identical UMAP models, reusing it.
Expanding:  14%|█▍        | 4/28 [00:09<00:54,  2.29s/cluster]02:15:07 INFO:HTTP Request: POST http://145.38.192.173:8087/v1

### Label the Clusters (TF-IDF)
#### Load Stopwords to avoid including them in the Cluster Labels

In [24]:
stopwords_file = Path("../tempo_embeddings/data/stopwords-filter-nl.txt")

with open(stopwords_file.absolute(), "rt") as f:
    stopwords = set(f.read().splitlines())

#### Iterate Clusters to assign labels and save a sample of Passages for inspection

In [26]:
import os

if not os.path.exists("clusters"):
    os.makedirs("clusters")

selected_metadata = ["year"]
cluster_sample = 10
centroid_based_sample = True

all_clusters_records, df_cluster_labels, df_cluster_meta = [], [], []
for cluster in clusters:
    cluster.set_topic_label(exclude_words=frozenset(stopwords | set(FILTER_TERMS)), n=5)
    df = cluster.to_dataframe()
    centroid = cluster.centroid()
    label = cluster.label
    cluster_size = len(cluster.passages)
    # Compute Cluster Stats as a Subcorpus
    top_words = " ".join(
        cluster.top_words(exclude_words=frozenset(stopwords | set(FILTER_TERMS)), n=10)
    )
    all_clusters_records.append((f"{label}\t{cluster_size}\t{centroid}\t{top_words}\n"))
    df_cluster_labels.append(cluster.label)
    df_cluster_meta.append(df[selected_metadata])
    # Save a sample of the Cluster Passages in a File (the cluster_sample_size closest to the centroid)
    file_prefix = f"cluster_{year_from}_{year_to}_{cluster.label.replace('; ', '_')}"
    df_sample = cluster.sample(cluster_sample, centroid_based_sample).to_dataframe()
    df_sample.to_csv(f"clusters/{file_prefix}.tsv", sep="\t", index=False)
    print(cluster)

Corpus('concept; veilig; verkeer; verkeersveiligheid; vervoersysteem', 367 passages)
Corpus('beleid; bezit; herstel; karakter; vorming', 2179 passages)
Corpus('aanpak; bouwen; kader; plan; project', 803 passages)
Corpus('land; nederland; verblijf; vreemdelingen; wijziging', 196 passages)
Corpus('begrip; term; woord; «duurzaam»; „duurzaam', 257 passages)
Corpus('Outliers', 1481 passages)
Corpus('aardgasbeleid; energie; energiebeleid; kernenergie; overwegende', 37 passages)
Corpus('bosbeheer; bossen; geproduceerd; hout; tropisch', 991 passages)
Corpus('gehuwden; gescheiden; gezamenlijke; huishouding; voeren', 744 passages)
Corpus('regering; schip; visbestanden; visserij; visserijbeleid', 63 passages)
Corpus('echtgenoot; gehuwde; gescheiden; pensioen; vrouw', 92 passages)
Corpus('beleid; gebruik; internationale; nederland; nederlandse', 139 passages)
Corpus('gezamenlijke; graad; huishouding; persoon; voert', 34 passages)
Corpus('lijden; ondraaglijk; patiënt; verzoek; weloverwogen', 54 pas

In [27]:
with open(f"clusters/clusters_all_{year_from}_{year_to}.csv", "wt") as f:
    f.write("Label\tSize\tCentroid\tTopWords\n")
    for rec in all_clusters_records:
        f.write(rec)

### Plot Cluster Content Distributions

In [28]:
import logging

import matplotlib.pyplot as plt
from ipywidgets import interact

logging.getLogger("matplotlib").setLevel(logging.WARNING)


# TODO: Maybe use something better than pyplot so the histogram is prettier and more flexible
def plot_cluster_distribution(cluster_index, column_name, num_bins):
    df = df_cluster_meta[cluster_index]
    cluster_name = df_cluster_labels[cluster_index]
    plt.figure(figsize=(10, 6))
    plt.hist(df[column_name], edgecolor="black", bins=num_bins)
    plt.xlabel(f"{column_name}", fontsize=14)
    plt.xticks(rotation=90)
    plt.ylabel("Frequency", fontsize=14)
    plt.title(f"{cluster_name}", fontsize=16)
    plt.show()


cluster_selector = widgets.Dropdown(
    options=[(lbl, i) for i, lbl in enumerate(df_cluster_labels)],
    description="Choose a Cluster:",
    disabled=False,
    style={"description_width": "initial"},
)

# Would be better to choose a BIN_SIZE: so it plots per year, per decade, per 25 years, etc...
hist_bins_selector = widgets.Dropdown(
    options=[x + 1 for x in range(100)],
    value=10,
    description="Choose Number of Bins:",
    disabled=False,
    style={"description_width": "initial"},
)

variable_selector = widgets.Dropdown(
    options=selected_metadata,
    description="Choose a Column to Plot:",
    disabled=False,
    style={"description_width": "initial"},
)

interact(
    plot_cluster_distribution,
    cluster_index=cluster_selector,
    column_name=variable_selector,
    num_bins=hist_bins_selector,
)

  from ipykernel.pylab.backend_inline import flush_figures


interactive(children=(Dropdown(description='Choose a Cluster:', options=(('concept; veilig; verkeer; verkeersv…

<function __main__.plot_cluster_distribution(cluster_index, column_name, num_bins)>

## Visualize Embeddings (All Clusters)

In [29]:
try:
    del visualizer
except NameError:
    pass

In [32]:
import os

from bokeh.io import output_notebook
from bokeh.plotting import show

from tempo_embeddings.visualization.bokeh import BokehInteractiveVisualizer

output_notebook()

meta_fields = corpus.metadata_fields()
meta_fields = ["year", "date", "issue", "provenance"]

visualizer = BokehInteractiveVisualizer(
    *clusters, metadata_fields=meta_fields, width=1200, height=500
)

os.environ["BOKEH_ALLOW_WS_ORIGIN"] = "*"

show(visualizer.create_document)

02:14:03 INFO:Starting Bokeh server version 3.3.3 (running on Tornado 6.4)
02:14:03 INFO:User authentication hooks NOT provided (default user enabled)


02:14:03 INFO:WebSocket connection opened
02:14:03 INFO:ServerConnection created
