## Load Packages

In [1]:
# # Install tempo-embeddings from GitHub
# # This can also refer to a specific version or branch

%pip install ..

# %pip install --upgrade pip  # Required for properly resolving dependencies
# %pip uninstall -y tempo_embeddings  # Remove existing installation
# %pip install --upgrade git+https://github.com/Semantics-of-Sustainability/tempo-embeddings.git

Processing /Users/jose/Repos/tempo-embeddings
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: tempo_embeddings
  Building wheel for tempo_embeddings (pyproject.toml) ... [?25ldone
[?25h  Created wheel for tempo_embeddings: filename=tempo_embeddings-0.0.1-py3-none-any.whl size=36886 sha256=42837a6b0cfd383e4a8eeef112120f7a880b553f2ba2cf71004ba8c4b9eea2d7
  Stored in directory: /Users/jose/Library/Caches/pip/wheels/de/25/96/d92b7a130b730e0ab67770d76841f36cb3d1f9cda32a4a539b
Successfully built tempo_embeddings
Installing collected packages: tempo_embeddings
  Attempting uninstall: tempo_embeddings
    Found existing installation: tempo_embeddings 0.0.1
    Uninstalling tempo_embeddings-0.0.1:
      Successfully uninstalled tempo_embeddings-0.0.1
Successfully installed tempo_embeddings-0.0.1
Note: you may need to restart the kernel to

In [2]:
# make sure installation has succeeded
import tempo_embeddings

from importlib import reload
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%I:%M:%S')

In [3]:
%load_ext autoreload

In [4]:
try:
    import google.colab

    IN_COLAB = True
except ModuleNotFoundError:
    IN_COLAB = False

In [5]:
%autoreload now

from pathlib import Path
from tqdm import tqdm
from tempo_embeddings.text.corpus import Corpus
import ipywidgets as widgets

## Load Database Manager

The `db_path` parameter should point to the directory where the database is, so the original configuration and records are loaded. The database was created using the notebook `1_compute_embeddings_nl.ipynb`. If the given path does not exist, a new EMPTY database will be created there. 

A bigger `batch_size` could make the search faster but if it is too big you might run out of memory.

In [6]:
from tempo_embeddings.embeddings.weaviate_database import WeaviateDatabaseManager

# Here we load only the ANP collection because metadata field names diverge across datasets
db = WeaviateDatabaseManager(db_path="weaviate_default_db")
db.connect()


  warn(
01:41:14 ERROR:If the type is 'custom_model' or 'default' you should pass the model object under Key 'model'
01:41:14 INFO:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
01:41:14 INFO:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
01:41:14 INFO:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
01:41:14 INFO:<class 'weaviate.client.WeaviateClient'>
01:41:14 INFO:HTTP Request: GET http://localhost:8080/v1/.well-known/ready "HTTP/1.1 200 OK"
01:41:14 INFO:Weaviate Server Is Up: True
01:41:14 INFO:HTTP Request: GET http://localhost:8080/v1/.well-known/ready "HTTP/1.1 200 OK"


True

## Choose from the available Collections in the Database

In [7]:
existing_colls = db.get_available_collections()
collection_selector = widgets.Dropdown(
    options=existing_colls,
    description='Choose a Collection:',
    disabled=False,
    style={'description_width': 'initial'} 
)
collection_selector

Dropdown(description='Choose a Collection:', options=('AnpSgCorpus', 'AnpDuurzaam'), style=DescriptionStyle(de…

## Show number of records in the selected collection

In [8]:
collection = collection_selector.value
print(f"\nCollection '{collection_selector.value}' has {db.get_collection_count(collection)} records")

01:41:14 INFO:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
01:41:14 INFO:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
01:41:14 INFO:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
01:41:14 INFO:HTTP Request: POST http://localhost:8080/v1/graphql "HTTP/1.1 200 OK"



Collection 'AnpSgCorpus' has 271137 records


## Create Sub-Corpus

To make the processing and visualization easier, we will create a new `Corpus` comprising only a subet of the original Collection. This corpus will contain only the records of interest. This is done by querying the database with keyword and metadata constraints. In this example we allow to look for:

- **Filter Terms:** retrieve only passages that contain exactly the given keywords.
- **Year Range:** retrieve only the records which are inside the provided years
- **Neighbors:** This indicates how much to *expand* the search into more datapoints. The idea is to retrieve the *top_k* neighbors of the initially retrieved passages. Ideally this will give related passages that did not mention any of the keywords explicitly.

In [9]:
widget_year_range=widgets.IntRangeSlider(description='Year Range: ', min=1900, max=2020, step=1, value=(1980,1984), style={'description_width': 'initial'}, layout=widgets.Layout(width='400px') )
widget_terms=widgets.Text(description='Filter Terms (comma separated)', value="duurzaam", style={'description_width': 'initial'}, layout=widgets.Layout(width='600px') )
widget_neighbors=widgets.IntSlider(description="Expand Neighborhood Size: ", min=0, max=10, value=5, style={'description_width': 'initial'}, layout=widgets.Layout(width='400px') )

### Display the Widgets to choose the parameters

In [10]:
display(widget_terms)
display(widget_year_range)
display(widget_neighbors)

Text(value='duurzaam', description='Filter Terms (comma separated)', layout=Layout(width='600px'), style=TextS…

IntRangeSlider(value=(1980, 1984), description='Year Range: ', layout=Layout(width='400px'), max=2020, min=190…

IntSlider(value=5, description='Expand Neighborhood Size: ', layout=Layout(width='400px'), max=10, style=Slide…

### Execute the Search

No need to move the code manually here. All parameters are grabbed from the widget values

In [11]:
# Unpack values form Widget
year_from, year_to = widget_year_range.value
FILTER_TERMS = [s.strip() for s in widget_terms.value.split(",")]
# Execute Database Query
where_range = {"year_from": year_from, "year_to": year_to}
print(f"Searching terms {FILTER_TERMS} between year {year_from} and {year_to}")
corpus = db.get_corpus(collection, filter_words=FILTER_TERMS, where_obj=where_range, include_embeddings=True, limit=10000)
print(f"Found {len(corpus)} items that match!")

01:41:14 INFO:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
01:41:14 INFO:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
01:41:14 INFO:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"


Searching terms ['duurzaam'] between year 1980 and 1984
Found 98 items that match!


### Execute the search for neighborhoods

For each `Passage` in the `Corpus` created with the search result, we will find *k* neighbors and add them to the original `Corpus`.

In [12]:
import statistics
def get_corpus_with_neighborhoods(collection, corpus, k_neighbors):
    all_neighbors = []
    all_distances = []
    for p in corpus.passages:
        neighbors = db.query_vector_neighbors(collection, vector=p.embedding, k_neighbors=k_neighbors)
        for passage, distance in neighbors:
            all_neighbors.append(passage)
            all_distances.append(distance)
    print(f"Total Datapoints in the neighborhoods = {len(all_neighbors)}")
    print(f"Distance Info: Max = {max(all_distances)} | Min = {min(all_distances)} | Average = {statistics.mean(all_distances)}")
    # Join original passages + new found neighbors
    all_passages = corpus.passages + all_neighbors
    corpus = Corpus(all_passages)
    corpus.embeddings = db.compress_embeddings(corpus)
    print(corpus.embeddings.shape)
    return corpus

corpus = get_corpus_with_neighborhoods(collection, corpus, k_neighbors=widget_neighbors.value)

01:41:14 INFO:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
01:41:14 INFO:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
01:41:14 INFO:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
01:41:14 INFO:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
01:41:14 INFO:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
01:41:14 INFO:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
01:41:14 INFO:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
01:41:14 INFO:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
01:41:14 INFO:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
01:41:14 INFO:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
01:41:14 INFO:HTTP Re

Total Datapoints in the neighborhoods = 490
Distance Info: Max = 0.4341684579849243 | Min = -1.0728836059570312e-06 | Average = 0.13021429022964165
UMAP( verbose=True)
Thu Jun  6 13:41:20 2024 Construct fuzzy simplicial set
Thu Jun  6 13:41:20 2024 Finding Nearest Neighbors
Thu Jun  6 13:41:21 2024 Finished Nearest Neighbor Search
Thu Jun  6 13:41:22 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Thu Jun  6 13:41:24 2024 Finished embedding
(588, 2)


## Cluster the Corpus

### Choose Minimum Cluster Size

In [13]:
min_cluster_size_widget = widgets.IntSlider(
    value=10,
    min=5,
    max=len(corpus)//2,
    step=1,
    description='Minimum Cluster Size:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d',
    style={'description_width': 'initial'}
)


In [14]:
min_cluster_size_widget

IntSlider(value=10, continuous_update=False, description='Minimum Cluster Size:', max=294, min=5, style=Slider…

### Execute Clustering

In [15]:
# Arguments: min_cluster_size=10, cluster_selection_epsilon=0.1, ...
# See https://scikit-learn.org/stable/modules/generated/sklearn.cluster.HDBSCAN.html for full list

# e.g. min_samples=10, cluster_selection_epsilon=0.2, cluster_selection_method="leaf"
clusters = corpus.cluster(min_cluster_size=min_cluster_size_widget.value, cluster_selection_epsilon=0.1)
print(f"Found {len(clusters)} clusters in the corpus. (min cluster size is {min_cluster_size_widget.value})")

if len(clusters) > 100:
    raise ValueError("Seems like you have too many clusters! Try with a bigger value for min_cluster_size to avoid memory issues")

for c in clusters:
    print(len(c.passages), c)

Found 46 clusters in the corpus. (min cluster size is 10)
14 Subcorpus(5, [0, 54, 98, 99, 105, 106, 110, 111, 355, 356])
12 Subcorpus(10, [1, 53, 100, 101, 103, 104, 112, 357, 363, 364])
10 Subcorpus(4, [2, 51, 102, 107, 108, 109, 353, 354, 367, 372])
14 Subcorpus(0, [3, 52, 113, 114, 117, 120, 121, 122, 350, 351])
10 Subcorpus(32, [4, 50, 115, 116, 118, 119, 348, 349, 360, 361])
10 Subcorpus(8, [5, 49, 123, 124, 136, 137, 343, 344, 406, 407])
12 Subcorpus(17, [6, 58, 128, 129, 130, 131, 132, 388, 389, 390])
16 Subcorpus(20, [7, 61, 125, 126, 127, 133, 134, 135, 317, 345])
14 Subcorpus(29, [8, 57, 138, 139, 142, 145, 146, 147, 383, 384])
16 Subcorpus(41, [9, 60, 140, 141, 143, 144, 152, 162, 312, 385])
14 Subcorpus(39, [10, 62, 148, 149, 160, 161, 225, 227, 395, 396])
10 Subcorpus(36, [11, 56, 153, 154, 155, 157, 378, 379, 380, 382])
10 Subcorpus(44, [12, 59, 150, 151, 158, 159, 393, 394, 410, 411])
12 Subcorpus(28, [13, 55, 163, 164, 165, 166, 167, 373, 374, 375])
12 Subcorpus(38, [14

### Load Stopwords to avoid including them in the Cluster Labels

In [16]:
stopwords_file = Path("stopwords-filter-nl.txt")

with open(stopwords_file.absolute(), "rt") as f:
    stopwords = set(f.read().splitlines())

stopwords.update(
    {
        "wij",
        "we",
        "moeten",
        "heer",
        "mevrouw",
        "minister",
        "voorzitter",
        "gaat",
        "wel",
        "den",
    }
)

%autoreload now

### Save Cluster Passages for inspection

In [17]:
import os
if not os.path.exists("clusters"): 
    os.makedirs("clusters")

selected_metadata = ["year"]

all_clusters_records, df_cluster_labels, df_cluster_meta = [], [], []
for cluster in clusters:
    cluster.set_topic_label(exclude_words=frozenset(stopwords | set(FILTER_TERMS)), n=5)
    df = cluster.to_dataframe()
    centroid = cluster.centroid()
    label = cluster.label
    cluster_size = len(cluster.passages)
    # Compute Cluster Stats as a Subcorpus
    top_words = " ".join(cluster.top_words(exclude_words=frozenset(stopwords | set(FILTER_TERMS)), n=10))
    all_clusters_records.append((f"{label}\t{cluster_size}\t{centroid}\t{top_words}\n"))
    df_cluster_labels.append(cluster.label)
    df_cluster_meta.append(df[selected_metadata])
    # Save the Cluster Passages in a File
    file_prefix = f"cluster_{year_from}_{year_to}_{cluster.label.replace('; ', '_')}"
    df.to_csv(f"clusters/{file_prefix}.tsv", sep="\t", index=False) 
    print(cluster)

Subcorpus('antillen; iiecht; mogelijk; samenwerkingsverband; ydit', [0, 54, 98, 99, 105, 106, 110, 111, 355, 356])
Subcorpus('antillen; antilliaanse; gezegd; jaàtgt; ldit', [1, 53, 100, 101, 103, 104, 112, 357, 363, 364])
Subcorpus('1730; dumv; hecht; krijgen; nederlandse', [2, 51, 102, 107, 108, 109, 353, 354, 367, 372])
Subcorpus('dalende; durft; regets; weinig; zeggen', [3, 52, 113, 114, 117, 120, 121, 122, 350, 351])
Subcorpus('anp; dalende; kansen; odder; olieprijs', [4, 50, 115, 116, 118, 119, 348, 349, 360, 361])
Subcorpus('afhankelijk; laag; rente; vereist; wereldeconomi', [5, 49, 123, 124, 136, 137, 343, 344, 406, 407])
Subcorpus('belgrado; financiële; gepleit; lubbers; premier', [6, 58, 128, 129, 130, 131, 132, 388, 389, 390])
Subcorpus('hand; opleven; stabiel; termijn; wisselkoersen', [7, 61, 125, 126, 127, 133, 134, 135, 317, 345])
Subcorpus('bank; belemmering; grootste; hoge; verenigde', [8, 57, 138, 139, 142, 145, 146, 147, 383, 384])
Subcorpus('banken; betalingsregelinge



In [18]:
with open(f"clusters/clusters_all_{year_from}_{year_to}.csv", "wt") as f:
    f.write("Label\tSize\tCentroid\tTopWords\n")
    for rec in all_clusters_records:
        f.write(rec)

### Plot Cluster Content Distributions

In [19]:
import matplotlib.pyplot as plt
logging.getLogger('matplotlib').setLevel(logging.WARNING)
from ipywidgets import interact

def plot_cluster_distribution(cluster_index, column_name):

    df = df_cluster_meta[cluster_index]
    cluster_name = df_cluster_labels[cluster_index]

    plt.figure(figsize=(10, 6))
    plt.hist(df[column_name],edgecolor='black')
    plt.xlabel(f'{column_name}', fontsize=14)
    plt.ylabel('Frequency', fontsize=14)
    plt.title(f'{cluster_name}', fontsize=16)
    plt.show()


cluster_selector = widgets.Dropdown(
    options=[(lbl, i) for i, lbl in enumerate(df_cluster_labels)],
    description='Choose a Cluster:',
    disabled=False,
    style={'description_width': 'initial'} 
)

variable_selector = widgets.Dropdown(
    options=selected_metadata,
    description='Choose a Column to Plot:',
    disabled=False,
    style={'description_width': 'initial'} 
)

interact(plot_cluster_distribution, cluster_index=cluster_selector, column_name=variable_selector)

  from ipykernel.pylab.backend_inline import flush_figures


interactive(children=(Dropdown(description='Choose a Cluster:', options=(('antillen; iiecht; mogelijk; samenwe…

<function __main__.plot_cluster_distribution(cluster_index, column_name)>

ERROR:tornado.application:Uncaught exception GET /autoload.js?bokeh-autoload-element=p1038&bokeh-absolute-url=http://localhost:60918&resources=none (::1)
HTTPServerRequest(protocol='http', host='localhost:60918', method='GET', uri='/autoload.js?bokeh-autoload-element=p1038&bokeh-absolute-url=http://localhost:60918&resources=none', version='HTTP/1.1', remote_ip='::1')
Traceback (most recent call last):
  File "/Users/jose/Repos/tempo-embeddings/.venv/lib/python3.9/site-packages/tornado/web.py", line 1790, in _execute
    result = await result
  File "/Users/jose/Repos/tempo-embeddings/.venv/lib/python3.9/site-packages/bokeh/server/views/autoload_js_handler.py", line 62, in get
    session = await self.get_session()
  File "/Users/jose/Repos/tempo-embeddings/.venv/lib/python3.9/site-packages/bokeh/server/views/session_handler.py", line 145, in get_session
    session = await self.application_context.create_session_if_needed(session_id, self.request, token)
  File "/Users/jose/Repos/tempo

## Visualize Embeddings (All Clusters)

In [20]:
import os
from bokeh.io import output_notebook
from bokeh.plotting import show
from tempo_embeddings.visualization.bokeh import BokehInteractiveVisualizer

output_notebook()

meta_fields = corpus.metadata_fields()
meta_fields = ["year", "date", "issue"]

visualizer = BokehInteractiveVisualizer(
    *clusters, metadata_fields=meta_fields, width=2000, height=1000
)

os.environ[
    "BOKEH_ALLOW_WS_ORIGIN"
] = "*"

show(visualizer.create_document)

01:41:25 INFO:Starting Bokeh server version 3.3.3 (running on Tornado 6.4)
01:41:25 INFO:User authentication hooks NOT provided (default user enabled)


01:42:09 INFO:WebSocket connection opened
01:42:09 INFO:ServerConnection created
01:52:58 INFO:WebSocket connection closed: code=1001, reason=None
