In [None]:
import logging
from importlib import reload

reload(logging)
logging.basicConfig(
    format="%(asctime)s %(levelname)s:%(message)s",
    level=logging.INFO,
    datefmt="%I:%M:%S",
)

In [None]:
# %pip install "vegafusion[embed]>=1.5.0"
reload(logging)

In [None]:
%load_ext autoreload
%autoreload now

In [None]:
from pathlib import Path

import altair as alt
from ipywidgets import widgets  # type: ignore

from tempo_embeddings.text.corpus import Corpus

alt.data_transformers.enable("vegafusion")

In [None]:
import weaviate
from tempo_embeddings.embeddings.model import SentenceTransformerModelWrapper
from tempo_embeddings.embeddings.weaviate_database import WeaviateDatabaseManager
from tempo_embeddings.settings import DEFAULT_LANGUAGE_MODEL

db = WeaviateDatabaseManager(
    client=weaviate.connect_to_local(host="145.38.192.173", port=8087),
    model=SentenceTransformerModelWrapper.from_pretrained(DEFAULT_LANGUAGE_MODEL),
)

## Choose from the available Collections in the Database

In [None]:
existing_colls = list(db.get_available_collections())
collection_selector = widgets.SelectMultiple(
    options=existing_colls,
    value=["ANP", "StatenGeneraal"],
    description="Choose a Collection:",
    disabled=False,
    style={"description_width": "initial"},
)

print("\nCollection Sizes")
print("----------------")
max_len = max(len(collection) for collection in existing_colls) + 1
for collection in existing_colls:
    print(f"{collection:{max_len}}\t{db.get_collection_count(collection)}")

collection_selector

## Create Sub-Corpus

To make the processing and visualization easier, we will create a new `Corpus` comprising only a subet of the original Collection. This corpus will contain only the records of interest. This is done by querying the database with keyword and metadata constraints. In this example we allow to look for:

- **Filter Terms:** retrieve only passages that contain exactly the given keywords.
- **Year Range:** retrieve only the records which are inside the provided years
- **Neighbors:** This indicates how much to *expand* the search into more datapoints. The idea is to retrieve the *top_k* neighbors of the initially retrieved passages. Ideally this will give related passages that did not mention any of the keywords explicitly.

In [None]:
widget_year_range = widgets.IntRangeSlider(
    description="Year Range: ",
    min=1800,
    max=2020,
    step=1,
    value=(1950, 2000),
    style={"description_width": "initial"},
    layout=widgets.Layout(width="400px"),
)
widget_terms = widgets.Text(
    description="Filter Terms (comma separated)",
    value="duurzaam",
    style={"description_width": "initial"},
    layout=widgets.Layout(width="600px"),
)
widget_neighbors = widgets.IntSlider(
    description="Expand Neighborhood Size: ",
    min=0,
    max=10,
    value=5,
    style={"description_width": "initial"},
    layout=widgets.Layout(width="400px"),
)

### Display the Widgets to choose the parameters

In [None]:
display(widget_terms)
display(widget_year_range)

### Execute the Search

No need to move the code manually here. All parameters are grabbed from the widget values

In [None]:
# Unpack values form Widget
year_from, year_to = widget_year_range.value
FILTER_TERMS = [s.strip() for s in widget_terms.value.split(",")]
# Execute Database Query
where_range = {"year_from": year_from, "year_to": year_to}
print(f"Searching terms {FILTER_TERMS} between year {year_from} and {year_to}")
corpus = sum(
    (
        db.get_corpus(
            collection,
            filter_words=FILTER_TERMS,
            where_obj=where_range,
            include_embeddings=True,
            limit=10000,
        )
        for collection in collection_selector.value
    ),
    start=Corpus([]),
)
print(f"Found {len(corpus)} items that match!")

## Display the Datapoints

Here we only display what we got (After using UMAP to compress). The "cluster" colors are assigned based on the Year Metadata

In [None]:
corpus.compress_embeddings()
print(corpus.embeddings.shape)
print(corpus.embeddings_2d.shape)

In [None]:
import pandas as pd

corpus_df = corpus.to_dataframe()
corpus_df["year"] = corpus_df["year"].astype(int)
corpus_df["decade"] = (corpus_df["year"] // 10) * 10
corpus_df

In [None]:
decades = corpus_df["decade"].dropna().unique()
decades.sort()
colorSelector = alt.selection_point(
    name="Select", fields=["decade"], value=decades[0], bind="legend"
)

alt.Chart(corpus_df).mark_circle().add_params(colorSelector).encode(
    x="x",
    y="y",
    tooltip=["text", "date", "provenance"],
    color={
        "field": "decade",
        "scale": {"scheme": "category20b"},
        "legend": alt.Legend(
            title="Year", labelLimit=0, columns=1, labelFontSize=16, titleFontSize=18
        ),
    },
    opacity=alt.condition(colorSelector, alt.value(0.95), alt.value(0.01)),
).properties(width=1200, height=800).interactive()

## Cluster the Corpus

### CASE 1: Clustering Independently Per DECADE

In [22]:
import os
from collections import Counter

from bokeh.io import output_notebook
from bokeh.plotting import show

# from tempo_embeddings.settings import DATA_DIR
from tempo_embeddings.text.passage import Highlighting, Passage
from tempo_embeddings.visualization.bokeh import BokehInteractiveVisualizer

output_notebook()

MIN_CLUSTER_SIZE = 50


def get_stopwords():
    stopwords_file = Path("../../tempo_embeddings/data/stopwords-filter-nl.txt")
    # stopwords_file = Path(f"{DATA_DIR}/stopwords-filter-nl.txt")
    with open(stopwords_file.absolute(), "rt") as f:
        stopwords = set(f.read().splitlines())
    stopwords.update(
        {
            "wij",
            "we",
            "moeten",
            "heer",
            "mevrouw",
            "minister",
            "voorzitter",
            "gaat",
            "wel",
            "den",
        }
    )
    return stopwords


def get_passages_from_df(df, skip_columns=[]):
    passages = []
    for row in df.to_dict("records"):
        meta = {k: v for (k, v) in row.items() if k not in skip_columns}
        passages.append(
            Passage(row["text"], metadata=meta, highlighting=Highlighting(1, 10))
        )
    return passages


def cluster_selected_corpus(year_start, year_end, min_cluster_size):
    corpus_df_filtered = corpus_df[
        (corpus_df["year"] >= year_start) & (corpus_df["year"] <= year_end)
    ]
    corpus_to_cluster = Corpus(
        get_passages_from_df(corpus_df_filtered, skip_columns=["text"])
    )
    corpus_to_cluster.embeddings = list(
        zip(corpus_df_filtered["x"], corpus_df_filtered["y"])
    )
    print(
        f"Clustering only {len(corpus_to_cluster)} datapoints between {year_start} and {year_end}..."
    )
    clusters = corpus_to_cluster.cluster(
        min_cluster_size=min_cluster_size, cluster_selection_epsilon=0.1
    )
    print(
        f"Found {len(clusters)} clusters in the corpus. (min cluster size is {min_cluster_size})"
    )
    if len(clusters) > 100:
        raise ValueError(
            "Seems like you have too many clusters! Try with a bigger value for min_cluster_size to avoid memory issues"
        )
    [
        cl.set_topic_label(exclude_words=frozenset(stopwords | set(FILTER_TERMS)), n=5)
        for cl in clusters
    ]
    # Return Clusters Plot
    return clusters


def bokeh_plot_clustered_corpus(clusters):
    meta_fields = corpus.metadata_fields()
    meta_fields = ["year", "date", "issue", "provenance"]

    visualizer = BokehInteractiveVisualizer(
        *clusters, metadata_fields=meta_fields, width=1000, height=500
    )

    os.environ["BOKEH_ALLOW_WS_ORIGIN"] = "*"

    show(visualizer.create_document)


def altair_plot_clustered_corpus(clusters):
    # Prepare Data to Plot
    plot_data = clusters[0].to_dataframe()
    plot_data["cluster_label"] = clusters[0].label
    cl_labels = [clusters[0].label]
    if len(clusters) > 1:
        for cl in clusters[1:]:
            cl_labels.append(cl.label)
            df = cl.to_dataframe()
            df["cluster_label"] = cl.label
            plot_data = pd.concat([plot_data, df])
        plot_data[["x", "y"]] = pd.DataFrame(
            plot_data["datapoint"].tolist(), index=plot_data.index
        )
    plot_data = plot_data.drop(columns=["datapoint"])
    # Altair Chart
    colorSelector = alt.selection_point(
        name="Select", fields=["cluster_label"], value=cl_labels[0], bind="legend"
    )
    return (
        alt.Chart(plot_data)
        .mark_circle()
        .add_params(colorSelector)
        .encode(
            x="x",
            y="y",
            tooltip=["text", "date", "provenance"],
            color={
                "field": "cluster_label",
                # "scale": {"scheme": "category20b"},
                "legend": alt.Legend(
                    title="Cluster Label",
                    labelLimit=0,
                    columns=1,
                    labelFontSize=16,
                    titleFontSize=18,
                ),
            },
            opacity=alt.condition(colorSelector, alt.value(0.95), alt.value(0.01)),
        )
        .properties(width=1200, height=800)
        .interactive()
    )


if not os.path.exists("clusters"):
    os.makedirs("clusters")
stopwords = get_stopwords()

TIME_BUCKETS = [(1950, 1959), (1960, 1969), (1970, 1979), (1980, 1989), (1990, 1999)]

all_clusters_dict = {}
for decade in TIME_BUCKETS:
    clusters = cluster_selected_corpus(*decade, MIN_CLUSTER_SIZE)
    bokeh_plot_clustered_corpus(clusters)
    # display(altair_plot_clustered_corpus(clusters))
    all_clusters_dict[decade] = clusters

FileNotFoundError: [Errno 2] No such file or directory: '/Users/jose/Repos/tempo-embeddings/.venv/lib/python3.9/site-packages/tempo_embeddings/data/stopwords-filter-nl.txt'

In [None]:
def get_word_count(corpus):
    word_count = Counter()
    [
        word_count.update([w for w in p.words() if w not in stopwords])
        for p in corpus.passages
    ]
    return word_count


def analyze_clusters(clusters):
    # selected_metadata = ["year"] #  "top_words"
    # df_cluster_meta = []
    global_top_words = Counter()
    for cluster in sorted(clusters, key=lambda c: len(c.passages), reverse=True):
        cluster.set_topic_label(
            exclude_words=frozenset(stopwords | set(FILTER_TERMS)), n=5
        )
        top_words = cluster.top_words(
            exclude_words=frozenset(stopwords | set(FILTER_TERMS)), n=20
        )
        print(
            f"\n----- Cluster {cluster.label} || Size = {len(cluster.passages)} -----\n\tTop Words: {', '.join(sorted(top_words))}"
        )
        word_counts = get_word_count(cluster)
        print(word_counts.most_common(10))
        global_top_words += word_counts
    return global_top_words


for decade, clusters in all_clusters_dict.items():
    print(f"\n\n\n################### DECADE {decade} ###################\n")
    print(f"Total Clusters = {len(clusters)}")
    global_top_words = analyze_clusters(clusters)
    print(f"Decade Most Common Words: {global_top_words.most_common(20)}")

### CASE 2: Cluster Everything

In [None]:
clusters = cluster_selected_corpus(*widget_year_range.value, min_cluster_size=100)
# altair_plot_clustered_corpus(clusters)
bokeh_plot_clustered_corpus(clusters)