In [105]:
import logging
from importlib import reload

reload(logging)
logging.basicConfig(
    format="%(asctime)s %(levelname)s:%(message)s",
    level=logging.INFO,
    datefmt="%I:%M:%S",
)

In [160]:
%pip install "vegafusion[embed]>=1.5.0"
%pip install pyvis
reload(logging)

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


<module 'logging' from '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/logging/__init__.py'>

In [None]:
%load_ext autoreload
%autoreload now

In [107]:
from pathlib import Path

import altair as alt
from ipywidgets import widgets  # type: ignore

from tempo_embeddings.text.corpus import Corpus

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [108]:
import weaviate
from tempo_embeddings.embeddings.model import SentenceTransformerModelWrapper
from tempo_embeddings.embeddings.weaviate_database import WeaviateDatabaseManager
from tempo_embeddings.settings import DEFAULT_LANGUAGE_MODEL

db = WeaviateDatabaseManager(
    client=weaviate.connect_to_local(host="145.38.192.173", port=8087),
    model=SentenceTransformerModelWrapper.from_pretrained(DEFAULT_LANGUAGE_MODEL),
)

## Choose from the available Collections in the Database

In [109]:
existing_colls = list(db.get_available_collections())
collection_selector = widgets.SelectMultiple(
    options=existing_colls,
    value=["ANP", "StatenGeneraal"],
    description="Choose a Collection:",
    disabled=False,
    style={"description_width": "initial"},
)

print("\nCollection Sizes")
print("----------------")
max_len = max(len(collection) for collection in existing_colls) + 1
for collection in existing_colls:
    print(f"{collection:{max_len}}\t{db.get_collection_count(collection)}")

collection_selector


Collection Sizes
----------------
Telegraaf       	631072
StatenGeneraal  	820215
Volkskrant      	781978
NRC             	884143
Trouw           	568898
AlgemeenDagblad 	1005229
ANP             	126466


SelectMultiple(description='Choose a Collection:', index=(6, 1), options=('Telegraaf', 'StatenGeneraal', 'Volk…

## Create Sub-Corpus

To make the processing and visualization easier, we will create a new `Corpus` comprising only a subet of the original Collection. This corpus will contain only the records of interest. This is done by querying the database with keyword and metadata constraints. In this example we allow to look for:

- **Filter Terms:** retrieve only passages that contain exactly the given keywords.
- **Year Range:** retrieve only the records which are inside the provided years
- **Neighbors:** This indicates how much to *expand* the search into more datapoints. The idea is to retrieve the *top_k* neighbors of the initially retrieved passages. Ideally this will give related passages that did not mention any of the keywords explicitly.

In [110]:
widget_year_range = widgets.IntRangeSlider(
    description="Year Range: ",
    min=1800,
    max=2020,
    step=1,
    value=(1950, 2000),
    style={"description_width": "initial"},
    layout=widgets.Layout(width="400px"),
)
widget_terms = widgets.Text(
    description="Filter Terms (comma separated)",
    value="duurzaam",
    style={"description_width": "initial"},
    layout=widgets.Layout(width="600px"),
)
widget_neighbors = widgets.IntSlider(
    description="Expand Neighborhood Size: ",
    min=0,
    max=10,
    value=5,
    style={"description_width": "initial"},
    layout=widgets.Layout(width="400px"),
)

### Display the Widgets to choose the parameters

In [111]:
display(widget_terms)
display(widget_year_range)

Text(value='duurzaam', description='Filter Terms (comma separated)', layout=Layout(width='600px'), style=TextS…

IntRangeSlider(value=(1950, 2000), description='Year Range: ', layout=Layout(width='400px'), max=2020, min=180…

### Execute the Search

No need to move the code manually here. All parameters are grabbed from the widget values

In [112]:
# Unpack values form Widget
year_from, year_to = widget_year_range.value
FILTER_TERMS = [s.strip() for s in widget_terms.value.split(",")]
# Execute Database Query
print(f"Searching terms {FILTER_TERMS} between year {year_from} and {year_to}")
corpus = sum(
    (
        db.get_corpus(
            collection,
            filter_words=FILTER_TERMS,
            year_from=year_from,
            year_to=year_to,
            include_embeddings=True,
            limit=10000,
        )
        for collection in collection_selector.value
    ),
    start=Corpus([]),
)
print(f"Found {len(corpus)} items that match!")

Searching terms ['duurzaam'] between year 1950 and 2000
Found 9485 items that match!


## Display the Datapoints

Here we only display what we got (After using UMAP to compress). The "cluster" colors are assigned based on the Year Metadata

In [113]:
corpus.compress_embeddings()
print(corpus.embeddings.shape)
print(corpus.embeddings_2d.shape)

(9485, 768)
(9485, 2)


In [114]:
import pandas as pd

corpus_df = corpus.to_dataframe()
corpus_df["year"] = corpus_df["year"].astype(int)
corpus_df["decade"] = (corpus_df["year"] // 10) * 10
corpus_df

Unnamed: 0,sentence_index,year,date,day,provenance,month,filename,issue,ID_DB,text,...,type,description,recId,pages,chamber,speakers,leg_period,title,ocr_link,decade
0,8.0,1950,08-04-1950,8,ANP_1950.csv.gz,4,anp_1950_04_08_33_ocr.xml,33,ff2831f0-9ae7-5977-9bc4-28cfa6d26b0a,"Ook zöu ^ ^ cpaconomisch gebied, een duurzaam ...",...,,,,,,,,,,1950
1,0.0,1950,06-20-1950,20,ANP_1950.csv.gz,6,anp_1950_06_20_51_ocr.xml,51,97bbae5d-5401-59e8-a242-37768a8954a1,conferentie ^chuman 2 En minstens even belangr...,...,,,,,,,,,,1950
2,3.0,1951,06-17-1951,17,ANP_1951.csv.gz,6,anp_1951_06_17_21_ocr.xml,21,12c46dad-cc84-5888-8c27-bf482089e14e,In het begin van zijn memorie had de minister ...,...,,,,,,,,,,1950
3,5.0,1951,06-13-1951,13,ANP_1951.csv.gz,6,anp_1951_06_13_13_ocr.xml,13,04a70dbb-f8b7-57dc-91ae-e9e91c448084,"De moderne bouwwijze, waarbij de huizen van te...",...,,,,,,,,,,1950
4,2.0,1951,07-18-1951,18,ANP_1951.csv.gz,7,anp_1951_07_18_18_ocr.xml,18,c95a8bb2-2a32-5291-8eba-25e97043787c,15. / a- : t/ terwijl de andere partij eerst d...,...,,,,,,,,,,1950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9480,532.0,2000,2000-12-12,,StatenGeneraal_2000.csv.gz,,,,e02158af-7a20-5488-adfa-1110f529705d,De herstructurering van de land- en tuinbouwse...,...,,,h-ek-20002001-540-557,,EersteKamer,no speakers,,handelingen,,2000
9481,371.0,2000,2000-12-18,,StatenGeneraal_2000.csv.gz,,,,2863abad-8bd1-5926-b879-9090a1282349,"In een groen elektriciteitsscenario, waarin wi...",...,,,h-ek-20002001-613-625,,EersteKamer,De heer Hofstede CDA|Minister Jorritsma-Lebbin...,,handelingen,,2000
9482,661.0,2000,2000-12-18,,StatenGeneraal_2000.csv.gz,,,,1db054cc-fed2-5293-8257-c270850a86b4,Het past in de gedachte dat als twee mensen vo...,...,,,h-ek-20002001-597-613,,EersteKamer,Mevrouw Timmerman-Buck CDA|Mevrouw Le Poole Pv...,,handelingen,,2000
9483,17.0,2000,2000-12-19,,StatenGeneraal_2000.csv.gz,,,,44ff8aa2-968b-50f6-8628-63973de23a35,Deze leden onderkennen dat het kennelijke bela...,...,,,h-ek-20002001-687-688,,EersteKamer,Mevrouw Timmerman-Buck CDA|De heer Rensema VVD,,handelingen,,2000


In [115]:
decades = corpus_df["decade"].dropna().unique()
decades.sort()
colorSelector = alt.selection_point(
    name="Select", fields=["decade"], value=decades[0], bind="legend"
)

alt.Chart(corpus_df).mark_circle().add_params(colorSelector).encode(
    x="x",
    y="y",
    tooltip=["text", "date", "provenance"],
    color={
        "field": "decade",
        "scale": {"scheme": "category20b"},
        "legend": alt.Legend(
            title="Year", labelLimit=0, columns=1, labelFontSize=16, titleFontSize=18
        ),
    },
    opacity=alt.condition(colorSelector, alt.value(0.95), alt.value(0.01)),
).properties(width=1200, height=800).interactive()

## Cluster the Corpus

### CASE 1: Clustering Independently Per DECADE

In [116]:
import os
from collections import Counter

from bokeh.io import output_notebook
from bokeh.plotting import show

# from tempo_embeddings.settings import DATA_DIR
from tempo_embeddings.text.passage import Highlighting, Passage
from tempo_embeddings.visualization.bokeh import BokehInteractiveVisualizer

output_notebook()

MIN_CLUSTER_SIZE = 50


def get_stopwords():
    stopwords_file = Path("../../tempo_embeddings/data/stopwords-filter-nl.txt")
    # stopwords_file = Path(f"{DATA_DIR}/stopwords-filter-nl.txt")
    with open(stopwords_file.absolute(), "rt") as f:
        stopwords = set(f.read().splitlines())
    stopwords.update(
        {
            "wij",
            "we",
            "moeten",
            "heer",
            "mevrouw",
            "minister",
            "voorzitter",
            "gaat",
            "wel",
            "den",
        }
    )
    return stopwords


def get_passages_from_df(df, skip_columns=[]):
    passages = []
    for row in df.to_dict("records"):
        meta = {k: v for (k, v) in row.items() if k not in skip_columns}
        passages.append(
            Passage(
                row["text"],
                metadata=meta,
                highlighting=Highlighting(1, len(row["text"])),
            )
        )
    return passages


def cluster_selected_corpus(year_start, year_end, min_cluster_size):
    corpus_df_filtered = corpus_df[
        (corpus_df["year"] >= year_start) & (corpus_df["year"] <= year_end)
    ]
    corpus_to_cluster = Corpus(
        get_passages_from_df(corpus_df_filtered, skip_columns=["text"])
    )
    corpus_to_cluster.embeddings = list(
        zip(corpus_df_filtered["x"], corpus_df_filtered["y"])
    )
    print(
        f"Clustering only {len(corpus_to_cluster)} datapoints between {year_start} and {year_end}..."
    )
    clusters = corpus_to_cluster.cluster(
        min_cluster_size=min_cluster_size, cluster_selection_epsilon=0.1
    )
    print(
        f"Found {len(clusters)} clusters in the corpus. (min cluster size is {min_cluster_size})"
    )
    if len(clusters) > 100:
        raise ValueError(
            "Seems like you have too many clusters! Try with a bigger value for min_cluster_size to avoid memory issues"
        )
    [
        cl.set_topic_label(exclude_words=frozenset(stopwords | set(FILTER_TERMS)), n=5)
        for cl in clusters
    ]
    # Return Clusters Plot
    return clusters


def bokeh_plot_clustered_corpus(clusters):
    meta_fields = corpus.metadata_fields()
    meta_fields = ["year", "date", "issue", "provenance"]

    visualizer = BokehInteractiveVisualizer(
        *clusters, metadata_fields=meta_fields, width=1000, height=500
    )

    os.environ["BOKEH_ALLOW_WS_ORIGIN"] = "*"

    show(visualizer.create_document)


def altair_plot_clustered_corpus(clusters):
    # Prepare Data to Plot
    plot_data = clusters[0].to_dataframe()
    plot_data["cluster_label"] = clusters[0].label
    cl_labels = [clusters[0].label]
    if len(clusters) > 1:
        for cl in clusters[1:]:
            cl_labels.append(cl.label)
            df = cl.to_dataframe()
            df["cluster_label"] = cl.label
            plot_data = pd.concat([plot_data, df])
        plot_data[["x", "y"]] = pd.DataFrame(
            plot_data["datapoint"].tolist(), index=plot_data.index
        )
    plot_data = plot_data.drop(columns=["datapoint"])
    # Altair Chart
    colorSelector = alt.selection_point(
        name="Select", fields=["cluster_label"], value=cl_labels[0], bind="legend"
    )
    return (
        alt.Chart(plot_data)
        .mark_circle()
        .add_params(colorSelector)
        .encode(
            x="x",
            y="y",
            tooltip=["text", "date", "provenance"],
            color={
                "field": "cluster_label",
                # "scale": {"scheme": "category20b"},
                "legend": alt.Legend(
                    title="Cluster Label",
                    labelLimit=0,
                    columns=1,
                    labelFontSize=16,
                    titleFontSize=18,
                ),
            },
            opacity=alt.condition(colorSelector, alt.value(0.95), alt.value(0.01)),
        )
        .properties(width=1200, height=800)
        .interactive()
    )


if not os.path.exists("clusters"):
    os.makedirs("clusters")
stopwords = get_stopwords()

TIME_BUCKETS = [(1950, 1959), (1960, 1969), (1970, 1979), (1980, 1989), (1990, 1999)]

all_clusters_dict = {}
for decade in TIME_BUCKETS:
    clusters = cluster_selected_corpus(*decade, MIN_CLUSTER_SIZE)
    bokeh_plot_clustered_corpus(clusters)
    # display(altair_plot_clustered_corpus(clusters))
    all_clusters_dict[decade] = clusters

Clustering only 447 datapoints between 1950 and 1959...
Found 4 clusters in the corpus. (min cluster size is 50)




Clustering only 588 datapoints between 1960 and 1969...
Found 4 clusters in the corpus. (min cluster size is 50)




Clustering only 881 datapoints between 1970 and 1979...
Found 4 clusters in the corpus. (min cluster size is 50)




Clustering only 2466 datapoints between 1980 and 1989...
Found 15 clusters in the corpus. (min cluster size is 50)




Clustering only 4813 datapoints between 1990 and 1999...
Found 27 clusters in the corpus. (min cluster size is 50)






In [121]:
def get_word_count(corpus):
    word_count = Counter()
    [
        word_count.update([w for w in p.words() if w.lower() not in stopwords])
        for p in corpus.passages
    ]
    return word_count


def analyze_clusters(clusters):
    # selected_metadata = ["year"] #  "top_words"
    # df_cluster_meta = []
    global_top_words = Counter()
    global_label_terms = set()
    for cluster in sorted(clusters, key=lambda c: len(c.passages), reverse=True):
        cluster.set_topic_label(
            exclude_words=frozenset(stopwords | set(FILTER_TERMS)), n=5
        )
        top_words = cluster.top_words(
            exclude_words=frozenset(stopwords | set(FILTER_TERMS)), n=10
        )
        [global_label_terms.add(w) for w in top_words]
        print(
            f"\n----- Cluster {cluster.label} || Size = {len(cluster.passages)} -----\n\tTop Words: {', '.join(sorted(top_words))}"
        )
        word_counts = get_word_count(cluster)
        print(word_counts.most_common(10))
        global_top_words += word_counts
    return global_label_terms


all_important_terms = set()
for decade, clusters in all_clusters_dict.items():
    print(f"\n\n\n################### DECADE {decade} ###################\n")
    print(f"Total Clusters = {len(clusters)}")
    decade_label_terms = analyze_clusters(clusters)
    print(f"Decade Most Important Words: {decade_label_terms}")
    [all_important_terms.add(t) for t in decade_label_terms]




################### DECADE (1950, 1959) ###################

Total Clusters = 4

----- Cluster Outliers || Size = 204 -----
	Top Words: Outliers
[('duurzaam', 207), ('jaar', 47), ('bezit', 39), ('lid', 31), ('artikel', 29), ('eigen', 28), ('kinderen', 28), ('gescheiden', 26), ('eerste', 23), ('Nederland', 22)]

----- Cluster bezit; brede; persoonlijk; regering; vorming || Size = 117 -----
	Top Words: beleid, bezit, brede, geachte, nota, persoonlijk, regering, verbreiding, vorming, willen
[('duurzaam', 116), ('bezit', 47), ('Regering', 37), ('geachte', 22), ('artikel', 21), ('afgevaardigde', 20), ('enige', 18), ('belang', 16), ('brede', 15), ('vorming', 15)]

----- Cluster bezit; karakter; vorming; wet; wetsontwerp || Size = 76 -----
	Top Words: aanneming, bepaalde, bezit, blijven, gaan, karakter, vorming, wet, wetsontwerp, zouden
[('duurzaam', 79), ('bezit', 33), ('II', 15), ('vorming', 14), ('wetsontwerp', 13), ('hoofdstuk', 13), ('bevordering', 11), ('bepaalde', 10), ('wet', 10), 

### CASE 2: Cluster Everything

In [119]:
clusters = cluster_selected_corpus(*widget_year_range.value, min_cluster_size=100)
# altair_plot_clustered_corpus(clusters)
bokeh_plot_clustered_corpus(clusters)

Clustering only 9485 datapoints between 1950 and 2000...
Found 20 clusters in the corpus. (min cluster size is 100)




## Co-Occurrence Network


In [163]:
import itertools
import re

from sklearn.feature_extraction.text import CountVectorizer


def tokenize(doc):
    return (tok.lower() for tok in re.findall(r"\w+", doc))


def get_documents_features(
    docs: list[str], target_terms: list[str], stopwords: list[str]
):
    # To simplify do two separate things for now: freq_ngrams with any target term in them and the co-occurrence only for unigrams for now
    vectorizer = CountVectorizer(
        tokenizer=tokenize, stop_words=stopwords, ngram_range=(1, 5)
    )
    X = vectorizer.fit_transform(docs)
    vocab = vectorizer.vocabulary_
    count_values = X.toarray().sum(axis=0)
    freq_ngrams, freq_words = {}, {}
    for ng_count, ng_text in sorted(
        [(count_values[i], k) for k, i in vocab.items()], reverse=True
    ):
        if " " in ng_text and any([term in ng_text for term in target_terms]):
            freq_ngrams[ng_text] = ng_count
        elif ng_count > 10:
            freq_words[ng_text] = ng_count

    relevant_doc_terms = []
    for i, text in enumerate(docs):
        tokens = [tok for tok in tokenize(text) if tok in freq_words]
        if len(tokens) > 0:
            relevant_doc_terms.append(tokens)

    return {
        "relevant_doc_terms": relevant_doc_terms,
        "freq_ngrams": freq_ngrams,
        "freq_words": freq_words,
    }


decade_docs = []
decade_features = {}
for y_start, y_end in TIME_BUCKETS:
    texts_dec = corpus_df[corpus_df["decade"] == y_start]["text"]
    decade_features[(y_start, y_end)] = get_documents_features(
        texts_dec, target_terms=["duurzaam"], stopwords=list(stopwords)
    )
    doc_decade = " ".join(texts_dec)
    print(len(doc_decade))
    decade_docs.append(doc_decade)


# print(len(decade_docs))
# vectorizer = CountVectorizer(tokenizer=tokenize, stop_words=list(stopwords), min_df=2, ngram_range=(1,5))
# vectorizer.fit_transform(decade_docs)

# min_freq_keywords = 10
# max_amount_keywords = 50



192389
199737
244029
885911
1848899


In [168]:
for decade, feats_dict in decade_features.items():
    print(f"\n\n\n################### DECADE {decade} ###################\n")
    [print(ngram) for ngram in list(feats_dict["freq_ngrams"].items())[:10]]




################### DECADE (1950, 1959) ###################

('duurzaam bezit', 89)
('vorming duurzaam', 31)
('duurzaam persoonlijk bezit', 22)
('duurzaam persoonlijk', 22)
('duurzaam gescheiden', 21)
('vorming duurzaam bezit', 19)
('duurzaam karakter', 16)
('duurzaam persoonlijk bezit brede', 13)
('vormen duurzaam bezit', 12)
('vormen duurzaam', 12)



################### DECADE (1960, 1969) ###################

('duurzaam bezit', 169)
('duurzaam gescheiden', 78)
('vorming duurzaam', 70)
('duurzaam persoonlijk', 58)
('duurzaam persoonlijk bezit', 55)
('vorming duurzaam bezit', 38)
('vorming duurzaam persoonlijk', 26)
('vorming duurzaam persoonlijk bezit', 24)
('duurzaam persoonlijk bezit brede', 22)
('bevordering duurzaam', 22)



################### DECADE (1970, 1979) ###################

('duurzaam gescheiden', 134)
('duurzaam ontwricht', 67)
('huwelijk duurzaam', 53)
('duurzaam gescheiden leven', 43)
('huwelijk duurzaam ontwricht', 42)
('duurzaam bezit', 41)
('duurzaam gebruik'

In [170]:
import networkx as nx
import numpy as np
from pyvis.network import Network


def create_ccr_network(
    label: str, data: list[list[str]], relevant_terms, word_frequencies
):
    freq_log_words = {
        word: np.log1p(freq) * 10 for word, freq in word_frequencies.items()
    }
    # Count co-occurrences
    pairs = []
    for items in data:
        pairs.extend(itertools.combinations(sorted(items), 2))
    co_occurrence_counts = Counter(pairs).most_common(200)
    # Create Network
    G = nx.Graph()
    # Add edges with weights
    for (item1, item2), count in co_occurrence_counts:
        if item1 in relevant_terms or item2 in relevant_terms and count > 10:
            # print(item1, item2, count)
            G.add_node(item1, size=freq_log_words.get(item1, 1))
            G.add_node(item2, size=freq_log_words.get(item2, 1))
            G.add_edge(item1, item2, weight=count / 10)
    # Output Network in HTML
    nt = Network("1200px", "1600px", notebook=True, select_menu=True)
    nt.from_nx(G)
    nt.show_buttons(filter_=["physics"])
    nt.show(f"nx_{label}.html")


for decade, feats_dict in decade_features.items():
    create_ccr_network(
        label=f"{decade[0]}_{decade[1]}",
        data=feats_dict["relevant_doc_terms"],
        relevant_terms=all_important_terms,
        word_frequencies=feats_dict["freq_words"],
    )

nx_1950_1959.html
nx_1960_1969.html
nx_1970_1979.html
nx_1980_1989.html
nx_1990_1999.html
