In [1]:
## For now we are using the convenient word2vec models that are available here: https://zenodo.org/records/3237380
## There were trained on dutch news corpora (6 newspaper sources) from 1950-1989 and are separated by decade: 24 models
## We use all the models to extract the Top-K similar terms to a given list of seed terms.

# %pip install gensim==4.3.2
# %pip install seaborn==0.13.1
# %pip install qgridnext

In [9]:
from pathlib import Path  # noqa: I001

from gensim.models import KeyedVectors  # type: ignore

ROOT_DIR = Path.home() / "SEED_DATA/SemanticsSustainability/historic_w2v_models"

model = KeyedVectors.load_word2vec_format(
    f"{ROOT_DIR}/ah_nrc_1950_1959.w2v", binary=True
)
model

<gensim.models.keyedvectors.KeyedVectors at 0x14f9bdfa0>

In [3]:
import re
from glob import glob

import pandas as pd  # type: ignore


def get_term_from_w2v(term, model, topn):
    try:
        similar = [x for (x, y) in model.most_similar(term, topn=topn)]
    except KeyError:
        similar = []
    return similar


def get_top_terms(source, start_y, seed_terms, model, topn):
    related_terms = []
    for term in seed_terms:
        similar = get_term_from_w2v(term, model, topn)
        related_terms.append(
            {
                "seed_term": term,
                "related_terms": similar,
                "source": source,
                "decade": start_y,
            }
        )
    return related_terms


def extract_related_terms(output_name):
    all_related_terms = []
    seed_terms = open("sustainability-filter-words-all.txt").read().split("\n")
    for filepath in glob(f"{ROOT_DIR}/*.w2v"):
        if re.match(r".*\d{4}.w2v", filepath):
            named_info = re.search(
                r".*/(?P<source>\w+)_(?P<start>\d{4})_(?P<end>\d{4}).w2v", filepath
            )
            model = KeyedVectors.load_word2vec_format(filepath, binary=True)
            all_related_terms += get_top_terms(
                named_info.group("source"),
                int(named_info.group("start")),
                seed_terms,
                model,
                topn=10,
            )
    df = pd.DataFrame(all_related_terms).explode("related_terms")
    df.reset_index(drop=True, inplace=True)
    df.to_csv(f"{output_name}.csv")
    pd.DataFrame(df["related_terms"]).dropna().to_csv(
        f"{output_name}.txt", index=False, header=False
    )
    return df


expanded_filename = "sustainability-filter-w2v-expanded"
if Path(f"{expanded_filename}.csv").exists():
    extracted_terms = pd.read_csv(f"{expanded_filename}.csv", index_col=0)
else:
    extracted_terms = extract_related_terms(expanded_filename)

In [4]:
extracted_terms

Unnamed: 0,seed_term,related_terms,source,decade
0,duurzaam,belastingstelsel,parool,1960
1,duurzaam,rechtvaardig,parool,1960
2,duurzaam,zedelijk,parool,1960
3,duurzaam,zwakken,parool,1960
4,duurzaam,relativeert,parool,1960
...,...,...,...,...
8611,broeikas-effect,,telegraaf,1960
8612,broeikasgassen,,telegraaf,1960
8613,Energiebesparingsmogelijkheden,,telegraaf,1960
8614,CO2-uitstoot,,telegraaf,1960


In [5]:
# Count the occurrences of each hobby for each name
related_term_counts = (
    extracted_terms.groupby("decade")["related_terms"]
    .value_counts()
    .unstack(fill_value=0)
)
related_term_counts = related_term_counts.transpose()
related_term_counts["Total"] = related_term_counts.sum(axis=1)
related_term_counts = related_term_counts.sort_values(by="Total", ascending=False)

In [7]:
from qgridnext import show_grid  # noqa: I001

show_grid(related_term_counts)

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [8]:
terms_per_decade = related_term_counts.sum(axis=0)
show_grid(terms_per_decade)

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…