# Contextualized token embeddings for semantic change detection

In [None]:
from collections import defaultdict
import matplotlib.pylab as plot
import numpy as np
from tqdm import tqdm
from scipy.stats import spearmanr
from scipy.spatial.distance import cosine, cdist
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn import preprocessing
from helpers import collate, ContextsDataset
from gensim.models.word2vec import LineSentence
import torch
import pandas as pd
from torch.utils.data import DataLoader, SequentialSampler
from transformers import AutoModelForMaskedLM, AutoTokenizer

# Loading the test set

In [None]:
graded = pd.read_csv("targets/english/graded_nopos.txt", sep="\t", header=None,
                     names=['word', 'truth'])

In [None]:
graded

In [None]:
targets = graded.word

In [None]:
f"Target lemmas: {len(targets)}."

Again, corpus1 is XIX century English, corpus 2 is XX century English

# Embedding part

We assume the token embeddings are already extracted using the language model of our choice (BERT, XLM-R, etc).
If you are curious, look at the `extract.py` script.
Embeddings are stored as Numpy matrices (compressed). They are about 200 MBytes each, so we publish them separately.

In [None]:
data_path1= "embeddings/token_embeddings_corpus1_xlmr.npz"

In [None]:
array1 = np.load(data_path1)

In [None]:
f"Loaded an array of {len(array1)} entries from {data_path1}"

In [None]:
data_path2 = "embeddings/token_embeddings_corpus2_xlmr.npz"

In [None]:
array2 = np.load(data_path2)

In [None]:
f"Loaded an array of {len(array2)} entries from {data_path2}"

# Visualizing token embeddings

In [None]:
word = "plane"

In [None]:
array = array2[word]
array.shape

In [None]:
embedding = PCA(n_components=2)
y = embedding.fit_transform(array)

In [None]:
xpositions = y[:, 0]
ypositions = y[:, 1]

In [None]:
plot.clf()
plot.scatter(xpositions, ypositions, 5, marker='*', color='green')
plot.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plot.tick_params(axis='y', which='both', left=False, right=False, labelleft=False)
plot.title(f"{word} in {data_path2}")
plot.show()

Every dot is a 768-dimensional token embeddings projected into 2 dimensions.
Can you show both time periods side by side?

We can show two time periods on one plot as well:

In [None]:
embeddings = {"bin1": array1[word], "bin2": array2[word]}

In [None]:
embeddings["bin2"].shape

In [None]:
x = np.concatenate([embeddings[el] for el in sorted(embeddings)], axis=0)

In [None]:
x.shape

We want to show usages from different time bins with different colors, thus, we need class labels

In [None]:
class_labels = []
for el in sorted(embeddings):
    class_labels += [el] * len(embeddings[el])

In [None]:
len(class_labels)

We are projecting all embeddings into 2 dimensions with PCA:

In [None]:
x = preprocessing.StandardScaler().fit_transform(x)
x_2d = PCA(n_components=2).fit_transform(x)

In [None]:
class_set = sorted([c for c in set(class_labels)])
colors = plot.cm.Dark2(np.linspace(1, 0, len(class_set)))

In [None]:
plot.clf
plot.figure(figsize=(15, 15))
plot.xticks([]), plot.yticks([])
plot.title(f"{word} in all time bins\n", fontsize=20)
for year in class_set:
    rows = [x == year for x in class_labels]
    matrix = x_2d[rows]
    plot.scatter(matrix[:, 0], matrix[:, 1], color=colors[class_set.index(year)], marker='*', s=40, label=year)
plot.legend(prop={'size': 15}, loc="best")
plot.show()

What we will need to be able to inspect the actual usages? How to annotate the dots with the identifiers pointing at real sentences?

Anyway, now we would like to use the token embeddings to quantitatively estimate the degree of semantic change. And here comes the...

# Aggregating and assessment part

There are many usages and many token embeddings. We need to somehow *aggregate* them for each time period, before *assessing* the change.

The simplest is the PRT method (comparison of averaged *prototypical* embeddings):

## PRT

In [None]:
prt_predictions = []

In [None]:
for word in sorted(targets):
    frequency = np.sum([array1[word].shape[0], array2[word].shape[0]])
    vectors1 = array1[word]
    vectors2 = array2[word]
    vectors = []
    for m in [vectors1, vectors2]:
        # Aggregation:
        vector = np.average(m, axis=0)
        vectors.append(vector)
    vectors = [preprocessing.normalize(v.reshape(1, -1), norm='l2') for v in vectors]
    # Assessment:
    shift = 1 - np.dot(vectors[0].reshape(-1), vectors[1].reshape(-1))
    prt_predictions.append(shift)

In [None]:
graded["prt_predictions"] = prt_predictions

In [None]:
graded

In [None]:
spearmanr(graded.truth, graded.prt_predictions)

## APD

Average Pairwise Distance (APD) is a more sophisticated aggreation method. It computes pairwise distances between *all* usages from two time bins and averages these distances.

In [None]:
def mean_pairwise_distance(word_usages1, word_usages2, metric):
    """
    Computes the mean pairwise distance between two usage matrices.

    :param word_usages1: a three-place tuple including, in this order, a usage matrix, a list of
    snippets, and a list of integers indicating the lemma's position in the snippet
    :param word_usages2: a three-place tuple including, in this order, a usage matrix, a list of
    snippets, and a list of integers indicating the lemma's position in the snippet
    :param metric: a distance metric compatible with `scipy.spatial.distance.cdist`
    (e.g. 'cosine', 'euclidean')
    :return: the mean pairwise distance between two usage matrices
    """
    if isinstance(word_usages1, tuple):
        usage_matrix1, _, _ = word_usages1
    else:
        usage_matrix1 = word_usages1

    if isinstance(word_usages2, tuple):
        usage_matrix2, _, _ = word_usages2
    else:
        usage_matrix2 = word_usages2

    if usage_matrix1.shape[0] == 0 or usage_matrix2.shape[0] == 0:
        raise ValueError('Zero-dimensional usage matrix.')
    return np.mean(cdist(usage_matrix1, usage_matrix2, metric=metric))

Computational complexity naturally grows quadratically with the number of usages, so we will introduce sampling of max 10 000 random usages from each time bin.

In [None]:
max_samples = 10000

In [None]:
apd_predictions = []

In [None]:
for word in sorted(targets):
    frequency = np.sum([array1[word].shape[0], array2[word].shape[0]])
    print(f"Processing {word} with the frequency {frequency}...")
    vectors1 = array1[word]
    vectors2 = array2[word]
    if vectors1.shape[0] > max_samples:
        prev = vectors1.shape[0]
        rand_indices = np.random.choice(prev, max_samples, replace=False)
        vectors1 = vectors1[rand_indices]
        f"Choosing {max_samples} random usages from {prev} for {word} in T0"
    if vectors2.shape[0] > max_samples:
        prev = vectors2.shape[0]
        rand_indices = np.random.choice(prev, max_samples, replace=False)
        vectors2 = vectors2[rand_indices]
        f"Choosing {max_samples} random usages from {prev} for {word} in T1"
    shift = mean_pairwise_distance(vectors1, vectors2, "cosine")
    apd_predictions.append(shift)

In [None]:
graded["apd_predictions"] = apd_predictions

In [None]:
graded

In [None]:
spearmanr(graded.truth, graded.apd_predictions)

## PRT/APD

One can also take the geometric mean of PRT and APD

In [None]:
graded["prt_apd_predictions"] = graded.apply(lambda row: np.sqrt(row.prt_predictions * row.apd_predictions), axis=1)

In [None]:
graded

In [None]:
spearmanr(graded.truth, graded.prt_apd_predictions)

You can try other languages!
Annotated datasets are available here: https://www.ims.uni-stuttgart.de/en/research/resources/experiment-data/wugs/