In [None]:
%pip install numpy scikit-learn pandas
%pip install --no-cache-dir --force-reinstall https://dm.cs.tu-dortmund.de/nats/nats25_08_01_bert_embeddings-0.1-py3-none-any.whl
import nats25_08_01_bert_embeddings

# Deep Neural Embeddings

In this assignment, we will work with deep neural embeddings (but not train such an embedding, which is much too ressource intensive).

In [None]:
import numpy as np, pandas as pd
import json, gzip, urllib
# Load dataset
file_path, _ = urllib.request.urlretrieve("https://dm.cs.tu-dortmund.de/nats/data/minecraft-articles.json.gz")
with gzip.open(file_path, "rt", encoding="utf-8") as file:
	raw = json.load(file)
titles, texts, classes, mclasses = [x["title"] for x in raw], [x["text"] for x in raw], [x["heuristic"] for x in raw], [x["transitive"] for x in raw]
# Free memory
del raw
# Load BERT mean vectors
file_path, _ = urllib.request.urlretrieve("https://dm.cs.tu-dortmund.de/nats/data/minecraft-bge-m3.npy")
vectors = np.load(file_path)

In [None]:
import sklearn.preprocessing
vectors = sklearn.preprocessing.normalize(vectors, copy=False)

## Cluster deep vectors with k-means

Find the "best" result when clustering with k-means for k=2..10 by ARI.

For reproducibility, use the fixed random seed 0, 1 restarts, and no tolerance.

Log for yourself the time needed to cluster.

Note that on real data, we cannot use ARI.

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
best = (0, -1, None, None) # ARI, k, assignment, centers
pass # Your solution here

In [None]:
nats25_08_01_bert_embeddings.hidden_tests_5_0(best)

# Explore the clustering

Explore the clustering: print each clusters size and the 5 most central documents to each cluster.

In [None]:
def explain(ari, k, assignment, centers):
    print("ARI:", ari)
    pass # Your solution here

In [None]:
explain(*best)

In [None]:
nats25_08_01_bert_embeddings.hidden_tests_9_0(explain, best, titles)

## Improve the cluster explanation with TF-IDF

Interestingly, TF-IDF is still useful here - what are the important words, now that we only work with 768-dimensional mean vectors? These averaged vectors are not very similar to word vectors anymore (all close together, and close to stop words).

First, get back our old tf-idf data

In [None]:
# Prepare Tfidf vectors!
tfidf = None # sparse tf-idf matrix
vocabulary = None # vocabulary
idf = None # IDF values
pass # Your solution here

In [None]:
nats25_08_01_bert_embeddings.hidden_tests_12_0(vectors, vocabulary, tfidf)

Now write an explain2 function that also prints the most important words for each cluster.

Also use the multi-classifiction information in mclasses to explain the cluster contents in terms of labels.

In [None]:
def explain2(ari, k, assignment, centers, tfidf, idf, vocabulary):
    print("ARI:", ari)
    pass # Your solution here

In [None]:
explain2(*best, tfidf, idf, vocabulary)

In [None]:
nats25_08_01_bert_embeddings.hidden_tests_16_0(vocabulary, mclasses, tfidf, best, idf, explain2)