In [None]:
%pip install numpy scikit-learn pandas
%pip install --no-cache-dir --force-reinstall https://dm.cs.tu-dortmund.de/nats/nats25_04_01_evaluation-0.1-py3-none-any.whl
import nats25_04_01_evaluation

# Evaluation

In this (shorter) assignment, we want to compare the quality of different clustering approaches.

In [None]:
import numpy as np
# Load the input data
import json, gzip, urllib
file_path, _ = urllib.request.urlretrieve("https://dm.cs.tu-dortmund.de/nats/data/minecraft-articles.json.gz")
raw = json.load(gzip.open(file_path, "rt", encoding="utf-8"))
titles, texts, classes = [x["title"] for x in raw], [x["text"] for x in raw], [x["heuristic"] for x in raw]

This is a minimal example implementation of spherical k-means, which we will use in the following.

In [None]:
# Vectorize the text for k-means (minimalistic)
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(stop_words="english", sublinear_tf=True, smooth_idf=False, min_df=5)
vect.fit(texts)
vect.idf_ -= 1
tfidf, idf = vect.transform(texts), vect.idf_
vocabulary = vect.get_feature_names_out()

In [None]:
## Insert your spherical-k-means implementation from the previous assignment here!

def initial_centers(tfidf, k, seed):
    pass # Your solution here

def sphericalkmeans(tfidf, centers, max_iter=100):
    pass # Your solution here
    return centers, assignment, iter

## Implement a function to compute a cross-tabulation matrix

Compute the cross-tabulation matrix compares every class to every cluster. Append an additional row and column for the cluster sizes / class totals and the dataset size. Make sure to accept clusters that are, e.g., labeled using text labels and *not* just as integers 0..k.

Write your own code, do not use `pandas.crosstab`.

You do not need to vectorize this, but try to use numpy operations where easily possible - in particular if you end up waiting a lot for results below!

In [None]:
def cross_tabulation(clu, cla):
    """Compute the cross-tabulation matrix to compare assignments `clu` and `cla`."""
    pass # Your solution here

In [None]:
nats25_04_01_evaluation.hidden_tests_7_0(sphericalkmeans, classes, cross_tabulation, tfidf)

## Implement a function to compute the pair counts from the cross-tabulation matrix

In [None]:
def pair_count(crosstab):
    """Compute the pair count matrix from the cross-tabulation matrix."""
    pass # Your solution here

In [None]:
nats25_04_01_evaluation.hidden_tests_10_0(sphericalkmeans, cross_tabulation, pair_count, tfidf)

## Compute the Rand Index

First compute the Rand Index of two assignments. You must use above functions.

In [None]:
def rand_index(clu, cla):
    pass # Your solution here

In [None]:
nats25_04_01_evaluation.hidden_tests_13_0(tfidf, sphericalkmeans, pair_count, rand_index, classes, cross_tabulation)

## Compute the Adjusted Rand Index

Write a function to compute the adjusted Rand index of two assignments. You must use above `pair_count` and `cross_tabulation` functions.

Beware of integer overflows when using the equation from the slides. To resolve the integer overflow, transform the equation such that it has the standard form $ARI = \frac{RI-E[RI]}{M-E[RI]}$ where RI is the rand index, $E[RI]$ is the expected value of the rand index (you need to derive this from the ARI equation given on the slides, do *not* attempt to figure out this equation directly; this assignment only needs standad high school math), and \(M\) is the maximum possible value of the Rand index (a constant).

In [None]:
def adjusted_rand_index(clu, cla):
    pass # Your solution here

In [None]:
nats25_04_01_evaluation.hidden_tests_16_0(tfidf, sphericalkmeans, pair_count, adjusted_rand_index, classes, cross_tabulation)

## Compute the Normalized Mutual Information

Write a function to compute the Normalized Mutual Information (with arithmetic averaging) of two assignments.
You must use above `pair_count` and `cross_tabulation` functions.

In [None]:
def normalized_mutual_information(clu, cla):
    pass # Your solution here

In [None]:
nats25_04_01_evaluation.hidden_tests_19_0(sphericalkmeans, classes, normalized_mutual_information, tfidf)

## Finding the best clustering

for $k=1..15$, and a fixed random seed of 0, find the best spherical k-means clustering by NMI compared to the classes stored in `classes` above (note that this will not generally be possible, as our data usually will not be labeled).

In [None]:
bestk = None # Store best k here
bestnmi = None # Store the best NMI here
bestassignment = None # Store the best assignment here
pass # Your solution here
print("The best k is", bestk, "scoring", bestnmi)
# Hint: it will *not* score very good. The classes are not clusters.

In [None]:
nats25_04_01_evaluation.hidden_tests_22_0(tfidf, bestassignment, bestnmi, classes, bestk, initial_centers)

Is that value for $k$ reasonable? What does it tell you about the data?

## Explore the result

Explore the clustering result, by comparing it to the original classes.

For each cluster, return the cluster label, the three top classes, and the percentages of the clusters.

In [None]:
def top_classes(clu, cla):
    """For each cluster, give the top three classes and their share of the data each."""
    # For each cluster, call yield label, *top3, *shares to return a 7-tuple.
    pass # Your solution here

In [None]:
nats25_04_01_evaluation.hidden_tests_26_0(top_classes, bestk, bestassignment, classes)

In [None]:
# Explore your clusterings!
pass # Your solution here