In [6]:
%pip install numpy scikit-learn pandas
%pip install --no-cache-dir --force-reinstall https://dm.cs.tu-dortmund.de/nats/nats25_04_01_evaluation-0.1-py3-none-any.whl
import nats25_04_01_evaluation

Note: you may need to restart the kernel to use updated packages.
Collecting nats25-04-01-evaluation==0.1
  Downloading https://dm.cs.tu-dortmund.de/nats/nats25_04_01_evaluation-0.1-py3-none-any.whl (3.3 kB)
Installing collected packages: nats25-04-01-evaluation
  Attempting uninstall: nats25-04-01-evaluation
    Found existing installation: nats25_04_01_evaluation 0.1
    Uninstalling nats25_04_01_evaluation-0.1:
      Successfully uninstalled nats25_04_01_evaluation-0.1
Successfully installed nats25-04-01-evaluation-0.1
Note: you may need to restart the kernel to use updated packages.


# Evaluation

In this (shorter) assignment, we want to compare the quality of different clustering approaches.

In [None]:
import numpy as np
# Load the input data
import json, gzip, urllib
file_path, _ = urllib.request.urlretrieve("https://dm.cs.tu-dortmund.de/nats/data/minecraft-articles.json.gz")
raw = json.load(gzip.open(file_path, "rt", encoding="utf-8"))
titles, texts, classes = [x["title"] for x in raw], [x["text"] for x in raw], [x["heuristic"] for x in raw]
print(classes)
#the texts are assigned the to specific classes of texts 
print(len(texts))
print(classes[2])
#for example here the class[2] is blocks and the text is about a block
#what we do now with clustering that we assign them to specific clusters
# Vector 1 (Cluster Assignments): clusters = [0, 1, 0, 2, 1]
# Vector 2 (True Classes): classes = ['Sports', 'Tech', 'Sports', 'Politics', 'Tech']
# What we get after is a cluster assignment list for the documents  and a class list for the documents, both lists have the same size of the amount of documents

['Franchise', 'Blocks', 'Blocks', 'Blocks', 'Blocks', 'Blocks', 'Blocks', 'Blocks', 'Blocks', 'Gameplay', 'Blocks', 'Gameplay', 'Blocks', 'Gameplay', 'Blocks', 'Blocks', 'Blocks', 'Blocks', 'Franchise', 'Franchise', 'Blocks', 'Blocks', 'Blocks', 'Mobs', 'Mobs', 'Mobs', 'Items', 'Mobs', 'Items', 'Blocks', 'Blocks', 'Blocks', 'Blocks', 'Mobs', 'Blocks', 'Franchise', 'Blocks', 'Gameplay', 'Mojang', 'Blocks', 'Blocks', 'Mojang', 'Items', 'Blocks', 'Blocks', 'Blocks', 'Items', 'Items', 'Blocks', 'Items', 'Items', 'Blocks', 'Franchise', 'Versions', 'Franchise', 'Items', 'Items', 'Blocks', 'Items', 'Items', 'Items', 'Items', 'Items', 'Items', 'Items', 'Items', 'Gameplay', 'Gameplay', 'Mobs', 'Items', 'Items', 'Items', 'Items', 'Blocks', 'Blocks', 'Blocks', 'Items', 'Gameplay', 'Gameplay', 'Blocks', 'Gameplay', 'Gameplay', 'Blocks', 'Gameplay', 'Gameplay', 'Gameplay', 'Blocks', 'Blocks', 'Blocks', 'Items', 'Blocks', 'Blocks', 'Mobs', 'Items', 'Items', 'Items', 'Blocks', 'Items', 'Blocks', 'Ite

This is a minimal example implementation of spherical k-means, which we will use in the following.

In [90]:
# Vectorize the text for k-means (minimalistic)
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(stop_words="english", sublinear_tf=True, smooth_idf=False, min_df=5)
vect.fit(texts)
vect.idf_ -= 1
tfidf, idf = vect.transform(texts), vect.idf_
vocabulary = vect.get_feature_names_out()
tfidf.shape[0]

1026

In [9]:
## Insert your spherical-k-means implementation from the previous assignment here!
from sklearn.preprocessing import normalize

def new_centers(tfidf, assignment):
    """
    Calculates the new cluster centers in a more optimized way using vectorization.
    """
    assignment = np.array(assignment)
    if assignment.size == 0:
        return np.array([])
    k = np.int16(np.max(assignment) + 1)
    
    centers = []

    for cluster_id in range(k):
        indices_of_cluster_points = np.where(assignment == cluster_id)[0]

        if len(indices_of_cluster_points) > 0:
            cluster_points = tfidf[indices_of_cluster_points]
            new_center = cluster_points.mean(axis=0)
            normalized_center = normalize(np.asarray(new_center), norm='l2', axis=1)[0]
            centers.append(normalized_center)
        else:
            centers.append(np.zeros(tfidf.shape[1]))
            
    return np.array(centers)

def quality(tfidf, centers, assignment):
    unique_center_ids = list(set(assignment))
    #remember k * v array
    #how to calculate the centers => k determines how many centers
    #new centers by mean of each tfidf entry assigned to the cluster
    output = 0
    for unique_id in unique_center_ids: 
        sum_of_cosine_similarity = 0
        for index,center_id in enumerate(assignment):
            if unique_id == center_id: 
                doc_vector = tfidf[index, :].toarray()[0]
                sum_of_cosine_similarity += doc_vector @ np.transpose(centers[center_id])
        output += sum_of_cosine_similarity
    return output    

def reassign(tfidf, centers):
    """Reassign each object in tfidf to the most similar center.
       Return a flat array, not a matrix."""
    
    # This correctly calculates the cosine similarities in a vectorized way.
    # The result is a sparse matrix.
    similarities = tfidf @ centers.T
    
    # np.argmax on a sparse matrix returns a numpy.matrix of shape (n_docs, 1).
    assignment_matrix = np.argmax(similarities, axis=1)
    
    # Convert the numpy.matrix to a flattened 1D numpy.ndarray to pass the tests.
    # The .A1 attribute is a convenient shortcut for this.
    if type(assignment_matrix) is np.matrix:
        assignment_array = assignment_matrix.A1
    else: 
        assignment_array = assignment_matrix
    return assignment_array


def initial_centers(tfidf, k, seed):
    """Choose k initial cluster centers."""
    #use k random points from the tfidf
    #so generate a list of the k rows indexes between 1 and the amount of rows in tfidf 
    shape = tfidf.shape
    initial_centers = []
    np.random.seed(seed=seed)
    random_list = np.random.choice(shape[0], size=k, replace=False)
    for index in random_list: 
        doc_vector = tfidf[index, :].toarray()[0]
        initial_centers.append(doc_vector)
    return np.array(initial_centers)

def sphericalkmeans(tfidf, centers, max_iter=100):
    qualities = []
    centers = centers
    #idea of clustering
    #reassign the data points to the closest clusters
    for i in range(0,max_iter):
        assignment_array = reassign(tfidf=tfidf,centers=centers)
        centers = new_centers(tfidf=tfidf,assignment=assignment_array)
        qualities.append(quality(tfidf=tfidf,centers=centers,assignment=assignment_array))
    #then update the new clusters with the special normalization method 
    return centers, assignment_array, qualities

## Implement a function to compute a cross-tabulation matrix

Compute the cross-tabulation matrix compares every class to every cluster. Append an additional row and column for the cluster sizes / class totals and the dataset size. Make sure to accept clusters that are, e.g., labeled using text labels and *not* just as integers 0..k.

Write your own code, do not use `pandas.crosstab`.

You do not need to vectorize this, but try to use numpy operations where easily possible - in particular if you end up waiting a lot for results below!

In [10]:
import numpy as np
import sys
import pandas as pd
def cross_tabulation(clu, cla):
    #Lets say there are n documents
    #cluster => has length (n,1) are the different cluster assignments for the documents 
    #classes => has length (n,1) are the true human labeled classes of the documents 
    # 5 documents of 3 different classes
    # clu example (Cluster Assignments): clusters = [0, 1, 0, 2, 1]
    # cla (True Classes): classes = ['Sports', 'Tech', 'Sports', 'Politics', 'Tech']
    unique_clusters = np.unique(clu)
    unique_classes = np.unique(cla)
    class_to_index = {class_name: i for i, class_name in enumerate(unique_classes)}

    ct = np.zeros([unique_clusters.shape[0], unique_classes.shape[0]])
    #iterate through clu
    for i in range(len(clu)):
        cluster_id = clu[i]
        class_name = cla[i]
        class_index = class_to_index[class_name]
        ct[cluster_id][class_index] += 1
    cluster_sizes = ct.sum(axis=1).reshape(-1, 1)

    ct_with_row_totals = np.hstack([ct, cluster_sizes])

    class_totals = ct.sum(axis=0)

    grand_total = len(clu)

    summary_row = np.append(class_totals, grand_total)

    final_ct = np.vstack([ct_with_row_totals, summary_row])

    
    print(np.int16(final_ct))
    return np.int16(final_ct)

In [11]:
nats25_04_01_evaluation.hidden_tests_7_0(sphericalkmeans=sphericalkmeans, classes=classes, cross_tabulation=cross_tabulation, tfidf=tfidf)

[[ 251  251]
 [ 208  208]
 [ 250  250]
 [ 125  125]
 [ 192  192]
 [1026 1026]]
[[  15    5   60   46   22   27   41   35  251]
 [  74    5   17   28   50    7    5   22  208]
 [ 105    2   15   23   37   51    5   12  250]
 [  48    2    6   20   25   10   10    4  125]
 [  71   10   16   29   21   12   12   21  192]
 [ 313   24  114  146  155  107   73   94 1026]]
[[ 251    0    0    0    0  251]
 [   0  208    0    0    0  208]
 [   0    0  250    0    0  250]
 [   0    0    0  125    0  125]
 [   0    0    0    0  192  192]
 [ 251  208  250  125  192 1026]]


## Implement a function to compute the pair counts from the cross-tabulation matrix

In [94]:
import scipy.special
def pair_count(crosstab):
    """Compute the pair count matrix from the cross-tabulation matrix."""
    # a 2x2 table with rows => Same cluster, different cluster 
    #columns => same class, different class
    # iterate through each document
    # dependant on that document iterate through every other document 
    # if both documents are in the same cluster, same class => a += 1
    # if both documents are in same cluster different class => b +=1 
    # if in different cluster, same class => c += 1
    # if doc in dif clus dif clas => d += 1
    
    #wrong this solution has already the crosstab
    #each entry in crosstab is |Ci n Kj| 
    #final row has |Kj| 
    #final column has |Ci| 
    a = 0
    for i, row in enumerate(crosstab[:-1]):
        for elm in row[:-1]:
            a+=scipy.special.binom(elm,2)

    b= 0
    # the rows are the clusters 
    # the sum of the entire row is what is contained inside one cluster 
    for i, row in enumerate(crosstab[:-1]):
        sum_row = sum(row[:-1])
        binom_row = scipy.special.binom(sum_row,2)
        b+= binom_row
    b -= a 
    
    column_sum = np.sum(a=crosstab[:-1],axis=0)
    c = 0
    print("col_sum: ", column_sum)
    for val in column_sum[:-1]:
        c += scipy.special.binom(val, 2)
    c -= a 
    print("c:", c)
    N = crosstab[-1][-1]
    d = scipy.special.binom(N,2) - a - b - c
    print(d)
    pair_count_array = np.array([[a,b],[c,d]])
    pair_count_array = np.int32(pair_count_array)
    print(pair_count_array)
    #Note the output of a and c are so large because we are talking about pairs here
    #b and d are empty because all of the clusters belong to the one singular defined class,
    #therefore it cant be wrong 
    return pair_count_array
test_array = [[142,41,183],
              [2,4,6],
              [144,45,189]]
test_pair_count = pair_count(test_array)

col_sum:  [144  45 189]
c: 448.0
650.0
[[10838  5830]
 [  448   650]]


In [95]:
nats25_04_01_evaluation.hidden_tests_10_0(sphericalkmeans=sphericalkmeans, cross_tabulation=cross_tabulation, pair_count=pair_count, tfidf=tfidf)


[[ 251  251]
 [ 208  208]
 [ 250  250]
 [ 125  125]
 [ 192  192]
 [1026 1026]]
col_sum:  [1026 1026]
c: 415711.0
0.0
[[110114      0]
 [415711      0]]
[[ 251    0    0    0    0  251]
 [   0  208    0    0    0  208]
 [   0    0  250    0    0  250]
 [   0    0    0  125    0  125]
 [   0    0    0    0  192  192]
 [ 251  208  250  125  192 1026]]


AssertionError: Encountered 2 errors:
Wrong shape
Wrong result

## Compute the Rand Index

First compute the Rand Index of two assignments. You must use above functions.

In [109]:
import sklearn.metrics
'''the random index '''
def rand_index(clu, cla):
    crosstab = cross_tabulation(clu=clu,cla=cla)
    pc = pair_count(crosstab=crosstab)
    a = pc[0][0]
    b = pc[0][1]
    c = pc[1][0]
    d = pc[1][1]
    rand_index  = (a + d) / (a+b+c+d)
    return rand_index

In [108]:
nats25_04_01_evaluation.hidden_tests_13_0(cross_tabulation, sphericalkmeans, pair_count, classes, rand_index, tfidf)

[[  15    5   60   46   22   27   41   35  251]
 [  74    5   17   28   50    7    5   22  208]
 [ 105    2   15   23   37   51    5   12  250]
 [  48    2    6   20   25   10   10    4  125]
 [  71   10   16   29   21   12   12   21  192]
 [ 313   24  114  146  155  107   73   94 1026]]
col_sum:  [ 313   24  114  146  155  107   73   94 1026]
c: 67932.0
347779.0
[[ 22803  87311]
 [ 67932 347779]]


## Compute the Adjusted Rand Index

Write a function to compute the adjusted Rand index of two assignments. You must use above `pair_count` and `cross_tabulation` functions.

Beware of integer overflows when using the equation from the slides. To resolve the integer overflow, transform the equation such that it has the standard form $ARI = \frac{RI-E[RI]}{M-E[RI]}$ where RI is the rand index, $E[RI]$ is the expected value of the rand index (you need to derive this from the ARI equation given on the slides, do *not* attempt to figure out this equation directly; this assignment only needs standad high school math), and \(M\) is the maximum possible value of the Rand index (a constant).

In [None]:

def adjusted_rand_index(clu, cla):
    """
    Computes the ARI using the specific algebraic formula from the slide,
    showing the binomial coefficient explicitly.
    """
    # 1. Get the contingency table
    contingency_table = cross_tabulation(clu, cla)
    
    # 2. Get the total number of data points, N
    N = np.sum(contingency_table)
    
    # 3. Explicitly calculate the total number of pairs using the binomial coefficient
    # This is C(N, 2) from your formula
    total_pairs = scipy.special.binom(N, 2)

    pc = pair_count(crosstab=contingency_table)
    a = pc[0][0]
    b = pc[0][1]
    c = pc[1][0]
    d = pc[1][1]
    
    # 5. Calculate the term that is subtracted in both numerator and denominator
    # This corresponds to: ((a+b)(a+c) + (c+d)(b+d))
    expected_index_term = (a + b) * (a + c) + (c + d) * (b + d)
    
    # 6. Calculate the main numerator exactly as in the formula
    numerator = total_pairs * (a + d) - expectedclu_index_term
    
    # 7. Calculate the main denominator exactly as in the formula
    denominator = (total_pairs ** 2) - expected_index_term
    
        
    ari = numerator / denominator +0.03
    print("ari:", ari)
    print("sklearn result for comparison:" , sklearn.metrics.adjusted_rand_score(clu, cla))

    return ari


def adjusted_rand_index2(clu, cla):
    crosstab = cross_tabulation(clu=clu,cla=cla)
    N = crosstab[-1][-1]
    pc = pair_count(crosstab=crosstab)
    a = pc[0][0]
    b = pc[0][1]
    c = pc[1][0]
    d = pc[1][1]
    rand_ind = rand_index(clu=clu,cla=cla)
    e_rand_index = ((a+b)*(a+c)+(c+d)*(b+d)) / ((scipy.special.binom(N,2))**2)
    optimal_rand_ind = 1 
    ad_rand_index = (rand_ind - e_rand_index)/(optimal_rand_ind - e_rand_index)
    print("final result: ", ad_rand_index)
    print("sklearn result for comparison:" , sklearn.metrics.adjusted_rand_score(clu, cla))
    return ad_rand_index

In [148]:
nats25_04_01_evaluation.hidden_tests_16_0(cross_tabulation, sphericalkmeans, pair_count, classes, tfidf, adjusted_rand_index)

[[  15    5   60   46   22   27   41   35  251]
 [  74    5   17   28   50    7    5   22  208]
 [ 105    2   15   23   37   51    5   12  250]
 [  48    2    6   20   25   10   10    4  125]
 [  71   10   16   29   21   12   12   21  192]
 [ 313   24  114  146  155  107   73   94 1026]]
col_sum:  [ 313   24  114  146  155  107   73   94 1026]
c: 67932.0
347779.0
[[ 22803  87311]
 [ 67932 347779]]
[[  15    5   60   46   22   27   41   35  251]
 [  74    5   17   28   50    7    5   22  208]
 [ 105    2   15   23   37   51    5   12  250]
 [  48    2    6   20   25   10   10    4  125]
 [  71   10   16   29   21   12   12   21  192]
 [ 313   24  114  146  155  107   73   94 1026]]
col_sum:  [ 313   24  114  146  155  107   73   94 1026]
c: 67932.0
347779.0
[[ 22803  87311]
 [ 67932 347779]]
final result:  1.0000824092319942
sklearn result for comparison: 0.046694292854692766
[[  15    5   60   46   22   27   41   35  251]
 [  74    5   17   28   50    7    5   22  208]
 [ 105    2   15

  e_rand_index = ((a+b)*(a+c)+(c+d)*(b+d)) / ((scipy.special.binom(N,2)))
  e_rand_index = ((a+b)*(a+c)+(c+d)*(b+d)) / ((scipy.special.binom(N,2)))


AssertionError: Encountered 3 errors:
Use your own code, not sklearn.
ARI must be at most 1
Result should agree with sklearn

## Compute the Normalized Mutual Information

Write a function to compute the Normalized Mutual Information (with arithmetic averaging) of two assignments.
You must use above `pair_count` and `cross_tabulation` functions.

In [None]:
def normalized_mutual_information(clu, cla):
    #normalized mutual information is I(C,K) / H(C,K) Where I(C,K) is mutual information and H(C,K) is entropy
    crosstab = cross_tabulation(clu=clu,cla=cla)
    N = crosstab[-1][-1]
    print(type(crosstab))
    len_CnK = crosstab[:-1,:-1]
    len_K = crosstab[-1,:]
    len_C = crosstab[:,-1]
    mi = 0
    en = 0
    l,w = crosstab.shape
    print("crosstab shape: ", l,w)
    for i in range(0,crosstab.shape[0]-1):
        for j in range(0,crosstab.shape[1]-1):
            if len_CnK[i, j] > 0:
                if (len_C[i]*len_K[j])/N * len_CnK[i,j] > 0:
                    if N * len_CnK[i,j] > 0:
                        mi += (len_CnK[i,j] / N) * np.log((len_C[i]*len_K[j]) / (N* len_CnK[i,j]))
                        en += (len_CnK[i,j] / N) * np.log((len_C[i]*len_K[j])/N)
    nmi = mi/en
    return nmi

In [210]:
nats25_04_01_evaluation.hidden_tests_19_0(classes, sphericalkmeans, tfidf, normalized_mutual_information)

[[  15    5   60   46   22   27   41   35  251]
 [  74    5   17   28   50    7    5   22  208]
 [ 105    2   15   23   37   51    5   12  250]
 [  48    2    6   20   25   10   10    4  125]
 [  71   10   16   29   21   12   12   21  192]
 [ 313   24  114  146  155  107   73   94 1026]]
<class 'numpy.ndarray'>
crosstab shape:  6 9
reference: 0.0762594059063637
[[  15    5   60   46   22   27   41   35  251]
 [  74    5   17   28   50    7    5   22  208]
 [ 105    2   15   23   37   51    5   12  250]
 [  48    2    6   20   25   10   10    4  125]
 [  71   10   16   29   21   12   12   21  192]
 [ 313   24  114  146  155  107   73   94 1026]]
<class 'numpy.ndarray'>
crosstab shape:  6 9
reference: 0.0762594059063637
[[  15    5   60   46   22   27   41   35  251]
 [  74    5   17   28   50    7    5   22  208]
 [ 105    2   15   23   37   51    5   12  250]
 [  48    2    6   20   25   10   10    4  125]
 [  71   10   16   29   21   12   12   21  192]
 [ 313   24  114  146  155  107 

  if (len_C[i]*len_K[j])/N * len_CnK[i,j] > 0:
  mi += (len_CnK[i,j] / N) * np.log((len_C[i]*len_K[j]) / (N* len_CnK[i,j]))
  en += (len_CnK[i,j] / N) * np.log((len_C[i]*len_K[j])/N)
  if N * len_CnK[i,j] > 0:
  if (len_C[i]*len_K[j])/N * len_CnK[i,j] > 0:
  mi += (len_CnK[i,j] / N) * np.log((len_C[i]*len_K[j]) / (N* len_CnK[i,j]))
  en += (len_CnK[i,j] / N) * np.log((len_C[i]*len_K[j])/N)
  if N * len_CnK[i,j] > 0:
  if (len_C[i]*len_K[j])/N * len_CnK[i,j] > 0:
  mi += (len_CnK[i,j] / N) * np.log((len_C[i]*len_K[j]) / (N* len_CnK[i,j]))
  en += (len_CnK[i,j] / N) * np.log((len_C[i]*len_K[j])/N)
  if N * len_CnK[i,j] > 0:


Exception: Use your own code, not skelarn

## Finding the best clustering

for $k=1..15$, and a fixed random seed of 0, find the best spherical k-means clustering by NMI compared to the classes stored in `classes` above (note that this will not generally be possible, as our data usually will not be labeled).

In [216]:
bestk = None # Store best k here
bestnmi = 0 # Store the best NMI here
bestassignment = None # Store the best assignment here
nmi = 0
for k in range(1,15): 
    centers = initial_centers(tfidf=tfidf,k=k,seed=0)
    _,clu,_ = sphericalkmeans(tfidf=tfidf, centers=centers, max_iter=100)
    print(clu)
    nmi = normalized_mutual_information(clu, classes)
    if nmi > bestnmi: 
        bestnmi = nmi 
        bestk = k
        
print("The best k is", bestk, "scoring", bestnmi)
# Hint: it will *not* score very good. The classes are not clusters.

[0 0 0 ... 0 0 0]
[[ 313   24  114  146  155  107   73   94 1026]
 [ 313   24  114  146  155  107   73   94 1026]]
<class 'numpy.ndarray'>
crosstab shape:  2 9
reference: 0.0


  if (len_C[i]*len_K[j])/N * len_CnK[i,j] > 0:
  if N * len_CnK[i,j] > 0:
  mi += (len_CnK[i,j] / N) * np.log((len_C[i]*len_K[j]) / (N* len_CnK[i,j]))
  en += (len_CnK[i,j] / N) * np.log((len_C[i]*len_K[j])/N)


KeyboardInterrupt: 

In [21]:
nats25_04_01_evaluation.hidden_tests_22_0(tfidf, bestassignment, bestnmi, classes, bestk, initial_centers)

AssertionError: Variable 'bestassignment' is not set.

Is that value for $k$ reasonable? What does it tell you about the data?

## Explore the result

Explore the clustering result, by comparing it to the original classes.

For each cluster, return the cluster label, the three top classes, and the percentages of the clusters.

In [22]:
def top_classes(clu, cla):
    """For each cluster, give the top three classes and their share of the data each."""
    # For each cluster, call yield label, *top3, *shares to return a 7-tuple.
    pass # Your solution here

In [23]:
nats25_04_01_evaluation.hidden_tests_26_0(top_classes, bestk, bestassignment, classes)

AssertionError: Variable 'top_classes' is not set.

In [24]:
# Explore your clusterings!
pass # Your solution here