In [None]:
%pip install numpy scipy scikit-learn pandas
%pip install --no-cache-dir --force-reinstall https://dm.cs.tu-dortmund.de/nats/nats25_02_01_tfidf-0.1-py3-none-any.whl
import nats25_02_01_tfidf

# TF-IDF
## Setup our working context and load the data

In this assignment, we will work with a database of inaugural speeches of US presidents.

In [None]:
import numpy as np, pandas as pd, scipy
import gzip, json, urllib
file_path, _ = urllib.request.urlretrieve("https://dm.cs.tu-dortmund.de/nats/data/inaugural.json.gz")
inaugural = json.load(gzip.open(file_path,"rt"))
labels = [t[0] for t in inaugural]
speeches = [t[1] for t in inaugural]

## Build a Sparse Document-Term Matrix

Build a document-term matrix for the inaugural speeches.

Use sparse data structures, a minimum document frequency of 5, remove english stopwords.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer # Please use this
vocab = None # Your vocabulary
dtm = None # Your sparse document term matrix
pass # Your solution here
print("Document term matrix has shape", dtm.shape)

In [None]:
nats25_02_01_tfidf.hidden_tests_4_0(vocab, dtm, speeches)

In [None]:
# Pretty display the data with pandas:
pd.DataFrame.sparse.from_spmatrix(dtm,index=labels,columns=vocab).head()

## Most Frequent Words for Each Speech

Compute the most frequent word (except for the stopwords already removed) for each speech.

In [None]:
# Build a dictionary speech label to most frequent word
most_frequent = dict()
pass # Your solution here

for sp, w in sorted(most_frequent.items()): print(sp, w, sep="\t")

In [None]:
nats25_02_01_tfidf.hidden_tests_8_0(labels, most_frequent)

## TF-IDF

From the document-term matrix, compute the TF-IDF matrix. Implement the standard version of TF-IDF (`ltc`).

Be careful with 0 values, ensure that your matrix remains *sparse*. Do *not* rely on Wikipedia, it has errors.

Perform the transformation in three steps, named `tf`, `idf`, `tfidf`. First implement term frequency.

In [None]:
def tf(dtm):
    """Compute the "l" step of standard TF-IDF"""
    # HINT: use dtm.astype(np.float32) to get a *sparse floating point copy* of the dtm matrix.
    pass # Your solution here
print("Old sum:", dtm.sum(), "new sum:", tf(dtm).sum(), "(must be less and float)")

In [None]:
# Inspect your matrix
pd.DataFrame.sparse.from_spmatrix(tf(dtm),index=labels,columns=vocab).head()

In [None]:
nats25_02_01_tfidf.hidden_tests_12_0(dtm, tf)

Implement the `idf` function.

In [None]:
def idf(dtm):
    """ Compute the "t" step inverse document frequency """
    pass # Your solution here

In [None]:
b=(np.ones((dtm.shape[0],)) @ dtm)
print(np.log(dtm.shape[0] / b))
print(idf(dtm))

In [None]:
nats25_02_01_tfidf.hidden_tests_16_0(dtm, idf)

Now implement the full `tfidf` function, using above implementations of `df` and `idf`.

Hint: you may find `scipy.sparse.spdiags` useful to keep the computations *sparse*.

You are **not allowed** to use sklearns `TfidfVectorizer`!

In [None]:
def tfidf(dtm):
    """Finish the computation of standard TF-IDF with the c step"""
    _tf, _idf = tf(dtm), idf(dtm) # Must use above functions.
    pass # Your solution here

In [None]:
# Inspect your matrix
pd.DataFrame.sparse.from_spmatrix(tfidf(dtm),index=labels,columns=vocab).head()

In [None]:
nats25_02_01_tfidf.hidden_tests_20_0(dtm, tfidf)

## Compare to sklearn

Now you are allowed to use `TfidfVectorizer`!

Use sklearns `TfidfVectorizer` (make sure to choose parameters appropriately). Compare the results.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer() # set appropriate parameters!
sktfidf = None # Store the TF-IDF result obtained via sklearn
skvocab = None # The vocabulary
pass # Your solution here

In [None]:
# Pretty display the data with pandas:
pd.DataFrame.sparse.from_spmatrix(sktfidf,index=labels,columns=skvocab).head()

In [None]:
nats25_02_01_tfidf.hidden_tests_24_0(vocab, skvocab, dtm, sktfidf)

## Understand the difference

By visual inspection of the two matrixes, you will notice that they do *not* agree.

Check the [bug reports of scikit-learn](https://github.com/scikit-learn/scikit-learn/issues?q=is%3Aissue+tf-idf+is%3Aopen) for related bug reports, and check the scikit-learn documentation *carefully* to figure out the difference.

Is it better or worse? We don't know. But scikit-learn does not implement the standard approach!

But: we can easily "hack" sklearn to produce the desired result.

Hint: Use `fit`, adjust the vectorizer, and `tranform` separately.

In [None]:
# Work around this issue in scikit-learn
tvect2 = TfidfVectorizer() # set appropriate parameters!
sktfidf2 = None # Store the TF-IDF result obtained via sklearn
skvocab2 = None # The vocabulary
# Use fit(), adjust as necessary, transform() to get the desired result!
pass # Your solution here

In [None]:
# Pretty display the data with pandas:
pd.DataFrame.sparse.from_spmatrix(sktfidf2,index=labels,columns=skvocab2).head()

In [None]:
nats25_02_01_tfidf.hidden_tests_28_0(sktfidf, skvocab2, vocab, dtm, sktfidf2, tfidf)

## Compute the Cosine Similarity Matrix

Compute the cosine similarity matrix of the speeches above.

You are not allowed to use sklearn for this.

In [None]:
X = tfidf(dtm) # use your own tfidf results
sim = None # Compute cosine similarities
pass # Your solution here
del X # free memory again.
print("Matrix of shape %d x %d" % sim.shape)

In [None]:
nats25_02_01_tfidf.hidden_tests_31_0(sim, dtm)

## Find the two most similar speeches

Given the similarity matrix, find the two most similar (different) speeches.

In [None]:
most_similar = (None, None, None) # Store a pair of document *labels* and their similarity
pass # Your solution here
print("%s\t%s\t%g" % most_similar)

In [None]:
nats25_02_01_tfidf.hidden_tests_34_0(sim, labels, most_similar)