In [None]:
import sys
import scipy
import ast
import pandas as pd
import string
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab') # Maybe not needed?
nltk.download("wordnet")
nltk.download("omw-1.4")
import sklearn

sklearn.__version__

# TODO:
    # [ ] Preprocessing
    # [ ] Unieke woorden tellen
    # [ ] TF*IDF
    # [ ] Clusteren
    # [ ] Bekijk topic per cluster
    # [ ] Visualiseer evolutie  over tijd


In [None]:
# Read input file and create table
records = []
with open("data/data_mining_publications.txt", "r") as f:
    for line in f:
        line = line.strip()
        if line:
            records.append(ast.literal_eval(line))


table = pd.DataFrame(records)

# Get all titles and dates
titles = table['title'].to_list()
timestamps = table['year'].to_list()

In [None]:
# Initialize stemmer
ps: PorterStemmer = PorterStemmer()

wnl = WordNetLemmatizer()

# Add common words domain specific words to list of stop words
domain_stops = {'approach', 'data', 'learn', 'use'}
# Get English stop words and punctuation dict
stop_words = set(stopwords.words('english')) | domain_stops
punct = str.maketrans(dict.fromkeys(string.punctuation))

filtered_titles: list[str] = []

for title in titles:
    # Set string to lowercase and remove punctuation
    title_words = word_tokenize(title.lower().translate(punct))

    # Remove stopwords and stem each word
    # filtered_title = [ps.stem(word) for word in title_words if word not in stop_words]
    filtered_title = [wnl.lemmatize(word, pos="v") for word in title_words if wnl.lemmatize(word, pos="v") not in stop_words]
#     filtered_title = [wnl.lemmatize(word, pos="v") for word in title_words]
    filtered_titles.append(" ".join(filtered_title))



In [None]:
# This part creates a TF*IDF matrix

vec = TfidfVectorizer(lowercase=False,  # already lowercased/stemmed tokens
                      token_pattern=r"(?u)\b\w+\b",
                      use_idf=True,
                      norm='l2',
                      stop_words='english',
                      min_df=2, # discard words which only appear once -> reduces vocab by ~7k words!
                      # max_df=0.04
                      )

tfidf_matrix = vec.fit_transform(filtered_titles)
vocab = vec.get_feature_names_out()
print("n docs, vocab size:", tfidf_matrix.shape)
print(f"{tfidf_matrix.getnnz() / np.prod(tfidf_matrix.shape)}")

In [None]:
# Test TF*IDF matrix
i = 0
row = tfidf_matrix.getrow(i)
indices = row.indices
data = row.data
top_n = 10
order = data.argsort()[::-1][:top_n]
for pos in order:
    print(vocab[indices[pos]], data[pos])

In [None]:
from sklearn.cluster import KMeans
k = 10
kmeans = KMeans(
    n_clusters=k, 
    # random_state=0,
    n_init=5)
labels = kmeans.fit_predict(tfidf_matrix)

# top terms per cluster (cluster_centers_ is dense)
order = kmeans.cluster_centers_.argsort()[:, ::-1]
for ci in range(k):
    print("cluster", ci, [vocab[i] for i in order[ci, :10]])

In [None]:
import matplotlib.pyplot as plt

def elbow_plot():
    inertias = []
    silhouette_scores = []
    k_range = range(100, 2000, 100)

    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=0)
        kmeans.fit(tfidf_matrix)
        inertias.append(kmeans.inertia_)

    plt.figure(figsize=(10, 5))
    plt.plot(k_range, inertias, 'bo-')
    plt.xlabel('k')
    plt.ylabel('Inertia')
    plt.title('Elbow Method')
    plt.show()


In [None]:
from sklearn.metrics import silhouette_score
def silhouette_test():
    silhouette_scores = []
    k_range = range(2, 15)

    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
        labels = kmeans.fit_predict(tfidf_matrix)
        score = silhouette_score(tfidf_matrix, labels)
        silhouette_scores.append(score)
        print(f"k={k}: silhouette={score:.3f}")

    best_k = k_range[silhouette_scores.index(max(silhouette_scores))]
    print(f"Best k: {best_k}")

    plt.plot(k_range, silhouette_scores, 'go-')
    plt.xlabel('k')
    plt.ylabel('Silhouette Score')
    plt.show()

silhouette_test()