In [101]:
import sys
import scipy
import ast
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab') # Maybe not needed?

# TODO:
    # [ ] Preprocessing
    # [ ] Unieke woorden tellen
    # [ ] TF*IDF
    # [ ] Clusteren
    # [ ] Bekijk topic per cluster
    # [ ] Visualiseer evolutie  over tijd


[nltk_data] Downloading package stopwords to /home/diana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/diana/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/diana/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [102]:
# Read input file and create table
records = []
with open("data/data_mining_publications.txt", "r") as f:
    for line in f:
        line = line.strip()
        if line:
            records.append(ast.literal_eval(line))


table = pd.DataFrame(records)

# Get all titles
titles = table['title'].to_list()
table

Unnamed: 0,publication_type,booktitle,journal,year,title
0,inproceedings,SDM,,2019,Feature selection as Monte-Carlo Search in Gro...
1,inproceedings,SDM,,2002,Mining Relationship between Triggering and Con...
2,inproceedings,SDM,,2007,Segmentations with Rearrangements.
3,inproceedings,SDM,,2021,MT-STNets: Multi-Task Spatial-Temporal Network...
4,inproceedings,SDM,,2006,A Semantic Approach for Mining Hidden Links fr...
...,...,...,...,...,...
14480,inproceedings,KDD,,1996,Using a Hybrid Neural/Expert System for Data B...
14481,inproceedings,KDD,,2021,Table2Charts: Recommending Charts by Learning ...
14482,inproceedings,KDD,,2021,Physical Equation Discovery Using Physics-Cons...
14483,inproceedings,KDD,,2023,Accelerating Personalized PageRank Vector Comp...


In [103]:
# Initialize stemmer
ps: PorterStemmer = PorterStemmer()

# Other option (below): lemmatization
# This returns valid words, but can only be done on one type (eg. nouns or verbs), 
# so others will still create duplicates

# from nltk.stem import WordNetLemmatizer
# nltk.download("wordnet")
# nltk.download("omw-1.4")
# wnl = WordNetLemmatizer()

# Get English stop words and punctuation dict
stop_words = set(stopwords.words('english'))
punct = str.maketrans(dict.fromkeys(string.punctuation))

filtered_titles: list[list[str]] = []
unique_word_counts: defaultdict[str, int] = defaultdict(int)
total_words: int = 0

for title in titles:
    # Set string to lowercase and remove punctuation
    title_words = word_tokenize(title.lower().translate(punct))

    # Remove stopwords and stem each word
    filtered_title = [ps.stem(word) for word in title_words if word not in stop_words]
    # filtered_title = [wnl.lemmatize(word, pos="v") for word in title_words if word not in stop_words]
    # filtered_title = [word for word in title_words if word not in stop_words]
    filtered_titles.append(filtered_title)
    total_words += len(filtered_title)
    
    for word in filtered_title:
        unique_word_counts[word] += 1


print(f"Total words: {total_words}, unique words: {len(unique_word_counts)}")

Total words: 97786, unique words: 9669


In [None]:
# This part creates a TF*IDF matrix (can be used for clustering later on!)
tfidf_titles = [" ".join(tokens) for tokens in filtered_titles]

vec = TfidfVectorizer(lowercase=False,  # we already lowercased/stemmed tokens
                      token_pattern=r"(?u)\b\w+\b",
                      smooth_idf=True,  # add 1 to numerator/denominator (stability)
                      use_idf=True)

tfidf_matrix = vec.fit_transform(tfidf_titles)
vocab = vec.get_feature_names_out()
# print("n docs, vocab size:", tfidf_matrix.shape)

n docs, vocab size: (14485, 9667)


In [None]:
# Test TF*IDF matrix
i = 0
row = tfidf_matrix.getrow(i)
indices = row.indices
data = row.data
top_n = 10
order = data.argsort()[::-1][:top_n]
for pos in order:
    print(vocab[indices[pos]], data[pos])

leaf 0.3827972514282151
acycl 0.3671000211130385
montecarlo 0.35596265896147644
grow 0.3204892572648419
root 0.31033205211090426
singl 0.2882439023061207
best 0.2834974596441657
direct 0.24406442093090747
identif 0.23718564811915677
search 0.18717256659593434
