In [1]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [2]:
def preprocess_text(
    text: str, index: int = 0, print_progress: bool = True, print_freq: int = 100
) -> str:
    if type(text) != str:
        return ""
    if print_progress and index and index % print_freq == 0:
        print(f"Processing document {index}", flush=True)

    # Initialize stop words and stemmer
    stop_words = set(stopwords.words("dutch"))
    stemmer = PorterStemmer()

    # Remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # Remove unnecessary whitespaces
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stop words and stem
    return " ".join([stemmer.stem(word) for word in tokens if word not in stop_words])

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
woo_data = pd.read_csv("./docs/12_dossiers_no_requests/woo_merged.csv.gz")

# Filter out NaNs and ensure the index is reset for future operations
clean_woo_data = woo_data.dropna(subset=["bodyText"]).reset_index(drop=True)
clean_woo_data["processedText"] = clean_woo_data["bodyText"].apply(preprocess_text)

# Drop all where publisher is Ministry of Defense
clean_woo_data = clean_woo_data[
    clean_woo_data["publisher"].str.lower() != "ministerie van defensie"
]

# Generate TF-IDF vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(clean_woo_data["processedText"])

# Convert the TF-IDF matrix to a list of lists, where each list is a document's vector
tfidf_lists = tfidf_matrix.toarray().tolist()

# Create a new DataFrame to hold the vectors
vectors_df = pd.DataFrame({"vectors": tfidf_lists})

# Merge the original cleaned data with the TF-IDF DataFrame
# Ensure the original text data is also a DataFrame with the same index
merged = pd.concat([clean_woo_data, vectors_df], axis=1)

# Now, 'merged' contains both the original text and the corresponding TF-IDF vectors
print(merged.head())

                                  page_id                    document_id  \
0  nl.mnre1153.2i.2023.674.doc.2.pagina.1  nl.mnre1153.2i.2023.674.doc.2   
1  nl.mnre1153.2i.2023.674.doc.2.pagina.2  nl.mnre1153.2i.2023.674.doc.2   
2  nl.mnre1153.2i.2023.674.doc.2.pagina.3  nl.mnre1153.2i.2023.674.doc.2   
3  nl.mnre1153.2i.2023.674.doc.2.pagina.4  nl.mnre1153.2i.2023.674.doc.2   
4  nl.mnre1153.2i.2023.674.doc.2.pagina.5  nl.mnre1153.2i.2023.674.doc.2   

                dossier_id                                           bodyText  \
0  nl.mnre1153.2i.2023.674      Intern gebruik    Conclusie: Afwijzen, voo...   
1  nl.mnre1153.2i.2023.674      Intern gebruik    - 170 meter heg    Op bl...   
2  nl.mnre1153.2i.2023.674      Intern gebruik    5.1.2.e worden gerealise...   
3  nl.mnre1153.2i.2023.674      Intern gebruik    - Ze vragen 35 producten...   
4  nl.mnre1153.2i.2023.674      Intern gebruik    - Bij de kosten van 5.1....   

      type                                          publ

In [4]:
from renumics import spotlight

spotlight.show(merged)

VBox(children=(Label(value='Spotlight running on http://127.0.0.1:62491/'), HBox(children=(Button(description=…