# Notebook: Explore Addressed Topics

This notebook is used to explore the topics discussed in reviews previously discussed. 

## Packages

In [1]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP
import pandas as pd
import spacy
import nltk
import re

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


## Constants

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
DATASET_PATH = "../datasets/reviews_sentences.csv"
STOPWORDS = set(stopwords.words('german'))
N_TOPICS = 11

## Code

In [4]:
#spacy.cli.download("de_core_news_sm")

In [5]:
nlp = spacy.load('de_core_news_sm')

In [6]:
dataset = pd.read_csv(DATASET_PATH)

In [7]:
def clean_text(text):
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)
    RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)

    text = re.sub(r'@\w+', '', text)
    text = re.sub(RE_ASCII, " ", text)
    text = re.sub(RE_SINGLECHAR, " ", text)
    text = re.sub(RE_WSPACE, " ", text)
    return text.strip()

def lemmatize_remove_stopwords_text(text):
    doc = nlp(text)
    lemmatized_text = ' '.join([token.lemma_.lower() for token in doc if token.text.lower() not in STOPWORDS])
    return lemmatized_text

dataset["text"] = dataset["text"].apply(clean_text)
dataset["text"] = dataset["text"].apply(lemmatize_remove_stopwords_text)

In [8]:
dataset = dataset.text.to_list()

In [9]:
# Use default UMAP but add random state
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=43)
model = BERTopic(language="german", nr_topics=N_TOPICS, umap_model=umap_model, top_n_words=20, embedding_model="paraphrase-multilingual-MiniLM-L12-v2")

In [10]:
topics, probabilities = model.fit_transform(dataset)
topics = model.reduce_outliers(dataset, topics, strategy="distributions")
model.update_topics(dataset, topics, top_n_words=20)

Downloading (…)0fe39/.gitattributes: 100%|███████████████████████████████████| 968/968 [00:00<00:00, 1.83MB/s]
Downloading (…)_Pooling/config.json: 100%|████████████████████████████████████| 190/190 [00:00<00:00, 975kB/s]
Downloading (…)83e900fe39/README.md: 100%|███████████████████████████████| 3.79k/3.79k [00:00<00:00, 7.62MB/s]
Downloading (…)e900fe39/config.json: 100%|███████████████████████████████████| 645/645 [00:00<00:00, 3.19MB/s]
Downloading (…)ce_transformers.json: 100%|████████████████████████████████████| 122/122 [00:00<00:00, 705kB/s]
Downloading pytorch_model.bin: 100%|███████████████████████████████████████| 471M/471M [00:38<00:00, 12.2MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████████████████████████████| 53.0/53.0 [00:00<00:00, 193kB/s]
Downloading (…)tencepiece.bpe.model: 100%|███████████████████████████████| 5.07M/5.07M [00:00<00:00, 12.5MB/s]
Downloading (…)cial_tokens_map.json: 100%|████████████████████████████████████| 239/239 [00:00<00:00, 328kB/s]
D

In [11]:
topic_statistics = model.get_document_info(dataset)["Topic"].value_counts().reset_index().rename(columns={"index": "Topic", "Topic": "Frequency"}).sort_values("Topic").reset_index(drop=True)
topic_statistics = topic_statistics[topic_statistics["Topic"] != -1]
topic_statistics["Topic"] = topic_statistics["Topic"] + 1
topic_statistics

KeyError: 'Topic'

In [None]:
top_words = []
for topic in topic_statistics["Topic"]:
    words = model.get_topic(topic - 1)[:5]
    top_words.append(', '.join([word[0] for word in words]))
    
topic_statistics["Top Words"] = top_words

def format_frequency(frequency):
    return "{:,}".format(frequency)

# Anwendung der Funktion auf die Spalte "Frequency"
topic_statistics["Frequency"] = topic_statistics["Frequency"].apply(format_frequency)

topic_statistics