# BERTopic: 
- Preprocess (normalize text, filter for "yes" label)

- Embedding (convert text to number representation)

- Top Modeling (find different cluster setups)

In [35]:
#imports / installs

import pandas as pd
import re
import glob
import os
import itertools
import hdbscan


## Step 1: Preprocessing

In [36]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)
    # Remove mentions and hashtags
    text = re.sub(r"@\w+|#\w+", "", text)
    # Remove non-letter characters (keep punctuation if needed)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [37]:
from sentence_transformers import SentenceTransformer

def generate_embeddings(texts, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts, show_progress_bar=True)
    return embeddings


In [38]:
INPUT_PATH = "../../data/climate_classified/" #Currently uses the 14 jsons * 100,000 posts 

json_pattern = os.path.join(INPUT_PATH,'*.json')
combined_paths = glob.glob(json_pattern)

dfs = []


for path in combined_paths:
    try:
        df = pd.read_json(path)
        dfs.append(df)
    except ValueError as e:
        print(f"Failed to read {path}: {e}")


In [39]:
if dfs:
    temp = pd.concat(dfs, ignore_index=True)
    print(f"Loaded {len(temp)} posts from {len(dfs)} files.")
else:
    print("No data loaded.")

Loaded 1311270 posts from 14 files.


### NOTE: Currently filters for 60+ characters and score >= 0.99 !

In [40]:
# Step 1: Filter for climate-related posts only
climate_df = temp[temp["label"] == "yes"].copy()

# Step 2: Light preprocessing
climate_df["clean_text"] = climate_df["text"].astype(str).apply(preprocess_text)

# Step 3: Filter on character length and score
climate_df = climate_df[
    
    (climate_df["clean_text"].str.len() >= 60) &
    (climate_df["score"] >= 0.99)
].copy()

print(f"Remaining posts after full filtering: {len(climate_df)}")

Remaining posts after full filtering: 11467


In [41]:
# Save filtered climate_df

output_path = "../../data/filtered/above60chars_above99score.json"
climate_df.to_json(output_path)

print(f"Filtered dataset saved to: {output_path}")

Filtered dataset saved to: ../../data/filtered/above60chars_above99score.json


In [42]:
climate_df.columns

Index(['repo', 'seq', 'text', 'timestamp', 'cid', 'uri', 'label', 'score',
       'clean_text'],
      dtype='object')

In [43]:
climate_df.head()

Unnamed: 0,repo,seq,text,timestamp,cid,uri,label,score,clean_text
25,did:plc:uli2rqyfqasvuawksu2z5jkc,7778280581,Trump's executive order trying to block state ...,2025-04-09 21:10:45.855,bafyreihbjn7mnkbiytl4wc2jjhukux7xfncg772auwhhu...,at://did:plc:uli2rqyfqasvuawksu2z5jkc/app.bsky...,yes,0.997684,trumps executive order trying to block state c...
204,did:plc:4zh2idecxr5zudhn3oniodhw,7778286641,Spain and Canada signed agreements on renewabl...,2025-04-09 21:10:53.664,bafyreibvwoj6qzbffnpz4rkgxena26ejvpfqoznkbed7n...,at://did:plc:4zh2idecxr5zudhn3oniodhw/app.bsky...,yes,0.993054,spain and canada signed agreements on renewabl...
411,did:plc:m6ntt433rso3lp7dxaja3mue,7778293323,When did you bitch about what Republicans were...,2025-04-09 21:11:03.083,bafyreic4vipeqxz6mlm36qa7uw3qqjbxwo3qtvucl2fek...,at://did:plc:m6ntt433rso3lp7dxaja3mue/app.bsky...,yes,0.996543,when did you bitch about what republicans were...
441,did:plc:cm4nhw2xk43bczonk7mbfvrb,7778294643,"Hydrogen, as you know, is useful for decarboni...",2025-04-09 21:11:05.110,bafyreidsl6kwy2dx6rtvqkhjyafcffv6cwdud7aefrubf...,at://did:plc:cm4nhw2xk43bczonk7mbfvrb/app.bsky...,yes,0.997304,hydrogen as you know is useful for decarbonisi...
448,did:plc:ci5fsjcdjgoct5k3yllky4ud,7778294887,Either we end the Fossil Fuel Era or the Fossi...,2025-04-09 21:11:05.321,bafyreifegyoen4hku664cni3qqh3v6xbnoptueyb76dqm...,at://did:plc:ci5fsjcdjgoct5k3yllky4ud/app.bsky...,yes,0.99579,either we end the fossil fuel era or the fossi...


## Step 2: Embedding Generation

pip install -U sentence-transforme

In [1]:
pip install -U sentence-transformers -q

You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [16]:
from sentence_transformers import SentenceTransformer

In [25]:
len(texts_to_embed)

11467

In [17]:
model = SentenceTransformer('all-miniLM-L6-v2')
texts_to_embed = climate_df["clean_text"].tolist()
embeddings = model.encode(texts_to_embed, show_progress_bar=True) 

Batches: 100%|██████████| 359/359 [00:57<00:00,  6.22it/s]


In [64]:
pip install bertopic -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl (150 kB)
[K     |████████████████████████████████| 150 kB 261 kB/s eta 0:00:01
[?25hCollecting plotly>=4.7.0
  Downloading plotly-6.0.1-py3-none-any.whl (14.8 MB)
[K     |████████████████████████████████| 14.8 MB 167 kB/s eta 0:00:01
Collecting hdbscan>=0.8.29
  Downloading hdbscan-0.8.40.tar.gz (6.9 MB)
[K     |████████████████████████████████| 6.9 MB 60 kB/s eta 0:00:016
  distutils: /private/var/folders/f3/z043mppd07z6npdj2ch2t1mm0000gn/T/pip-build-env-ljns71ow/normal/lib/python3.9/site-packages
  sysconfig: /Library/Python/3.9/site-packages[0m
  distutils: /private/var/folders/f3/z043mppd07z6npdj2ch2t1mm0000gn/T/pip-build-env-ljns71ow/normal/lib/python3.9/site-packages
  sysconfig: /Library/Python/3.9/site-packages[0m
  user = False
  home = None
  root = None
  prefix = '/private/var/folders/f3/z043mppd07z6npdj2ch2t1mm0000gn/T/pip-b

In [65]:
from bertopic import BERTopic

In [66]:
topic_model = BERTopic(language="english", verbose=True)

In [26]:
topics, probs = topic_model.fit_transform(texts_to_embed, embeddings)

topic_model.get_topic_info().head(20)

NameError: name 'embeddings' is not defined

## Tuning

To improve the clustering of topics, we can improve the model in several ways:

- Change UDBSCAN settings (min_cluster_size, min_samples, metrics = euclidean, manhattan, cosine)
- Manual merging of topics (Two related fine-grained topics could be merged into a broader, more general topic)
- Change Sentence Transformer to a different model (ie. "all-mpnet-base-v2")



## CURRENT!:

- steps of 2 for hyperparameters
- uses cosine through normalization as well - in case it doesnt work it goes back to euclidean
- logs and saves results in a csv

- 

In [44]:
import os
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from sklearn.preprocessing import normalize
import itertools
import pandas as pd
import time

#data
texts_to_embed = climate_df["clean_text"].tolist()

# Hyperparameter ranges
embedding_models = ["all-MiniLM-L6-v2", "all-mpnet-base-v2"]
min_cluster_sizes = list(range(10, 51, 5))     # 10 to 50, step 5
min_samples_vals = list(range(5, 11, 2))       # 5, 7, 9
nr_topics_vals = [10, 15,20]
distance_metrics = ["euclidean", "manhattan", "cosine"]

# Logging setup
log_path = "../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv"
log_columns = [
    "embedding_model", "metric", "min_cluster_size", "min_samples",
    "nr_topics", "n_topics", "outliers", "outlier_pct", "time_sec"
]

# Load or initialize log
if os.path.exists(log_path):
    log_df = pd.read_csv(log_path)
else:
    log_df = pd.DataFrame(columns=log_columns)
    log_df.to_csv(log_path, index=False)

# Main loop
for embed_model in embedding_models:
    print(f"\nEmbedding model: {embed_model}")
    model = SentenceTransformer(embed_model)
    embeddings_local = model.encode(texts_to_embed, show_progress_bar=True)

    for min_cluster_size, min_samples, metric, nr_topics in itertools.product(
        min_cluster_sizes, min_samples_vals, distance_metrics, nr_topics_vals
    ):
        # Check if already done
        run_key = {
            "embedding_model": embed_model,
            "metric": metric,
            "min_cluster_size": min_cluster_size,
            "min_samples": min_samples,
            "nr_topics": nr_topics if nr_topics else "None"
        }

        existing = log_df[
            (log_df.embedding_model == run_key["embedding_model"]) &
            (log_df.metric == run_key["metric"]) &
            (log_df.min_cluster_size == run_key["min_cluster_size"]) &
            (log_df.min_samples == run_key["min_samples"]) &
            (log_df.nr_topics == run_key["nr_topics"])
        ]

        if not existing.empty:
            print(f"Skipping already completed: {run_key}")
            continue

        print(f"\nRunning: {run_key}")
        start = time.time()

        # Normalize for cosine
        if metric == "cosine":
            embeddings_used = normalize(embeddings_local, norm="l2")
            hdbscan_metric = "euclidean"
        else:
            embeddings_used = embeddings_local
            hdbscan_metric = metric

        # HDBSCAN setup
        hdbscan_model = HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric=hdbscan_metric,
            cluster_selection_method="eom"
        )

        # BERTopic setup
        topic_model = BERTopic(
            hdbscan_model=hdbscan_model,
            language="english",
            calculate_probabilities=False,
            verbose=False,
            low_memory=True                                  # OBS: Remove if we want to train on new docs/entries and need .transform()!!!
        )

        try:
            topics, _ = topic_model.fit_transform(texts_to_embed, embeddings_used)

            if nr_topics:
                topic_model.reduce_topics(texts_to_embed, nr_topics=nr_topics)
                topics = topic_model.topics_

            topic_info = topic_model.get_topic_info()
            n_topics = len(topic_info[topic_info.Topic != -1])
            n_outliers = topic_info[topic_info.Topic == -1].Count.values[0] if -1 in topic_info.Topic.values else 0
            n_total = sum(topic_info.Count)
            duration = round(time.time() - start, 2)

            log_entry = {
                **run_key,
                "n_topics": n_topics,
                "outliers": n_outliers,
                "outlier_pct": round(n_outliers / n_total * 100, 2),
                "time_sec": duration
            }

            log_df = pd.concat([log_df, pd.DataFrame([log_entry])], ignore_index=True)
            log_df.to_csv(log_path, index=False)
            
            model_name = f"{embed_model}_{metric}_c{min_cluster_size}_s{min_samples}_nt{nr_topics or 'none'}"
            save_path = f"../../data/BERTopic_Hyperparameters/_{model_name}"



            # Save model
            topic_model.save(save_path)


            print(f"Done | Topics: {n_topics}, Outliers: {n_outliers} ({log_entry['outlier_pct']}%) | Time: {duration}s")

        except Exception as e:
            print(f"Failed for config: {run_key} — {e}")



Embedding model: all-MiniLM-L6-v2


Batches: 100%|██████████| 359/359 [00:32<00:00, 11.13it/s]



Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 10}


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Done | Topics: 9, Outliers: 4590 (40.03%) | Time: 13.08s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4488 (39.14%) | Time: 3.26s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4584 (39.98%) | Time: 3.33s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4467 (38.96%) | Time: 3.73s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4717 (41.14%) | Time: 3.44s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4484 (39.1%) | Time: 3.43s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4699 (40.98%) | Time: 3.28s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4701 (41.0%) | Time: 3.37s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4736 (41.3%) | Time: 3.48s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4762 (41.53%) | Time: 3.25s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5312 (46.32%) | Time: 3.33s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4825 (42.08%) | Time: 3.27s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4793 (41.8%) | Time: 3.36s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5256 (45.84%) | Time: 3.4s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4944 (43.12%) | Time: 3.78s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5223 (45.55%) | Time: 3.35s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5120 (44.65%) | Time: 3.39s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5076 (44.27%) | Time: 3.56s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5032 (43.88%) | Time: 3.42s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5123 (44.68%) | Time: 3.42s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4975 (43.39%) | Time: 3.76s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4849 (42.29%) | Time: 3.86s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5215 (45.48%) | Time: 3.65s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5234 (45.64%) | Time: 3.53s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4988 (43.5%) | Time: 3.39s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5232 (45.63%) | Time: 3.64s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4926 (42.96%) | Time: 3.5s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4936 (43.05%) | Time: 3.42s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4845 (42.25%) | Time: 3.94s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4817 (42.01%) | Time: 4.18s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5127 (44.71%) | Time: 3.76s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4905 (42.77%) | Time: 3.49s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4790 (41.77%) | Time: 4.14s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5111 (44.57%) | Time: 3.52s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5062 (44.14%) | Time: 3.42s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4766 (41.56%) | Time: 3.43s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5135 (44.78%) | Time: 3.4s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4909 (42.81%) | Time: 3.6s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5087 (44.36%) | Time: 3.33s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5237 (45.67%) | Time: 3.47s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5200 (45.35%) | Time: 3.43s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5183 (45.2%) | Time: 3.75s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4928 (42.98%) | Time: 3.35s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5128 (44.72%) | Time: 3.4s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4648 (40.53%) | Time: 3.99s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5014 (43.73%) | Time: 4.14s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5354 (46.69%) | Time: 4.06s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5304 (46.25%) | Time: 4.08s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5221 (45.53%) | Time: 3.8s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5201 (45.36%) | Time: 3.6s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5294 (46.17%) | Time: 3.57s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5272 (45.98%) | Time: 3.4s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5351 (46.66%) | Time: 3.42s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5270 (45.96%) | Time: 3.44s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5180 (45.17%) | Time: 3.43s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5198 (45.33%) | Time: 3.38s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4984 (43.46%) | Time: 3.71s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5178 (45.16%) | Time: 4.43s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5048 (44.02%) | Time: 3.64s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4951 (43.18%) | Time: 3.87s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4502 (39.26%) | Time: 4.26s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5220 (45.52%) | Time: 3.35s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4779 (41.68%) | Time: 3.4s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4941 (43.09%) | Time: 3.31s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4689 (40.89%) | Time: 3.4s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4921 (42.91%) | Time: 4.06s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5142 (44.84%) | Time: 3.74s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4830 (42.12%) | Time: 3.38s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5035 (43.91%) | Time: 3.47s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5290 (46.13%) | Time: 3.35s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5146 (44.88%) | Time: 3.44s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4936 (43.05%) | Time: 3.49s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5060 (44.13%) | Time: 3.35s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5342 (46.59%) | Time: 3.39s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4560 (39.77%) | Time: 3.36s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5329 (46.47%) | Time: 4.18s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5209 (45.43%) | Time: 3.38s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5357 (46.72%) | Time: 3.37s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4850 (42.3%) | Time: 3.42s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5327 (46.46%) | Time: 3.47s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5112 (44.58%) | Time: 3.35s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 25, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4846 (42.26%) | Time: 4.35s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 25, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4897 (42.71%) | Time: 3.69s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 25, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4917 (42.88%) | Time: 3.34s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 25, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4689 (40.89%) | Time: 3.38s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 25, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4753 (41.45%) | Time: 3.56s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 25, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5278 (46.03%) | Time: 3.71s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 25, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4846 (42.26%) | Time: 3.74s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 25, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5012 (43.71%) | Time: 3.81s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 25, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4849 (42.29%) | Time: 3.6s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 25, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5099 (44.47%) | Time: 3.34s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 25, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4410 (38.46%) | Time: 3.19s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 25, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4709 (41.07%) | Time: 3.24s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 25, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5314 (46.34%) | Time: 3.34s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 25, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5361 (46.75%) | Time: 3.32s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 25, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4900 (42.73%) | Time: 3.53s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 25, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4541 (39.6%) | Time: 3.2s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 25, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5006 (43.66%) | Time: 3.48s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 25, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5080 (44.3%) | Time: 3.35s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4818 (42.02%) | Time: 3.26s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5150 (44.91%) | Time: 3.44s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5004 (43.64%) | Time: 3.34s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5004 (43.64%) | Time: 3.24s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5159 (44.99%) | Time: 3.33s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5514 (48.09%) | Time: 3.31s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5022 (43.8%) | Time: 3.33s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4827 (42.09%) | Time: 3.64s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5570 (48.57%) | Time: 3.8s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 30, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4687 (40.87%) | Time: 3.74s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 30, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4648 (40.53%) | Time: 3.81s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 30, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4521 (39.43%) | Time: 4.2s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 30, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4738 (41.32%) | Time: 3.52s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 30, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4439 (38.71%) | Time: 3.39s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 30, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4804 (41.89%) | Time: 3.91s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 30, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4672 (40.74%) | Time: 3.39s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 30, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4525 (39.46%) | Time: 3.62s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 30, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4686 (40.87%) | Time: 4.2s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 30, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4805 (41.9%) | Time: 3.91s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 30, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5258 (45.85%) | Time: 4.34s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 30, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4656 (40.6%) | Time: 3.66s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 30, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4486 (39.12%) | Time: 3.42s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 30, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5170 (45.09%) | Time: 3.42s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 30, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5089 (44.38%) | Time: 3.42s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 30, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5252 (45.8%) | Time: 3.39s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 30, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4808 (41.93%) | Time: 3.39s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 30, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4997 (43.58%) | Time: 3.56s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 30, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4984 (43.46%) | Time: 3.35s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 30, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4897 (42.71%) | Time: 3.5s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 30, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5383 (46.94%) | Time: 3.68s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 30, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4624 (40.32%) | Time: 4.13s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 30, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4793 (41.8%) | Time: 4.17s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 30, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4518 (39.4%) | Time: 3.91s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 30, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5349 (46.65%) | Time: 3.76s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 30, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4828 (42.1%) | Time: 3.47s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 30, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4673 (40.75%) | Time: 3.92s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 35, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5342 (46.59%) | Time: 3.75s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 35, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5041 (43.96%) | Time: 3.26s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 35, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4401 (38.38%) | Time: 3.5s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 35, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4767 (41.57%) | Time: 3.32s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 35, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4881 (42.57%) | Time: 3.34s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 35, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5245 (45.74%) | Time: 3.38s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 35, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4549 (39.67%) | Time: 3.43s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 35, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4896 (42.7%) | Time: 4.37s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 35, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5050 (44.04%) | Time: 3.36s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 35, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4892 (42.66%) | Time: 3.3s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 35, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4671 (40.73%) | Time: 3.33s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 35, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5260 (45.87%) | Time: 3.4s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 35, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5476 (47.75%) | Time: 3.65s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 35, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4839 (42.2%) | Time: 3.77s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 35, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4671 (40.73%) | Time: 3.74s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 35, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5154 (44.95%) | Time: 10.18s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 35, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4815 (41.99%) | Time: 5.55s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 35, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5052 (44.06%) | Time: 4.9s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4970 (43.34%) | Time: 3.58s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4571 (39.86%) | Time: 3.85s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4971 (43.35%) | Time: 3.73s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4817 (42.01%) | Time: 4.26s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 8, Outliers: 212 (1.85%) | Time: 4.18s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5046 (44.0%) | Time: 4.49s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4967 (43.32%) | Time: 4.36s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5190 (45.26%) | Time: 3.98s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4547 (39.65%) | Time: 3.65s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 8, Outliers: 315 (2.75%) | Time: 3.77s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4346 (37.9%) | Time: 3.63s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4897 (42.71%) | Time: 4.36s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5036 (43.92%) | Time: 3.71s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 8, Outliers: 326 (2.84%) | Time: 3.85s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4935 (43.04%) | Time: 4.27s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5119 (44.64%) | Time: 4.04s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 8, Outliers: 324 (2.83%) | Time: 3.58s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4655 (40.59%) | Time: 3.94s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 8, Outliers: 377 (3.29%) | Time: 3.51s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4771 (41.61%) | Time: 3.81s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4629 (40.37%) | Time: 3.82s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4888 (42.63%) | Time: 3.79s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4655 (40.59%) | Time: 3.85s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4578 (39.92%) | Time: 4.37s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4948 (43.15%) | Time: 3.87s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5110 (44.56%) | Time: 3.94s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 9, Outliers: 351 (3.06%) | Time: 3.57s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4906 (42.78%) | Time: 4.16s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4968 (43.32%) | Time: 3.91s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 7, Outliers: 150 (1.31%) | Time: 3.97s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 8, Outliers: 279 (2.43%) | Time: 3.59s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5322 (46.41%) | Time: 4.08s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4711 (41.08%) | Time: 4.3s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 8, Outliers: 312 (2.72%) | Time: 3.54s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5101 (44.48%) | Time: 3.81s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 7, Outliers: 199 (1.74%) | Time: 3.94s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 8, Outliers: 291 (2.54%) | Time: 3.66s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 8, Outliers: 293 (2.56%) | Time: 3.47s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 8, Outliers: 300 (2.62%) | Time: 3.61s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5035 (43.91%) | Time: 3.72s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4693 (40.93%) | Time: 3.81s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 8, Outliers: 337 (2.94%) | Time: 3.81s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 8, Outliers: 324 (2.83%) | Time: 3.44s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 8, Outliers: 296 (2.58%) | Time: 3.49s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4284 (37.36%) | Time: 3.69s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 45, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 8, Outliers: 337 (2.94%) | Time: 3.56s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 45, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4769 (41.59%) | Time: 3.55s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 45, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 8, Outliers: 340 (2.97%) | Time: 3.47s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 45, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 7, Outliers: 387 (3.37%) | Time: 3.47s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 45, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4896 (42.7%) | Time: 3.7s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 45, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4773 (41.62%) | Time: 3.96s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 45, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4677 (40.79%) | Time: 3.55s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 45, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 8, Outliers: 258 (2.25%) | Time: 3.44s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 45, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 7, Outliers: 386 (3.37%) | Time: 3.77s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 45, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5083 (44.33%) | Time: 3.73s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 45, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4899 (42.72%) | Time: 3.75s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 45, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 8, Outliers: 319 (2.78%) | Time: 3.55s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 45, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 8, Outliers: 271 (2.36%) | Time: 3.63s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 45, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 7, Outliers: 159 (1.39%) | Time: 3.8s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 45, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 8, Outliers: 336 (2.93%) | Time: 3.71s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 45, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 6, Outliers: 205 (1.79%) | Time: 3.4s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 45, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 8, Outliers: 305 (2.66%) | Time: 3.5s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 45, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5310 (46.31%) | Time: 3.79s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 50, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 8, Outliers: 303 (2.64%) | Time: 3.43s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 50, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4719 (41.15%) | Time: 3.68s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 50, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 7, Outliers: 345 (3.01%) | Time: 3.81s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 50, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5074 (44.25%) | Time: 3.82s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 50, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 7, Outliers: 420 (3.66%) | Time: 3.41s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 50, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4500 (39.24%) | Time: 3.86s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 50, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5079 (44.29%) | Time: 3.74s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 50, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 8, Outliers: 382 (3.33%) | Time: 3.61s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 50, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 7, Outliers: 413 (3.6%) | Time: 3.6s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 50, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4609 (40.19%) | Time: 3.71s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 50, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 7, Outliers: 248 (2.16%) | Time: 3.73s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 50, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 8, Outliers: 319 (2.78%) | Time: 3.66s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 50, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 7, Outliers: 398 (3.47%) | Time: 3.87s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 50, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 8, Outliers: 334 (2.91%) | Time: 3.92s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 50, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 8, Outliers: 293 (2.56%) | Time: 4.09s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 50, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 8, Outliers: 388 (3.38%) | Time: 3.48s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 50, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4892 (42.66%) | Time: 3.87s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 50, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 7, Outliers: 397 (3.46%) | Time: 3.73s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 50, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 8, Outliers: 429 (3.74%) | Time: 3.77s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 50, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 7, Outliers: 320 (2.79%) | Time: 3.71s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 50, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 6, Outliers: 78 (0.68%) | Time: 3.85s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 50, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 7, Outliers: 251 (2.19%) | Time: 3.66s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 50, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 7, Outliers: 360 (3.14%) | Time: 3.86s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 50, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 5, Outliers: 243 (2.12%) | Time: 3.67s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 50, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 8, Outliers: 354 (3.09%) | Time: 3.62s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 50, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 8, Outliers: 368 (3.21%) | Time: 3.77s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 50, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 8, Outliers: 263 (2.29%) | Time: 3.8s

Embedding model: all-mpnet-base-v2


Batches: 100%|██████████| 359/359 [02:08<00:00,  2.79it/s]



Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4699 (40.98%) | Time: 5.44s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4210 (36.71%) | Time: 4.65s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4677 (40.79%) | Time: 4.8s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4402 (38.39%) | Time: 4.93s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4745 (41.38%) | Time: 4.41s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4740 (41.34%) | Time: 4.63s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4489 (39.15%) | Time: 5.98s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4571 (39.86%) | Time: 4.76s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4454 (38.84%) | Time: 4.35s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4743 (41.36%) | Time: 4.5s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4795 (41.82%) | Time: 4.68s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4780 (41.68%) | Time: 4.42s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4722 (41.18%) | Time: 4.57s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4647 (40.52%) | Time: 4.15s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4949 (43.16%) | Time: 4.42s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4818 (42.02%) | Time: 4.51s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4847 (42.27%) | Time: 4.13s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4442 (38.74%) | Time: 4.24s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4874 (42.5%) | Time: 4.07s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4839 (42.2%) | Time: 4.02s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4989 (43.51%) | Time: 4.25s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4744 (41.37%) | Time: 4.56s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 5012 (43.71%) | Time: 4.65s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 5242 (45.71%) | Time: 4.86s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4822 (42.05%) | Time: 4.02s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 15}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 15} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 20}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 20} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 10}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 15}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 15} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 20}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 20} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 10}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 15}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 15} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 20}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 20} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 10}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 15}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 15} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 20}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 5, 'nr_topics': 20} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 10}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 15}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 15} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 20}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 20} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 10}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 15}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 15} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 20}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 20} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 10}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 15}
Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 15} — [Errno 28] No space left on device: '../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv'

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 20}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 15, 'min_samples': 7, 'nr_topics': 20} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 9, 'nr_topics': 10}
Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 9, 'nr_topics': 10} — [Errno 28] No space left on device: '../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv'

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 9, 'nr_topics': 15}
Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': 9, 'nr_topics': 15} — [Errno 28] No space left on device: '../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv'

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min



Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 15}




Done | Topics: 14, Outliers: 4866 (42.43%) | Time: 4.7s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 20}




Done | Topics: 19, Outliers: 4800 (41.86%) | Time: 4.79s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 10}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 15}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 15} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 20}
Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 20} — [Errno 28] No space left on device: '../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv'

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 10}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 15}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 15} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 20}
Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 25, 'min_samples': 9, 'nr_topics': 20} — [Errno 28] No space left on device: '../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv'

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 30, 'min_samples': 5, 'nr_topics': 10}
Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 30, 'min_samples': 5, 'nr_topics': 10} — [Errno 28] No space left on device: '../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv'

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_clust



Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 35, 'min_samples': 7, 'nr_topics': 20} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 10}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 15}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 15} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 20}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 20} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 10}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 15}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 15} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 20}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 20} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 10}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 15}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 15} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 20}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 35, 'min_samples': 9, 'nr_topics': 20} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 10}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 15}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 15} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 20}
Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 20} — [Errno 28] No space left on device: '../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv'

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 10}
Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 10} — [Errno 28] No space left on device: '../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv'

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', '



Done | Topics: 14, Outliers: 4770 (41.6%) | Time: 4.51s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 20}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 40, 'min_samples': 5, 'nr_topics': 20} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 10}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 15}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 15} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 20}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 20} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 10}
Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 10} — [Errno 28] No space left on device: '../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv'

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 15}
Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 15} — [Errno 28] No space left on device: '../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv'

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', '



Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 15}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 15} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 20}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 40, 'min_samples': 7, 'nr_topics': 20} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 10}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 15}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 15} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 20}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 20} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 10}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 15}
Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 15} — [Errno 28] No space left on device: '../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv'

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 20}
Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 40, 'min_samples': 9, 'nr_topics': 20} — [Errno 28] No space left on device: '../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv'

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min



Done | Topics: 7, Outliers: 166 (1.45%) | Time: 4.45s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 6, Outliers: 243 (2.12%) | Time: 4.82s

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 15}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 15} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 20}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'euclidean', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 20} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 10}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 10} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 15}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 15} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 20}




Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'manhattan', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 20} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 10}
Failed for config: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 10} — [Errno 28] No space left on device: '../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv'

Running: {'embedding_model': 'all-mpnet-base-v2', 'metric': 'cosine', 'min_cluster_size': 45, 'min_samples': 5, 'nr_topics': 15}


KeyboardInterrupt: 

## View saved topic_model (CURRENT):

In [None]:
from bertopic import BERTopic
topic_model = BERTopic.load("/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_euclidean_c10_s5_ntnone")

# Inspect
topic_model.get_topic_info()


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4566,-1_the_and_to_of,"[the, and, to, of, in, for, is, that, we, it]",[understandable i know a lot of people essenti...
1,0,334,0_coal_clean_plants_trump,"[coal, clean, plants, trump, mine, beautiful, ...",[trump tried to bring back coal in his first t...
2,1,269,1_solar_electricity_panels_power,"[solar, electricity, panels, power, generation...",[renewables met of the growth in electricity d...
3,2,266,2_depth_snow_apr_precip,"[depth, snow, apr, precip, low, high, iembot, ...",[birmingham apr climate report high low precip...
4,3,246,3_apr_missing_iembot_additional,"[apr, missing, iembot, additional, details, vi...",[key west apr climate report high low precip s...
...,...,...,...,...,...
193,192,10,192_bags_ecofriendly_plastic_tree,"[bags, ecofriendly, plastic, tree, canadianwil...",[every item ordered plants a tree and removes ...
194,193,10,193_water_facilities_carries_desalination,"[water, facilities, carries, desalination, sma...",[we support latvias largest water utility rgas...
195,194,10,194_dinner_functioningcore_alreadydirty_dishes...,"[dinner, functioningcore, alreadydirty, dishes...",[clean space clean mind why tidying up is good...
196,195,10,195_indoors_cooking_propane_stove,"[indoors, cooking, propane, stove, gas, showin...",[natural gas is linked with cancer not wind po...


In [15]:
from bertopic import BERTopic

# Load saved model
topic_model = BERTopic.load("/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_manhattan_c22_s9_nt10")

# Get original texts
texts = climate_df["clean_text"].tolist()  

# Reduce to 10 topics
topic_model.reduce_topics(docs=texts, nr_topics=10)

# View updated topic summary
reduced_topic_info = topic_model.get_topic_info()
print(reduced_topic_info.head(11))


   Topic  Count                          Name  \
0     -1   4955              -1_the_to_and_of   
1      0   4852               0_the_to_and_of   
2      1   1052         1_apr_snow_precip_low   
3      2    199            2_green_the_and_of   
4      3    133  3_plastic_plastics_waste_the   
5      4    105          4_air_the_autism_and   
6      5     65                 5_de_en_la_es   
7      6     47     6_carbon_carbonated_it_my   
8      7     31         7_of_emission_the_and   
9      8     28         8_reg_promo_code_deal   

                                      Representation  \
0  [the, to, and, of, in, is, for, that, on, clim...   
1  [the, to, and, of, in, is, for, that, climate,...   
2  [apr, snow, precip, low, high, iembot, additio...   
3  [green, the, and, of, to, like, for, my, game,...   
4  [plastic, plastics, waste, the, and, microplas...   
5  [air, the, autism, and, of, environmental, to,...   
6        [de, en, la, es, les, que, des, et, le, el]   
7  [carbon, 

In [17]:
## Filter for lowest outlier percentage

import pandas as pd
import os

# Load the log
log_path = "../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv"
log_df = pd.read_csv(log_path)

# Sort by outlier percentage (ascending = best first)
log_df_sorted = log_df.sort_values(by="outlier_pct")

# Show top 10
print(log_df_sorted.head(10))


      embedding_model     metric  min_cluster_size  min_samples  nr_topics  \
9    all-MiniLM-L6-v2     cosine                10            5         10   
7    all-MiniLM-L6-v2  manhattan                10            5         14   
10   all-MiniLM-L6-v2     cosine                10            5         12   
50   all-MiniLM-L6-v2  euclidean                12            7         12   
155  all-MiniLM-L6-v2     cosine                18            5         14   
38   all-MiniLM-L6-v2  euclidean                12            5         12   
39   all-MiniLM-L6-v2  euclidean                12            5         14   
146  all-MiniLM-L6-v2  euclidean                18            5         12   
46   all-MiniLM-L6-v2     cosine                12            5         12   
111  all-MiniLM-L6-v2  euclidean                16            5         14   

     n_topics  outliers  outlier_pct  time_sec  
9           9      4372        38.13      8.30  
7          13      4402        38.39      8

In [52]:
top_n = 10
base_model_dir = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/BERTopic_Hyperparameters"

top_models = []

for row in log_df_sorted.head(top_n).itertuples():
    model_name = f"{row.embedding_model}_{row.metric}_c{row.min_cluster_size}_s{row.min_samples}_nt{row.nr_topics}"
    model_path = os.path.join(base_model_dir, f"_{model_name}")
    top_models.append((model_name, model_path))

# Print top model paths
for name, path in top_models:
    print(f"{name}: {path}")


all-MiniLM-L6-v2_cosine_c10_s5_nt10: /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_cosine_c10_s5_nt10
all-MiniLM-L6-v2_manhattan_c10_s5_nt14: /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_manhattan_c10_s5_nt14
all-MiniLM-L6-v2_cosine_c10_s5_nt12: /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_cosine_c10_s5_nt12
all-MiniLM-L6-v2_euclidean_c12_s7_nt12: /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_euclidean_c12_s7_nt12
all-MiniLM-L6-v2_cosine_c18_s5_nt14: /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_cosine_c18_s5_nt14
all-MiniLM-L6-v2_euclidean_c12_s5_nt12: /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_euclidea

In [59]:
from bertopic import BERTopic

# Load saved model
topic_model = BERTopic.load("/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_cosine_c15_s9_nt20")

# Get original texts
texts = climate_df["clean_text"].tolist()  

# Reduce to 10 topics

pd.set_option('display.max_colwidth', None)
# View updated topic summary
reduced_topic_info = topic_model.get_topic_info()
print(reduced_topic_info.head(15))


    Topic  Count  \
0      -1   5270   
1       0   2708   
2       1    863   
3       2    603   
4       3    595   
5       4    357   
6       5    225   
7       6    195   
8       7    140   
9       8    114   
10      9     74   
11     10     71   
12     11     65   
13     12     39   
14     13     28   

                                                                   Name  \
0                                                      -1_the_to_and_of   
1                                                       0_the_to_and_of   
2                                                 1_apr_snow_precip_low   
3                                                 2_ai_energy_the_solar   
4                                                  3_the_and_climate_of   
5                                                4_water_plastic_the_to   
6                                                  5_green_flag_red_the   
7                                                    6_cars_tesla_to_ev   
8   

In [72]:
topic_id = 6
#
topic_model.get_representative_docs(topic_id)
df_asd = pd.read_json("../../data/filtered/above60chars_above99score.json")
print(df_asd)
df_asd = pd.DataFrame(df_asd,topics,reset_index=True)


                                     repo         seq  \
25       did:plc:uli2rqyfqasvuawksu2z5jkc  7778280581   
204      did:plc:4zh2idecxr5zudhn3oniodhw  7778286641   
411      did:plc:m6ntt433rso3lp7dxaja3mue  7778293323   
441      did:plc:cm4nhw2xk43bczonk7mbfvrb  7778294643   
448      did:plc:ci5fsjcdjgoct5k3yllky4ud  7778294887   
...                                   ...         ...   
1310384  did:plc:33jio4po2g6jowei4lxedlwb  7606048401   
1310614  did:plc:nkatt34qgc76mncywdyafngb  7606056509   
1310970  did:plc:k46un34mwzrvyyifeuyu6uie  7606071491   
1310979  did:plc:23a3ahyixkin56ejpbsnbhec  7606072018   
1311000  did:plc:fvmllofyyrsll5skmykmbug3  7606072608   

                                                                                                                                                                                                                                                                                                             text  \
25    

TypeError: __init__() got an unexpected keyword argument 'reset_index'

In [58]:
topics_cleaned = topic_info[topic_info["Topic"] != -1]

def is_high_quality_topic(keywords):
    stopwords = {"the", "to", "and", "of", "in", "is", "for", "that", "it"}
    return len([w for w in keywords if w not in stopwords]) >= 5

topics_cleaned["high_quality"] = topics_cleaned["Representation"].apply(is_high_quality_topic)

good_topics = topics_cleaned[topics_cleaned["high_quality"] == True]

for _, row in good_topics.iterrows():
    print(f"\nTopic {row['Topic']} | Count: {row['Count']}")
    print("Top words:", ", ".join(row["Representation"][:30]))
    print("Example post:", row["Representative_Docs"][0])




Topic 1 | Count: 324
Top words: snow, apr, depth, precip, low, high, iembot, additional, details, via
Example post: lansing apr climate report high low precip snow snow depth at wed apr via iembot additional details here

Topic 2 | Count: 256
Top words: airport, apr, snow, precip, low, high, missing, climate, link, iembot
Example post: oceanside airport ca apr climate report high low precip snow missing at mon apr via iembot additional details here

Topic 3 | Count: 229
Top words: apr, missing, iembot, additional, details, via, report, precip, low, here
Example post: jackson apr climate report high low precip snow missing at wed apr via iembot additional details here

Topic 4 | Count: 62
Top words: missing, link, precip, low, high, snow, apr, climate, trace, austin
Example post: peachtree city apr climate high low precip trace snow missing link

Topic 5 | Count: 48
Top words: issues, spc, risk, elevated, day, link, fire, weather, apr, at
Example post: spc issues day elevated fire weat

In [33]:
import pandas as pd

df = pd.read_csv("../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv")

# Group and average by hyperparameter config
agg = df.groupby(["min_cluster_size", "min_samples", "nr_topics"]).agg({
    "n_topics": "mean",
    "outlier_pct": "mean",
    "time_sec": "mean"
}).reset_index()

agg.sort_values("outlier_pct").head()


Unnamed: 0,min_cluster_size,min_samples,nr_topics,n_topics,outlier_pct,time_sec
3,10,5,14,13.0,39.736667,8.266667
14,12,5,12,11.0,40.183333,7.163333
1,10,5,10,9.0,40.25,6.996667
15,12,5,14,13.0,40.46,7.38
2,10,5,12,11.0,40.64,6.973333
