# BERTopic: 
- Preprocess (normalize text, filter for "yes" label)

- Embedding (convert text to number representation)

- Top Modeling (find different cluster setups)

In [6]:
#imports / installs

import pandas as pd
import re
import glob
import os
import itertools
import hdbscan


## Step 1: Preprocessing

In [7]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)
    # Remove mentions and hashtags
    text = re.sub(r"@\w+|#\w+", "", text)
    # Remove non-letter characters (keep punctuation if needed)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [8]:
from sentence_transformers import SentenceTransformer

def generate_embeddings(texts, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts, show_progress_bar=True)
    return embeddings


In [9]:
INPUT_PATH = "../../data/climate_classified/" #Currently uses the 14 jsons * 100,000 posts 

json_pattern = os.path.join(INPUT_PATH,'*.json')
combined_paths = glob.glob(json_pattern)

dfs = []


for path in combined_paths:
    try:
        df = pd.read_json(path)
        dfs.append(df)
    except ValueError as e:
        print(f"Failed to read {path}: {e}")


In [10]:
if dfs:
    temp = pd.concat(dfs, ignore_index=True)
    print(f"Loaded {len(temp)} posts from {len(dfs)} files.")
else:
    print("No data loaded.")

Loaded 1311270 posts from 14 files.


### NOTE: Currently filters for 60+ characters and score >= 0.99 !

In [11]:
# Step 1: Filter for climate-related posts only
climate_df = temp[temp["label"] == "yes"].copy()

# Step 2: Light preprocessing
climate_df["clean_text"] = climate_df["text"].astype(str).apply(preprocess_text)

# Step 3: Filter on character length and score
climate_df = climate_df[
    
    (climate_df["clean_text"].str.len() >= 60) &
    (climate_df["score"] >= 0.99)
].copy()

print(f"Remaining posts after full filtering: {len(climate_df)}")

Remaining posts after full filtering: 11467


In [9]:
# Save filtered climate_df

output_path = "../../data/filtered/above60chars_above99score.json"
climate_df.to_json(output_path)

print(f"Filtered dataset saved to: {output_path}")

Filtered dataset saved to: ../../data/filtered/above60chars_above99score.json


In [13]:
climate_df.columns

Index(['repo', 'seq', 'text', 'timestamp', 'cid', 'uri', 'label', 'score',
       'clean_text'],
      dtype='object')

In [14]:
climate_df.head()

Unnamed: 0,repo,seq,text,timestamp,cid,uri,label,score,clean_text
25,did:plc:uli2rqyfqasvuawksu2z5jkc,7778280581,Trump's executive order trying to block state ...,2025-04-09 21:10:45.855,bafyreihbjn7mnkbiytl4wc2jjhukux7xfncg772auwhhu...,at://did:plc:uli2rqyfqasvuawksu2z5jkc/app.bsky...,yes,0.997684,trumps executive order trying to block state c...
204,did:plc:4zh2idecxr5zudhn3oniodhw,7778286641,Spain and Canada signed agreements on renewabl...,2025-04-09 21:10:53.664,bafyreibvwoj6qzbffnpz4rkgxena26ejvpfqoznkbed7n...,at://did:plc:4zh2idecxr5zudhn3oniodhw/app.bsky...,yes,0.993054,spain and canada signed agreements on renewabl...
411,did:plc:m6ntt433rso3lp7dxaja3mue,7778293323,When did you bitch about what Republicans were...,2025-04-09 21:11:03.083,bafyreic4vipeqxz6mlm36qa7uw3qqjbxwo3qtvucl2fek...,at://did:plc:m6ntt433rso3lp7dxaja3mue/app.bsky...,yes,0.996543,when did you bitch about what republicans were...
441,did:plc:cm4nhw2xk43bczonk7mbfvrb,7778294643,"Hydrogen, as you know, is useful for decarboni...",2025-04-09 21:11:05.110,bafyreidsl6kwy2dx6rtvqkhjyafcffv6cwdud7aefrubf...,at://did:plc:cm4nhw2xk43bczonk7mbfvrb/app.bsky...,yes,0.997304,hydrogen as you know is useful for decarbonisi...
448,did:plc:ci5fsjcdjgoct5k3yllky4ud,7778294887,Either we end the Fossil Fuel Era or the Fossi...,2025-04-09 21:11:05.321,bafyreifegyoen4hku664cni3qqh3v6xbnoptueyb76dqm...,at://did:plc:ci5fsjcdjgoct5k3yllky4ud/app.bsky...,yes,0.99579,either we end the fossil fuel era or the fossi...


## Step 2: Embedding Generation

pip install -U sentence-transforme

In [15]:
pip install -U sentence-transformers -q

You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [16]:
from sentence_transformers import SentenceTransformer

In [25]:
len(texts_to_embed)

11467

In [17]:
model = SentenceTransformer('all-miniLM-L6-v2')
texts_to_embed = climate_df["clean_text"].tolist()
embeddings = model.encode(texts_to_embed, show_progress_bar=True) 

Batches: 100%|██████████| 359/359 [00:57<00:00,  6.22it/s]


In [64]:
pip install bertopic -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl (150 kB)
[K     |████████████████████████████████| 150 kB 261 kB/s eta 0:00:01
[?25hCollecting plotly>=4.7.0
  Downloading plotly-6.0.1-py3-none-any.whl (14.8 MB)
[K     |████████████████████████████████| 14.8 MB 167 kB/s eta 0:00:01
Collecting hdbscan>=0.8.29
  Downloading hdbscan-0.8.40.tar.gz (6.9 MB)
[K     |████████████████████████████████| 6.9 MB 60 kB/s eta 0:00:016
  distutils: /private/var/folders/f3/z043mppd07z6npdj2ch2t1mm0000gn/T/pip-build-env-ljns71ow/normal/lib/python3.9/site-packages
  sysconfig: /Library/Python/3.9/site-packages[0m
  distutils: /private/var/folders/f3/z043mppd07z6npdj2ch2t1mm0000gn/T/pip-build-env-ljns71ow/normal/lib/python3.9/site-packages
  sysconfig: /Library/Python/3.9/site-packages[0m
  user = False
  home = None
  root = None
  prefix = '/private/var/folders/f3/z043mppd07z6npdj2ch2t1mm0000gn/T/pip-b

In [65]:
from bertopic import BERTopic

In [66]:
topic_model = BERTopic(language="english", verbose=True)

In [26]:
topics, probs = topic_model.fit_transform(texts_to_embed, embeddings)

topic_model.get_topic_info().head(20)

NameError: name 'embeddings' is not defined

## Tuning

To improve the clustering of topics, we can improve the model in several ways:

- Change UDBSCAN settings (min_cluster_size, min_samples, metrics = euclidean, manhattan, cosine)
- Manual merging of topics (Two related fine-grained topics could be merged into a broader, more general topic)
- Change Sentence Transformer to a different model (ie. "all-mpnet-base-v2")



## CURRENT!:

- steps of 2 for hyperparameters
- uses cosine through normalization as well - in case it doesnt work it goes back to euclidean
- logs and saves results in a csv

- 

In [29]:
import os
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from sklearn.preprocessing import normalize
import itertools
import pandas as pd
import time

#data
texts_to_embed = climate_df["clean_text"].tolist()

# Hyperparameter ranges
embedding_models = ["all-MiniLM-L6-v2", "all-mpnet-base-v2"]
min_cluster_sizes = list(range(10, 51, 5))     # 10 to 30, step 2
min_samples_vals = list(range(5, 11, 2))       # 5, 7, 9
nr_topics_vals = [8, 10, 12,14]
distance_metrics = ["euclidean", "manhattan", "cosine"]

# Logging setup
log_path = "../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv"
log_columns = [
    "embedding_model", "metric", "min_cluster_size", "min_samples",
    "nr_topics", "n_topics", "outliers", "outlier_pct", "time_sec"
]

# Load or initialize log
if os.path.exists(log_path):
    log_df = pd.read_csv(log_path)
else:
    log_df = pd.DataFrame(columns=log_columns)
    log_df.to_csv(log_path, index=False)

# Main loop
for embed_model in embedding_models:
    print(f"\nEmbedding model: {embed_model}")
    model = SentenceTransformer(embed_model)
    embeddings_local = model.encode(texts_to_embed, show_progress_bar=True)

    for min_cluster_size, min_samples, metric, nr_topics in itertools.product(
        min_cluster_sizes, min_samples_vals, distance_metrics, nr_topics_vals
    ):
        # Check if already done
        run_key = {
            "embedding_model": embed_model,
            "metric": metric,
            "min_cluster_size": min_cluster_size,
            "min_samples": min_samples,
            "nr_topics": nr_topics if nr_topics else "None"
        }

        existing = log_df[
            (log_df.embedding_model == run_key["embedding_model"]) &
            (log_df.metric == run_key["metric"]) &
            (log_df.min_cluster_size == run_key["min_cluster_size"]) &
            (log_df.min_samples == run_key["min_samples"]) &
            (log_df.nr_topics == run_key["nr_topics"])
        ]

        if not existing.empty:
            print(f"Skipping already completed: {run_key}")
            continue

        print(f"\nRunning: {run_key}")
        start = time.time()

        # Normalize for cosine
        if metric == "cosine":
            embeddings_used = normalize(embeddings_local, norm="l2")
            hdbscan_metric = "euclidean"
        else:
            embeddings_used = embeddings_local
            hdbscan_metric = metric

        # HDBSCAN setup
        hdbscan_model = HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric=hdbscan_metric,
            cluster_selection_method="eom"
        )

        # BERTopic setup
        topic_model = BERTopic(
            hdbscan_model=hdbscan_model,
            language="english",
            calculate_probabilities=False,
            verbose=False,
            low_memory=True                                  # OBS: Remove if we want to train on new docs/entries and need .transform()!!!
        )

        try:
            topics, _ = topic_model.fit_transform(texts_to_embed, embeddings_used)

            if nr_topics:
                topic_model.reduce_topics(texts_to_embed, nr_topics=nr_topics)
                topics = topic_model.topics_

            topic_info = topic_model.get_topic_info()
            n_topics = len(topic_info[topic_info.Topic != -1])
            n_outliers = topic_info[topic_info.Topic == -1].Count.values[0] if -1 in topic_info.Topic.values else 0
            n_total = sum(topic_info.Count)
            duration = round(time.time() - start, 2)

            log_entry = {
                **run_key,
                "n_topics": n_topics,
                "outliers": n_outliers,
                "outlier_pct": round(n_outliers / n_total * 100, 2),
                "time_sec": duration
            }

            log_df = pd.concat([log_df, pd.DataFrame([log_entry])], ignore_index=True)
            log_df.to_csv(log_path, index=False)
            
            model_name = f"{embed_model}_{metric}_c{min_cluster_size}_s{min_samples}_nt{nr_topics or 'none'}"
            save_path = f"../../data/BERTopic_Hyperparameters/_{model_name}"



            # Save model
            topic_model.save(save_path)


            print(f"Done | Topics: {n_topics}, Outliers: {n_outliers} ({log_entry['outlier_pct']}%) | Time: {duration}s")

        except Exception as e:
            print(f"Failed for config: {run_key} — {e}")



Embedding model: all-MiniLM-L6-v2


Batches: 100%|██████████| 359/359 [01:12<00:00,  4.93it/s]



Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4856 (42.35%) | Time: 9.35s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4789 (41.76%) | Time: 6.44s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4830 (42.12%) | Time: 6.23s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4694 (40.93%) | Time: 6.84s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4581 (39.95%) | Time: 7.24s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4685 (40.86%) | Time: 6.25s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4730 (41.25%) | Time: 7.71s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4402 (38.39%) | Time: 8.34s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4637 (40.44%) | Time: 9.81s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4372 (38.13%) | Time: 8.3s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4421 (38.55%) | Time: 6.98s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4574 (39.89%) | Time: 9.62s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4758 (41.49%) | Time: 7.07s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4895 (42.69%) | Time: 5.98s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4857 (42.36%) | Time: 7.41s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5114 (44.6%) | Time: 9.79s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4887 (42.62%) | Time: 8.9s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5109 (44.55%) | Time: 8.63s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5021 (43.79%) | Time: 6.65s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4996 (43.57%) | Time: 6.41s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4784 (41.72%) | Time: 7.28s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5156 (44.96%) | Time: 11.27s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4748 (41.41%) | Time: 6.13s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5023 (43.8%) | Time: 5.8s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5379 (46.91%) | Time: 6.36s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5420 (47.27%) | Time: 7.69s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4849 (42.29%) | Time: 9.32s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4882 (42.57%) | Time: 6.44s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5026 (43.83%) | Time: 6.86s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5378 (46.9%) | Time: 6.35s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5015 (43.73%) | Time: 6.6s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5353 (46.68%) | Time: 7.12s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5022 (43.8%) | Time: 6.61s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4890 (42.64%) | Time: 7.14s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5164 (45.03%) | Time: 7.34s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 10, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5223 (45.55%) | Time: 6.34s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 12, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4869 (42.46%) | Time: 6.02s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 12, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4772 (41.62%) | Time: 6.82s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 12, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4463 (38.92%) | Time: 6.65s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 12, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4463 (38.92%) | Time: 7.83s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 12, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4855 (42.34%) | Time: 6.46s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 12, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4876 (42.52%) | Time: 7.15s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 12, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4857 (42.36%) | Time: 7.6s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 12, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4691 (40.91%) | Time: 6.5s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 12, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4746 (41.39%) | Time: 9.77s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 12, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4692 (40.92%) | Time: 9.46s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 12, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4503 (39.27%) | Time: 7.24s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 12, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4764 (41.55%) | Time: 7.81s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 12, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4886 (42.61%) | Time: 6.78s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 12, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5174 (45.12%) | Time: 7.48s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 12, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4430 (38.63%) | Time: 6.14s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 12, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5131 (44.75%) | Time: 6.04s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 12, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4945 (43.12%) | Time: 6.21s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 12, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4924 (42.94%) | Time: 6.35s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 12, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4843 (42.23%) | Time: 5.07s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 12, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5033 (43.89%) | Time: 7.75s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 12, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4840 (42.21%) | Time: 6.86s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 12, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4712 (41.09%) | Time: 6.25s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 12, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4904 (42.77%) | Time: 6.32s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 12, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4719 (41.15%) | Time: 7.18s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 12, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4890 (42.64%) | Time: 5.94s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 12, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5246 (45.75%) | Time: 5.94s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 12, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5056 (44.09%) | Time: 7.02s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 12, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4925 (42.95%) | Time: 8.69s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 12, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5172 (45.1%) | Time: 9.09s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 12, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5171 (45.09%) | Time: 8.24s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 12, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5093 (44.41%) | Time: 7.34s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 12, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5222 (45.54%) | Time: 7.2s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 12, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5265 (45.91%) | Time: 6.67s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 12, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5127 (44.71%) | Time: 6.8s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 12, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4874 (42.5%) | Time: 5.59s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 12, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5150 (44.91%) | Time: 8.09s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 14, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4608 (40.18%) | Time: 7.97s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 14, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4584 (39.98%) | Time: 5.77s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 14, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4905 (42.77%) | Time: 5.95s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 14, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4849 (42.29%) | Time: 9.56s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 14, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4753 (41.45%) | Time: 12.06s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 14, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4684 (40.85%) | Time: 9.61s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 14, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4884 (42.59%) | Time: 9.86s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 14, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4885 (42.6%) | Time: 9.68s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 14, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4831 (42.13%) | Time: 12.98s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 14, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4831 (42.13%) | Time: 8.64s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 14, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4565 (39.81%) | Time: 8.06s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 14, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4852 (42.31%) | Time: 7.73s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 14, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4643 (40.49%) | Time: 7.13s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 14, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4738 (41.32%) | Time: 15.59s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 14, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5025 (43.82%) | Time: 10.13s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 14, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5210 (45.43%) | Time: 11.97s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 14, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4733 (41.27%) | Time: 10.25s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 14, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5216 (45.49%) | Time: 7.9s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 14, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5235 (45.65%) | Time: 10.33s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 14, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4941 (43.09%) | Time: 10.02s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 14, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4935 (43.04%) | Time: 8.38s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 14, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4717 (41.14%) | Time: 9.15s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 14, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5058 (44.11%) | Time: 16.07s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 14, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4889 (42.64%) | Time: 11.19s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 14, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5186 (45.23%) | Time: 10.75s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 14, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5019 (43.77%) | Time: 9.86s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 14, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5232 (45.63%) | Time: 9.78s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 14, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4872 (42.49%) | Time: 11.41s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 14, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5281 (46.05%) | Time: 12.26s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 14, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5075 (44.26%) | Time: 11.61s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 14, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5311 (46.32%) | Time: 14.03s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 14, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5235 (45.65%) | Time: 9.3s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 14, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5060 (44.13%) | Time: 10.15s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 14, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5122 (44.67%) | Time: 8.3s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 14, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5321 (46.4%) | Time: 10.84s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 14, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4921 (42.91%) | Time: 23.58s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 16, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4814 (41.98%) | Time: 29.36s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 16, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4522 (39.43%) | Time: 28.51s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 16, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4923 (42.93%) | Time: 24.26s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 16, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4505 (39.29%) | Time: 23.67s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 16, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4561 (39.78%) | Time: 28.16s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 16, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4795 (41.82%) | Time: 30.46s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 16, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4778 (41.67%) | Time: 19.59s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 16, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5033 (43.89%) | Time: 20.1s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 16, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4849 (42.29%) | Time: 17.61s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 16, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4995 (43.56%) | Time: 21.84s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 16, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4980 (43.43%) | Time: 30.48s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 16, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4796 (41.82%) | Time: 26.03s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 16, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4729 (41.24%) | Time: 28.77s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 16, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5011 (43.7%) | Time: 25.96s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 16, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4967 (43.32%) | Time: 33.11s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 16, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5189 (45.25%) | Time: 13.47s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 16, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5051 (44.05%) | Time: 8.9s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 16, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5145 (44.87%) | Time: 10.11s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 16, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5211 (45.44%) | Time: 9.22s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 16, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4993 (43.54%) | Time: 8.72s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 16, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5002 (43.62%) | Time: 12.15s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 16, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4772 (41.62%) | Time: 9.2s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 16, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4861 (42.39%) | Time: 9.45s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 16, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4827 (42.09%) | Time: 11.4s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 16, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5351 (46.66%) | Time: 9.84s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 16, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4894 (42.68%) | Time: 8.12s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 16, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5401 (47.1%) | Time: 9.48s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 16, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5209 (45.43%) | Time: 7.5s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 16, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5388 (46.99%) | Time: 8.62s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 16, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4954 (43.2%) | Time: 7.63s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 16, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5417 (47.24%) | Time: 7.77s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 16, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4850 (42.3%) | Time: 11.29s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 16, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5392 (47.02%) | Time: 10.22s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 16, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5007 (43.66%) | Time: 9.57s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 16, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5045 (44.0%) | Time: 9.78s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 16, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5484 (47.82%) | Time: 10.8s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 18, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4783 (41.71%) | Time: 10.45s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 18, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4914 (42.85%) | Time: 16.13s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 18, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4491 (39.16%) | Time: 12.91s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 18, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4923 (42.93%) | Time: 10.0s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 18, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4995 (43.56%) | Time: 7.87s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 18, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4815 (41.99%) | Time: 8.28s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 18, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4689 (40.89%) | Time: 9.98s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 18, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4795 (41.82%) | Time: 9.94s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 18, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4889 (42.64%) | Time: 8.86s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 18, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4716 (41.13%) | Time: 8.64s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 18, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5013 (43.72%) | Time: 9.61s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 18, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4461 (38.9%) | Time: 12.34s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 18, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5224 (45.56%) | Time: 10.01s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 18, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4840 (42.21%) | Time: 9.54s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 18, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5011 (43.7%) | Time: 10.05s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 18, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5304 (46.25%) | Time: 6.9s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 18, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4770 (41.6%) | Time: 7.77s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 18, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4812 (41.96%) | Time: 8.64s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 18, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5065 (44.17%) | Time: 8.37s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 18, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4743 (41.36%) | Time: 6.63s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 18, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4918 (42.89%) | Time: 7.13s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 18, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4771 (41.61%) | Time: 7.77s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 18, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4775 (41.64%) | Time: 8.11s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 18, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5068 (44.2%) | Time: 7.12s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 18, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5042 (43.97%) | Time: 6.27s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 18, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4935 (43.04%) | Time: 6.38s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 18, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5122 (44.67%) | Time: 6.72s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 18, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5092 (44.41%) | Time: 7.47s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 18, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5161 (45.01%) | Time: 8.41s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 18, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5071 (44.22%) | Time: 6.23s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 18, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5136 (44.79%) | Time: 10.35s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 18, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4867 (42.44%) | Time: 7.44s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 18, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5143 (44.85%) | Time: 13.9s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 18, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5117 (44.62%) | Time: 7.89s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 18, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5112 (44.58%) | Time: 7.51s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 18, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5082 (44.32%) | Time: 6.01s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4974 (43.38%) | Time: 6.42s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4725 (41.21%) | Time: 6.43s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4702 (41.0%) | Time: 7.82s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4517 (39.39%) | Time: 6.62s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4987 (43.49%) | Time: 8.29s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4999 (43.59%) | Time: 8.41s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4612 (40.22%) | Time: 7.97s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4802 (41.88%) | Time: 6.7s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4759 (41.5%) | Time: 6.78s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4572 (39.87%) | Time: 7.19s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4815 (41.99%) | Time: 6.65s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4789 (41.76%) | Time: 6.69s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4784 (41.72%) | Time: 6.27s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5079 (44.29%) | Time: 6.85s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4761 (41.52%) | Time: 7.72s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5499 (47.96%) | Time: 6.07s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5083 (44.33%) | Time: 8.25s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5225 (45.57%) | Time: 10.29s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5087 (44.36%) | Time: 11.21s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5024 (43.81%) | Time: 6.78s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5259 (45.86%) | Time: 5.92s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4765 (41.55%) | Time: 6.38s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5069 (44.21%) | Time: 5.59s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5322 (46.41%) | Time: 6.0s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4970 (43.34%) | Time: 5.54s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5215 (45.48%) | Time: 5.91s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5106 (44.53%) | Time: 5.37s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4807 (41.92%) | Time: 6.3s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5441 (47.45%) | Time: 11.58s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5053 (44.07%) | Time: 8.01s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5225 (45.57%) | Time: 7.74s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5639 (49.18%) | Time: 7.89s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5059 (44.12%) | Time: 8.17s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5378 (46.9%) | Time: 8.2s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5415 (47.22%) | Time: 7.08s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 20, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5150 (44.91%) | Time: 6.04s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 22, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4950 (43.17%) | Time: 6.02s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 22, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4691 (40.91%) | Time: 6.38s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 22, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5018 (43.76%) | Time: 6.51s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 22, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4794 (41.81%) | Time: 6.67s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 22, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5043 (43.98%) | Time: 7.67s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 22, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4807 (41.92%) | Time: 7.29s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 22, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5172 (45.1%) | Time: 7.65s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 22, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4988 (43.5%) | Time: 6.11s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 22, 'min_samples': 5, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5052 (44.06%) | Time: 7.94s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 22, 'min_samples': 5, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5087 (44.36%) | Time: 6.83s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 22, 'min_samples': 5, 'nr_topics': 12}




Done | Topics: 11, Outliers: 4732 (41.27%) | Time: 6.18s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 22, 'min_samples': 5, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4671 (40.73%) | Time: 6.4s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 22, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5062 (44.14%) | Time: 6.41s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 22, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4931 (43.0%) | Time: 7.77s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 22, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5225 (45.57%) | Time: 9.25s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 22, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5277 (46.02%) | Time: 5.9s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 22, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5091 (44.4%) | Time: 6.13s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 22, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5328 (46.46%) | Time: 7.95s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 22, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5293 (46.16%) | Time: 6.85s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 22, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5069 (44.21%) | Time: 6.58s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 22, 'min_samples': 7, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4912 (42.84%) | Time: 6.68s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 22, 'min_samples': 7, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4815 (41.99%) | Time: 6.43s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 22, 'min_samples': 7, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5288 (46.11%) | Time: 9.67s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 22, 'min_samples': 7, 'nr_topics': 14}




Done | Topics: 13, Outliers: 4837 (42.18%) | Time: 8.19s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 22, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 4954 (43.2%) | Time: 7.36s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 22, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 5291 (46.14%) | Time: 6.98s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 22, 'min_samples': 9, 'nr_topics': 12}




Done | Topics: 11, Outliers: 5040 (43.95%) | Time: 6.29s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 22, 'min_samples': 9, 'nr_topics': 14}




Done | Topics: 13, Outliers: 5584 (48.7%) | Time: 5.73s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 22, 'min_samples': 9, 'nr_topics': 8}




Done | Topics: 7, Outliers: 5681 (49.54%) | Time: 8.46s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 22, 'min_samples': 9, 'nr_topics': 10}




Done | Topics: 9, Outliers: 4955 (43.21%) | Time: 8.33s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 22, 'min_samples': 9, 'nr_topics': 12}




Failed for config: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 22, 'min_samples': 9, 'nr_topics': 12} — [Errno 28] No space left on device

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 22, 'min_samples': 9, 'nr_topics': 14}


KeyboardInterrupt: 

## View saved topic_model (CURRENT):

In [None]:
from bertopic import BERTopic
topic_model = BERTopic.load("/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_euclidean_c10_s5_ntnone")

# Inspect
topic_model.get_topic_info()


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4566,-1_the_and_to_of,"[the, and, to, of, in, for, is, that, we, it]",[understandable i know a lot of people essenti...
1,0,334,0_coal_clean_plants_trump,"[coal, clean, plants, trump, mine, beautiful, ...",[trump tried to bring back coal in his first t...
2,1,269,1_solar_electricity_panels_power,"[solar, electricity, panels, power, generation...",[renewables met of the growth in electricity d...
3,2,266,2_depth_snow_apr_precip,"[depth, snow, apr, precip, low, high, iembot, ...",[birmingham apr climate report high low precip...
4,3,246,3_apr_missing_iembot_additional,"[apr, missing, iembot, additional, details, vi...",[key west apr climate report high low precip s...
...,...,...,...,...,...
193,192,10,192_bags_ecofriendly_plastic_tree,"[bags, ecofriendly, plastic, tree, canadianwil...",[every item ordered plants a tree and removes ...
194,193,10,193_water_facilities_carries_desalination,"[water, facilities, carries, desalination, sma...",[we support latvias largest water utility rgas...
195,194,10,194_dinner_functioningcore_alreadydirty_dishes...,"[dinner, functioningcore, alreadydirty, dishes...",[clean space clean mind why tidying up is good...
196,195,10,195_indoors_cooking_propane_stove,"[indoors, cooking, propane, stove, gas, showin...",[natural gas is linked with cancer not wind po...


In [15]:
from bertopic import BERTopic

# Load saved model
topic_model = BERTopic.load("/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_manhattan_c22_s9_nt10")

# Get original texts
texts = climate_df["clean_text"].tolist()  

# Reduce to 10 topics
topic_model.reduce_topics(docs=texts, nr_topics=10)

# View updated topic summary
reduced_topic_info = topic_model.get_topic_info()
print(reduced_topic_info.head(11))


   Topic  Count                          Name  \
0     -1   4955              -1_the_to_and_of   
1      0   4852               0_the_to_and_of   
2      1   1052         1_apr_snow_precip_low   
3      2    199            2_green_the_and_of   
4      3    133  3_plastic_plastics_waste_the   
5      4    105          4_air_the_autism_and   
6      5     65                 5_de_en_la_es   
7      6     47     6_carbon_carbonated_it_my   
8      7     31         7_of_emission_the_and   
9      8     28         8_reg_promo_code_deal   

                                      Representation  \
0  [the, to, and, of, in, is, for, that, on, clim...   
1  [the, to, and, of, in, is, for, that, climate,...   
2  [apr, snow, precip, low, high, iembot, additio...   
3  [green, the, and, of, to, like, for, my, game,...   
4  [plastic, plastics, waste, the, and, microplas...   
5  [air, the, autism, and, of, environmental, to,...   
6        [de, en, la, es, les, que, des, et, le, el]   
7  [carbon, 

In [17]:
## Filter for lowest outlier percentage

import pandas as pd
import os

# Load the log
log_path = "../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv"
log_df = pd.read_csv(log_path)

# Sort by outlier percentage (ascending = best first)
log_df_sorted = log_df.sort_values(by="outlier_pct")

# Show top 10
print(log_df_sorted.head(10))


      embedding_model     metric  min_cluster_size  min_samples  nr_topics  \
9    all-MiniLM-L6-v2     cosine                10            5         10   
7    all-MiniLM-L6-v2  manhattan                10            5         14   
10   all-MiniLM-L6-v2     cosine                10            5         12   
50   all-MiniLM-L6-v2  euclidean                12            7         12   
155  all-MiniLM-L6-v2     cosine                18            5         14   
38   all-MiniLM-L6-v2  euclidean                12            5         12   
39   all-MiniLM-L6-v2  euclidean                12            5         14   
146  all-MiniLM-L6-v2  euclidean                18            5         12   
46   all-MiniLM-L6-v2     cosine                12            5         12   
111  all-MiniLM-L6-v2  euclidean                16            5         14   

     n_topics  outliers  outlier_pct  time_sec  
9           9      4372        38.13      8.30  
7          13      4402        38.39      8

In [18]:
top_n = 10
base_model_dir = "../../data/BERTopic_Hyperparameters"

top_models = []

for row in log_df_sorted.head(top_n).itertuples():
    model_name = f"{row.embedding_model}_{row.metric}_c{row.min_cluster_size}_s{row.min_samples}_nt{row.nr_topics}"
    model_path = os.path.join(base_model_dir, f"_{model_name}")
    top_models.append((model_name, model_path))

# Print top model paths
for name, path in top_models:
    print(f"{name}: {path}")


all-MiniLM-L6-v2_cosine_c10_s5_nt10: ../../data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_cosine_c10_s5_nt10
all-MiniLM-L6-v2_manhattan_c10_s5_nt14: ../../data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_manhattan_c10_s5_nt14
all-MiniLM-L6-v2_cosine_c10_s5_nt12: ../../data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_cosine_c10_s5_nt12
all-MiniLM-L6-v2_euclidean_c12_s7_nt12: ../../data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_euclidean_c12_s7_nt12
all-MiniLM-L6-v2_cosine_c18_s5_nt14: ../../data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_cosine_c18_s5_nt14
all-MiniLM-L6-v2_euclidean_c12_s5_nt12: ../../data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_euclidean_c12_s5_nt12
all-MiniLM-L6-v2_euclidean_c12_s5_nt14: ../../data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_euclidean_c12_s5_nt14
all-MiniLM-L6-v2_euclidean_c18_s5_nt12: ../../data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_euclidean_c18_s5_nt12
all-MiniLM-L6-v2_cosine_c12_s5_nt12: ../../data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2

In [29]:
from bertopic import BERTopic

# Load saved model
topic_model = BERTopic.load("../../data/BERTopic_Hyperparameters/_all-MiniLM-L6-v2_euclidean_c16_s5_nt14")

# Get original texts
texts = climate_df["clean_text"].tolist()  

# Reduce to 10 topics


# View updated topic summary
reduced_topic_info = topic_model.get_topic_info()
print(reduced_topic_info.head(15))


    Topic  Count                                               Name  \
0      -1   4505                                   -1_the_to_and_of   
1       0   4110                                    0_the_to_and_of   
2       1   1446                              1_apr_snow_precip_low   
3       2    545                                    2_the_to_and_of   
4       3    332                               3_the_plastic_and_to   
5       4    144                                  4_the_and_of_game   
6       5    108                               5_green_flag_red_the   
7       6     66                                      6_de_la_en_es   
8       7     43                                  7_she_her_shes_to   
9       8     41                            8_noise_the_monoxide_it   
10      9     37                      9_autism_rfk_environmental_jr   
11     10     33  10_thegreenplanet_davidattenborough_stopgreenw...   
12     11     31                             11_of_emission_the_and   
13    

In [28]:
topics_cleaned = topic_info[topic_info["Topic"] != -1]

def is_high_quality_topic(keywords):
    stopwords = {"the", "to", "and", "of", "in", "is", "for", "that", "it"}
    return len([w for w in keywords if w not in stopwords]) >= 5

topics_cleaned["high_quality"] = topics_cleaned["Representation"].apply(is_high_quality_topic)

good_topics = topics_cleaned[topics_cleaned["high_quality"] == True]

for _, row in good_topics.iterrows():
    print(f"\nTopic {row['Topic']} | Count: {row['Count']}")
    print("Top words:", ", ".join(row["Representation"][:30]))
    print("Example post:", row["Representative_Docs"][0])




Topic 1 | Count: 1052
Top words: apr, snow, precip, low, high, iembot, additional, details, via, missing
Example post: monroe apr climate report high low precip snow snow depth at wed apr via iembot additional details here

Topic 5 | Count: 65
Top words: de, en, la, es, les, que, des, et, le, el
Example post: reporte del clima en este momento la temperatura en sartenejas es de c la condicin es nublado ten un da productivo y feliz

Topic 7 | Count: 31
Top words: of, emission, the, and, in, transition, we, quasiperiodic, tropomi, with
Example post: quasiperiodic pulsations in ionospheric tec synchronized with solar flare euv emission aisling n ohare et al

Topic 8 | Count: 28
Top words: reg, promo, code, deal, use, amazon, price, buy, off, lights
Example post: tesla model y allweather tpe floor mats set of off deal price reg use promo code liklpx buy on amazon
