In [8]:
pip install gensim

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [9]:
pip install --upgrade bertopic


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [10]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

In [19]:
import json
import time
import itertools
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from umap import UMAP
from sklearn.preprocessing import normalize
import os
import csv
import torch
import gc

from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

# Functions
def compute_coherence_score(topic_model, documents, top_n_words=10):
    topics = topic_model.get_topics()
    topic_words = [
        [word for word, _ in topics[topic_id][:top_n_words]]
        for topic_id in topics.keys() if topic_id != -1
    ]
    tokenized_docs = [doc.split() for doc in documents]
    dictionary = Dictionary(tokenized_docs)
    coherence_model = CoherenceModel(
        topics=topic_words,
        texts=tokenized_docs,
        dictionary=dictionary,
        coherence='c_v'
    )
    return coherence_model.get_coherence()

def compute_topic_diversity(topic_model, top_n_words=10):
    topics = topic_model.get_topics()
    topic_words = [
        [word for word, _ in topics[topic_id][:top_n_words]]
        for topic_id in topics.keys() if topic_id != -1
    ]
    all_words = [word for words in topic_words for word in words]
    unique_words = len(set(all_words))
    total_words = len(all_words)
    return unique_words / total_words if total_words > 0 else 0

# --- DATA ---
df_whole = pd.DataFrame()

files_to_load = [
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_2.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_3.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_8.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_20.json"
]

for filepath in files_to_load:
    df = pd.read_json(filepath)
    df = df[df["label"] == "yes"]
    df = df[df["score"] >= .99]
    df_whole = pd.concat([df_whole, df], ignore_index=True)
    print(f"Loaded {filepath}, current shape: {df_whole.shape}", flush=True)

# --- HYPERPARAMETERS ---
embedding_models = ["all-MiniLM-L6-v2"]

min_cluster_sizes = [50, 100]
min_samples_vals = [5, 10]
distance_metrics = ["cosine"]

umap_neighbors = [15]
umap_components = [5, 7]
umap_min_dist = [0.0]

nr_topics = 15  # FIXED value outside the loop

log_path = os.path.expanduser("logs/bertopic_grid_log_local.csv")
os.makedirs(os.path.dirname(log_path), exist_ok=True)

log_columns = [
    "embedding_model", "metric", "min_cluster_size", "min_samples",
    "nr_topics", "umap_neighbors", "umap_components", "umap_min_dist",
    "n_topics", "outliers", "outlier_pct", "time_sec", "coherence", "diversity"
]

if os.path.exists(log_path):
    log_df = pd.read_csv(log_path)
else:
    log_df = pd.DataFrame(columns=log_columns)
    log_df.to_csv(log_path, index=False)

counter_combinations = len(log_df)
max_combinations = 8
proportion = counter_combinations / max_combinations

# --- MAIN LOOP ---
texts_to_embed = df_whole["text"].tolist()
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}", flush=True)

batch_size = 32

saved_models = []

for embed_model in embedding_models:
    print(f"\nStarting embedding with model: {embed_model}", flush=True)
    model = SentenceTransformer(embed_model, device=device)

    start_embed = time.time()
    embeddings_local = model.encode(
        texts_to_embed,
        show_progress_bar=True,
        batch_size=batch_size,
        convert_to_numpy=True
    )
    embed_time = round(time.time() - start_embed, 2)
    print(f"Embedding done for model '{embed_model}' in {embed_time}s", flush=True)

    for (min_cluster_size, min_samples, metric,
         n_neighbors, n_components, min_dist) in itertools.product(
        min_cluster_sizes, min_samples_vals, distance_metrics,
        umap_neighbors, umap_components, umap_min_dist
    ):

        run_key = {
            "embedding_model": embed_model,
            "metric": metric,
            "min_cluster_size": min_cluster_size,
            "min_samples": min_samples,
            "nr_topics": nr_topics,  # fixed
            "umap_neighbors": n_neighbors,
            "umap_components": n_components,
            "umap_min_dist": min_dist
        }

        existing = log_df[
            (log_df.embedding_model == run_key["embedding_model"]) &
            (log_df.metric == run_key["metric"]) &
            (log_df.min_cluster_size == run_key["min_cluster_size"]) &
            (log_df.min_samples == run_key["min_samples"]) &
            (log_df.nr_topics == run_key["nr_topics"]) &
            (log_df.umap_neighbors == run_key["umap_neighbors"]) &
            (log_df.umap_components == run_key["umap_components"]) &
            (log_df.umap_min_dist == run_key["umap_min_dist"])
        ]

        if not existing.empty:
            print(f"Skipping already completed: {run_key}", flush=True)
            continue

        print(f"\nRunning: {run_key}", flush=True)
        start = time.time()

        if metric == "cosine":
            embeddings_used = normalize(embeddings_local, norm="l2")
            hdbscan_metric = "euclidean"
        else:
            embeddings_used = embeddings_local
            hdbscan_metric = metric

        umap_model = UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            min_dist=min_dist,
            metric=metric,
            random_state=42
        )

        hdbscan_model = HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric=hdbscan_metric,
            cluster_selection_method="eom"
        )

        topic_model = BERTopic(
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            language="english",
            calculate_probabilities=False,
            verbose=False,
            low_memory=True
        )

        try:
            topics, _ = topic_model.fit_transform(texts_to_embed, embeddings_used)
            df_topics = pd.DataFrame({
                "seq": df_whole["seq"].tolist(),
                "text": texts_to_embed,
                "topic": topics
            })

            topic_model.reduce_topics(texts_to_embed, nr_topics=nr_topics)

            topic_info = topic_model.get_topic_info()
            n_topics = len(topic_info[topic_info.Topic != -1])
            n_outliers = topic_info[topic_info.Topic == -1].Count.values[0] if -1 in topic_info.Topic.values else 0
            n_total = sum(topic_info.Count)
            duration = round(time.time() - start, 2)

            coherence = compute_coherence_score(topic_model, texts_to_embed)
            diversity = compute_topic_diversity(topic_model)

            log_entry = {
                **run_key,
                "n_topics": n_topics,
                "outliers": n_outliers,
                "outlier_pct": round(n_outliers / n_total * 100, 2),
                "time_sec": duration,
                "coherence": round(coherence, 4),
                "diversity": round(diversity, 4)
            }

            log_df = pd.concat([log_df, pd.DataFrame([log_entry])], ignore_index=True)
            log_df.to_csv(log_path, index=False)

            print(f"Completed {len(log_df)} models so far.", flush=True)

            saved_models.append((coherence + diversity, topic_model, run_key))
            saved_models = sorted(saved_models, key=lambda x: x[0], reverse=True)[:1]

            counter_combinations += 1
            proportion = counter_combinations / max_combinations

            if max_combinations >= 10 and counter_combinations % (max_combinations // 10) == 0:
                print(f"Progress: {counter_combinations}/{max_combinations} models completed ({int(proportion * 100)}%)", flush=True)

            print(f"Done | Topics: {n_topics}, Outliers: {n_outliers} ({log_entry['outlier_pct']}%) | Coherence: {coherence:.4f} | Diversity: {diversity:.4f} | Time: {duration}s", flush=True)

        except Exception as e:
            print(f"Failed for config: {run_key} — {e}", flush=True)

        finally:
            del topic_model
            gc.collect()

# --- SAVE BEST MODEL ---
for idx, (score, model, params) in enumerate(saved_models):
    save_dir = f"logs/top_models/model_{idx+1}"
    os.makedirs(save_dir, exist_ok=True)
    model.save(os.path.join(save_dir, "model"))
    
    with open(os.path.join(save_dir, "params.json"), "w") as f:
        json.dump(params, f, indent=2)
    
    df_topics = pd.DataFrame({
        "seq": df_whole["seq"].tolist(),
        "text": texts_to_embed,
        "topic": model.topics_
    })
    df_topics.to_json(os.path.join(save_dir, "topics_with_seq.json"), orient="records", lines=True)


Loaded /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_2.json, current shape: (541, 8)
Loaded /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_3.json, current shape: (1168, 8)
Loaded /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_8.json, current shape: (1991, 8)
Loaded /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_20.json, current shape: (3021, 8)
Using device: cpu

Starting embedding with model: all-MiniLM-L6-v2


Batches: 100%|██████████| 95/95 [00:13<00:00,  7.30it/s]

Embedding done for model 'all-MiniLM-L6-v2' in 13.1s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 50, 'min_samples': 5, 'nr_topics': 15, 'umap_neighbors': 15, 'umap_components': 5, 'umap_min_dist': 0.0}



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- A

Completed 1 models so far.
Done | Topics: 5, Outliers: 11 (0.36%) | Coherence: 0.4396 | Diversity: 0.6000 | Time: 9.26s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 50, 'min_samples': 5, 'nr_topics': 15, 'umap_neighbors': 15, 'umap_components': 7, 'umap_min_dist': 0.0}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Completed 2 models so far.
Done | Topics: 5, Outliers: 7 (0.23%) | Coherence: 0.4396 | Diversity: 0.6000 | Time: 9.1s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 50, 'min_samples': 10, 'nr_topics': 15, 'umap_neighbors': 15, 'umap_components': 5, 'umap_min_dist': 0.0}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Completed 3 models so far.
Done | Topics: 5, Outliers: 15 (0.5%) | Coherence: 0.4619 | Diversity: 0.6000 | Time: 8.67s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 50, 'min_samples': 10, 'nr_topics': 15, 'umap_neighbors': 15, 'umap_components': 7, 'umap_min_dist': 0.0}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Completed 4 models so far.
Done | Topics: 5, Outliers: 5 (0.17%) | Coherence: 0.4396 | Diversity: 0.6000 | Time: 8.73s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 100, 'min_samples': 5, 'nr_topics': 15, 'umap_neighbors': 15, 'umap_components': 5, 'umap_min_dist': 0.0}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Completed 5 models so far.
Done | Topics: 3, Outliers: 10 (0.33%) | Coherence: 0.4835 | Diversity: 0.7000 | Time: 8.56s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 100, 'min_samples': 5, 'nr_topics': 15, 'umap_neighbors': 15, 'umap_components': 7, 'umap_min_dist': 0.0}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Completed 6 models so far.
Done | Topics: 3, Outliers: 5 (0.17%) | Coherence: 0.5185 | Diversity: 0.7000 | Time: 8.92s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 100, 'min_samples': 10, 'nr_topics': 15, 'umap_neighbors': 15, 'umap_components': 5, 'umap_min_dist': 0.0}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Completed 7 models so far.
Done | Topics: 3, Outliers: 10 (0.33%) | Coherence: 0.4835 | Diversity: 0.7000 | Time: 8.71s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'cosine', 'min_cluster_size': 100, 'min_samples': 10, 'nr_topics': 15, 'umap_neighbors': 15, 'umap_components': 7, 'umap_min_dist': 0.0}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Completed 8 models so far.
Done | Topics: 3, Outliers: 5 (0.17%) | Coherence: 0.5185 | Diversity: 0.7000 | Time: 9.22s




In [20]:
from bertopic import BERTopic

model = BERTopic.load("logs/top_models/model_1/model")


In [22]:
# See basic topic information
model.get_topic_info()


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,5,-1_local_weather_pi_raspberry,"[local, weather, pi, raspberry, kampot, cambod...","[The local weather in Kampot, Cambodia from a ..."
1,0,2669,0_the_to_and_of,"[the, to, and, of, is, in, for, that, on, it]",[Now is not the time to vote Green either in c...
2,1,238,1_apr_snow_low_precip,"[apr, snow, low, precip, high, 00, missing, cl...",[BURNS OR Apr 6 Climate Report High 69 Low 26 ...
3,2,109,2_apr_airport_snow_precip,"[apr, airport, snow, precip, 00, low, high, mi...",[MADERA CA AIRPORT Apr 6 Climate Report High 7...
