## Re-assign every post to topic:
- We get the "topic" from the cluster ID assigned by BERTopic in BERTopic_final_result.json
- We get the assigned_label for each topic based on the LLM 


In [14]:
import json
import pandas as pd
from pathlib import Path

# --- Paths ---
BASE_PATH = Path("/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/llm_subtopic")
CSV_PATH = BASE_PATH / "datasets" / "LLM_Clusters_Topic_Assignment.csv"
FINAL_JSON_PATH = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/BERTopicResult/BERTopic_final_result.json"
OUTPUT_SINGLE = BASE_PATH / "datasets" / "BERTopic_posts_with_labels_single_label.json"
OUTPUT_MULTI = BASE_PATH / "datasets" / "BERTopic_posts_with_labels_multi_label.json"

# --- Load cluster-to-label assignment ---
df_map = pd.read_csv(CSV_PATH)

# --- Load all original posts from BERTopic ---
with open(FINAL_JSON_PATH, "r") as f:
    all_posts = [json.loads(line) for line in f]

# --- Function to process one mode ---
def process_mode(mode, output_path):
    topic_to_label = dict(zip(
        df_map[df_map["mode"] == mode]["topic"],
        df_map[df_map["mode"] == mode]["assigned_label"]
    ))

    enriched = []
    missing = set()

    for post in all_posts:
        topic_id = post.get("topic")
        label = topic_to_label.get(topic_id)
        if label:
            post["assigned_label"] = label
            enriched.append(post)
        else:
            missing.add(topic_id)

    df_labeled = pd.DataFrame(enriched)
    df_labeled.to_json(output_path, orient="records", indent=2)
    print(f"[{mode}] Saved {len(df_labeled)} labeled posts to {output_path}")

    if missing:
        print(f"[{mode}] Warning: {len(missing)} topic IDs had no label mapping.")
        print("Missing topic IDs (first 10):", list(missing)[:10])

process_mode("single_label", OUTPUT_SINGLE)
process_mode("multi_label", OUTPUT_MULTI)


[single_label] Saved 90213 labeled posts to /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/llm_subtopic/datasets/BERTopic_posts_with_labels_single_label.json
[multi_label] Saved 90213 labeled posts to /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/llm_subtopic/datasets/BERTopic_posts_with_labels_multi_label.json


In [15]:
df_missing_cid = df_labeled[df_labeled["cid"].isnull()]
print(len(df_missing_cid), "entries are missing CID.")


368 entries are missing CID.
