### Make annotation sets for 1-4 runs of BERTopic for downstream evaluation

In [2]:
import pandas as pd
from pathlib import Path

# === Configuration ===
BASE_PATH = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH"
RUN_FILES = {
    "run1": f"{BASE_PATH}/data/BERTopicResult/Cluster_Runs/Run1/run1_result.json",
    "run2": f"{BASE_PATH}/data/BERTopicResult/Cluster_Runs/Run2/run2_result.json",
    "run3": f"{BASE_PATH}/data/BERTopicResult/Cluster_Runs/Run3/run3_result.json", 
    "run4": f"{BASE_PATH}/data/BERTopicResult/Cluster_Runs/Run4/run4_result.json",
}


OUTPUT_DIR = Path("/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/processed/annotated/Sampled_BERT_Runs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

SAMPLES_PER_CLUSTER = 10


for run_name, file_path in RUN_FILES.items():
    print(f" Processing {run_name} from: {file_path}")

    df = pd.read_json(file_path, lines=True)

    topic_col = "topic" if "topic" in df.columns else "cluster"
    if topic_col not in df.columns or "text" not in df.columns:
        raise ValueError(f"Missing required columns in {file_path}. Expected: '{topic_col}', 'text'")

    df[topic_col] = df[topic_col].astype(str)

    cluster_samples = []

    # Sample 10 posts per cluster
    for topic, group in df.groupby(topic_col):
        
        sample_size = min(SAMPLES_PER_CLUSTER, len(group))
        sampled_texts = group.sample(n=sample_size, random_state=42)["text"].tolist()
        
        cluster_samples.append({
            "topic": topic,
            "sample_texts": sampled_texts,
            "annotated_label": "",  # Leave empty for annotation
            "source_run": run_name
        })

    output_path = OUTPUT_DIR / f"cluster_level_annotation_{run_name}.json"
    pd.DataFrame(cluster_samples).to_json(output_path, indent=2, orient="records", force_ascii=False)


    print(f"Saved {len(cluster_samples)} annotated clusters to: {output_path}")



 Processing run1 from: /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/BERTopicResult/Cluster_Runs/Run1/run1_result.json
Saved 4 annotated clusters to: /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/processed/annotated/Sampled_BERT_Runs/cluster_level_annotation_run1.json
 Processing run2 from: /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/BERTopicResult/Cluster_Runs/Run2/run2_result.json
Saved 114 annotated clusters to: /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/processed/annotated/Sampled_BERT_Runs/cluster_level_annotation_run2.json
 Processing run3 from: /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/BERTopicResult/Cluster_Runs/Run3/run3_result.json
Saved 50 annotated clusters to: /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/processed/annotated/Sampled_BERT_Runs/cluster_level_annotation_run3.json
 Processing run4 from: /Users/tobiasmichelsen/Bachelor_Project/D

In [3]:
import pandas as pd
import json
from pathlib import Path

# === Configuration ===
BASE_PATH = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH"
ANNOTATION_DIR = Path(f"{BASE_PATH}/data/processed/annotated/Sampled_BERT_Runs")
RESULT_DIR = Path(f"{BASE_PATH}/data/BERTopicResult/Cluster_Runs")

RUN_NAMES = ["run1", "run2", "run3", "run4"]

summary = []
initial_cids = set()  # Will be filled by run1

for i, run in enumerate(RUN_NAMES):
    # Load annotation file
    annotation_path = ANNOTATION_DIR / f"cluster_level_annotation_{run}.json"
    if not annotation_path.exists():
        print(f"❌ Missing annotation file: {annotation_path}")
        continue

    with open(annotation_path, "r", encoding="utf-8") as f:
        annotations = pd.DataFrame(json.load(f))

    kept_clusters = annotations[annotations["annotated_label"].str.lower() == "yes"]["topic"].astype(str).tolist()

    # Load result file
    result_path = RESULT_DIR / run.capitalize() / f"{run}_result.json"
    try:
        df = pd.read_json(result_path, lines=True)
        topic_col = "topic" if "topic" in df.columns else "cluster"
        df[topic_col] = df[topic_col].astype(str)
        
        # Filter to only kept clusters
        if kept_clusters:
            df_filtered = df[df[topic_col].isin(kept_clusters)]
        else:
            print(f"⚠️ No 'yes'-labeled clusters found for {run}. Using all clusters.")
            df_filtered = df.copy()

        # Track CIDs (or unique post IDs)
        current_cids = set(df_filtered["cid"])
        if i == 0:
            initial_cids = current_cids
            removed_entries = 0
            removed_pct = 0.0
        else:
            removed_entries = len(initial_cids - current_cids)
            removed_pct = round(removed_entries / len(initial_cids) * 100, 2)

        # Add stats
        total_entries = len(df_filtered)
        num_clusters = df_filtered[topic_col].nunique()
        noise_size = (df[topic_col] == "-1").sum()

    except Exception as e:
        print(f"⚠️ Error processing {run}: {e}")
        total_entries = num_clusters = noise_size = removed_entries = removed_pct = "N/A"

    summary.append({
        "Run": run.capitalize(),
        "Total Entries": total_entries,
        "Total Clusters": num_clusters,
        "Noise Cluster Size": noise_size,
        "Removed Entries": removed_entries,
        "% Removed": removed_pct
    })

# Display the summary table
summary_df = pd.DataFrame(summary)
print("\n=== BERTopic Entry-Level Filtering Summary ===")
print(summary_df)

# Optional: Save to CSV
summary_df.to_csv(f"{BASE_PATH}/Visualizations/bert_entry_filtering_summary.csv", index=False)


⚠️ No 'yes'-labeled clusters found for run1. Using all clusters.
⚠️ No 'yes'-labeled clusters found for run2. Using all clusters.
⚠️ No 'yes'-labeled clusters found for run3. Using all clusters.
⚠️ No 'yes'-labeled clusters found for run4. Using all clusters.

=== BERTopic Entry-Level Filtering Summary ===
    Run  Total Entries  Total Clusters  Noise Cluster Size  Removed Entries  \
0  Run1         228275               4               24264                0   
1  Run2         182145             114               95358            36664   
2  Run3         140161              50               69703            48435   
3  Run4          69703              50               49948           118772   

   % Removed  
0       0.00  
1      19.56  
2      25.84  
3      63.37  
