### Script to stratify sampling of filtered data to be used for Manual Labeling
### Goal: Evaluate the classifier's performance

In [22]:
import pandas as pd
import os
import json
from datetime import datetime

# Parameters
INPUT_JSON = [
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_2.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_3.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_8.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_20.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_21.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_40.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_46.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_60.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_67.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_77.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_94.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_113.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_129.json",
]

Iteration_2_PATH = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/processed/annotated/Labeled_Posts_23_04"
OUTPUT_DIR = Iteration_2_PATH
os.makedirs(OUTPUT_DIR, exist_ok=True)

SUBSETS = {
    "fullsample": {"filter": None, "sample_size": 200},
    "score95": {"filter": lambda df: df[df["score"] >= 0.95], "sample_size": 100},
    "score99": {"filter": lambda df: df[df["score"] >= 0.99], "sample_size": 100},
}

# Load and combine JSON files
all_posts = []
for file in INPUT_JSON:
    with open(file, "r", encoding="utf-8") as f:
        all_posts.extend(json.load(f))
df = pd.DataFrame(all_posts)

# Helpers for stratification
def length_bucket(text):
    l = len(text)
    return "short" if l < 100 else "medium" if l < 300 else "long"

def extract_hour(ts):
    try:
        return datetime.fromisoformat(ts.replace("Z", "+00:00")).hour
    except:
        return None

def stratified_sample(df, sample_size):
    df = df.copy()
    df["hour"] = df["timestamp"].apply(extract_hour)
    df["length_bucket"] = df["text"].apply(length_bucket)
    df = df.dropna(subset=["hour", "length_bucket"])
    df["hour"] = df["hour"].astype(int)

    strata = df.groupby(["hour", "length_bucket"])
    proportions = strata.size() / len(df)
    samples_per_stratum = (proportions * sample_size).round().astype(int)

    sampled_rows = []
    for (key, group), (_, n) in zip(strata, samples_per_stratum.items()):
        n_safe = min(n, len(group))
        if n_safe > 0:
            sampled_rows.append(group.sample(n=n_safe, random_state=43))

    df_sampled = pd.concat(sampled_rows)

    if len(df_sampled) > sample_size:
        df_sampled = df_sampled.sample(n=sample_size, random_state=43)

    if len(df_sampled) < sample_size:
        print(f"Only got {len(df_sampled)} rows after stratified sampling. Sampling remaining randomly...")
        remaining = sample_size - len(df_sampled)
        rest = df.drop(df_sampled.index, errors="ignore")
        df_sampled = pd.concat([
            df_sampled,
            rest.sample(n=min(remaining, len(rest)), random_state=43)
        ])

    return df_sampled.sample(frac=1, random_state=43).reset_index(drop=True)

# Split 50/50 for score95/fullsample
def split_and_save(df, name):
    df["annotation"] = ""
    half = len(df) // 2
    df_tobias = df.iloc[:half].reset_index(drop=True)
    df_abel = df.iloc[half:].reset_index(drop=True)

    annotation_fields = ["repo", "seq", "text", "timestamp", "cid", "uri", "annotation"]
    full_fields = annotation_fields + ["label", "score"]

    df_tobias[annotation_fields].to_json(os.path.join(OUTPUT_DIR, f"manual_labels_{name}_tobias.json"), indent=2, orient="records", force_ascii=False)
    df_abel[annotation_fields].to_json(os.path.join(OUTPUT_DIR, f"manual_labels_{name}_abel.json"), indent=2, orient="records", force_ascii=False)
    df[full_fields].to_json(os.path.join(OUTPUT_DIR, f"manual_labels_{name}_full.json"), indent=2, orient="records", force_ascii=False)

    print(f"Saved 3 files for '{name}': {len(df)} total posts")

# Main loop
for subset_name, config in SUBSETS.items():
    try:
        print(f"Processing subset: {subset_name}")

        df_filtered = config["filter"](df) if config["filter"] else df
        df_yes = df_filtered[df_filtered["label"] == "yes"]
        df_no = df_filtered[df_filtered["label"] == "no"]

        if subset_name == "score99":
            # Normal sample for 50 Tobias + 50 Abel
            sample_size = config["sample_size"] // 2
            if len(df_yes) < sample_size or len(df_no) < sample_size:
                print("Not enough posts for score99 — skipping")
                continue

            df_yes_sampled = stratified_sample(df_yes, sample_size)
            df_no_sampled = stratified_sample(df_no, sample_size)
            df_sampled = pd.concat([df_yes_sampled, df_no_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)

            split_and_save(df_sampled, subset_name)

            # Extra shared set for inter-annotator agreement
            print("Creating shared duplicate subset for inter-annotator agreement...")
            df_yes_remaining = df_yes.drop(df_yes_sampled.index, errors="ignore")
            df_no_remaining = df_no.drop(df_no_sampled.index, errors="ignore")

            if len(df_yes_remaining) < 25 or len(df_no_remaining) < 25:
                print("Not enough data for duplicate set — skipping")
                continue

            df_yes_dupe = stratified_sample(df_yes_remaining, 25)
            df_no_dupe = stratified_sample(df_no_remaining, 25)
            df_dupe = pd.concat([df_yes_dupe, df_no_dupe]).sample(frac=1, random_state=42).reset_index(drop=True)
            df_dupe["annotation"] = ""

            annotation_fields = ["repo", "seq", "text", "timestamp", "cid", "uri", "annotation"]
            df_dupe[annotation_fields].to_json(
                os.path.join(OUTPUT_DIR, "manual_labels_score99_INTERAGREEMENT.json"),
                orient="records", indent=2, force_ascii=False
            )
            print("Saved duplicate shared set for score99")

        else:
            sample_size = config["sample_size"] // 2
            if len(df_yes) < sample_size or len(df_no) < sample_size:
                print(f"Not enough data for a 50/50 label split in '{subset_name}' — skipping")
                continue

            df_yes_sampled = stratified_sample(df_yes, sample_size)
            df_no_sampled = stratified_sample(df_no, sample_size)
            df_sampled = pd.concat([df_yes_sampled, df_no_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)

            split_and_save(df_sampled, subset_name)

    except Exception as e:
        print(f"Error in subset '{subset_name}': {e}")


Processing subset: fullsample
Saved 3 files for 'fullsample': 200 total posts
Processing subset: score95
Only got 46 rows after stratified sampling. Sampling remaining randomly...
Only got 47 rows after stratified sampling. Sampling remaining randomly...
Saved 3 files for 'score95': 100 total posts
Processing subset: score99
Only got 47 rows after stratified sampling. Sampling remaining randomly...
Only got 47 rows after stratified sampling. Sampling remaining randomly...
Saved 3 files for 'score99': 100 total posts
Creating shared duplicate subset for inter-annotator agreement...
Only got 22 rows after stratified sampling. Sampling remaining randomly...
Only got 21 rows after stratified sampling. Sampling remaining randomly...
Saved duplicate shared set for score99


In [30]:
import pandas as pd
import os

# Set path to output directory
output_dir = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/processed/annotated/Labeled_Posts_23_04"

# Files to check
files = {
    "score99_tobias": "manual_labels_score99_tobias.json",
    "score99_abel": "manual_labels_score99_abel.json",
    "score99_duplicate": "manual_labels_score99_duplicate.json",
    "score99_full": "manual_labels_score99_full.json",
}

# Check each file
results = []

for name, filename in files.items():
    path = os.path.join(output_dir, filename)
    if os.path.exists(path):
        df = pd.read_json(path)
        if "label" in df.columns:
            label_counts = df["label"].value_counts().to_dict()
        else:
            label_counts = {"label": "missing"}
        results.append({
            "file": name,
            "total_posts": len(df),
            "yes_count": label_counts.get("yes", 0),
            "no_count": label_counts.get("no", 0),
            "label_column": "label" in df.columns
        })
    else:
        results.append({
            "file": name,
            "error": "File not found"
        })




In [31]:
results

[{'file': 'score99_tobias',
  'total_posts': 50,
  'yes_count': 0,
  'no_count': 0,
  'label_column': False},
 {'file': 'score99_abel',
  'total_posts': 50,
  'yes_count': 0,
  'no_count': 0,
  'label_column': False},
 {'file': 'score99_duplicate', 'error': 'File not found'},
 {'file': 'score99_full',
  'total_posts': 100,
  'yes_count': 50,
  'no_count': 50,
  'label_column': True}]

In [3]:
import pandas as pd
import os

# Set paths
output_dir = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/processed/annotated/Labeled_Posts_23_04"
interagreement_file = os.path.join(output_dir, "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/processed/annotated/Labeled_Posts_23_04/Inter-agreement/manual_labels_score99_INTERAGREEMENT_TOBIAS.json")
full_labeled_file = os.path.join(output_dir, "manual_labels_score99_full.json")
output_file = os.path.join(output_dir, "manual_labels_score99_INTERAGREEMENT_with_labels.json")

# Load both files
df_agree = pd.read_json(interagreement_file)
df_full = pd.read_json(full_labeled_file)

# Merge label and score using `seq` as the key
df_merged = pd.merge(df_agree, df_full[["seq", "label", "score"]], on="seq", how="left")

# Save the enriched version
df_merged.to_json(output_file, orient="records", indent=2, force_ascii=False)

print("Saved:", output_file)


Saved: /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/processed/annotated/Labeled_Posts_23_04/manual_labels_score99_INTERAGREEMENT_with_labels.json


In [4]:
import pandas as pd
import os
import json
from datetime import datetime

# Parameters
INPUT_JSON = [
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_2.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_3.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_8.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_20.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_21.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_40.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_46.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_60.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_67.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_77.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_94.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_113.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_129.json",
]

OUTPUT_DIR = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/processed/annotated/Labeled_Posts_23_04"
os.makedirs(OUTPUT_DIR, exist_ok=True)

SUBSETS = {
    "fullsample": {"filter": None, "sample_size": 200},
    "score95": {"filter": lambda df: df[df["score"] >= 0.95], "sample_size": 100},
    "score99": {"filter": lambda df: df[df["score"] >= 0.99], "sample_size": 100},
}

# Load and combine JSON files
all_posts = []
for file in INPUT_JSON:
    with open(file, "r", encoding="utf-8") as f:
        all_posts.extend(json.load(f))
df = pd.DataFrame(all_posts)

# Helpers for stratification
def length_bucket(text):
    l = len(text)
    return "short" if l < 100 else "medium" if l < 300 else "long"

def extract_hour(ts):
    try:
        return datetime.fromisoformat(ts.replace("Z", "+00:00")).hour
    except:
        return None

def stratified_sample(df, sample_size):
    df = df.copy()
    df["hour"] = df["timestamp"].apply(extract_hour)
    df["length_bucket"] = df["text"].apply(length_bucket)
    df = df.dropna(subset=["hour", "length_bucket"])
    df["hour"] = df["hour"].astype(int)

    strata = df.groupby(["hour", "length_bucket"])
    proportions = strata.size() / len(df)
    samples_per_stratum = (proportions * sample_size).round().astype(int)

    sampled_rows = []
    for (key, group), (_, n) in zip(strata, samples_per_stratum.items()):
        n_safe = min(n, len(group))
        if n_safe > 0:
            sampled_rows.append(group.sample(n=n_safe, random_state=43))

    df_sampled = pd.concat(sampled_rows)

    if len(df_sampled) > sample_size:
        df_sampled = df_sampled.sample(n=sample_size, random_state=43)

    if len(df_sampled) < sample_size:
        print(f"Only got {len(df_sampled)} rows after stratified sampling. Sampling remaining randomly...")
        remaining = sample_size - len(df_sampled)
        rest = df.drop(df_sampled.index, errors="ignore")
        df_sampled = pd.concat([
            df_sampled,
            rest.sample(n=min(remaining, len(rest)), random_state=43)
        ])

    return df_sampled.sample(frac=1, random_state=43).reset_index(drop=True)

# Split 50/50 for score95/fullsample
def split_and_save(df, name):
    df["annotation"] = ""
    half = len(df) // 2
    df_tobias = df.iloc[:half].reset_index(drop=True)
    df_abel = df.iloc[half:].reset_index(drop=True)

    annotation_fields = ["repo", "seq", "text", "timestamp", "cid", "uri", "annotation"]
    full_fields = annotation_fields + ["label", "score"]

    df_tobias[annotation_fields].to_json(os.path.join(OUTPUT_DIR, f"manual_labels_{name}_tobias.json"), indent=2, orient="records", force_ascii=False)
    df_abel[annotation_fields].to_json(os.path.join(OUTPUT_DIR, f"manual_labels_{name}_abel.json"), indent=2, orient="records", force_ascii=False)
    df[full_fields].to_json(os.path.join(OUTPUT_DIR, f"manual_labels_{name}_full.json"), indent=2, orient="records", force_ascii=False)

    print(f"Saved 3 files for '{name}': {len(df)} total posts")

# Main loop
for subset_name, config in SUBSETS.items():
    try:
        print(f"Processing subset: {subset_name}")

        df_filtered = config["filter"](df) if config["filter"] else df
        df_yes = df_filtered[df_filtered["label"] == "yes"]
        df_no = df_filtered[df_filtered["label"] == "no"]

        if subset_name == "score99":
            # Normal sample for 50 Tobias + 50 Abel
            sample_size = config["sample_size"] // 2
            if len(df_yes) < sample_size or len(df_no) < sample_size:
                print("Not enough posts for score99 — skipping")
                continue

            df_yes_sampled = stratified_sample(df_yes, sample_size)
            df_no_sampled = stratified_sample(df_no, sample_size)
            df_sampled = pd.concat([df_yes_sampled, df_no_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)

            split_and_save(df_sampled, subset_name)

            # Extra shared set for inter-annotator agreement
            print("Creating shared duplicate subset for inter-annotator agreement...")
            df_yes_remaining = df_yes.drop(df_yes_sampled.index, errors="ignore")
            df_no_remaining = df_no.drop(df_no_sampled.index, errors="ignore")

            if len(df_yes_remaining) < 25 or len(df_no_remaining) < 25:
                print("Not enough data for duplicate set — skipping")
                continue

            df_yes_dupe = stratified_sample(df_yes_remaining, 25)
            df_no_dupe = stratified_sample(df_no_remaining, 25)
            df_dupe = pd.concat([df_yes_dupe, df_no_dupe]).sample(frac=1, random_state=42).reset_index(drop=True)
            df_dupe["annotation"] = ""

            interagreement_path = os.path.join(OUTPUT_DIR, "manual_labels_score99_INTERAGREEMENT.json")
            full_fields = ["repo", "seq", "text", "timestamp", "cid", "uri", "annotation", "label", "score"]

            if not os.path.exists(interagreement_path):
                df_dupe[full_fields].to_json(
                    interagreement_path,
                    orient="records", indent=2, force_ascii=False
                )
                print("✅ Saved interagreement file with labels:", interagreement_path)
            else:
                print("⚠️ File already exists — skipping:", interagreement_path)

        else:
            sample_size = config["sample_size"] // 2
            if len(df_yes) < sample_size or len(df_no) < sample_size:
                print(f"Not enough data for a 50/50 label split in '{subset_name}' — skipping")
                continue

            df_yes_sampled = stratified_sample(df_yes, sample_size)
            df_no_sampled = stratified_sample(df_no, sample_size)
            df_sampled = pd.concat([df_yes_sampled, df_no_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)

            split_and_save(df_sampled, subset_name)

    except Exception as e:
        print(f"Error in subset '{subset_name}': {e}")


Processing subset: fullsample
Saved 3 files for 'fullsample': 200 total posts
Processing subset: score95
Only got 46 rows after stratified sampling. Sampling remaining randomly...
Only got 47 rows after stratified sampling. Sampling remaining randomly...
Saved 3 files for 'score95': 100 total posts
Processing subset: score99
Only got 47 rows after stratified sampling. Sampling remaining randomly...
Only got 47 rows after stratified sampling. Sampling remaining randomly...
Saved 3 files for 'score99': 100 total posts
Creating shared duplicate subset for inter-annotator agreement...
Only got 22 rows after stratified sampling. Sampling remaining randomly...
Only got 21 rows after stratified sampling. Sampling remaining randomly...
✅ Saved interagreement file with labels: /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/processed/annotated/Labeled_Posts_23_04/manual_labels_score99_INTERAGREEMENT.json
