### Script to stratify sampling of filtered data to be used for Manual Labeling
### Goal: Evaluate the classifier's performance

In [1]:
import pandas as pd
import os
import json
from datetime import datetime

In [2]:
# Parameters



INPUT_JSON = [
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_2.json",
"/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_3.json",
"/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_8.json",
"/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_20.json",
"/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_21.json",
"/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_40.json",
"/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_46.json",
"/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_60.json",
"/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_67.json",
"/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_77.json",
"/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_94.json",
"/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_113.json",
"/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_129.json",
]

Iteration_2_PATH = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/processed/annotated/Labeled_Posts_23_04"
OUTPUT_DIR = f"/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/processed/annotated{Iteration_2_PATH}"
os.makedirs(OUTPUT_DIR, exist_ok=True)



SUBSETS = {
    "fullsample": {"filter": None, "sample_size": 200},
    "score95": {"filter": lambda df: df[df["score"] >= 0.95], "sample_size": 100},
    "score99": {"filter": lambda df: df[df["score"] >= 0.99], "sample_size": 100},
}

In [3]:
all_posts = []
for file in INPUT_JSON:
    with open(file, "r", encoding="utf-8") as f:
        all_posts.extend(json.load(f))

df = pd.DataFrame(all_posts)


In [4]:
def length_bucket(text):
    l = len(text)
    if l < 100:
        return "short"
    elif l < 300:
        return "medium"
    else:
        return "long"

def extract_hour(ts):
    try:
        return datetime.fromisoformat(ts.replace("Z", "+00:00")).hour
    except:
        return None

In [5]:
def stratified_sample(df, sample_size):
    df = df.copy()  # Avoid chained assignment warnings

    df["hour"] = df["timestamp"].apply(extract_hour)
    df["length_bucket"] = df["text"].apply(length_bucket)
    df = df.dropna(subset=["hour", "length_bucket"])
    df["hour"] = df["hour"].astype(int)

    strata = df.groupby(["hour", "length_bucket"])
    proportions = strata.size() / len(df)
    samples_per_stratum = (proportions * sample_size).round().astype(int)

    sampled_rows = []
    for (key, group), (_, n) in zip(strata, samples_per_stratum.items()):
        n_safe = min(n, len(group))  # never try to oversample a small group
        if n_safe > 0:
            sampled_rows.append(group.sample(n=n_safe, random_state=42))

    df_sampled = pd.concat(sampled_rows)


    if len(df_sampled) > sample_size:
        df_sampled = df_sampled.sample(n=sample_size, random_state=43) #iteration 1 uses random_state 42, iteration 2 uses random_state 43


    if len(df_sampled) < sample_size:
        print(f"Only got {len(df_sampled)} rows after stratified sampling. Sampling remaining randomly...")
        remaining = sample_size - len(df_sampled)
        rest = df.drop(df_sampled.index, errors="ignore")
        df_sampled = pd.concat([
            df_sampled,
            rest.sample(n=min(remaining, len(rest)), random_state=42)
        ])

    return df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)


In [6]:
def split_and_save(df, name):
    df["annotation"] = ""
    half = len(df) // 2
    df_tobias = df.iloc[:half].reset_index(drop=True)
    df_abel = df.iloc[half:].reset_index(drop=True)

    annotation_fields = ["repo", "seq", "text", "timestamp", "cid", "uri", "annotation"]
    full_fields = annotation_fields + ["label", "score"]

    # Save anonymized sets
    df_tobias[annotation_fields].to_json(os.path.join(OUTPUT_DIR, f"manual_labels_{name}_tobias.json"),orient="records", indent=2, force_ascii=False)
    df_abel[annotation_fields].to_json(os.path.join(OUTPUT_DIR, f"manual_labels_{name}_abel.json"),orient="records", indent=2, force_ascii=False)

    # Save full version with label + score
    df[full_fields].to_json(os.path.join(OUTPUT_DIR, f"manual_labels_{name}_full.json"),orient="records", indent=2, force_ascii=False)

    print(f" Saved 3 files for subset '{name}': {len(df)} total posts")

In [7]:
all_posts = []
for file in INPUT_JSON:
    with open(file, "r", encoding="utf-8") as f:
        all_posts.extend(json.load(f))

df = pd.DataFrame(all_posts)


In [8]:
for subset_name, config in SUBSETS.items():
    try:
        print(f"Processing subset: {subset_name}")

        df_filtered = config["filter"](df) if config["filter"] else df

        print(f"📊 Filtered posts available: {len(df_filtered)}")

        sample_size = config["sample_size"] // 2

        df_yes = df_filtered[df_filtered["label"] == "yes"]
        df_no = df_filtered[df_filtered["label"] == "no"]

        if len(df_yes) < sample_size or len(df_no) < sample_size:
            print(f"⚠️ Not enough data for a 50/50 label split in '{subset_name}' — skipping")
            continue

        df_yes_sampled = stratified_sample(df_yes, sample_size)
        df_no_sampled = stratified_sample(df_no, sample_size)

        df_sampled = pd.concat([df_yes_sampled, df_no_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)

        split_and_save(df_sampled, subset_name)

    except Exception as e:
        print(f"❌ Failed to process '{subset_name}': {e}")
        continue


Processing subset: fullsample
📊 Filtered posts available: 1300000
 Saved 3 files for subset 'fullsample': 200 total posts
Processing subset: score95
📊 Filtered posts available: 743520
Only got 46 rows after stratified sampling. Sampling remaining randomly...
Only got 47 rows after stratified sampling. Sampling remaining randomly...
 Saved 3 files for subset 'score95': 100 total posts
Processing subset: score99
📊 Filtered posts available: 104237
Only got 47 rows after stratified sampling. Sampling remaining randomly...
Only got 47 rows after stratified sampling. Sampling remaining randomly...
 Saved 3 files for subset 'score99': 100 total posts


In [9]:
print(f"Total posts loaded: {len(df)}")
print(f"Posts with score ≥ 0.99: {len(df[df['score'] >= 0.99])}")

Total posts loaded: 1300000
Posts with score ≥ 0.99: 104237


In [13]:
# check distribution

import pandas as pd
import os

# Set your annotated output folder
OUTPUT_DIR = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/processed/annotated/Labeled_Posts_23_04"
# List of full files to check
full_files = [

    "manual_labels_fullsample_full.json",
    "manual_labels_score95_full.json",
    "manual_labels_score99_full.json"
]

# Check and print label distributions
for filename in full_files:
    filepath = os.path.join(OUTPUT_DIR, filename)
    df = pd.read_json(filepath)
    label_counts = df["label"].value_counts()

    print(f"\n {filename}")
    print(label_counts)
    print(f"Total: {label_counts.sum()} rows")


KeyError: 'label'

In [14]:
import pandas as pd
import os
import json
from datetime import datetime

# Parameters
INPUT_JSON = [
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_2.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_3.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_8.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_20.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_21.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_40.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_46.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_60.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_67.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_77.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_94.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_113.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_129.json",
]

Iteration_2_PATH = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/processed/annotated/Labeled_Posts_23_04"
OUTPUT_DIR = f"{Iteration_2_PATH}"
os.makedirs(OUTPUT_DIR, exist_ok=True)

SUBSETS = {
    "fullsample": {"filter": None, "sample_size": 200},
    "score95": {"filter": lambda df: df[df["score"] >= 0.95], "sample_size": 100},
    "score99": {"filter": lambda df: df[df["score"] >= 0.99], "sample_size": 200},  # full 200 for 4x50 split
}

# Load and combine JSONs
all_posts = []
for file in INPUT_JSON:
    with open(file, "r", encoding="utf-8") as f:
        all_posts.extend(json.load(f))

df = pd.DataFrame(all_posts)

# Stratified sampling helpers
def length_bucket(text):
    l = len(text)
    return "short" if l < 100 else "medium" if l < 300 else "long"

def extract_hour(ts):
    try:
        return datetime.fromisoformat(ts.replace("Z", "+00:00")).hour
    except:
        return None

def stratified_sample(df, sample_size):
    df = df.copy()
    df["hour"] = df["timestamp"].apply(extract_hour)
    df["length_bucket"] = df["text"].apply(length_bucket)
    df = df.dropna(subset=["hour", "length_bucket"])
    df["hour"] = df["hour"].astype(int)

    strata = df.groupby(["hour", "length_bucket"])
    proportions = strata.size() / len(df)
    samples_per_stratum = (proportions * sample_size).round().astype(int)

    sampled_rows = []
    for (key, group), (_, n) in zip(strata, samples_per_stratum.items()):
        n_safe = min(n, len(group))
        if n_safe > 0:
            sampled_rows.append(group.sample(n=n_safe, random_state=43))  # Iteration 2 seed

    df_sampled = pd.concat(sampled_rows)

    if len(df_sampled) > sample_size:
        df_sampled = df_sampled.sample(n=sample_size, random_state=43)

    if len(df_sampled) < sample_size:
        print(f"Only got {len(df_sampled)} rows after stratified sampling. Sampling remaining randomly...")
        remaining = sample_size - len(df_sampled)
        rest = df.drop(df_sampled.index, errors="ignore")
        df_sampled = pd.concat([
            df_sampled,
            rest.sample(n=min(remaining, len(rest)), random_state=42)
        ])

    return df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Generic split-and-save for fullsample and score95
def split_and_save(df, name):
    df["annotation"] = ""
    half = len(df) // 2
    df_tobias = df.iloc[:half].reset_index(drop=True)
    df_abel = df.iloc[half:].reset_index(drop=True)

    annotation_fields = ["repo", "seq", "text", "timestamp", "cid", "uri", "annotation"]
    full_fields = annotation_fields + ["label", "score"]

    df_tobias[annotation_fields].to_json(os.path.join(OUTPUT_DIR, f"manual_labels_{name}_tobias.json"), indent=2, orient="records", force_ascii=False)
    df_abel[annotation_fields].to_json(os.path.join(OUTPUT_DIR, f"manual_labels_{name}_abel.json"), indent=2, orient="records", force_ascii=False)
    df[full_fields].to_json(os.path.join(OUTPUT_DIR, f"manual_labels_{name}_full.json"), indent=2, orient="records", force_ascii=False)

    print(f"Saved 3 files for '{name}': {len(df)} total posts")

# Main subset loop
for subset_name, config in SUBSETS.items():
    try:
        print(f"\nProcessing subset: {subset_name}")

        df_filtered = config["filter"](df) if config["filter"] else df
        df_yes = df_filtered[df_filtered["label"] == "yes"]
        df_no = df_filtered[df_filtered["label"] == "no"]

        if subset_name == "score99":
            if len(df_yes) < 100 or len(df_no) < 100:
                print("Not enough posts for score99 — skipping")
                continue

            df_yes_sampled = stratified_sample(df_yes, 100)
            df_no_sampled = stratified_sample(df_no, 100)
            df_sampled = pd.concat([df_yes_sampled, df_no_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)

            # Create 4 sets of 50 each
            sets = [df_sampled.iloc[i*50:(i+1)*50].reset_index(drop=True) for i in range(4)]
            set_a, set_b, set_c, set_d = sets

            for df_subset in [set_a, set_b, set_c, set_d]:
                df_subset["annotation"] = ""

            annotation_fields = ["repo", "seq", "text", "timestamp", "cid", "uri", "annotation"]
            full_fields = annotation_fields + ["label", "score"]

            # Save all labeled posts
            df_sampled[full_fields].to_json(
                os.path.join(OUTPUT_DIR, "manual_labels_score99_full.json"),
                orient="records", indent=2, force_ascii=False
            )

            # Save shared sets
            set_a[annotation_fields].to_json(
                os.path.join(OUTPUT_DIR, "manual_labels_score99_set_a.json"),
                orient="records", indent=2, force_ascii=False
            )
            set_b[annotation_fields].to_json(
                os.path.join(OUTPUT_DIR, "manual_labels_score99_set_b.json"),
                orient="records", indent=2, force_ascii=False
            )

            # Save annotator-specific sets
            set_c[annotation_fields].to_json(
                os.path.join(OUTPUT_DIR, "manual_labels_score99_set_c_tobias.json"),
                orient="records", indent=2, force_ascii=False
            )
            set_d[annotation_fields].to_json(
                os.path.join(OUTPUT_DIR, "manual_labels_score99_set_d_abel.json"),
                orient="records", indent=2, force_ascii=False
            )

            print("Saved 4 clean sets for score99")

        else:
            sample_size = config["sample_size"] // 2
            if len(df_yes) < sample_size or len(df_no) < sample_size:
                print(f"Not enough data for a 50/50 split in '{subset_name}' — skipping")
                continue

            df_yes_sampled = stratified_sample(df_yes, sample_size)
            df_no_sampled = stratified_sample(df_no, sample_size)
            df_sampled = pd.concat([df_yes_sampled, df_no_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)

            split_and_save(df_sampled, subset_name)

    except Exception as e:
        print(f"Error in subset '{subset_name}': {e}")



📦 Processing subset: fullsample
Saved 3 files for 'fullsample': 200 total posts

📦 Processing subset: score95
Only got 46 rows after stratified sampling. Sampling remaining randomly...
Only got 47 rows after stratified sampling. Sampling remaining randomly...
Saved 3 files for 'score95': 100 total posts

📦 Processing subset: score99
Only got 98 rows after stratified sampling. Sampling remaining randomly...
Only got 99 rows after stratified sampling. Sampling remaining randomly...
Created 4 separate sets for score99 — 150 posts each for Tobias and Abel
