### Script to stratify sampling of filtered data to be used for Manual Labeling
### Goal: Evaluate the classifier's performance

In [31]:
import pandas as pd
import os
import json
from datetime import datetime

In [66]:
# Parameters



INPUT_JSON = [
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_2.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_3.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_67.json",
    "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/climate_classified/climate_classified_posts_129.json",
]


OUTPUT_DIR = "../data/processed/annotated"

SAMPLE_SIZE = 500
SAMPLE_PER_LABEL = 250 

In [60]:
all_posts = []
for file in INPUT_JSON:
    with open(file, "r", encoding="utf-8") as f:
        all_posts.extend(json.load(f))

df = pd.DataFrame(all_posts)


In [61]:
def length_bucket(text):
    l = len(text)
    if l < 100:
        return "short"
    elif l < 300:
        return "medium"
    else:
        return "long"

def extract_hour(ts):
    try:
        return datetime.fromisoformat(ts.replace("Z", "+00:00")).hour
    except:
        return None

In [62]:
df["text_length"] = df["text"].apply(len)
df["length_bucket"] = df["text"].apply(length_bucket)
df["hour"] = df["timestamp"].apply(extract_hour)
df = df.dropna(subset=["length_bucket", "hour"])
df["hour"] = df["hour"].astype(int)


In [39]:
sampled_rows = []
for (group_key, group_df), n_samples in zip(strata, samples_per_stratum.items()):
    n = min(n_samples[1], len(group_df))
    if n > 0:
        sampled_rows.append(group_df.sample(n=n, random_state=42))

sample_df = pd.concat(sampled_rows, ignore_index=True)
sample_df = sample_df.sample(n=SAMPLE_SIZE, random_state=42)

In [64]:
def stratified_sample(df_label, n):
    strata = df_label.groupby(["hour", "length_bucket"])
    total_count = len(df_label)
    stratum_sizes = strata.size()
    proportions = stratum_sizes / total_count
    samples_per_stratum = (proportions * n).round().astype(int)

    sampled_rows = []
    for (group_key, group_df), n_samples in zip(strata, samples_per_stratum.items()):
        count = min(n_samples[1], len(group_df))
        if count > 0:
            sampled_rows.append(group_df.sample(n=count, random_state=42))

    return pd.concat(sampled_rows, ignore_index=True)

In [67]:
yes_df = df[df["label"] == "yes"]
no_df = df[df["label"] == "no"]

sampled_yes = stratified_sample(yes_df, SAMPLE_PER_LABEL)
sampled_no = stratified_sample(no_df, SAMPLE_PER_LABEL)

sample_df = pd.concat([sampled_yes, sampled_no], ignore_index=True)
sample_df = sample_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [68]:
#Write our label / annotation in extra field (Could be changed if we get biased by score but shouldnt matter)
sample_df["annotation"] = ""

In [70]:
half = len(sample_df) // 2

df_tobias = sample_df.iloc[:half].reset_index(drop=True)
df_abel = sample_df.iloc[half:].reset_index(drop=True)

In [71]:
#Interested metadata for annotated file:
annotation_fields = ["repo", "seq", "text", "timestamp", "cid", "uri", "annotation"]
#Complete file
full_fields = annotation_fields + ["label", "score"]

In [36]:
with open(os.path.join(OUTPUT_DIR, "manual_labels_climate_tobias.json"), "w", encoding="utf-8") as f:
    json.dump(df_tobias[annotation_fields].to_dict(orient="records"), f, indent=2, ensure_ascii=False)

with open(os.path.join(OUTPUT_DIR, "manual_labels_climate_friend.json"), "w", encoding="utf-8") as f:
    json.dump(df_friend[annotation_fields].to_dict(orient="records"), f, indent=2, ensure_ascii=False)

with open(os.path.join(OUTPUT_DIR, "manual_labels_climate_full.json"), "w", encoding="utf-8") as f:
    json.dump(sample_df[full_fields].to_dict(orient="records"), f, indent=2, ensure_ascii=False)

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/annotated/manual_labels_climate_Tobias.json'