In [None]:
from pathlib import Path

from datasets import load_dataset, Dataset

from luminar.utils.data import get_matched_ids

HF_TOKEN = (Path.home() / ".hf_token").read_text().strip()

In [None]:
domains = [
    "blog_authorship_corpus",
    "student_essays",
    "cnn_news",
    "euro_court_cases",
    "house_of_commons",
    "arxiv_papers",
    "gutenberg_en",
    # "bundestag",
    # "spiegel_articles",
    # "gutenberg_de",
]
agents = {"human", "gpt_4o_mini", "gemma2_9b"}

In [None]:
path = Path("../data/prismai/share")

stats = []
for domain in domains:
    datset_config_name = f"{domain}-fulltext"
    folder = path / datset_config_name
    folder.mkdir(parents=True, exist_ok=True)

    for agent in agents:
        dataset: Dataset = (
            load_dataset(
                "liberi-luminaris/PrismAI",
                datset_config_name,
                split=agent,
                token=HF_TOKEN,
            )  # type: ignore
        )
        file = folder / agent
        with (
            file.with_suffix(".jsonl.gz").open("wb") as fp,
            gzip.GzipFile(fileobj=fp, mode="w") as fout,
        ):
            dataset.to_json(fout)

In [None]:
raise RuntimeError
stats = []
for domain in domains:
    datset_config_name = f"{domain}-fulltext"
    dataset_split_name = f"human+{'+'.join(agents)}"
    dataset: Dataset = (
        load_dataset(
            "liberi-luminaris/PrismAI-encoded-gpt2",
            datset_config_name,
            split=dataset_split_name,
            token=HF_TOKEN,
        )  # type: ignore
        .map(
            lambda features: {"len": len(features)},
            input_columns=["features"],
            remove_columns=["features"],
            num_proc=8,
        )
        .filter(lambda len: len > 0, input_columns=["len"], num_proc=8)
    )
    matched_ids = get_matched_ids(dataset, agents)
    stats.append(
        {
            "domain": domain,
            "dataset_size": len(dataset),
            "matched_dataset_size": len(matched_ids),
            "splits": dataset_split_name,
        }
    )

In [None]:
import pandas


pandas.DataFrame(
    stats
    + [
        {
            "domain": "all",
            "dataset_size": sum(s["dataset_size"] for s in stats),
            "matched_dataset_size": sum(s["matched_dataset_size"] for s in stats),
        }
    ]
)