In [1]:
from pathlib import Path

from datasets import load_dataset, Dataset

from luminar.utils.data import get_matched_ids

HF_TOKEN = (Path.home() / ".hf_token").read_text().strip()

In [2]:
domains = [
    "blog_authorship_corpus",
    "student_essays",
    "cnn_news",
    "euro_court_cases",
    "house_of_commons",
    "arxiv_papers",
    "gutenberg_en",
    "bundestag",
    "spiegel_articles",
    "gutenberg_de",
]
agents = {"gpt_4o_mini", "gemma2_9b"}

In [3]:
stats = []
for domain in domains:
    datset_config_name = f"{domain}-fulltext"
    dataset_split_name = f"human+{'+'.join(agents)}"
    dataset: Dataset = (
        load_dataset(
            "liberi-luminaris/PrismAI-encoded-gpt2",
            datset_config_name,
            split=dataset_split_name,
            token=HF_TOKEN,
        )  # type: ignore
        .map(
            lambda features: {"len": len(features)},
            input_columns=["features"],
            remove_columns=["features"],
            num_proc=8,
        )
        .filter(lambda len: len > 0, input_columns=["len"], num_proc=8)
    )
    stats.append(
        {
            "domain": domain,
            "dataset_size": len(dataset),
            "matched_dataset_size": len(get_matched_ids(dataset, agents)),
            "split": dataset_split_name,
        }
    )
    for agent in agents:
        stats.append(
            {
                "domain": domain,
                "dataset_size": len(dataset),
                "matched_dataset_size": len(get_matched_ids(dataset, {agent})),
                "split": f"human+{agent}",
            }
        )

Filter:   0%|          | 0/93727 [00:00<?, ? examples/s]

Filter:   0%|          | 0/93727 [00:00<?, ? examples/s]

Filter:   0%|          | 0/44386 [00:00<?, ? examples/s]

Filter:   0%|          | 0/44386 [00:00<?, ? examples/s]

Filter:   0%|          | 0/21818 [00:00<?, ? examples/s]

Filter:   0%|          | 0/21818 [00:00<?, ? examples/s]

Filter:   0%|          | 0/33064 [00:00<?, ? examples/s]

Filter:   0%|          | 0/33064 [00:00<?, ? examples/s]

Filter:   0%|          | 0/23401 [00:00<?, ? examples/s]

Filter:   0%|          | 0/23401 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14861 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14861 [00:00<?, ? examples/s]

Filter:   0%|          | 0/41783 [00:00<?, ? examples/s]

Filter:   0%|          | 0/41783 [00:00<?, ? examples/s]

Filter:   0%|          | 0/43023 [00:00<?, ? examples/s]

Filter:   0%|          | 0/43023 [00:00<?, ? examples/s]

Filter:   0%|          | 0/810 [00:00<?, ? examples/s]

Filter:   0%|          | 0/810 [00:00<?, ? examples/s]

In [4]:
import pandas


pandas.DataFrame(
    stats
    + [
        {
            "domain": "all",
            "dataset_size": sum(s["dataset_size"] for s in stats),
            "matched_dataset_size": sum(s["matched_dataset_size"] for s in stats),
        }
    ]
)

Unnamed: 0,domain,dataset_size,matched_dataset_size,split
0,blog_authorship_corpus,37864,14818,human+gemma2_9b+gpt_4o_mini
1,blog_authorship_corpus,37864,14674,human+gemma2_9b
2,blog_authorship_corpus,37864,4576,human+gpt_4o_mini
3,student_essays,93727,41389,human+gemma2_9b+gpt_4o_mini
4,student_essays,93727,15177,human+gemma2_9b
5,student_essays,93727,36239,human+gpt_4o_mini
6,cnn_news,44386,17756,human+gemma2_9b+gpt_4o_mini
7,cnn_news,44386,13799,human+gemma2_9b
8,cnn_news,44386,11814,human+gpt_4o_mini
9,euro_court_cases,21818,9106,human+gemma2_9b+gpt_4o_mini
