# PrismAI

## Data

### Upload Data to HuggingFace Hub

In [None]:
HF_TOKEN = open("/home/mastoeck/.hf_token").read().strip()

In [None]:
import bz2
import json
from collections import defaultdict
from pathlib import Path

from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset

In [None]:
def flatten_samples(batch: dict):
    result = defaultdict(list)
    for element in batch["samples"]:
        for sample in element:
            for key, value in sample.items():
                result[key].append(value)
    return result


In [None]:
from unicodedata import normalize


def normalize_text(batch: dict[str, list[str]]):
    return {
        "text": [
            " ".join(normalize("NFC", text).strip().split()) for text in batch["text"]
        ]
    }

In [None]:
for path in sorted(Path("../data").iterdir()):
    if path.is_file() and path.stem.startswith("prismai-"):
        config_name = path.with_suffix("").stem.removeprefix("prismai-")
        print(config_name)

        raw_dataset = (
            Dataset.from_list(
                [json.loads(line) for line in bz2.open(path, "rt")],
            )
            .map(
                flatten_samples,
                batched=True,
                remove_columns=["_id", "samples"],
            )
            .map(normalize_text, batched=True)
            .map(lambda sample: {"agent": sample["agent"].replace("-", "_").replace(":", "_")})
        )
        dataset = DatasetDict(
            {
                agent: raw_dataset.filter(lambda x: x["agent"] == agent)
                for agent in set(raw_dataset["agent"])
            }
        )
        dataset.push_to_hub(
            "liberi-luminaris/PrismAI",
            config_name,
            private=True,
            token=HF_TOKEN,
        )

In [None]:
# for agent in [
#     "gpt_4o_mini",
#     "gemma2_9b",
# ]:
datasets: list[DatasetDict] = [
    # load_dataset("liberi-luminaris/PrismAI", f"{domain}-fulltext-{agent}")
    load_dataset("liberi-luminaris/PrismAI", f"{domain}-fulltext").map(
        lambda _: {"domain": domain}
    )
    for domain in ["bundestag", "spiegel_articles", "gutenberg_de"]
]  # type: ignore
agents = set(agent for dataset in datasets for agent in dataset.keys())
dataset = DatasetDict(
    {
        agent: concatenate_datasets(
            [dataset[agent] for dataset in datasets if agent in dataset.keys()]
        )
        for agent in agents
    }
)
dataset.push_to_hub(
    "liberi-luminaris/PrismAI",
    "de-fulltext",
    # f"de-fulltext-{agent}",
    private=True,
    token=HF_TOKEN,
)

In [32]:
datasets: list[DatasetDict] = [
    load_dataset("liberi-luminaris/PrismAI", f"{domain}-fulltext").map(
        lambda _: {"domain": domain}
    )
    for domain in [
        "blog_authorship_corpus",
        "student_essays",
        "cnn_news",
        "euro_court_cases",
        "house_of_commons",
        "arxiv_papers",
        "gutenberg_en",
    ]
]  # type: ignore
agents = set(agent for dataset in datasets for agent in dataset.keys())
dataset = DatasetDict(
    {
        agent: concatenate_datasets(
            [dataset[agent] for dataset in datasets if agent in dataset.keys()]
        )
        for agent in agents
    }
)
dataset.push_to_hub(
    "liberi-luminaris/PrismAI",
    "en-fulltext",
    private=True,
    token=HF_TOKEN,
)

Map:   0%|          | 0/195 [00:00<?, ? examples/s]

Map:   0%|          | 0/42311 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/15177 [00:00<?, ? examples/s]

Map:   0%|          | 0/36239 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/219 [00:00<?, ? examples/s]

Map:   0%|          | 0/1167 [00:00<?, ? examples/s]

o3_mini-00000-of-00001.parquet:   0%|          | 0.00/38.7k [00:00<?, ?B/s]

human-00000-of-00001.parquet:   0%|          | 0.00/47.6M [00:00<?, ?B/s]

nemotron-00000-of-00001.parquet:   0%|          | 0.00/189k [00:00<?, ?B/s]

gpt_4_turbo-00000-of-00001.parquet:   0%|          | 0.00/186k [00:00<?, ?B/s]

gemma2_9b-00000-of-00001.parquet:   0%|          | 0.00/28.7M [00:00<?, ?B/s]

gpt_4o_mini-00000-of-00001.parquet:   0%|          | 0.00/29.4M [00:00<?, ?B/s]

deepseek_r1_1.5b-00000-of-00001.parquet:   0%|          | 0.00/384k [00:00<?, ?B/s]

deepseek_r1_32b-00000-of-00001.parquet:   0%|          | 0.00/513k [00:00<?, ?B/s]

phi3_3.8b-00000-of-00001.parquet:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

Generating o3_mini split:   0%|          | 0/7 [00:00<?, ? examples/s]

Generating human split:   0%|          | 0/18773 [00:00<?, ? examples/s]

Generating nemotron split:   0%|          | 0/79 [00:00<?, ? examples/s]

Generating gpt_4_turbo split:   0%|          | 0/74 [00:00<?, ? examples/s]

Generating gemma2_9b split:   0%|          | 0/13799 [00:00<?, ? examples/s]

Generating gpt_4o_mini split:   0%|          | 0/11814 [00:00<?, ? examples/s]

Generating deepseek_r1_1.5b split:   0%|          | 0/300 [00:00<?, ? examples/s]

Generating deepseek_r1_32b split:   0%|          | 0/217 [00:00<?, ? examples/s]

Generating phi3_3.8b split:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/18773 [00:00<?, ? examples/s]

Map:   0%|          | 0/79 [00:00<?, ? examples/s]

Map:   0%|          | 0/74 [00:00<?, ? examples/s]

Map:   0%|          | 0/13799 [00:00<?, ? examples/s]

Map:   0%|          | 0/11814 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/217 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

o3_mini-00000-of-00001.parquet:   0%|          | 0.00/610k [00:00<?, ?B/s]

human-00000-of-00001.parquet:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

nemotron-00000-of-00001.parquet:   0%|          | 0.00/993k [00:00<?, ?B/s]

gpt_4_turbo-00000-of-00001.parquet:   0%|          | 0.00/123k [00:00<?, ?B/s]

gemma2_9b-00000-of-00001.parquet:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

gpt_4o_mini-00000-of-00001.parquet:   0%|          | 0.00/9.05M [00:00<?, ?B/s]

deepseek_r1_1.5b-00000-of-00001.parquet:   0%|          | 0.00/387k [00:00<?, ?B/s]

deepseek_r1_32b-00000-of-00001.parquet:   0%|          | 0.00/552k [00:00<?, ?B/s]

phi3_3.8b-00000-of-00001.parquet:   0%|          | 0.00/394k [00:00<?, ?B/s]

Generating o3_mini split:   0%|          | 0/109 [00:00<?, ? examples/s]

Generating human split:   0%|          | 0/9148 [00:00<?, ? examples/s]

Generating nemotron split:   0%|          | 0/474 [00:00<?, ? examples/s]

Generating gpt_4_turbo split:   0%|          | 0/59 [00:00<?, ? examples/s]

Generating gemma2_9b split:   0%|          | 0/8909 [00:00<?, ? examples/s]

Generating gpt_4o_mini split:   0%|          | 0/3761 [00:00<?, ? examples/s]

Generating deepseek_r1_1.5b split:   0%|          | 0/300 [00:00<?, ? examples/s]

Generating deepseek_r1_32b split:   0%|          | 0/216 [00:00<?, ? examples/s]

Generating phi3_3.8b split:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/109 [00:00<?, ? examples/s]

Map:   0%|          | 0/9148 [00:00<?, ? examples/s]

Map:   0%|          | 0/474 [00:00<?, ? examples/s]

Map:   0%|          | 0/59 [00:00<?, ? examples/s]

Map:   0%|          | 0/8909 [00:00<?, ? examples/s]

Map:   0%|          | 0/3761 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/216 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

o3_mini-00000-of-00001.parquet:   0%|          | 0.00/709k [00:00<?, ?B/s]

human-00000-of-00001.parquet:   0%|          | 0.00/178M [00:00<?, ?B/s]

nemotron-00000-of-00001.parquet:   0%|          | 0.00/175k [00:00<?, ?B/s]

gpt_4_turbo-00000-of-00001.parquet:   0%|          | 0.00/154k [00:00<?, ?B/s]

gemma2_9b-00000-of-00001.parquet:   0%|          | 0.00/25.8M [00:00<?, ?B/s]

gpt_4o_mini-00000-of-00001.parquet:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

deepseek_r1_1.5b-00000-of-00001.parquet:   0%|          | 0.00/367k [00:00<?, ?B/s]

deepseek_r1_32b-00000-of-00001.parquet:   0%|          | 0.00/430k [00:00<?, ?B/s]

phi3_3.8b-00000-of-00001.parquet:   0%|          | 0.00/399k [00:00<?, ?B/s]

Generating o3_mini split:   0%|          | 0/171 [00:00<?, ? examples/s]

Generating human split:   0%|          | 0/13760 [00:00<?, ? examples/s]

Generating nemotron split:   0%|          | 0/81 [00:00<?, ? examples/s]

Generating gpt_4_turbo split:   0%|          | 0/74 [00:00<?, ? examples/s]

Generating gemma2_9b split:   0%|          | 0/13343 [00:00<?, ? examples/s]

Generating gpt_4o_mini split:   0%|          | 0/5961 [00:00<?, ? examples/s]

Generating deepseek_r1_1.5b split:   0%|          | 0/300 [00:00<?, ? examples/s]

Generating deepseek_r1_32b split:   0%|          | 0/218 [00:00<?, ? examples/s]

Generating phi3_3.8b split:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/171 [00:00<?, ? examples/s]

Map:   0%|          | 0/13760 [00:00<?, ? examples/s]

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Map:   0%|          | 0/74 [00:00<?, ? examples/s]

Map:   0%|          | 0/13343 [00:00<?, ? examples/s]

Map:   0%|          | 0/5961 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/218 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

o3_mini-00000-of-00001.parquet:   0%|          | 0.00/949k [00:00<?, ?B/s]

human-00000-of-00001.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

gpt_4_turbo-00000-of-00001.parquet:   0%|          | 0.00/15.2k [00:00<?, ?B/s]

gemma2_9b-00000-of-00001.parquet:   0%|          | 0.00/21.1M [00:00<?, ?B/s]

gpt_4o_mini-00000-of-00001.parquet:   0%|          | 0.00/22.9M [00:00<?, ?B/s]

deepseek_r1_1.5b-00000-of-00001.parquet:   0%|          | 0.00/473k [00:00<?, ?B/s]

deepseek_r1_32b-00000-of-00001.parquet:   0%|          | 0.00/476k [00:00<?, ?B/s]

phi3_3.8b-00000-of-00001.parquet:   0%|          | 0.00/645k [00:00<?, ?B/s]

Generating o3_mini split:   0%|          | 0/116 [00:00<?, ? examples/s]

Generating human split:   0%|          | 0/8448 [00:00<?, ? examples/s]

Generating gpt_4_turbo split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating gemma2_9b split:   0%|          | 0/7780 [00:00<?, ? examples/s]

Generating gpt_4o_mini split:   0%|          | 0/7173 [00:00<?, ? examples/s]

Generating deepseek_r1_1.5b split:   0%|          | 0/300 [00:00<?, ? examples/s]

Generating deepseek_r1_32b split:   0%|          | 0/218 [00:00<?, ? examples/s]

Generating phi3_3.8b split:   0%|          | 0/109 [00:00<?, ? examples/s]

Map:   0%|          | 0/116 [00:00<?, ? examples/s]

Map:   0%|          | 0/8448 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/7780 [00:00<?, ? examples/s]

Map:   0%|          | 0/7173 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/218 [00:00<?, ? examples/s]

Map:   0%|          | 0/109 [00:00<?, ? examples/s]

o3_mini-00000-of-00001.parquet:   0%|          | 0.00/545k [00:00<?, ?B/s]

human-00000-of-00003.parquet:   0%|          | 0.00/454M [00:00<?, ?B/s]

human-00001-of-00003.parquet:   0%|          | 0.00/440M [00:00<?, ?B/s]

human-00002-of-00003.parquet:   0%|          | 0.00/482M [00:00<?, ?B/s]

nemotron-00000-of-00001.parquet:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

gpt_4_turbo-00000-of-00001.parquet:   0%|          | 0.00/40.4k [00:00<?, ?B/s]

gemma2_9b-00000-of-00003.parquet:   0%|          | 0.00/6.08M [00:00<?, ?B/s]

gemma2_9b-00001-of-00003.parquet:   0%|          | 0.00/6.02M [00:00<?, ?B/s]

gemma2_9b-00002-of-00003.parquet:   0%|          | 0.00/5.99M [00:00<?, ?B/s]

gpt_4o_mini-00000-of-00001.parquet:   0%|          | 0.00/5.92M [00:00<?, ?B/s]

deepseek_r1_1.5b-00000-of-00001.parquet:   0%|          | 0.00/207k [00:00<?, ?B/s]

deepseek_r1_32b-00000-of-00001.parquet:   0%|          | 0.00/159k [00:00<?, ?B/s]

phi3_3.8b-00000-of-00001.parquet:   0%|          | 0.00/57.6k [00:00<?, ?B/s]

Generating o3_mini split:   0%|          | 0/65 [00:00<?, ? examples/s]

Generating human split:   0%|          | 0/6801 [00:00<?, ? examples/s]

Generating nemotron split:   0%|          | 0/381 [00:00<?, ? examples/s]

Generating gpt_4_turbo split:   0%|          | 0/12 [00:00<?, ? examples/s]

Generating gemma2_9b split:   0%|          | 0/6575 [00:00<?, ? examples/s]

Generating gpt_4o_mini split:   0%|          | 0/1485 [00:00<?, ? examples/s]

Generating deepseek_r1_1.5b split:   0%|          | 0/113 [00:00<?, ? examples/s]

Generating deepseek_r1_32b split:   0%|          | 0/49 [00:00<?, ? examples/s]

Generating phi3_3.8b split:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

Map:   0%|          | 0/6801 [00:00<?, ? examples/s]

Map:   0%|          | 0/381 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/6575 [00:00<?, ? examples/s]

Map:   0%|          | 0/1485 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/7 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/81 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/72 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/liberi-luminaris/PrismAI/commit/d903bd37cbb12e2201d4511fe60fb3d80231bd2c', commit_message='Upload dataset', commit_description='', oid='d903bd37cbb12e2201d4511fe60fb3d80231bd2c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/liberi-luminaris/PrismAI', endpoint='https://huggingface.co', repo_type='dataset', repo_id='liberi-luminaris/PrismAI'), pr_revision=None, pr_num=None)