In [None]:
import bz2

from bson import json_util
from pymongo import MongoClient
from tqdm import tqdm

client = MongoClient(
    "mongodb://prismai:prismai@isengart.hucompute.org:27123/?retryWrites=true&loadBalanced=false&serverSelectionTimeoutMS=5000&connectTimeoutMS=10000&authSource=admin&authMechanism=SCRAM-SHA-256"
)
db = client["prismai"]
collected_items = db["collected_items"]
features_prismai = db["features_prismai"]

for domain_kwargs in tqdm(("spiegel_articles", "cnn_news")):
    ids = set()

    with bz2.open(
        f"/tmp/prismai_{domain_kwargs}_metrics.jsonl.bz2", "wt", encoding="utf-8"
    ) as fp:
        for doc in features_prismai.find(
            {
                "document.domain": domain_kwargs,
                "model": {
                    "name": "gpt2",
                    "provider": "transformers",
                    "variant": "default",
                },
                "pre_processor": {"type": "truncate", "max_length": 1024},
                "document.agent": {"$in": [None, "gpt-4o-mini"]},
                "document.type": {"$in": ["source", "fulltext"]},
            },
            projection={
                "_id": "$document._id.$id",
                "agent": "$document.agent",
                "type": "$document.type",
                "metrics": {"$first": "$metrics"},
            },
        ):
            ids.add(doc["_id"])
            fp.write(json_util.dumps(doc).strip() + "\n")

    with bz2.open(f"/tmp/prismai_{domain_kwargs}.jsonl.bz2", "wt", encoding="utf-8") as fp:
        for doc in collected_items.find({"domain": domain_kwargs}):
            if doc["_id"] in ids:
                ids.remove(doc["_id"])

                fp.write(json_util.dumps(doc).strip() + "\n")

                if not ids:
                    break

100%|██████████| 2/2 [00:10<00:00,  5.04s/it]


In [28]:
import bz2

from bson import json_util
from pymongo import MongoClient
from tqdm import tqdm

client = MongoClient(
    "mongodb://prismai:prismai@isengart.hucompute.org:27123/?retryWrites=true&loadBalanced=false&serverSelectionTimeoutMS=5000&connectTimeoutMS=10000&authSource=admin&authMechanism=SCRAM-SHA-256"
)
db = client["prismai"]
dataset_PrismAI = db["dataset_PrismAI"]

for domain_kwargs in tqdm(
    (
        {"domain": "blog_authorship_corpus"},
        {"domain": "student_essays"},
        {"domain": "cnn_news"},
        {"domain": "euro_court_cases"},
        {"domain": "house_of_commons"},
        {"domain": "arxiv_papers"},
        {"domain": "gutenberg", "lang": "en-EN", "name": "gutenberg_en"},
        {"domain": "gutenberg", "lang": "de-DE", "name": "gutenberg_de"},
        {"domain": "bundestag"},
        {"domain": "spiegel_articles"},
    )
):
    domain_name = domain_kwargs.pop("name", domain_kwargs["domain"])
    with bz2.open(
        f"../data/prismai-{domain_name}-fulltext-gpt_4o_mini.jsonl.bz2",
        "wt",
        encoding="utf-8",
    ) as fp:
        for doc in tqdm(
            dataset_PrismAI.aggregate(
                [
                    {
                        "$match": {
                            "type": "fulltext",
                            "agent": {"$in": ["human", "gpt-4o-mini"]},
                            **domain_kwargs,
                        }
                    },
                    {
                        "$project": {
                            "text": 1,
                            "agent": 1,
                            "label_str": "$label",
                            "label": {
                                "$cond": {
                                    "if": {"$eq": ["$agent", "human"]},
                                    "then": 0,
                                    "else": 1,
                                }
                            },
                            "id": {
                                "$cond": {
                                    "if": {"$eq": ["$agent", "human"]},
                                    "then": "$id",
                                    "else": "$source",
                                }
                            },
                        },
                    },
                    {
                        "$group": {
                            "_id": "$id",
                            "samples": {
                                "$push": {
                                    "text": "$text",
                                    "agent": "$agent",
                                    "label_str": "$label_str",
                                    "label": "$label",
                                }
                            },
                        }
                    },
                    {
                        "$match": {
                            "samples.1": {"$exists": True},
                        }
                    }
                ]
            ),
            position=1,
            leave=False,
        ):
            fp.write(json_util.dumps(doc).strip() + "\n")


100%|██████████| 10/10 [04:40<00:00, 28.00s/it]


In [None]:
# import bz2

# from bson import json_util
# from pymongo import MongoClient
# from tqdm import tqdm

# client = MongoClient(
#     "mongodb://prismai:prismai@isengart.hucompute.org:27123/?retryWrites=true&loadBalanced=false&serverSelectionTimeoutMS=5000&connectTimeoutMS=10000&authSource=admin&authMechanism=SCRAM-SHA-256"
# )
# db = client["prismai"]
# dataset_PrismAI = db["dataset_PrismAI"]

# for domain_kwargs in tqdm(
#     (
#         {"domain": "blog_authorship_corpus"},
#         {"domain": "student_essays"},
#         {"domain": "cnn_news"},
#         {"domain": "euro_court_cases"},
#         {"domain": "house_of_commons"},
#         {"domain": "arxiv_papers"},
#         {"domain": "gutenberg", "lang": "en-EN", "name": "gutenberg_en"},
#         {"domain": "gutenberg", "lang": "de-DE", "name": "gutenberg_de"},
#         {"domain": "bundestag"},
#         {"domain": "spiegel_articles"},
#     )
# ):
#     domain_name = domain_kwargs.pop("name", domain_kwargs["domain"])
#     with bz2.open(
#         f"../data/prismai-{domain_name}-fulltext-gpt_4o_mini.jsonl.bz2",
#         "wt",
#         encoding="utf-8",
#     ) as fp:
#         for doc in tqdm(
#             dataset_PrismAI.aggregate(
#                 [
#                     {
#                         "$match": {
#                             "agent": "human",
#                             **domain_kwargs,
#                         }
#                     },
#                     {
#                         "$project": {
#                             "_id": 0,
#                             "id": 1,
#                             "human.text": "$text",
#                         },
#                     },
#                     {
#                         "$lookup": {
#                             "from": "dataset_PrismAI",
#                             "localField": "id",
#                             "foreignField": "source",
#                             "as": "ai",
#                             "pipeline": [
#                                 {
#                                     "$match": {
#                                         "type": "fulltext",
#                                         "agent": "gpt-4o-mini",
#                                     }
#                                 },
#                             #     # {
#                             #     #     "$project": {
#                             #     #         "text": 1,
#                             #     #         "agent": 1,
#                             #     #         "label_str": "$label",
#                             #     #         "label": {
#                             #     #             "$cond": {
#                             #     #                 "if": {"$eq": ["$label", "ai"]},
#                             #     #                 "then": 1,
#                             #     #                 "else": 0,
#                             #     #             }
#                             #     #         },
#                             #     #     }
#                             #     # },
#                             ],
#                         }
#                     },
#                     {
#                         "$match": {
#                             "ai": {"$ne": []},
#                         }
#                     }
#                     # {
#                     #     "$group": {
#                     #         "_id": "$source",
#                     #         "samples": {
#                     #             "$push": {
#                     #                 "text": "$text",
#                     #                 "agent": "$agent",
#                     #                 "label_str": "$label_str",
#                     #                 "label": "$label",
#                     #             }
#                     #         },
#                     #     }
#                     # },
#                 ]
#             ),
#             position=1,
#             leave=False,
#         ):
#             print(json_util.dumps(doc, indent=2))
#             break
#             fp.write(json_util.dumps(doc).strip() + "\n")
