In [None]:
import os
from collections import defaultdict

import numpy as np
import pandas as pd
import pymongo
from dotenv import load_dotenv
from pymongo import MongoClient
from tqdm import tqdm
from transformers import AutoTokenizer

gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
llama_tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.2-1B", use_fast=True
)
load_dotenv("../.env")
client = MongoClient(os.environ.get("MONGO_DB_CONNECTION"))
db = client.get_database("prismai")

In [None]:
prismai_gpt2 = defaultdict(list)
prismai_llama = defaultdict(list)

In [None]:
for domain in tqdm(db.get_collection("collected_items").distinct("domain"), position=0):
    for item in tqdm(
        db.get_collection("features_prismai").aggregate(
            [
                {"$match": {"document.domain": domain}},
                {"$project": {"_id": 1, "document._id.$id": 1}},
                {
                    "$lookup": {
                        "from": "collected_items",
                        "localField": "document._id.$id",
                        "foreignField": "_id",
                        "as": "text",
                        "pipeline": [
                            {"$project": {"text": 1}},
                        ],
                    }
                },
                {"$unwind": "$text"},
                {"$project": {"text": "$text.text"}},
            ]
        ),
        desc=domain,
        position=1,
    ):
        text = " ".join(item.get("text").strip().split())
        encoding = gpt_tokenizer(text, return_length=True)
        (length,) = encoding["length"]
        prismai_gpt2[domain].append(length)

        encoding = llama_tokenizer(text, return_length=True)
        (length,) = encoding["length"]
        prismai_llama[domain].append(length)

In [None]:
df = pd.DataFrame(
    [{"gpt2": prismai_gpt2[key], "llama": prismai_llama[key]} for key in prismai_llama],
    columns=["gpt2", "llama"],
    index=list(prismai_llama.keys()),
)
df["gpt2_05p"] = df["gpt2"].map(lambda x: np.percentile(x, 5))
# df["gpt2_mean"] = df["gpt2"].map(np.mean)
df["gpt2_median"] = df["gpt2"].map(np.median)
df["gpt2_95p"] = df["gpt2"].map(lambda x: np.percentile(x, 95))
del df["gpt2"]
df["llama_05p"] = df["llama"].map(lambda x: np.percentile(x, 5))
# df["llama_mean"] = df["llama"].map(np.mean)
df["llama_median"] = df["llama"].map(np.median)
df["llama_95p"] = df["llama"].map(lambda x: np.percentile(x, 95))
del df["llama"]
# print(df.to_latex(float_format="\\np{%d}"))
df

\begin{tabular}{lrrrrrr}
\toprule
 & gpt2_05p & gpt2_median & gpt2_95p & llama_05p & llama_median & llama_95p \\
\midrule
arxiv_papers & \np{1009} & \np{11338} & \np{34941} & \np{966} & \np{11158} & \np{34433} \\
blog_authorship_corpus & \np{15} & \np{65} & \np{600} & \np{16} & \np{66} & \np{593} \\
bundestag & \np{234} & \np{1342} & \np{2483} & \np{170} & \np{946} & \np{1747} \\
cnn_news & \np{309} & \np{749} & \np{1597} & \np{309} & \np{748} & \np{1588} \\
euro_court_cases & \np{258} & \np{984} & \np{5046} & \np{280} & \np{1019} & \np{5140} \\
gutenberg & \np{784} & \np{39006} & \np{222774} & \np{778} & \np{37531} & \np{202753} \\
house_of_commons & \np{89} & \np{818} & \np{18497} & \np{91} & \np{822} & \np{18700} \\
spiegel_articles & \np{334} & \np{912} & \np{2603} & \np{250} & \np{682} & \np{1934} \\
student_essays & \np{213} & \np{439} & \np{890} & \np{212} & \np{436} & \np{884} \\
\bottomrule
\end{tabular}



Unnamed: 0,gpt2_05p,gpt2_median,gpt2_95p,llama_05p,llama_median,llama_95p
arxiv_papers,1009.0,11338.0,34941.5,966.5,11158.0,34433.0
blog_authorship_corpus,15.0,65.0,600.0,16.0,66.0,593.0
bundestag,234.0,1342.0,2483.9,170.0,946.0,1747.0
cnn_news,309.0,749.0,1597.0,309.55,748.0,1588.0
euro_court_cases,258.25,984.0,5046.0,280.25,1019.0,5140.0
gutenberg,784.9,39006.0,222774.0,778.9,37531.0,202753.0
house_of_commons,89.0,818.0,18497.4,91.0,822.0,18700.55
spiegel_articles,334.0,912.0,2603.0,250.0,682.0,1934.0
student_essays,213.0,439.0,890.0,212.0,436.0,884.0


In [None]:
values_gpt2 = defaultdict(list)
values_llama = defaultdict(list)

for ds in ("CHEAT","Ghostbuster","HC3-Plus","MAGE","OpenLLMText","SeqXGPT"):
    for item in tqdm(db.get_collection(f"dataset_{ds}").find(), position=0, desc=ds):
        text = " ".join(item.get("text").strip().split())
        encoding = gpt_tokenizer(text, return_length=True)
        (length,) = encoding["length"]
        values_gpt2[ds].append(length)

        encoding = llama_tokenizer(text, return_length=True)
        (length,) = encoding["length"]
        values_llama[ds].append(length)

In [None]:
df = pd.DataFrame(
    [{"gpt2": values_gpt2[key], "llama": values_llama[key]} for key in values_llama],
    columns=["gpt2", "llama"],
    index=list(values_llama.keys()),
)
df["gpt2_05p"] = df["gpt2"].map(lambda x: np.percentile(x, 5))
# df["gpt2_mean"] = df["gpt2"].map(np.mean)
df["gpt2_median"] = df["gpt2"].map(np.median)
df["gpt2_95p"] = df["gpt2"].map(lambda x: np.percentile(x, 95))
del df["gpt2"]
df["llama_05p"] = df["llama"].map(lambda x: np.percentile(x, 5))
# df["llama_mean"] = df["llama"].map(np.mean)
df["llama_median"] = df["llama"].map(np.median)
df["llama_95p"] = df["llama"].map(lambda x: np.percentile(x, 95))
del df["llama"]
print(df.to_latex(float_format="\\np{%d}"))
df

\begin{tabular}{lrrrrrr}
\toprule
 & gpt2_05p & gpt2_median & gpt2_95p & llama_05p & llama_median & llama_95p \\
\midrule
CHEAT & \np{106} & \np{176} & \np{298} & \np{105} & \np{173} & \np{291} \\
Ghostbuster & \np{280} & \np{632} & \np{997} & \np{281} & \np{631} & \np{998} \\
HC3-Plus & \np{12} & \np{52} & \np{383} & \np{11} & \np{41} & \np{257} \\
MAGE & \np{36} & \np{141} & \np{951} & \np{37} & \np{142} & \np{952} \\
OpenLLMText & \np{120} & \np{392} & \np{1024} & \np{120} & \np{390} & \np{1031} \\
SeqXGPT & \np{72} & \np{270} & \np{504} & \np{73} & \np{270} & \np{499} \\
\bottomrule
\end{tabular}



Unnamed: 0,gpt2_05p,gpt2_median,gpt2_95p,llama_05p,llama_median,llama_95p
CHEAT,106.0,176.0,298.0,105.0,173.0,291.0
Ghostbuster,280.0,632.0,997.0,281.0,631.0,998.05
HC3-Plus,12.0,52.0,383.0,11.0,41.0,257.0
MAGE,36.0,141.0,951.0,37.0,142.0,952.0
OpenLLMText,120.0,392.0,1024.0,120.0,390.0,1031.0
SeqXGPT,72.0,270.0,504.0,73.0,270.0,499.0
