In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [1]:
import os

import pymongo
from dotenv import load_dotenv
from pymongo import MongoClient
from tqdm import tqdm

load_dotenv("../env")
client = MongoClient(os.environ.get("MONGO_DB_CONNECTION"))
db = client.get_database("prismai")
collected_items = db.get_collection("collected_items")
synthesized_texts = db.get_collection("synthesized_texts")

In [3]:
from collections import Counter
from pprint import pprint

for domain in (
    "arxiv_papers",
    "blog_authorship_corpus",
    "bundestag",
    "cnn_news",
    "euro_court_cases",
    "gutenberg",
    "house_of_commons",
    "spiegel_articles",
    "student_essays",
):
    token_dist = Counter()
    word_dist = Counter()
    for doc in tqdm(
        collected_items.find({"domain": domain}, projection=["text"], limit=1500),
        total=1500,
    ):
        text = doc["text"]
        token_dist.update(tokenizer(text, return_length=True)["length"])
        word_dist.update((len(text.split()),))
    print(domain)
    print("token_dist")
    pprint(list(sorted(token_dist.items(), reverse=True))[:10])
    print()
    print("word_dist")
    pprint(list(sorted(word_dist.items(), reverse=True))[:10])


  0%|          | 0/1500 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (10096 > 1024). Running this sequence through the model will result in indexing errors
100%|██████████| 1500/1500 [00:40<00:00, 37.25it/s]


arxiv_papers
token_dist
[(221856, 1),
 (217126, 1),
 (178281, 1),
 (166378, 1),
 (146908, 1),
 (122233, 1),
 (101665, 1),
 (98652, 1),
 (82644, 1),
 (72937, 1)]

word_dist
[(139978, 1),
 (113636, 1),
 (93948, 1),
 (86011, 1),
 (67400, 1),
 (45572, 1),
 (43331, 1),
 (38512, 1),
 (38150, 1),
 (36054, 1)]


100%|██████████| 1500/1500 [00:01<00:00, 1451.66it/s]


blog_authorship_corpus
token_dist
[(1944, 1),
 (1811, 1),
 (1567, 1),
 (1479, 1),
 (1290, 1),
 (1272, 1),
 (1231, 1),
 (1135, 1),
 (1108, 1),
 (1072, 1)]

word_dist
[(1431, 1),
 (1212, 1),
 (1143, 1),
 (1111, 1),
 (964, 1),
 (948, 1),
 (853, 1),
 (825, 1),
 (819, 1),
 (801, 2)]


100%|██████████| 1500/1500 [00:02<00:00, 514.99it/s]


bundestag
token_dist
[(18823, 1),
 (6776, 1),
 (6149, 1),
 (5368, 1),
 (4939, 1),
 (4811, 1),
 (4588, 1),
 (4574, 1),
 (4501, 1),
 (4403, 1)]

word_dist
[(7072, 1),
 (2594, 1),
 (2365, 1),
 (2004, 1),
 (1951, 1),
 (1852, 1),
 (1825, 1),
 (1714, 1),
 (1686, 1),
 (1654, 1)]


100%|██████████| 1500/1500 [00:02<00:00, 535.76it/s]


cnn_news
token_dist
[(2504, 1),
 (2438, 1),
 (2296, 1),
 (2288, 1),
 (2262, 1),
 (2221, 1),
 (2211, 1),
 (2203, 1),
 (2167, 1),
 (2162, 1)]

word_dist
[(1835, 1),
 (1830, 1),
 (1800, 1),
 (1773, 1),
 (1750, 1),
 (1745, 1),
 (1735, 1),
 (1734, 1),
 (1720, 1),
 (1701, 1)]


100%|██████████| 1500/1500 [00:06<00:00, 239.61it/s]


euro_court_cases
token_dist
[(28976, 1),
 (23653, 1),
 (18201, 1),
 (16626, 1),
 (14964, 1),
 (14490, 1),
 (13953, 1),
 (13652, 1),
 (13478, 1),
 (13307, 1)]

word_dist
[(21799, 1),
 (17595, 1),
 (14366, 1),
 (12950, 1),
 (11439, 1),
 (10703, 1),
 (10334, 1),
 (10328, 1),
 (10254, 1),
 (10158, 1)]


100%|██████████| 1500/1500 [04:14<00:00,  5.90it/s]


gutenberg
token_dist
[(1285971, 1),
 (898834, 1),
 (715979, 1),
 (646163, 1),
 (542038, 1),
 (510828, 1),
 (508578, 1),
 (484058, 1),
 (475326, 1),
 (471663, 1)]

word_dist
[(571984, 1),
 (498393, 1),
 (489095, 1),
 (459223, 1),
 (414031, 1),
 (345361, 1),
 (343792, 1),
 (341143, 1),
 (325343, 1),
 (309169, 1)]


100%|██████████| 1500/1500 [00:14<00:00, 105.29it/s]


house_of_commons
token_dist
[(73579, 1),
 (64571, 1),
 (64116, 1),
 (63235, 1),
 (62268, 1),
 (60721, 1),
 (60648, 1),
 (60239, 1),
 (59698, 1),
 (59472, 1)]

word_dist
[(62527, 1),
 (55014, 1),
 (54194, 1),
 (53478, 1),
 (53337, 1),
 (51941, 1),
 (51394, 1),
 (51321, 1),
 (50566, 1),
 (50554, 1)]


100%|██████████| 1500/1500 [00:02<00:00, 539.69it/s]


spiegel_articles
token_dist
[(8785, 1),
 (6542, 1),
 (6425, 1),
 (5513, 1),
 (4994, 1),
 (4727, 1),
 (4725, 1),
 (4702, 1),
 (4649, 1),
 (4550, 1)]

word_dist
[(3726, 1),
 (3552, 1),
 (3441, 1),
 (3315, 1),
 (3167, 1),
 (3122, 1),
 (2997, 1),
 (2980, 1),
 (2422, 1),
 (2300, 1)]


100%|██████████| 1500/1500 [00:01<00:00, 987.63it/s] 

student_essays
token_dist
[(14883, 1),
 (4922, 1),
 (3989, 1),
 (2454, 1),
 (1510, 1),
 (1327, 1),
 (1308, 1),
 (1238, 1),
 (1228, 2),
 (1222, 1)]

word_dist
[(1145, 1),
 (1101, 1),
 (1100, 1),
 (1093, 1),
 (1073, 1),
 (1032, 2),
 (1023, 1),
 (1022, 1),
 (1015, 1),
 (1008, 1)]





In [3]:
from collections import Counter
from pprint import pprint

for domain in (
    "arxiv_papers",
    "blog_authorship_corpus",
    "bundestag",
    "cnn_news",
    "euro_court_cases",
    "gutenberg",
    "house_of_commons",
    "spiegel_articles",
    "student_essays",
):
    prefix_lenghts = Counter()
    infill_lenghts = Counter()
    total = synthesized_texts.count_documents(
        {"domain": domain, "agent": "gpt-4o-mini", "type": "chunk"}
    )
    for doc in tqdm(
        synthesized_texts.find(
            {"domain": domain, "agent": "gpt-4o-mini", "type": "chunk"},
            projection=["chunks", "synth_metadata.start_idx", "synth_metadata.end_idx"],
        ),
        total=total,
    ):
        chunks: list[str] = doc["chunks"]
        prefix = " ".join(
            word
            for chunk in chunks[: doc["synth_metadata"]["start_idx"]]
            for word in chunk.strip().split()
        )
        infill = " ".join(
            word
            for chunk in chunks[: doc["synth_metadata"]["end_idx"]]
            for word in chunk.strip().split()
        )
        prefix_length, infill_length = tokenizer([prefix, infill], return_length=True)[
            "length"
        ]
        prefix_lenghts.update([prefix_length])
        infill_lenghts.update([infill_length])
    print(domain)
    print("prefix_lenghts")
    pprint(list(sorted(prefix_lenghts.items(), reverse=True))[:10])
    print()
    print("with_infill_lenghts")
    pprint(list(sorted(infill_lenghts.items(), reverse=True))[:10])


  0%|          | 0/1483 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (6498 > 1024). Running this sequence through the model will result in indexing errors
100%|██████████| 1483/1483 [00:36<00:00, 40.80it/s]


arxiv_papers
prefix_lenghts
[(216076, 1),
 (174081, 1),
 (70398, 1),
 (57500, 1),
 (57412, 1),
 (48843, 1),
 (41330, 1),
 (40092, 1),
 (37898, 1),
 (37590, 1)]

with_infill_lenghts
[(217050, 1),
 (182961, 1),
 (119821, 1),
 (72393, 1),
 (62982, 1),
 (59614, 1),
 (49506, 1),
 (49355, 1),
 (46118, 1),
 (42938, 1)]


100%|██████████| 640/640 [00:00<00:00, 1683.84it/s]


blog_authorship_corpus
prefix_lenghts
[(1404, 1),
 (1202, 1),
 (726, 1),
 (701, 1),
 (683, 1),
 (632, 1),
 (610, 1),
 (594, 1),
 (541, 1),
 (535, 1)]

with_infill_lenghts
[(1425, 1),
 (1405, 1),
 (1032, 1),
 (958, 1),
 (815, 1),
 (746, 1),
 (714, 1),
 (704, 1),
 (688, 1),
 (682, 1)]


100%|██████████| 1268/1268 [00:03<00:00, 360.12it/s]


bundestag
prefix_lenghts
[(12216, 1),
 (4211, 1),
 (3648, 1),
 (2936, 1),
 (2659, 1),
 (2594, 1),
 (2211, 1),
 (2201, 1),
 (2112, 1),
 (2074, 1)]

with_infill_lenghts
[(15752, 1),
 (6339, 1),
 (4811, 1),
 (4612, 1),
 (4427, 1),
 (4351, 1),
 (4097, 1),
 (4075, 1),
 (4054, 1),
 (3797, 1)]


100%|██████████| 957/957 [00:01<00:00, 490.26it/s]


cnn_news
prefix_lenghts
[(1728, 1),
 (1699, 1),
 (1664, 1),
 (1553, 1),
 (1543, 1),
 (1491, 1),
 (1452, 1),
 (1450, 1),
 (1425, 1),
 (1395, 1)]

with_infill_lenghts
[(2106, 1),
 (2071, 1),
 (2069, 1),
 (1973, 1),
 (1898, 1),
 (1827, 1),
 (1796, 1),
 (1775, 1),
 (1770, 1),
 (1666, 1)]


100%|██████████| 1495/1495 [00:05<00:00, 272.01it/s]


euro_court_cases
prefix_lenghts
[(15145, 1),
 (8605, 1),
 (8440, 1),
 (7861, 1),
 (7798, 1),
 (7550, 1),
 (6452, 1),
 (6441, 1),
 (6050, 1),
 (5999, 1)]

with_infill_lenghts
[(16479, 1),
 (11453, 1),
 (10374, 1),
 (9604, 1),
 (9347, 1),
 (8952, 1),
 (8863, 1),
 (8312, 1),
 (8279, 1),
 (8080, 1)]


100%|██████████| 1494/1494 [04:12<00:00,  5.92it/s]


gutenberg
prefix_lenghts
[(759578, 1),
 (389642, 1),
 (353165, 1),
 (324779, 1),
 (314759, 1),
 (292916, 1),
 (287022, 1),
 (280718, 1),
 (274372, 1),
 (268861, 1)]

with_infill_lenghts
[(826597, 1),
 (452365, 1),
 (433536, 1),
 (431394, 1),
 (403759, 1),
 (370773, 1),
 (366399, 1),
 (356778, 1),
 (326965, 1),
 (318779, 1)]


100%|██████████| 1245/1245 [00:11<00:00, 104.66it/s]


house_of_commons
prefix_lenghts
[(44610, 1),
 (27324, 1),
 (25439, 1),
 (24324, 1),
 (23968, 1),
 (22229, 1),
 (22171, 1),
 (22058, 1),
 (21526, 1),
 (19752, 1)]

with_infill_lenghts
[(50358, 1),
 (48151, 1),
 (47912, 1),
 (46727, 1),
 (35395, 1),
 (33670, 1),
 (32966, 1),
 (32949, 1),
 (31994, 1),
 (30505, 1)]


100%|██████████| 1426/1426 [00:02<00:00, 522.09it/s]


spiegel_articles
prefix_lenghts
[(3623, 1),
 (3542, 1),
 (3407, 1),
 (2922, 1),
 (2820, 1),
 (2783, 1),
 (2751, 1),
 (2737, 1),
 (2581, 1),
 (2512, 1)]

with_infill_lenghts
[(4603, 1),
 (4260, 1),
 (3727, 1),
 (3638, 1),
 (3408, 1),
 (3367, 1),
 (3138, 1),
 (3087, 1),
 (3024, 1),
 (2982, 1)]


100%|██████████| 1379/1379 [00:01<00:00, 905.06it/s]

student_essays
prefix_lenghts
[(1241, 1),
 (1055, 1),
 (983, 1),
 (838, 1),
 (833, 1),
 (787, 1),
 (773, 1),
 (769, 1),
 (752, 1),
 (748, 1)]

with_infill_lenghts
[(1324, 1),
 (1181, 1),
 (1153, 1),
 (1022, 1),
 (995, 1),
 (972, 1),
 (940, 1),
 (929, 1),
 (924, 1),
 (918, 1)]



