In [1]:
import os

import pymongo
from dotenv import load_dotenv
from pymongo import MongoClient
from tqdm import tqdm

load_dotenv("../env")
client = MongoClient(os.environ.get("MONGO_DB_CONNECTION"))
db = client.get_database("prismai")

In [2]:
collected_items = db.get_collection("collected_items")
synthesized_texts = db.get_collection("synthesized_texts")
features_prismai = db.get_collection("features_prismai")
features_prismai_old = db.get_collection("features_prismai_old")
test_swt = db.get_collection("test_swt")
test_new = db.get_collection("test_new")
dataset_CHEAT = db.get_collection("dataset_CHEAT")

In [3]:
for index in tqdm(list(features_prismai_old.list_indexes())):
    features_prismai.create_index(index.get("key"))

100%|██████████| 7/7 [00:00<00:00, 64.82it/s]


In [None]:
class Averager:
    def __init__(self):
        self.sum = 0
        self.count = 0

    def update(self, value):
        self.sum += value
        self.count += 1

    def average(self):
        return self.sum / self.count

In [None]:
from collections import defaultdict
import pandas as pd

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)

In [None]:
maximum = {}
averages = defaultdict(Averager)

for domain in tqdm(collected_items.distinct("domain"), position=0):
    maximum[domain] = 0
    for item in tqdm(
        collected_items.find({"domain": domain}, limit=1500),
        total=1500,
        position=1,
    ):
        text = item.get("text")
        encoding = tokenizer(text, return_length=True)
        length = encoding["length"][0]
        maximum[domain] = max(maximum[domain], length)
        averages[domain].update(length)

In [None]:
pd.DataFrame([{"max": maximum[key], "avg": averages[key].average()} for key in maximum], columns=["max", "avg"], index=maximum.keys())

In [None]:
list(
    features_prismai.aggregate(
        [
            {
                "$project": {
                    "domain": "$document.domain",
                    "size": {"$size": "$transition_scores.intermediate_logits"},
                },
            },
            {
                "$group": {
                    "_id": "$domain",
                    "size": {"$max": "$size"},
                },
            },
        ]
    )
)

In [None]:
source_collection = collected_items
source_collection_limit = 1500

synth_collection = "synthesized_texts"
score_collection = "features_prismai"

domain = "cnn_news"
lang = "en-EN"

synth_type = "fulltext"
synth_agent = "gpt-4o-mini"

feature_model = "gpt2"

pre_processor_type = "truncated"

next(
    db.get_collection(score_collection).aggregate(
        [
            {
                "$match": {
                    "document.agent": {"$in": [None, synth_agent]},
                    "document.domain": domain,
                    "document.lang": lang,
                    "document.type": {"$in": ["source", synth_type]},
                    "model.name": feature_model,
                    # "pre_processor.type": pre_processor_type,
                }
            },
            {
                "$group": {
                    "_id": "$document._id.$id",
                    "count": {"$count": {}},
                    "features": {
                        "$push": {
                            "type": "$document.type",
                            "split": "$split",
                            "transition_scores": "$transition_scores",
                        }
                    },
                }
            },
        ],
        allowDiskUse=True,
    )
)

In [None]:
features_prismai.count_documents({"model.name": {"$ne": "gpt2"}})

In [None]:
features_prismai.delete_many({"model.name": {"$ne": "gpt2"}})

In [None]:
list(features_prismai.aggregate(
    [
        {"$match": {"model.name": "meta-llama/Llama-3.2-1B"}},
        {"$group": {"_id": ["$document.agent", "$document.type", "$document.domain"], "count": {"$count": {}}}},]
))

In [None]:
raise RuntimeError

In [None]:
synthesized_texts.count_documents({"domain": "blog_authorship_corpus", })

In [None]:
count_ = list(
    synthesized_texts.aggregate(
        [
            {"$match": {"agent": {"$ne": "nemotron"}}},
            {
                "$group": {
                    "_id": ["$domain", "$type", "$agent"],
                    "agent": {"$first": "$agent"},
                    "domain": {"$first": "$domain"},
                    "type": {"$first": "$type"},
                    "count": {"$count": {}},
                }
            },
            {"$group": {"_id": ["$domain"], "count": {"$min": "$count"}}},
        ]
    )
)
count_

In [None]:

items = list(
    collected_items.aggregate(
        [
            {
                "$lookup": {
                    "from": "synthesized_texts",
                    "as": "synthesized_texts",
                    "localField": "_id",
                    "foreignField": "_ref_id.$id",
                    "pipeline": [
                        {
                            "$project": {
                                "_id": 1,
                                "type": 1,
                                "agent": 1,
                                # "_doc_id": "$_ref_id.$id",
                                # "domain": 1,
                            }
                        },
                        {"$match": {"type": "fulltext", "agent": "gemma2:9b"}},
                    ],
                }
            },
        ]
    )
)
# len(items)
items

In [None]:
# db.drop_collection(transition_scores)

In [None]:
hoc_ids = [
    doc["_id"]
    for doc in collected_items.find(
        {"domain": "house_of_commons"}, projection={"_id": 1}, limit=1500
    )
]
len(hoc_ids)

In [None]:
llama_ids_hoc = [
    doc["document"]["_id"].id
    for doc in transition_scores.find(
        {
            "document.domain": "house_of_commons",
            "document.type": "source",
            # "model.name": "gpt2",
            "model.name": "meta-llama/Llama-3.2-1B",
        },
        projection={"document._id": 1},
    )
]
len(llama_ids_hoc)

In [None]:
len(set(llama_ids_hoc).intersection(hoc_ids))

In [None]:
len(set(llama_ids_hoc).intersection(hoc_ids[: len(llama_ids_hoc)]))

In [None]:
1500 - len(llama_ids_hoc)

In [None]:
hoc_ids_synth = [
    doc["_id"]
    for doc in synthesized_texts.find(
        # {"domain": "euro_court_cases"},
        {"domain": "house_of_commons"},
        projection={"_id": 1},
    )
]
len(hoc_ids_synth)

In [None]:
llama_ids_hoc_synth = [
    doc["document"]["_synth_id"].id
    for doc in transition_scores.find(
        {
            "document.domain": "house_of_commons",
            "document.type": {"$ne": "source"},
            "model.name": "meta-llama/Llama-3.2-1B",
        },
        projection={"document._synth_id": 1},
    )
]
len(llama_ids_hoc)

In [None]:
len(set(llama_ids_hoc_synth).intersection(hoc_ids_synth))

In [None]:
len(set(llama_ids_hoc_synth).intersection(hoc_ids_synth[: len(llama_ids_hoc_synth)]))

In [None]:
set(doc["agent"] for doc in synthesized_texts.find(projection=["agent"]))

In [None]:
next(
    synthesized_texts.aggregate(
        [
            {"$match": {"type": "chunk", "agent": "gemma2:9b"}},
            # {"$limit": 1},
            {
                "$project": {
                    "_id": 1,
                    "chunks": 1,
                    "start_idx": "$synth_metadata.start_idx",
                    "end_idx": "$synth_metadata.end_idx",
                }
            },
            {
                "$lookup": {
                    "from": "transition_scores",
                    "as": "transition_scores",
                    "localField": "_id",
                    "foreignField": "document._synth_id.$id",
                    "pipeline": [
                        {"$match": {"document.type": "chunk", "model.name": "gpt2"}},
                        {
                            "$project": {
                                "_id": 1,
                                "document": 1,
                                "model": 1,
                                # "transition_scores": 1,
                                "metadata": 1,
                            }
                        },
                    ],
                }
            },
        ],
        # allowDiskUse=True,
    ),
)

In [None]:
list(
    transition_scores.aggregate(
        [
            {
                "$lookup": {
                    "from": "collected_items",
                    "localField": "refs._ref_id.$id",
                    "foreignField": "_id",
                    "as": "source",
                }
            },
            {
                "$project": {
                    "transition_scores": 0,
                    "source.text": 0,
                    "source.chunks": 0,
                    "source.synthetization": 0,
                    # "_id": 1,
                    # "domain": 1,
                    # "features._id": 1,
                    # "features.model.name": 1,
                    # "features.pre_processor.type": 1,
                }
            },
            {"$unwind": {"path": "$source"}},
            {
                "$group": {
                    "_id": {
                        "$concat": [
                            "$source.domain",
                            " - ",
                            "$model.name",
                            " - ",
                            "$pre_processor.type",
                        ]
                    },
                    "total": {"$sum": 1},
                }
            },
        ]
    )
)

In [None]:
list(
    transition_scores.aggregate(
        [
            {
                "$lookup": {
                    "from": "synthesized_texts",
                    "localField": "refs._ref_id.$id",
                    "foreignField": "_id",
                    "as": "source",
                }
            },
            {
                "$project": {
                    "transition_scores": 0,
                    "source.text": 0,
                    "source.chunks": 0,
                    "source.synthetization": 0,
                    # "_id": 1,
                    # "domain": 1,
                    # "features._id": 1,
                    # "features.model.name": 1,
                    # "features.pre_processor.type": 1,
                }
            },
            {"$unwind": {"path": "$source"}},
            {
                "$group": {
                    "_id": {
                        "$concat": [
                            "$source.domain",
                            " - ",
                            "$model.name",
                            " - ",
                            "$pre_processor.type",
                        ]
                    },
                    "total": {"$sum": 1},
                }
            },
        ]
    )
)

In [None]:
sum(
    1
    for _ in test_swt.aggregate(
        [
            {
                "$lookup": {
                    "from": "synthesized_texts",
                    "localField": "refs._ref_id.$id",
                    "foreignField": "_id",
                    "as": "features",
                }
            },
            {"$match": {"features": {"$exists": True, "$ne": []}}},
            {
                "$project": {
                    "_id": 1,
                    "refs._ref_id.$id": 1,
                    "features._id": 1,
                    "features.id": 1,
                }
            },
        ]
    )
    # if doc["features"]
)

In [None]:
sum(1 for _ in test_swt.find(projection={"_id": 1}))

In [None]:
import re

re.findall("\d", "abc")