In [2]:
import os

import pymongo
from dotenv import load_dotenv
from pymongo import MongoClient
from tqdm import tqdm

load_dotenv("../env")
client = MongoClient(os.environ.get("MONGO_DB_CONNECTION"))
db = client.get_database("prismai")

In [3]:
collected_items = db.get_collection("collected_items")
synthesized_texts = db.get_collection("synthesized_texts")
transition_scores = db.get_collection("transition_scores")
test_swt = db.get_collection("test_swt")

In [5]:
set(doc["domain"] for doc in collected_items.find(projection={"domain": 1}))

{'arxiv_papers',
 'blog_authorship_corpus',
 'bundestag',
 'cnn_news',
 'euro_court_cases',
 'gutenberg',
 'house_of_commons',
 'spiegel_articles',
 'student_essays'}

In [None]:
# db.drop_collection(transition_scores)

In [None]:
hoc_ids = [
    doc["_id"]
    for doc in collected_items.find(
        {"domain": "house_of_commons"}, projection={"_id": 1}, limit=1500
    )
]
len(hoc_ids)

In [None]:
llama_ids_hoc = [
    doc["document"]["_id"].id
    for doc in transition_scores.find(
        {
            "document.domain": "house_of_commons",
            "document.type": "source",
            # "model.name": "gpt2",
            "model.name": "meta-llama/Llama-3.2-1B",
        },
        projection={"document._id": 1},
    )
]
len(llama_ids_hoc)

In [None]:
len(set(llama_ids_hoc).intersection(hoc_ids))

In [None]:
len(set(llama_ids_hoc).intersection(hoc_ids[: len(llama_ids_hoc)]))

In [None]:
1500 - len(llama_ids_hoc)

In [None]:
hoc_ids_synth = [
    doc["_id"]
    for doc in synthesized_texts.find(
        # {"domain": "euro_court_cases"},
        {"domain": "house_of_commons"},
        projection={"_id": 1},
    )
]
len(hoc_ids_synth)

In [None]:
llama_ids_hoc_synth = [
    doc["document"]["_synth_id"].id
    for doc in transition_scores.find(
        {
            "document.domain": "house_of_commons",
            "document.type": {"$ne": "source"},
            "model.name": "meta-llama/Llama-3.2-1B",
        },
        projection={"document._synth_id": 1},
    )
]
len(llama_ids_hoc)

In [None]:
len(set(llama_ids_hoc_synth).intersection(hoc_ids_synth))

In [None]:
len(set(llama_ids_hoc_synth).intersection(hoc_ids_synth[: len(llama_ids_hoc_synth)]))

In [None]:
raise RuntimeError

In [None]:
next(
    collected_items.aggregate(
        [
            {"$match": {"domain": "bundestag"}},
            {"$limit": 1},
            {
                "$project": {
                    "_id": 1,
                    "domain": 1,
                    # "_doc_id": "$_id",
                    # "type": 1,
                }
            },
            {
                "$lookup": {
                    "from": "synthesized_texts",
                    "as": "synthesized_texts",
                    "localField": "_id",
                    "foreignField": "_ref_id.$id",
                    "pipeline": [
                        {
                            "$project": {
                                "_id": 1,
                                "type": 1,
                                "agent": 1,
                                # "_doc_id": "$_ref_id.$id",
                                # "domain": 1,
                            }
                        },
                        {"$match": {"type": "fulltext", "agent": "gemma2:9b"}},
                    ],
                }
            },
            # {"$unwind": {"path": "$synthesized_texts"}},
            {
                "$lookup": {
                    "from": "transition_scores",
                    "as": "transition_scores",
                    "localField": "_id",
                    "foreignField": "document._id.$id",
                    "pipeline": [
                        {
                            "$project": {
                                "_id": 1,
                                "document": 1,
                                "model.name": 1,
                                # "pre_processor": 1,
                            }
                        },
                        {
                            "$match": {
                                "model.name": "gpt2",
                                "document.type": {"$ne": "chunk"},
                                "document.agent": {"$in": [None, "gpt-4o-mini"]},
                            }
                        },
                        # {"$match": {"model.name": "meta-llama/Llama-3.2-1B"}},
                    ],
                }
            },
            # {
            #     "$project": {
            #         "transition_scores": 0,
            #         "source.text": 0,
            #         "source.chunks": 0,
            #         "source.synthetization": 0,
            #         # "_id": 1,
            #         # "domain": 1,
            #         # "features._id": 1,
            #         # "features.model.name": 1,
            #         # "features.pre_processor.type": 1,
            #     }
            # },
            # {"$unwind": {"path": "$source"}},
            # {
            #     "$group": {
            #         "_id": {
            #             "$concat": [
            #                 "$source.domain",
            #                 " - ",
            #                 "$model.name",
            #                 " - ",
            #                 "$pre_processor.type",
            #             ]
            #         },
            #         "total": {"$sum": 1},
            #     }
            # },
        ]
    )
)

In [None]:
source_collection = collected_items
source_collection_limit = 1500

domain = "gutenberg"
lang = "en-EN"

synth_type = "fulltext"
synth_agent = "gpt-4o-mini"

feature_model = "gpt2"

next(
    source_collection.aggregate(
        [
            {
                "$match": {
                    "domain": domain,
                    "lang": lang,
                }
            },
            {"$limit": source_collection_limit},
            {
                "$project": {
                    "_id": 1,
                }
            },
            {
                "$lookup": {
                    "from": "synthesized_texts",
                    "as": "synthesized_texts",
                    "localField": "_id",
                    "foreignField": "_ref_id.$id",
                    "pipeline": [
                        # {
                        #     "$project": {
                        #         "_id": 1,
                        #         "type": 1,
                        #         "agent": 1,
                        #     }
                        # },
                        {"$match": {"type": synth_type, "agent": synth_agent}},
                    ],
                }
            },
            {
                "$lookup": {
                    "from": "transition_scores",
                    "as": "transition_scores",
                    "localField": "_id",
                    "foreignField": "document._id.$id",
                    "pipeline": [
                        # {
                        #     "$project": {
                        #         "_id": 1,
                        #         "document": 1,
                        #         "model.name": 1,
                        #         "transition_scores": 1,
                        #         "metadata": 1,
                        #     }
                        # },
                        {
                            "$match": {
                                "model.name": feature_model,
                                "document.type": {"$in": ["source", synth_type]},
                                "document.agent": {"$in": [None, synth_agent]},
                            }
                        },
                    ],
                }
            },
        ],
        allowDiskUse=True,
    )
)

In [None]:
set(doc["agent"] for doc in synthesized_texts.find(projection=["agent"]))

In [None]:
next(
    synthesized_texts.aggregate(
        [
            {"$match": {"type": "chunk", "agent": "gemma2:9b"}},
            # {"$limit": 1},
            {
                "$project": {
                    "_id": 1,
                    "chunks": 1,
                    "start_idx": "$synth_metadata.start_idx",
                    "end_idx": "$synth_metadata.end_idx",
                }
            },
            {
                "$lookup": {
                    "from": "transition_scores",
                    "as": "transition_scores",
                    "localField": "_id",
                    "foreignField": "document._synth_id.$id",
                    "pipeline": [
                        {"$match": {"document.type": "chunk", "model.name": "gpt2"}},
                        {
                            "$project": {
                                "_id": 1,
                                "document": 1,
                                "model": 1,
                                # "transition_scores": 1,
                                "metadata": 1,
                            }
                        },
                    ],
                }
            },
        ],
        # allowDiskUse=True,
    ),
)

In [None]:
list(
    transition_scores.aggregate(
        [
            {
                "$lookup": {
                    "from": "collected_items",
                    "localField": "refs._ref_id.$id",
                    "foreignField": "_id",
                    "as": "source",
                }
            },
            {
                "$project": {
                    "transition_scores": 0,
                    "source.text": 0,
                    "source.chunks": 0,
                    "source.synthetization": 0,
                    # "_id": 1,
                    # "domain": 1,
                    # "features._id": 1,
                    # "features.model.name": 1,
                    # "features.pre_processor.type": 1,
                }
            },
            {"$unwind": {"path": "$source"}},
            {
                "$group": {
                    "_id": {
                        "$concat": [
                            "$source.domain",
                            " - ",
                            "$model.name",
                            " - ",
                            "$pre_processor.type",
                        ]
                    },
                    "total": {"$sum": 1},
                }
            },
        ]
    )
)

In [None]:
list(
    transition_scores.aggregate(
        [
            {
                "$lookup": {
                    "from": "synthesized_texts",
                    "localField": "refs._ref_id.$id",
                    "foreignField": "_id",
                    "as": "source",
                }
            },
            {
                "$project": {
                    "transition_scores": 0,
                    "source.text": 0,
                    "source.chunks": 0,
                    "source.synthetization": 0,
                    # "_id": 1,
                    # "domain": 1,
                    # "features._id": 1,
                    # "features.model.name": 1,
                    # "features.pre_processor.type": 1,
                }
            },
            {"$unwind": {"path": "$source"}},
            {
                "$group": {
                    "_id": {
                        "$concat": [
                            "$source.domain",
                            " - ",
                            "$model.name",
                            " - ",
                            "$pre_processor.type",
                        ]
                    },
                    "total": {"$sum": 1},
                }
            },
        ]
    )
)

In [None]:
sum(
    1
    for _ in test_swt.aggregate(
        [
            {
                "$lookup": {
                    "from": "synthesized_texts",
                    "localField": "refs._ref_id.$id",
                    "foreignField": "_id",
                    "as": "features",
                }
            },
            {"$match": {"features": {"$exists": True, "$ne": []}}},
            {
                "$project": {
                    "_id": 1,
                    "refs._ref_id.$id": 1,
                    "features._id": 1,
                    "features.id": 1,
                }
            },
        ]
    )
    # if doc["features"]
)

In [None]:
sum(1 for _ in test_swt.find(projection={"_id": 1}))