In [1]:
import os

from dotenv import load_dotenv
from pymongo import MongoClient

load_dotenv("../env")
client = MongoClient(os.environ.get("MONGO_DB_CONNECTION"))
collection = client.get_database("prismai").get_collection("collected_items")

In [2]:
from itertools import batched

import datasets
from datasets import Dataset
from tqdm.auto import tqdm

datasets.disable_progress_bars()

In [3]:
from transition_scores.pre_processor.text import TextPreProcessor
from transition_scores.pre_processor.chunks import RollingWindowChunkPreProcessor
from transition_scores.scorer import OnnxTransitionScorer

scorer = OnnxTransitionScorer(
    "/hot_storage/models/onnx/gpt2_onnx_o4/",
    pre_processor=RollingWindowChunkPreProcessor.from_pretrained(
        "/hot_storage/models/onnx/gpt2_onnx_o4/"
    ),
    batch_size=1,
    device="cuda",
    top_k=4,
)

[0;93m2025-01-23 16:55:10.333259473 [W:onnxruntime:, transformer_memcpy.cc:74 ApplyImpl] 24 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-01-23 16:55:10.337907913 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-01-23 16:55:10.337916633 [W:onnxruntime:, session_state.cc:1170 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


In [4]:
from bson import DBRef


total = collection.count_documents({})

tq = tqdm(
    collection.find(
        projection=[
            "text",
            "chunks",
        ],
        batch_size=128,
    ),
    total=total,
)
for batch in batched(tq, 16):
    batch = [
        {
            "ref": {
                "$ref": "collected_items",
                "$id": str(row.pop("_id")),
            }
        }
        | row
        for row in batch
    ]
    dataset = Dataset.from_list(batch)
    dataset = dataset.filter(lambda x: x["text"] and x["chunks"])
    for scores in scorer.process(dataset):
        print(str(scores)[:500])
        raise RuntimeError

  0%|          | 0/802852 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1078 > 1024). Running this sequence through the model will result in indexing errors


{'ref': {'$id': '678fb3abdbe3ac531644d662', '$ref': 'collected_items'}, 'text_sha256': '00de2227c91236bb19430f6780cc26df7e06e098370e9068af6877140db3939f', 'text': 'Herr Präsident! Meine sehr geehrten Damen und Herren!', 'start_idx': 0, 'end_idx': 1, 'start_token_idx': 0, 'prefix_idx': 0, 'transition_scores': [{'target_id': 39, 'target_prob': 0.0, 'top_k_ids': [], 'top_k_scores': []}, {'target_id': 8056, 'target_prob': 5.699131725123152e-05, 'top_k_ids': [13, 11, 198, 12], 'top_k_scores': [0.0634


RuntimeError: 

In [None]:
_scores = scores.copy()

transposed = {"feature_metadata": dict()}
for key in scorer.pre_processor.additional_fields:
    transposed["feature_metadata"][key] = _scores.pop(key)
transposed = _scores | transposed
transposed

In [None]:
one = collection.find_one({"_id": "22c34302-0ec6-4781-8d96-1d6a4fda049e"})
print(one["text"])
print("".join(one["chunks"]))
one

In [None]:
from bson.dbref import DBRef

from transition_scores.data import LogProbs
from transition_scores.mongo import TextTransitionScore, TransitionScoreItem

dict(
    TransitionScoreItem(
        DBRef("a", "b"),
        "gpt2",
        "onnx",
        TextTransitionScore([LogProbs(0, 1.0, [0], [1.0])]),
    )
)

In [None]:
from datasets import Dataset

from transition_scores.pre_processor.text import TextPreProcessor

tokenizer = TextPreProcessor.from_pretrained("gpt2")

dataset = Dataset.from_dict(
    {
        "_id": ["abc-def-123"],
        "text": [
            "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."
        ],
        "chunks": [
            [
                "Lorem ipsum dolor sit amet,",
                "consectetur adipiscing elit.",
                "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.",
                "Ut enim ad minim veniam,",
                "quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.",
            ]
        ],
    }
)
dataset = tokenizer.prepare_dataset(dataset)
dataset

In [None]:
from datasets import Dataset

from transition_scores.pre_processor.text import TextPreProcessor
from transition_scores.pre_processor.chunks import RollingWindowChunkPreProcessor

tokenizer = RollingWindowChunkPreProcessor.from_pretrained("gpt2")

dataset = Dataset.from_dict(
    {
        "_id": ["abc-def-123"],
        "text": [
            "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."
        ],
        "chunks": [
            [
                "Lorem ipsum dolor sit amet,",
                "consectetur adipiscing elit.",
                "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.",
                "Ut enim ad minim veniam,",
                "quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.",
            ]
        ],
    }
)
dataset = tokenizer.prepare_dataset(dataset)
dataset