In [None]:
from datasets import load_dataset

dataset_id = "HuggingFaceM4/FineVision"
config_name = "cocotext"
train_dataset = load_dataset(dataset_id, name=config_name)

In [None]:
train_dataset['train']['images'][20][0]

In [None]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor

model_id = "Qwen/Qwen2.5-VL-7B-Instruct"

In [None]:
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id, torch_dtype="auto", device_map=0
)


processor = AutoProcessor.from_pretrained(model_id)

In [None]:
from qwen_vl_utils import process_vision_info


def generate_text_from_sample(model, processor, sample, max_new_tokens=1024, device="cuda"):
    # Prepare the text input by applying the chat template
    text_input = processor.apply_chat_template(
        sample["messages"], 
        tokenize=False,
        add_generation_prompt=True,
    )

    # Process the visual input from the sample
    image_inputs, _ = process_vision_info(sample["messages"])

    # Prepare the inputs for the model
    model_inputs = processor(
        text=[text_input],
        images=image_inputs,
        return_tensors="pt",
    ).to(
        device
    )  # Move inputs to the specified device

    # Generate text with the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)]

    # Decode the output text
    output_text = processor.batch_decode(
        trimmed_generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    return output_text[0]  # Return the first decoded output text

In [None]:
system_message_object = """You are a Vision Language Model specialised in interpreting visual data from images and determining objects and people in the picture.
Your task is to analyze the provided image and respond to queries with concise answers, without repeating the question.
Avoid additional explanation unless absolutely necessary."""

In [None]:
system_message_style = """You are a Vision Language Model specialised in interpreting an artistic picture style.
Your task is to analyze the provided image and respond to queries with concise answers, without repeating the question.
Avoid additional explanation unless absolutely necessary"""

In [None]:
def format_data_object(sample):
    return {
        "images": [sample["images"][0]],
        "messages": [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message_object}],
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": sample["images"][0],
                    },
                    {
                        "type": "text",
                        "text": "Who or what on this picture? Short answer.",
                    },
                ],
            },
        ],
    }

In [None]:
def format_data_style(sample):
    return {
        "images": [sample["images"][0]],
        "messages": [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message_style}],
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": sample["images"][0],
                    },
                    {
                        "type": "text",
                        "text": "What artistic style is the whole picture?. Short answer."
                    },
                ],
            },
        ],
    }

In [None]:
#subset = train_dataset["train"].select(range(2))

In [None]:
from random import shuffle, choice

from datasets import Dataset
import tqdm

from watermark_text import watermark_text
from add_watermark import make_watermark_pattern


shuffle(watermark_text)

In [None]:
data = []
for sample in tqdm.tqdm(train_dataset["train"]):
    assistant_value = sample["texts"][0]["assistant"]
    generated_object = generate_text_from_sample(model, processor, format_data_object(sample))
    generated_style = generate_text_from_sample(model, processor, format_data_style(sample))
    image, count = make_watermark_pattern(sample['images'][0], text=choice(watermark_text))
    data.append({'images': [image], 'texts': [{'user': '',
                                                        'assistant': '{{"watermarks": {}, "text": {}, "main object": {}, "style": {}}}'.format(count, assistant_value, generated_object, generated_style)}]})


In [None]:
def gen():
    for d in data:
        yield d

In [None]:
ds = Dataset.from_generator(gen)

In [None]:
ds['images'][121][0]

In [None]:
ds['texts'][121]

In [None]:
ds.save_to_disk('~/.cache/huggingface/hub/my_tmp_dataset')

In [None]:
ds = ds.train_test_split(train_size=10000)
tr_ds = ds['train']
ts_ds = ds['test']
tr_ds.save_to_disk('~/.cache/huggingface/hub/my_tmp_dataset_train')
ts_ds.save_to_disk('~/.cache/huggingface/hub/my_tmp_dataset_test')

In [None]:
from datasets import load_from_disk
ds = load_from_disk("~/.cache/huggingface/hub/my_tmp_dataset")