In [2]:
!pip install -qqq git+https://github.com/huggingface/transformers.git@3c2517727ce28a30f5044e01663ee204deb1cdbe datasets trl peft --progress-bar off

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


In [3]:
import torch
import transformers
import trl
import os
os.environ["WANDB_DISABLED"] = "true"

print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"TRL: {trl.__version__}")

PyTorch: 2.9.0+cu126
Transformers: 5.0.0.dev0
TRL: 0.26.2


In [4]:
from google.colab import drive
drive.mount('/content/drive')

!unzip -q /content/drive/MyDrive/data.zip -d /content/
!ls /content/data/lines_augmented | head -5
!wc -l /content/data/transcription_augmented.jsonl.jsonl

Mounted at /content/drive
page_006_line_004_brightness_up.png
page_006_line_004.png
page_006_line_004_rotation_right.png
page_006_line_006_brightness_down.png
page_006_line_006_brightness_up.png
wc: /content/data/transcription_augmented.jsonl.jsonl: No such file or directory


In [5]:
from transformers import AutoModelForImageTextToText, AutoProcessor

model_id = "LiquidAI/LFM2.5-VL-1.6B"

print("ðŸ“š Chargement processor...")
processor = AutoProcessor.from_pretrained(model_id, max_image_tokens=256, trust_remote_code=True)

print("ðŸ§  Chargement modÃ¨le...")
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    torch_dtype="bfloat16",
    device_map="auto",
    trust_remote_code=True
)

print(f"âœ… ModÃ¨le chargÃ©: {model.num_parameters():,} paramÃ¨tres")

ðŸ“š Chargement processor...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


processor_config.json:   0%|          | 0.00/828 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

ðŸ§  Chargement modÃ¨le...


config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.19G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/589 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

âœ… ModÃ¨le chargÃ©: 1,596,625,904 paramÃ¨tres


In [6]:
from datasets import load_dataset, Image
import os

base_path = "/content/data/lines_augmented"
jsonl_path = "/content/data/transcription_augmented.jsonl"

raw_ds = load_dataset("json", data_files=jsonl_path)

def fix_image_paths(example):
    example["image"] = os.path.join(base_path, example["image"])
    return example

raw_ds = raw_ds.map(fix_image_paths)
raw_ds = raw_ds.cast_column("image", Image())

split = raw_ds["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]

print(f"ðŸ“š Train: {len(train_dataset)} | Eval: {len(eval_dataset)}")

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1926 [00:00<?, ? examples/s]

ðŸ“š Train: 1540 | Eval: 386


In [7]:
system_message = (
    "Tu es un systÃ¨me OCR spÃ©cialisÃ© dans la transcription de manuscrits franÃ§ais anciens. "
    "Transcris fidÃ¨lement le texte manuscrit visible dans l'image."
)

def format_ocr_sample(sample):
    return [
        {"role": "system", "content": system_message},  # string
        {"role": "user", "content": [
            {"type": "image", "image": sample["image"]},
            {"type": "text", "text": "Transcris ce texte manuscrit."},
        ]},
        {"role": "assistant", "content": sample["text"]},  # string
    ]

train_dataset = [format_ocr_sample(s) for s in train_dataset]
eval_dataset = [format_ocr_sample(s) for s in eval_dataset]

In [8]:
from PIL import Image as PILImage

def ensure_rgb_images(dataset, name="dataset"):
    fixed_count = 0
    for conversation in dataset:
        for message in conversation:
            if isinstance(message.get("content"), list):
                for content_part in message["content"]:
                    if content_part.get("type") == "image":
                        img = content_part["image"]
                        if img.mode != "RGB":
                            content_part["image"] = img.convert("RGB")
                            fixed_count += 1
    print(f"âœ… {name}: {fixed_count} images converties en RGB")

ensure_rgb_images(train_dataset, "train")
ensure_rgb_images(eval_dataset, "eval")

âœ… train: 1540 images converties en RGB
âœ… eval: 386 images converties en RGB


In [9]:
def create_fixed_collate_fn(processor):
    def collate_fn(samples):
        all_images = []
        all_texts = []

        for conversation in samples:
            images = []
            text = processor.tokenizer.apply_chat_template(
                conversation,
                tokenize=False,
                add_generation_prompt=False
            )
            all_texts.append(text)

            for message in conversation:
                if message["role"] == "user" and isinstance(message["content"], list):
                    for item in message["content"]:
                        if isinstance(item, dict) and item.get("type") == "image":
                            images.append(item["image"])
            all_images.append(images)

        batch = processor(
            text=all_texts,
            images=all_images if any(all_images) else None,
            return_tensors="pt",
            padding=True,
        )

        labels = batch["input_ids"].clone()

        # Token ID du marqueur de fin de tour
        im_end_token = processor.tokenizer.encode("<|im_end|>", add_special_tokens=False)[0]

        for i, label_seq in enumerate(labels):
            im_end_positions = (label_seq == im_end_token).nonzero(as_tuple=True)[0]

            # 2Ã¨me <|im_end|> = fin du user message
            if len(im_end_positions) >= 2:
                mask_until = im_end_positions[1].item() + 1
                labels[i, :mask_until] = -100

        labels[labels == processor.tokenizer.pad_token_id] = -100
        batch["labels"] = labels

        return batch
    return collate_fn

collate_fn = create_fixed_collate_fn(processor)

In [10]:
# DEBUG - VÃ©rifie ce que le modÃ¨le apprend
sample = train_dataset[0]
batch = collate_fn([sample])

tokens = processor.tokenizer.convert_ids_to_tokens(batch["input_ids"][0])
labels = batch["labels"][0].tolist()

print("=== DIAGNOSTIC LABELS ===")
for tok, lab in zip(tokens[:100], labels[:100]):  # 100 premiers tokens
    status = "LEARN" if lab != -100 else "MASK"
    print(f"{status}: {tok}")
print('-' * 30)
for tok, lab in zip(tokens[-50:], labels[-50:]):
    status = "LEARN" if lab != -100 else "MASK"
    print(f"{status}: {tok}")

=== DIAGNOSTIC LABELS ===
MASK: <|startoftext|>
MASK: <|im_start|>
MASK: system
MASK: ÄŠ
MASK: Tu
MASK: Ä es
MASK: Ä un
MASK: Ä systÃƒÂ¨me
MASK: Ä O
MASK: CR
MASK: Ä spÃƒÂ©cial
MASK: isÃƒÂ©
MASK: Ä dans
MASK: Ä la
MASK: Ä transcription
MASK: Ä de
MASK: Ä manus
MASK: crit
MASK: s
MASK: Ä franÃƒÂ§ais
MASK: Ä anciens
MASK: .
MASK: Ä Trans
MASK: c
MASK: ris
MASK: Ä fid
MASK: ÃƒÂ¨
MASK: lement
MASK: Ä le
MASK: Ä texte
MASK: Ä manus
MASK: crit
MASK: Ä visible
MASK: Ä dans
MASK: Ä l
MASK: 'image
MASK: .
MASK: <|im_end|>
MASK: ÄŠ
MASK: <|im_start|>
MASK: user
MASK: ÄŠ
MASK: <|image_start|>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK: <image>
MASK:

In [11]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj", "fc1", "fc2", "linear", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 3,671,808 || all params: 1,600,297,712 || trainable%: 0.2294


In [12]:
from trl import SFTConfig, SFTTrainer

sft_config = SFTConfig(
    output_dir="./lfm25-manuscrit",
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps=5,
    optim="adamw_torch_8bit",
    gradient_checkpointing=True,
    max_length=512,
    dataset_kwargs={"skip_prepare_dataset": True},
    save_strategy="epoch",
    eval_strategy="epoch",
    bf16=True,
)

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
    processing_class=processor.tokenizer,
)

print("ðŸš€ DÃ©but du fine-tuning...")
trainer.train()
print("ðŸŽ‰ TerminÃ©!")

warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 7, 'bos_token_id': 1, 'pad_token_id': 0}.


ðŸš€ DÃ©but du fine-tuning...


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,0.553997,0.496147,4.661392,402629.0,0.894236
2,0.315438,0.332805,4.436414,805258.0,0.930024
3,0.179018,0.273705,4.347076,1207887.0,0.943952
4,0.077733,0.227642,4.486801,1610516.0,0.953737
5,0.113169,0.216858,4.476709,2013145.0,0.957101


ðŸŽ‰ TerminÃ©!


In [13]:
# Sauvegarde SEULEMENT l'adapteur (sans merge)
trainer.model.save_pretrained("./lfm25-manuscrit-epoch1")
processor.save_pretrained("./lfm25-manuscrit-epoch1")

!cp -r ./lfm25-manuscrit-epoch1 /content/drive/MyDrive/
print("âœ… Adapteur sauvegardÃ©")

âœ… Adapteur sauvegardÃ©
