In [1]:
!pip install -qqq git+https://github.com/huggingface/transformers.git@3c2517727ce28a30f5044e01663ee204deb1cdbe datasets trl peft --progress-bar off

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


In [2]:
import torch
import transformers
import trl
import os
os.environ["WANDB_DISABLED"] = "true"

print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"TRL: {trl.__version__}")

PyTorch: 2.9.0+cu126
Transformers: 5.0.0.dev0
TRL: 0.26.2


In [3]:
from google.colab import drive
drive.mount('/content/drive')

!unzip -q /content/drive/MyDrive/data.zip -d /content/
!ls /content/data/lines | head -5
!wc -l /content/data/transcription.jsonl

Mounted at /content/drive
page_006_line_002.png
page_006_line_003.png
page_006_line_004.png
page_006_line_005.png
page_006_line_006.png
125 /content/data/transcription.jsonl


In [4]:
from transformers import AutoModelForImageTextToText, AutoProcessor

model_id = "LiquidAI/LFM2.5-VL-1.6B"

print("📚 Chargement processor...")
processor = AutoProcessor.from_pretrained(model_id, max_image_tokens=256, trust_remote_code=True)

print("🧠 Chargement modèle...")
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    torch_dtype="bfloat16",
    device_map="auto",
    trust_remote_code=True
)

print(f"✅ Modèle chargé: {model.num_parameters():,} paramètres")

📚 Chargement processor...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


processor_config.json:   0%|          | 0.00/828 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

🧠 Chargement modèle...


config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.19G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/589 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

✅ Modèle chargé: 1,596,625,904 paramètres


In [5]:
from datasets import load_dataset, Image
import os

base_path = "/content/data/lines"
jsonl_path = "/content/data/transcription.jsonl"

raw_ds = load_dataset("json", data_files=jsonl_path)

def fix_image_paths(example):
    example["image"] = os.path.join(base_path, example["image"])
    return example

raw_ds = raw_ds.map(fix_image_paths)
raw_ds = raw_ds.cast_column("image", Image())

split = raw_ds["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]

print(f"📚 Train: {len(train_dataset)} | Eval: {len(eval_dataset)}")

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

📚 Train: 100 | Eval: 25


In [6]:
system_message = (
    "Tu es un système OCR spécialisé dans la transcription de manuscrits français anciens. "
    "Transcris fidèlement le texte manuscrit visible dans l'image."
)

def format_ocr_sample(sample):
    return [
        {"role": "system", "content": system_message},  # string
        {"role": "user", "content": [
            {"type": "image", "image": sample["image"]},
            {"type": "text", "text": "Transcris ce texte manuscrit."},
        ]},
        {"role": "assistant", "content": sample["text"]},  # string
    ]

train_dataset = [format_ocr_sample(s) for s in train_dataset]
eval_dataset = [format_ocr_sample(s) for s in eval_dataset]

In [7]:
from PIL import Image as PILImage

def ensure_rgb_images(dataset, name="dataset"):
    fixed_count = 0
    for conversation in dataset:
        for message in conversation:
            if isinstance(message.get("content"), list):
                for content_part in message["content"]:
                    if content_part.get("type") == "image":
                        img = content_part["image"]
                        if img.mode != "RGB":
                            content_part["image"] = img.convert("RGB")
                            fixed_count += 1
    print(f"✅ {name}: {fixed_count} images converties en RGB")

ensure_rgb_images(train_dataset, "train")
ensure_rgb_images(eval_dataset, "eval")

✅ train: 100 images converties en RGB
✅ eval: 25 images converties en RGB


In [8]:
def create_fixed_collate_fn(processor):
    def collate_fn(samples):
        all_images = []
        all_texts = []

        for conversation in samples:
            images = []
            # Construire le texte avec le tokenizer directement
            text = processor.tokenizer.apply_chat_template(
                conversation,
                tokenize=False,
                add_generation_prompt=False
            )
            all_texts.append(text)

            # Extraire les images des messages user
            for message in conversation:
                if message["role"] == "user" and isinstance(message["content"], list):
                    for item in message["content"]:
                        if isinstance(item, dict) and item.get("type") == "image":
                            images.append(item["image"])
            all_images.append(images)

        # Tokenizer + image processor
        batch = processor(
            text=all_texts,
            images=all_images if any(all_images) else None,
            return_tensors="pt",
            padding=True,
        )

        labels = batch["input_ids"].clone()
        labels[labels == processor.tokenizer.pad_token_id] = -100
        batch["labels"] = labels
        return batch

    return collate_fn

collate_fn = create_fixed_collate_fn(processor)

In [9]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj", "fc1", "fc2", "linear", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 3,671,808 || all params: 1,600,297,712 || trainable%: 0.2294


In [10]:
from trl import SFTConfig, SFTTrainer

sft_config = SFTConfig(
    output_dir="./lfm25-manuscrit",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=5e-4,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps=5,
    optim="adamw_torch_8bit",
    gradient_checkpointing=True,
    max_length=512,
    dataset_kwargs={"skip_prepare_dataset": True},
    save_strategy="epoch",
    bf16=True,
)

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
    processing_class=processor.tokenizer,
)

print("🚀 Début du fine-tuning...")
trainer.train()
print("🎉 Terminé!")

warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 7, 'bos_token_id': 1, 'pad_token_id': 0}.


🚀 Début du fine-tuning...


Step,Training Loss
5,11.438166
10,7.739435
15,2.096268
20,0.884901
25,0.85739
30,0.760478
35,0.647181


🎉 Terminé!


In [11]:
model = trainer.model

if hasattr(model, 'peft_config'):
    print("🔄 Merge LoRA...")
    model = model.merge_and_unload()

model.save_pretrained("./lfm25-manuscrit")
processor.save_pretrained("./lfm25-manuscrit")

!cp -r ./lfm25-manuscrit /content/drive/MyDrive/
print("✅ Modèle sauvegardé sur Google Drive")

🔄 Merge LoRA...


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Modèle sauvegardé sur Google Drive


In [20]:
from PIL import Image as PILImage

model.eval()
test_image = PILImage.open("/content/page_011.png").convert("RGB")

conversation = [
    #{"role": "system", "content": "Tu es un système OCR spécialisé dans la transcription de manuscrits français anciens. Transcris fidèlement le texte manuscrit visible dans l'image."},
    {"role": "user", "content": [
        {"type": "image", "image": test_image},
        {"type": "text", "text": "Transcrit ce texte manuscrit."},
    ]},
]

inputs = processor.apply_chat_template(
    [conversation],
    add_generation_prompt=True,
    return_tensors="pt",
    return_dict=True,
    tokenize=True,
)
inputs = {k: v.to(model.device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=2048, do_sample=False)

result = processor.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]
print("📝 Transcription:", result)

📝 Transcription: 


In [15]:
# 🧪 Test d'inférence sur le modèle fine-tuné
from transformers import AutoProcessor, AutoModelForMultimodalLM
from PIL import Image
import torch

model_path = "./lfm25-manuscrit"

print("📚 Chargement du modèle fine-tuné...")
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForMultimodalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)
model.eval()
print("✅ Modèle chargé!")

# Test sur une image
test_image_path = "/content/data/lines/page_006_line_005.png"
test_image = Image.open(test_image_path).convert("RGB")

messages = [
    {"role": "system", "content": "Tu es un système OCR spécialisé dans la transcription de manuscrits français anciens. Transcris fidèlement le texte manuscrit visible dans l'image."},
    {"role": "user", "content": [
        {"type": "image", "image": test_image},
        {"type": "text", "text": "Transcris ce texte manuscrit."},
    ]},
]

inputs = processor.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
).to(model.device)

with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=256, do_sample=False)

result = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
print(f"\n📝 Transcription: {result}")

📚 Chargement du modèle fine-tuné...


Loading weights:   0%|          | 0/589 [00:00<?, ?it/s]

✅ Modèle chargé!


TypeError: string indices must be integers, not 'str'