In [1]:
# ============================
#  1. Install & Import
# ============================
!pip install -U "transformers>=4.43.0" "accelerate>=0.33.0" "bitsandbytes>=0.43" "peft>=0.11" "trl>=0.9.0" datasets tqdm

import torch, json, os
from datasets import load_dataset
from transformers import Idefics3ForConditionalGeneration, AutoProcessor
from peft import LoraConfig, get_peft_model
from trl import SFTConfig, SFTTrainer

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.cuda.empty_cache()

# ============================
#  2. Load dataset
# ============================
ds = load_dataset("mychen76/invoices-and-receipts_ocr_v1")
print(ds)

# Inspect one sample
print(ds["train"][0].keys())

# ============================
#  3. Model + Processor
# ============================
model = Idefics3ForConditionalGeneration.from_pretrained(
    "HuggingFaceTB/SmolVLM-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)

processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")

# ============================
#  4. Apply LoRA (QLoRA optional)
# ============================
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# ============================
#  5. Preprocess Dataset
# ============================
def preprocess_fn(example):
    parsed = json.loads(example["parsed_data"])
    structured = parsed.get("json", "{}")
    try:
        structured_json = json.loads(structured.replace("'", '"'))
    except:
        structured_json = {"error": "invalid_json"}

    prompt = "Extract all invoice fields and return as JSON."
    target = json.dumps(structured_json)

    # tokenize separately
    inputs = processor.tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    labels = processor.tokenizer(
        target,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    return {
        "input_ids": inputs["input_ids"][0].tolist(),
        "attention_mask": inputs["attention_mask"][0].tolist(),
        "labels": labels["input_ids"][0].tolist()
    }

tokenized_train = ds["train"].map(preprocess_fn, remove_columns=ds["train"].column_names)
tokenized_valid = ds["valid"].map(preprocess_fn, remove_columns=ds["valid"].column_names)

print(tokenized_train[0].keys())

# ============================
#  6. Config for Training
# ============================
sft_config = SFTConfig(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    num_train_epochs=3,
    fp16=False,          # disable AMP since model is already in FP16
    bf16=True,           # if GPU supports it
    output_dir="./outputs",
    logging_steps=50,
    eval_steps=200,
    save_steps=500,
    report_to="none"
)

# ============================
#  7. Trainer
# ============================
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
)

trainer.train()

# ============================
#  8. Save Fine-tuned Model
# ============================
model.save_pretrained("./fine_tuned_model")
processor.save_pretrained("./fine_tuned_model")

# ============================
#  9. Quick Evaluation
# ============================
sample = tokenized_valid[0]
input_ids = torch.tensor([sample["input_ids"]]).to("cuda")
labels = torch.tensor([sample["labels"]]).to("cuda")

with torch.no_grad():
    loss = model(input_ids=input_ids, labels=labels).loss
print("Sample loss:", loss.item())


Collecting bitsandbytes>=0.43
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl>=0.9.0
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Collecting datasets
  Downloading datasets-4.2.0-py3-none-any.whl.metadata (18 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.24.0-py3-none-any.whl (423 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-4.2.0-py3-none-any.whl (506 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.3/506.3 kB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-21.0.0-cp312-c

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/782 [00:00<?, ?B/s]

data/train-00000-of-00001-76ffc8319f74dd(…):   0%|          | 0.00/249M [00:00<?, ?B/s]

data/test-00000-of-00001-af2d92d1cee2851(…):   0%|          | 0.00/18.8M [00:00<?, ?B/s]

data/valid-00000-of-00001-894b4e1f736b57(…):   0%|          | 0.00/14.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/125 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/70 [00:00<?, ? examples/s]

`torch_dtype` is deprecated! Use `dtype` instead!


DatasetDict({
    train: Dataset({
        features: ['image', 'id', 'parsed_data', 'raw_data'],
        num_rows: 2043
    })
    test: Dataset({
        features: ['image', 'id', 'parsed_data', 'raw_data'],
        num_rows: 125
    })
    valid: Dataset({
        features: ['image', 'id', 'parsed_data', 'raw_data'],
        num_rows: 70
    })
})
dict_keys(['image', 'id', 'parsed_data', 'raw_data'])


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/4.49G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/429 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/486 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/92.0 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Trainable parameters: 2,568,192


Map:   0%|          | 0/2043 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask', 'labels'])


Truncating train dataset:   0%|          | 0/2043 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/70 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 49154, 'bos_token_id': 1, 'pad_token_id': 2}.


Step,Training Loss
50,3.8845
100,3.0393
150,2.8168
200,2.7946
250,2.6444
300,2.5536
350,2.6831
400,2.5775
450,2.5989
500,2.618


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Sample loss: 0.7851240634918213




In [4]:
print(inputs["input_ids"].shape, labels.shape)


torch.Size([1, 9]) torch.Size([1, 8])


In [5]:
# ==========================
# Invoice Parser Evaluation
# ==========================
import torch
import json
from datasets import load_dataset
from transformers import Idefics3ForConditionalGeneration, AutoProcessor

# ---------------------------
# Config
# ---------------------------
model_dir = "./fine_tuned_model"  # path to your trained model
dataset_id = "mychen76/invoices-and-receipts_ocr_v1"
split = "valid"
max_samples = 5
max_len = 512

# ---------------------------
# Load Model & Processor
# ---------------------------
model = Idefics3ForConditionalGeneration.from_pretrained(
    model_dir,
    torch_dtype="auto",
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_dir)
model.eval()

# ---------------------------
# Load Dataset
# ---------------------------
ds = load_dataset(dataset_id)[split]
print(f"Loaded {len(ds)} samples from {dataset_id}/{split}")

# ---------------------------
# Evaluation Loop
# ---------------------------
for i, ex in enumerate(ds.select(range(min(max_samples, len(ds))))):
    parsed_data = json.loads(ex["parsed_data"])
    target_str = parsed_data.get("json", "{}")

    try:
        target_json = json.loads(target_str.replace("'", '"'))
    except Exception:
        target_json = {"error": "invalid_json"}

    prompt = "Extract all invoice fields and return as JSON."
    target_text = json.dumps(target_json)

    # --- Concatenate prompt + target (safe alignment) ---
    full_text = f"{prompt}\n{target_text}"
    enc = processor.tokenizer(
        full_text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_len,
    )

    # Mask prompt tokens so loss only applies to the answer
    labels = enc["input_ids"].clone()
    prompt_len = len(processor.tokenizer(prompt)["input_ids"])
    labels[:, :prompt_len] = -100

    # Move to device
    enc = {k: v.to(model.device) for k, v in enc.items()}
    labels = labels.to(model.device)

    # Compute loss
    with torch.no_grad():
        loss = model(**enc, labels=labels).loss

    print(f"\n[{i}] Loss: {loss.item():.4f}")

    # Generate output
    gen_tokens = model.generate(**enc, max_new_tokens=256)
    pred = processor.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0]

    print("Prediction:", pred[:300])
    print("Ground Truth:", target_text[:300])
    print("-" * 80)


Loaded 70 samples from mychen76/invoices-and-receipts_ocr_v1/valid

[0] Loss: 3.9814
Prediction: Extract all invoice fields and return as JSON.
{"error": "invalid_json"}": "", "amount": "100.00", "creditcard": "4000 0000 0000 0000", "creditcard_exp": "12/20", "creditcard_cvv": "123", "address": "1234567890", "city": "New York", "state": "NY", "zipcode": "10000", "phone": "1234567890", "email": 
Ground Truth: {"error": "invalid_json"}
--------------------------------------------------------------------------------

[1] Loss: 3.9814
Prediction: Extract all invoice fields and return as JSON.
{"error": "invalid_json"}": "", "amount": "100.00", "creditcard": "4000 0000 0000 0000", "creditcard_exp": "12/20", "creditcard_cvv": "123", "address": "1234567890", "city": "New York", "state": "NY", "zipcode": "10000", "phone": "1234567890", "email": 
Ground Truth: {"error": "invalid_json"}
--------------------------------------------------------------------------------

[2] Loss: 2.1879
Prediction: