In [1]:
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

## Setup

### Configure your API keys

To fine-tune Florence-2, you need to provide your HuggingFace Token and Roboflow API key. Follow these steps:

- Open your [`HuggingFace Settings`](https://huggingface.co/settings) page. Click `Access Tokens` then `New Token` to generate new token.
- Go to your [`Roboflow Settings`](https://app.roboflow.com/settings/api) page. Click `Copy`. This will place your private key in the clipboard.
- In Colab, go to the left pane and click on `Secrets` (🔑).
    - Store HuggingFace Access Token under the name `HF_TOKEN`.
    - Store Roboflow API Key under the name `ROBOFLOW_API_KEY`.

### Select the runtime

Let's make sure that we have access to GPU. We can use `nvidia-smi` command to do that. In case of any problems navigate to `Edit` -> `Notebook settings` -> `Hardware accelerator`, set it to `L4 GPU`, and then click `Save`.

In [2]:
!nvidia-smi

Sat Jan  4 20:14:11 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P8              12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### Install dependencies

In [None]:
!pip install -q roboflow maestro==0.2.0rc5

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.0/43.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.8/66.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.4/324.4 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.2/158.2 kB[0m [31m16.3 MB/s[0m eta [

## Download dataset

In [None]:
from google.colab import userdata
from roboflow import Roboflow

ROBOFLOW_API_KEY = userdata.get("ROBOFLOW_API_KEY")
rf = Roboflow(api_key=ROBOFLOW_API_KEY)

version = rf.workspace("roboflow-jvuqo").project("chartqa-6pz0e").version(1)
dataset = version.download("jsonl")

In [None]:
!head -n 5 {dataset.location}/train/annotations.jsonl

**NOTE:** The downloaded dataset is in raw JSONL format. To remain consistent with the Florence-2 convention, we will prefix each prefix with the name of the vision task being performed. In our case, this will be `<VQA>`.

In [None]:
import json
import os
from typing import Any


def modify_jsonl(file_path: str, task: str) -> None:
    """Modify each line of a JSONL file by prepending a task string to the 'prefix' value.

    Args:
        file_path (str): Path to the JSONL file to modify.
        task (str): The string to prepend to each 'prefix' value in the JSONL file.

    Returns:
        None
    """
    temp_file_path: str = file_path + ".temp"

    with open(file_path) as infile, open(temp_file_path, "w") as outfile:
        for line in infile:
            data: dict[str, Any] = json.loads(line.strip())
            data["prefix"] = f"{task}{data['prefix']}"
            outfile.write(json.dumps(data) + "\n")

    os.replace(temp_file_path, file_path)

In [None]:
modify_jsonl(f"{dataset.location}/train/annotations.jsonl", "<VQA>")
modify_jsonl(f"{dataset.location}/test/annotations.jsonl", "<VQA>")
modify_jsonl(f"{dataset.location}/valid/annotations.jsonl", "<VQA>")

In [None]:
!head -n 5 {dataset.location}/train/annotations.jsonl

## Fine-tune Florence-2 on visual questions answering (VQA) dataset

In [None]:
!maestro florence2 train --help

In [None]:
!maestro florence2 train --dataset={dataset.location} \
--epochs=3 --batch_size=14 --lr=2e-6 \
--metrics word_error_rate --metrics character_error_rate

## Evaluate Florence-2 on visual questions answering (VQA) dataset

In [None]:
!ls -la training/florence-2/1/metrics/

In [None]:
from IPython.display import Image

Image(filename="training/florence-2/1/metrics/loss_plot.png", height=600)

In [None]:
from IPython.display import Image

Image(filename="training/florence-2/1/metrics/wer_plot.png", height=600)

In [None]:
from IPython.display import Image

Image(filename="training/florence-2/1/metrics/cer_plot.png", height=600)

In [None]:
!ls -la training/florence-2/1/checkpoints/

In [None]:
!maestro florence2 evaluate \
--dataset={dataset.location} \
--model_id=/content/training/florence-2/1/checkpoints/best \
--metrics word_error_rate --metrics character_error_rate

In [None]:
!cat /content/evaluation/florence-2/metrics/evaluation.json

## Run inference

In [None]:
from maestro.trainer.models.florence_2.checkpoints import load_model

processor, model = load_model(model_id_or_path="/content/training/florence-2/1/checkpoints/best")

In [None]:
from maestro.trainer.common.data_loaders.datasets import JSONLDataset

ds = JSONLDataset(
    jsonl_file_path=f"{dataset.location}/valid/annotations.jsonl", image_directory_path=f"{dataset.location}/valid/"
)

image, annotations = ds[0]
text = annotations["prefix"]

inputs = processor(text=text, images=image, return_tensors="pt").to("cuda")
generated_ids = model.generate(
    input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("question:", text)
print("answare:", generated_text)

In [None]:
image