<a href="https://colab.research.google.com/github/ParitKansal/FineTunning/blob/main/Qwen_VL_OCR_Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install trl bitsandbytes peft



In [3]:
import pandas as pd

df = pd.read_excel("/content/drive/MyDrive/FSL_2025/new_initiative/Qwen-32b/fsl_invoices_xelp_extraction_15_9_2025.xlsx")

In [4]:
df.head(1)

Unnamed: 0,filename,field_name,ground_truth,cleaned_ground_truth,vendor_name,XELP_OCR_OUTPUT
0,GHX9H09IM.002_004.jpg,INV_VendorFullName,Boston Scientific Corporation,BOSTON SCIENTIFIC CORPORATION,BOSTON SCIENTIFIC CORPORATION,BOSTON SCIENTIFIC CORPORATION


In [5]:
df = df[['filename', 'vendor_name', 'field_name', 'cleaned_ground_truth']]
df = df.rename(columns={'cleaned_ground_truth': 'ground_truth'})

In [6]:
# Get unique filenames and field names
filenames = df['filename'].unique()
field_names = df['field_name'].unique()

# Final dictionary
result = {}

for file_name in filenames:
    temp = df[df['filename'] == file_name]
    result[file_name] = {}
    for field_name in field_names:
        temp_df = temp[temp['field_name'] == field_name]
        if not temp_df.empty:
            # take first value (or join if multiple)
            value = temp_df['ground_truth'].iloc[0]
        else:
            value = ""  # empty string if not present
        result[file_name][field_name] = value

In [7]:
import json
for key in result.keys():
  result[key] = json.dumps(result[key])

In [8]:
vendors = []
for key in result.keys():
    temp = df[df['filename'] == key]
    if not temp.empty and 'vendor_name' in temp.columns:
        vendors.append(temp['vendor_name'].iloc[0])
    else:
        vendors.append("")

print(len(vendors))

300


In [9]:
from datasets import Dataset, Features, Image, Value
import pandas as pd

# Build full image paths
image_paths = [
    "/content/drive/MyDrive/FSL_2025/new_initiative/data/Invoice_Data/Invoice_Data/" + fname
    for fname in result.keys()
]

# Build dataframe
df_result = pd.DataFrame({
    "image": image_paths,
    "ground_truth": list(result.values()),  # JSON strings
    "vendors": vendors
})

# Define dataset features
features = Features({
    "image": Image(),
    "ground_truth": Value("string"),
    "vendors": Value("string")
})

# Convert to Hugging Face dataset
hf_dataset = Dataset.from_pandas(df_result, features=features)

# Inspect one example
hf_dataset[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=2550x3300>,
 'ground_truth': '{"INV_VendorFullName": "BOSTON SCIENTIFIC CORPORATION", "INV_InvoiceNumber": "701719182", "INV_PurchaseOrderNo": "P1412360", "INV_InvoiceDate": "09/05/2024", "INV_VAT": "0.00", "INV_FreightCharge": "17.00", "INV_TotalAmount": "6,832.00", "INV_ExtdPrice_0": "65.00", "INV_Description_0": "FG EMERGE MR US 2.50MM X 20MM", "INV_VendorCatalogNo_0": "H7493918920250", "INV_Quantity_0": "1", "INV_ExtdPrice_1": "65.00", "INV_Description_1": "FG NC EMERGE MR US 3.00MM X 20MM", "INV_VendorCatalogNo_1": "H7493926720300", "INV_Quantity_1": "1", "INV_ExtdPrice_2": "505.00", "INV_Description_2": "SYNERGY XD MR US 3.00X16MM", "INV_VendorCatalogNo_2": "H7493941816300", "INV_Quantity_2": "1", "INV_ExtdPrice_3": "65.00", "INV_Description_3": "FG EMERGE MR US 2.50MM X 12MM", "INV_VendorCatalogNo_3": "H7493918912250", "INV_Quantity_3": "1", "INV_AccountNumber": "", "INV_ShipToAddress1": "", "INV_ShipToCity": "", "

In [10]:
from collections import Counter

vendor_counts = Counter(hf_dataset['vendors'])
print(vendor_counts)

Counter({'BOSTON SCIENTIFIC CORPORATION': 113, 'ARTHREX INC': 96, 'MEDLINE INDUSTRIES INC': 91})


In [11]:
from datasets import Dataset, DatasetDict

# Convert to Pandas for easier grouping
df = hf_dataset.to_pandas()

train_idx = []
eval_idx = []

# Group by vendor
for vendor, group in df.groupby("vendors"):
    # Shuffle the group
    group = group.sample(frac=1, random_state=42).reset_index()

    # Compute 20% size (at least 1 if group is small)
    n_eval = max(1, int(len(group) * 0.2))

    # Split
    eval_idx.extend(group.loc[:n_eval-1, "index"].tolist())
    train_idx.extend(group.loc[n_eval:, "index"].tolist())

# Split the dataset
train_dataset = hf_dataset.select(train_idx)
eval_dataset = hf_dataset.select(eval_idx)

# Combine into DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "eval": eval_dataset
})

# Inspect
print(dataset_dict)
print("Eval examples per vendor:")
for vendor, group in df.groupby("vendors"):
    count = sum([x['vendors'] == vendor for x in eval_dataset])
    print(vendor, count)

DatasetDict({
    train: Dataset({
        features: ['image', 'ground_truth', 'vendors'],
        num_rows: 241
    })
    eval: Dataset({
        features: ['image', 'ground_truth', 'vendors'],
        num_rows: 59
    })
})
Eval examples per vendor:
ARTHREX INC 19
BOSTON SCIENTIFIC CORPORATION 22
MEDLINE INDUSTRIES INC 18


In [12]:
dataset_dict['eval']

Dataset({
    features: ['image', 'ground_truth', 'vendors'],
    num_rows: 59
})

In [13]:
system_message = """
You are an OCR model expert.
Your task is to analyze images and return the extracted information in a well-structured, nested JSON format.
"""
import json

def format_data(sample):
    return {
        "images": [sample["image"]],
        "messages": [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message}],
            },
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": sample["image"]}
                ],
            },
            {
                "role": "assistant",
                "content": [
                    {
                        "type": "text",
                        "text": f"```json\n{sample['ground_truth']}\n```"
                    }
                ],
            },
        ],
    }

train_dataset = dataset_dict["train"]
eval_dataset = dataset_dict["eval"]

# Apply your formatting function
train_dataset = [format_data(sample) for sample in train_dataset]
eval_dataset = [format_data(sample) for sample in eval_dataset]

In [14]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor

In [15]:
import gc
import time


def clear_memory():
    # Delete variables if they exist in the current global scope
    if "inputs" in globals():
        del globals()["inputs"]
    if "model" in globals():
        del globals()["model"]
    if "processor" in globals():
        del globals()["processor"]
    if "trainer" in globals():
        del globals()["trainer"]
    if "peft_model" in globals():
        del globals()["peft_model"]
    if "bnb_config" in globals():
        del globals()["bnb_config"]
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


clear_memory()

GPU allocated memory: 0.00 GB
GPU reserved memory: 0.00 GB


In [16]:
model_id = "Qwen/Qwen2.5-VL-3B-Instruct"

In [17]:
from transformers import BitsAndBytesConfig

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=bnb_config
)
processor = AutoProcessor.from_pretrained(model_id)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [18]:
from peft import LoraConfig, get_peft_model

# Configure LoRA
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

# Apply PEFT model adaptation
peft_model = get_peft_model(model, peft_config)

# Print trainable parameters
peft_model.print_trainable_parameters()

trainable params: 1,843,200 || all params: 3,756,466,176 || trainable%: 0.0491


In [19]:
from trl import SFTConfig

# Configure training arguments
training_args = SFTConfig(
    output_dir="Qwen2.5-3B-Instruct-invoices",  # Directory to save the model
    num_train_epochs=50,  # Number of training epochs
    per_device_train_batch_size=2,  # Batch size for training
    per_device_eval_batch_size=2,  # Batch size for evaluation
    gradient_accumulation_steps=8,  # Steps to accumulate gradients
    gradient_checkpointing_kwargs={"use_reentrant": False},  # Options for gradient checkpointing
    max_length=None,

    # Optimizer and scheduler settings
    optim="adamw_torch_fused",  # Optimizer type
    learning_rate=3e-5,  # Learning rate for training

    # Logging and evaluation (epoch-based)
    logging_strategy="epoch",  # Log after each epoch
    eval_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch",  # Save after each epoch

    # Mixed precision and gradient settings
    bf16=True,  # Use bfloat16 precision
    max_grad_norm=0.3,  # Maximum norm for gradient clipping
    warmup_ratio=0.03,  # Ratio of total steps for warmup

    # Hub and reporting
    push_to_hub=True,  # Whether to push model to Hugging Face Hub
    hub_private_repo=True, # Make the hub repository private
)

In [20]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    processing_class=processor,
)



In [21]:
len(train_dataset), len(eval_dataset)

(241, 59)

In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
[34m[1mwandb[0m: Currently logged in as: [33mparitkansal121[0m ([33mparitkansal121-harcourt-butler-technical-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,17.6258,17.57197,2.722033,2763867.0,0.055539
