# OCR evaluation

The OCR techniques used in the research: Tesseract, EasyOCR and Idefics2 need to be evaluated. The metrics used for that are CER(Character Error Rate) and WER (Word Error Rate). According the fact that in the usecases there are no words but codes, the ground truth will be compared with the prediction.

In [13]:
import cv2
from PIL import Image
import pandas as pd
import kagglehub
import easyocr
from tqdm import tqdm
from transformers import Idefics2Processor, Idefics2ForConditionalGeneration
import torch
from transformers import BitsAndBytesConfig
import os
import Levenshtein
from nltk import word_tokenize
from nltk import download
import json

download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\stani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
annotations_path = kagglehub.dataset_download("stanislavlevendeev/hazmat-detection")
annotations_path = annotations_path.replace('\\', '/')
path_video = os.environ['PATH_TO_DATA']
data_path = annotations_path + '/labels_dataframe.csv'
print(data_path)

C:/Users/stani/.cache/kagglehub/datasets/stanislavlevendeev/hazmat-detection/versions/14/labels_dataframe.csv


In [3]:

df_labels = pd.read_csv(data_path)
df_labels.head()

Unnamed: 0,Task ID,Task Name,Job Id,Source,Frames,Absolute Frame,Relative Frame,XTL,YTL,XBR,YBR,Code,Issue
0,138,Task1,133,1690279852.mp4,730,54,54,29.87,506.88,190.69,554.96,83/2789,
1,138,Task1,133,1690279852.mp4,730,55,55,65.26,504.87,225.5,552.95,83/2789,
2,138,Task1,133,1690279852.mp4,730,56,56,131.98,503.67,291.63,551.76,83/2789,
3,138,Task1,133,1690279852.mp4,730,57,57,198.69,502.48,357.76,550.57,83/2789,
4,138,Task1,133,1690279852.mp4,730,58,58,241.62,498.68,400.1,546.77,83/2789,


In [4]:
available_sources = [f for f in os.listdir(path_video) if f.endswith('.mp4')]
available_sources

['1690281365.mp4', '1692830440.mp4', '1696009577.mp4']

In [5]:
# Group by in daaataframe
df_labels_grouped = df_labels.groupby(['Source', 'Relative Frame']).size().reset_index().iloc[:, :4]
df_labels_grouped = df_labels_grouped[df_labels_grouped['Source'].isin(available_sources)]
df_labels_grouped.head()

Unnamed: 0,Source,Relative Frame,0
200,1690281365.mp4,0,1
201,1690281365.mp4,1,1
202,1690281365.mp4,2,1
203,1690281365.mp4,3,1
204,1690281365.mp4,4,1


In [6]:
# Init  idefics2
processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b")
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)
print('Processor loaded')
# Ensure the model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Device:', device)
model = Idefics2ForConditionalGeneration.from_pretrained(
    "HuggingFaceM4/idefics2-8b",
    torch_dtype=torch.float16,
    device_map=device,
    quantization_config=quantization_config,   
    # attn_implementation="flash_attention_2",
)
model = model.to(device)

Chat templates should be in a 'chat_template.jinja' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.


Processor loaded
Device: cuda


Loading checkpoint shards: 100%|██████████| 7/7 [00:30<00:00,  4.40s/it]


In [7]:
prompt = """
Analyze the image and extract two key values:

    The UN number visible on the upper part of the placard.
    The code visible on the lower part of the placard, located below the horizontal line separating the two sections.

Both codes are printed in black. If either the upper or lower part cannot be detected, replace the missing value with "0." Output the extracted values as plain text, separated by a comma if multiple codes are present. No additional context or formatting is needed.

Input Examples:

    {98 {line} 4567}
    (not found, {line}, 8901)
    {101 {line} 3345}
    (not found, {line}, {not found})
    {45 {line} 2789}
    {22 {line} 5678}

Desired Output:

    98, 4567
    0, 8901
    101, 3345
    0, 0
    45, 2789
    22, 5678

Expected Transformation:

    For each input example, extract the UN number and the code below the horizontal line.
    If either part is missing (i.e., "not found"), replace it with 0.
    Output the extracted values as plain text, separated by a comma, without any additional context or formatting.
"""
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": prompt},
            {"type": "image"},
        ],
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True)

In [8]:
# loop throuw grouped df_lables_grouped
annotations_path = annotations_path + '/yolo/images'
annotations_path

'C:/Users/stani/.cache/kagglehub/datasets/stanislavlevendeev/hazmat-detection/versions/14/yolo/images'

In [9]:
def read_image(source, frame):
    source = source.split('.')[0]
    frame = str(frame).zfill(5)
    path = f'{annotations_path}/train/{source}_{frame}.jpg'
    image = None
    if os.path.exists(path):
        image = Image.open(path)
    path = f'{annotations_path}/test/{source}_{frame}.jpg'
    if os.path.exists(path):
        image = Image.open(path)
    path = f'{annotations_path}/val/{source}_{frame}.jpg'
    if os.path.exists(path):
        image = Image.open(path)
    if image is None:
        print(f'Image not found: {path}')
    return image
# Define the method
def extract_bounding_box(self, bbox):
    """
    Extract a bounding box from the image.
    
    :param bbox: A tuple of (left, top, right, bottom) coordinates.
    :return: A new Image object containing the cropped region.
    """
    return self.crop(bbox)

def perform_ocr(image): 
    
    inputs = processor(images=image, text=text, return_tensors="pt").to(device)
    generated_text = model.generate(**inputs, max_new_tokens=500)
    generated_text = processor.batch_decode(generated_text, skip_special_tokens=True)[0]
    assistant_output = generated_text.split("Assistant:")[1].strip()
    # Split the output by comma to get the individual numbers
    numbers = assistant_output.split(",")
    # Strip any leading or trailing whitespace from the numbers
    numbers = [number.strip().replace('.','') for number in numbers]
    numbers.append('0000') 
    numbers.append('0000')
    un_number, hin_number = numbers[:2]
    return un_number, hin_number

def calculate_cer(gt, ocr):
    return Levenshtein.distance(gt, ocr) / max(1, len(gt))
def calculate_wer(gt, ocr):
    gt_words = gt.split()
    ocr_words = ocr.split()
    return Levenshtein.distance(" ".join(gt_words), " ".join(ocr_words)) / max(1, len(gt_words))
# Add the method to the Image class
Image.Image.extract_bounding_box = extract_bounding_box
evaluation = {
    'images':[],
    'WER': 0,
    'CER': 0,
}

In [10]:
number_lables = 0
for index, frames in tqdm(df_labels_grouped.iterrows(), total=df_labels_grouped.shape[0]):
    # Read image
    image = read_image(frames['Source'], frames['Relative Frame'])
    if image is None:
        continue
    data_annotation = df_labels[(df_labels['Source'] == frames['Source']) & (df_labels['Relative Frame'] == frames['Relative Frame'])]
    #for loop
    for index, lable in data_annotation.iterrows():
        number_lables +=1
        # Crop image
        cropped = image.extract_bounding_box((int(lable['XTL']), int(lable['YTL']), int(lable['XBR']), int(lable['YBR'])))
        # OCR
        un_number_pr, hin_number_pr = perform_ocr(cropped)
        actual_un_number, actual_hin_number = str(lable['Code']).split('/')
        cer_un = calculate_cer(actual_un_number, un_number_pr)
        cer_hin = calculate_cer(actual_hin_number, hin_number_pr)
        evaluation['CER'] += cer_un + cer_hin
        wer_un = calculate_wer(actual_un_number, un_number_pr)
        wer_hin = calculate_wer(actual_hin_number, hin_number_pr)
        
        evaluation['WER'] += wer_un + wer_hin
        evaluation['images'].append({
            'video': frames['Source'],
            'frame': frames['Relative Frame'],
            'un_number_pr': un_number_pr,
            'hin_number_pr': hin_number_pr,
            'un_number_gt': actual_un_number,
            'hin_number_gt': actual_hin_number,
            'cer_un': cer_un,
            'cer_hin': cer_hin,
        })
        

 57%|█████▋    | 495/867 [1:19:20<59:37,  9.62s/it]  


KeyboardInterrupt: 

In [11]:
print('Evaluation:')
print('Number of lables:', number_lables)
print ('Raw CER:', evaluation['CER'])
print ('Raw WER:', evaluation['WER'])
print('CER:', evaluation['CER']/number_lables)
print('WER:', evaluation['WER']/number_lables)

Evaluation:
Number of lables: 496
Raw CER: 40.91666666666668
Raw WER: 139.0
CER: 0.0824932795698925
WER: 0.28024193548387094


In [12]:
evaluation_df = pd.DataFrame(evaluation['images'])
evaluation_df.head()

Unnamed: 0,video,frame,un_number_pr,hin_number_pr,un_number_gt,hin_number_gt,cer_un,cer_hin
0,1690281365.mp4,0,23,1055,23,1055,0.0,0.0
1,1690281365.mp4,1,23,1055,23,1055,0.0,0.0
2,1690281365.mp4,2,23,1055,23,1055,0.0,0.0
3,1690281365.mp4,3,23,1055,23,1055,0.0,0.0
4,1690281365.mp4,4,2,0,23,1055,0.5,0.75


In [14]:
file_path = 'data.json'

# Write the data to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(evaluation, json_file, indent=4)

: 