# OCR evaluation

The OCR techniques used in the research: Tesseract, EasyOCR and Idefics2 need to be evaluated. The metrics used for that are CER(Character Error Rate) and WER (Word Error Rate). According the fact that in the usecases there are no words but codes, the ground truth will be compared with the prediction.

In [2]:
import cv2
from PIL import Image
import pandas as pd
import kagglehub
import easyocr
from tqdm import tqdm
from transformers import Idefics2Processor, Idefics2ForConditionalGeneration
import torch
from transformers import BitsAndBytesConfig
import os
import Levenshtein
import json
import easyocr
import pytesseract
import regex as re
import numpy as np
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

  from .autonotebook import tqdm as notebook_tqdm


## Initializations

In [3]:
# Models and processors
# Init  idefics2
processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b")
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)
print('Processor loaded')
# Ensure the model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Device:', device)
model = Idefics2ForConditionalGeneration.from_pretrained(
    "HuggingFaceM4/idefics2-8b",
    torch_dtype=torch.float16,
    device_map=device,
    quantization_config=quantization_config,   
    # attn_implementation="flash_attention_2",
)
model = model.to(device)


Chat templates should be in a 'chat_template.jinja' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.


Processor loaded
Device: cuda


Loading checkpoint shards: 100%|██████████| 7/7 [00:33<00:00,  4.71s/it]


In [4]:
prompt = """
Analyze the image and extract two key values:

    The UN number visible on the upper part of the placard.
    The code visible on the lower part of the placard, located below the horizontal line separating the two sections.

Both codes are printed in black. If either the upper or lower part cannot be detected, replace the missing value with "0" Output the extracted values as plain text, separated by a comma if multiple codes are present. No additional context or formatting is needed.

Input Examples:

    {98 {line} 4567}
    (not found, {line}, 8901)
    {101 {line} 3345}
    (not found, {line}, {not found})
    {45 {line} 2789}
    {22 {line} 5678}

Desired Output:

    98, 4567
    00, 8901
    101, 3345
    00, 0000
    45, 2789
    22, 5678

Expected Transformation:

    For each input example, extract the UN number and the code below the horizontal line.
    If either part is missing (i.e., "not found"), replace it with 0.
    Output the extracted values as plain text, separated by a comma, without any additional context or formatting.
"""
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": prompt},
            {"type": "image"},
        ],
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True)

In [5]:

evaluation = {
    "idefics": {
        "images": [],
        "WER": 0,
        "CER": 0,
    },
    "easyocr": {
        "images": [],
        "WER": 0,
        "CER": 0,
    },
    "tesseract": {
        "images": [],
        "WER": 0,
        "CER": 0,
    },
}

def initEvaluation():
    global evaluation 
    evaluation = {
        "idefics": {
            "images": [],
            "WER": 0,
            "CER": 0,
        },
        "easyocr": {
            "images": [],
            "WER": 0,
            "CER": 0,
        },
        "tesseract": {
            "images": [],
            "WER": 0,
            "CER": 0,
        },
    }

def read_image(source, frame):
    source = source.split('.')[0]
    frame = str(frame).zfill(5)
    path = f'{annotations_path}/train/{source}_{frame}.jpg'
    image = None
    if os.path.exists(path):
        image = Image.open(path)
    path = f'{annotations_path}/test/{source}_{frame}.jpg'
    if os.path.exists(path):
        image = Image.open(path)
    path = f'{annotations_path}/val/{source}_{frame}.jpg'
    if os.path.exists(path):
        image = Image.open(path)
    if image is None:
        print(f'Image not found: {path}')
    return image
# Define the method
def extract_bounding_box(self, bbox):
    """
    Extract a bounding box from the image.
    
    :param bbox: A tuple of (left, top, right, bottom) coordinates.
    :return: A new Image object containing the cropped region.
    """
    return self.crop(bbox)

def perform_ocr(image): 
    
    inputs = processor(images=image, text=text, return_tensors="pt").to(device)
    generated_text = model.generate(**inputs, max_new_tokens=500)
    generated_text = processor.batch_decode(generated_text, skip_special_tokens=True)[0]
    assistant_output = generated_text.split("Assistant:")[1].strip()
    # Split the output by comma to get the individual numbers
    numbers = assistant_output.split(",")
    # Strip any leading or trailing whitespace from the numbers
    numbers = [number.strip().replace('.','') for number in numbers]
    numbers.append('0000') 
    numbers.append('0000')
    un_number, hin_number = numbers[:2]
    return un_number, hin_number
def get_text_from_image_easyocr(image):
    # Initialize the reader for digits
    reader = easyocr.Reader(["en"])
    result = reader.readtext(image, allowlist="0123456789",detail=0)
    h,w = None, None
    try:
        h, w = image.shape
    except:
        h,w,_ = image.shape
    image_un = image[0:int(h/2), 0:w]
    image_hin = image[int(h/2):h, 0:w]
    result_un = reader.readtext(image_un, allowlist="0123456789",detail=0)
    result_hin = reader.readtext(image_hin, allowlist="0123456789",detail=0)
    result_un = result_un[0] if len(result_un) > 0 else '00'
    result_hin = result_hin[0] if len(result_hin) > 0 else '0000'
    return result_un,result_hin,result
def extract_un_number(text):
    un = re.findall(r'\d{2,}', text)
    un = un[0] if len(un) > 0 else '00'
    return un

def extract_hin_number(text):
    hin = re.findall(r'\d{4,}', text)
    hin = hin[0] if len(hin) > 0 else '0000'
    return hin
def get_text_from_image_ocr(image):
    #split image horizontally in two pieces
    h, w = image.shape[:2]
    image_upper = image[0:int(h/2), 0:w]
    image_lower = image[int(h/2):h, 0:w]
    psm = 6
    option = f"--psm {psm}"
    text_un = pytesseract.image_to_string(image_upper, config=option)
    text_hin = pytesseract.image_to_string(image_lower, config=option)
    
    return extract_un_number(text_un), extract_hin_number(text_hin)
def calculate_cer(gt, ocr):
    return Levenshtein.distance(gt, ocr) / max(1, len(gt))
def calculate_wer(gt, ocr):
    gt_words = gt.split()
    ocr_words = ocr.split()
    return Levenshtein.distance(" ".join(gt_words), " ".join(ocr_words)) / max(1, len(gt_words))
def to_cv2(self):
    """
    Convert a PIL Image to an OpenCV image (numpy array) in grayscale.
    
    :return: Grayscale OpenCV image (numpy array)
    """
    # Convert PIL Image to numpy array
    numpy_image = np.array(self)
    
    # Convert RGB to BGR (OpenCV format)
    opencv_image = cv2.cvtColor(numpy_image, cv2.COLOR_RGB2BGR)
    
    # Convert BGR to Grayscale
    grayscale_image = cv2.cvtColor(opencv_image, cv2.COLOR_BGR2GRAY)
    
    return grayscale_image
def store_evaluation(lable_info, ocr_technique, prediction, gt):
    cer_un = calculate_cer(gt[0], prediction[0])
    wer_un = calculate_wer(gt[0], prediction[0])
    cer_hin = calculate_cer(gt[1], prediction[1])
    wer_hin = calculate_wer(gt[1], prediction[1])
    evaluation[ocr_technique]['WER'] += (wer_un + wer_hin)/2
    evaluation[ocr_technique]['CER'] += (cer_un + cer_hin)/2
    evaluation[ocr_technique]['images'].append({
        **lable_info,
        'prediction': prediction,
        'gt': gt,
        'CER': (cer_un + cer_hin)/2,
        'WER' : (wer_un + wer_hin)/2,
    })
def print_results():
    number_lables = len(evaluation['idefics']['images'])
    print('Evaluation  Idefics:')
    print('Number of lables:', number_lables)
    print ('Raw CER:', evaluation['idefics']['CER'])
    print ('Raw WER:',  evaluation['idefics']['WER'])
    print('CER:',  evaluation['idefics']['CER']/number_lables)
    print('WER:',  evaluation['idefics']['WER']/number_lables)
    print('Evaluation  EasyOCR:')
    print('Number of lables:', number_lables)
    print ('Raw CER:', evaluation['easyocr']['CER'])
    print ('Raw WER:',  evaluation['easyocr']['WER'])
    print('CER:',  evaluation['easyocr']['CER']/number_lables)
    print('WER:',  evaluation['easyocr']['WER']/number_lables)
    print('Evaluation  Tesseract:')
    print('Number of lables:', number_lables)
    print ('Raw CER:',  evaluation['tesseract']['CER'])
    print ('Raw WER:', evaluation['tesseract']['WER'])
    print('CER:', evaluation['tesseract']['CER']/number_lables)
    print('WER:', evaluation['tesseract']['WER']/number_lables)
# Add the method to the Image class
Image.Image.extract_bounding_box = extract_bounding_box
Image.Image.to_cv2 = to_cv2


In [6]:
def store_result(res):
    print('Saving evaluation')
    with open('evaluation.json', 'w') as f:
        json.dump(res, f)
    print('Evaluation saved')


## Evaluation on private dataset

In [5]:

annotations_path = kagglehub.dataset_download("stanislavlevendeev/hazmat-detection")
annotations_path = annotations_path.replace('\\', '/')
path_video = os.environ['PATH_TO_DATA']
data_path = annotations_path + '/labels_dataframe.csv'
print(data_path)

UsageError: Line magic function `%%script` not found.


In [1]:
df_labels = pd.read_csv(data_path)
df_labels.head()

NameError: name 'pd' is not defined

In [None]:
available_sources = [f for f in os.listdir(path_video) if f.endswith('.mp4')]
available_sources

['1690281365.mp4', '1692830440.mp4', '1696009577.mp4']

In [None]:
# Group by in daaataframe
df_labels_grouped = df_labels.groupby(['Source', 'Relative Frame']).size().reset_index().iloc[:, :4]
df_labels_grouped = df_labels_grouped[df_labels_grouped['Source'].isin(available_sources)]
df_labels_grouped.head()

Unnamed: 0,Source,Relative Frame,0
200,1690281365.mp4,0,1
201,1690281365.mp4,1,1
202,1690281365.mp4,2,1
203,1690281365.mp4,3,1
204,1690281365.mp4,4,1


In [None]:
# loop throuw grouped df_lables_grouped
annotations_path = annotations_path + '/yolo/images'
annotations_path

'C:/Users/stani/.cache/kagglehub/datasets/stanislavlevendeev/hazmat-detection/versions/14/yolo/images'

In [None]:
number_lables = 0

for index, frames in tqdm(df_labels_grouped.head(100).iterrows(), total=df_labels_grouped.shape[0]):

    # Read image

    image = read_image(frames['Source'], frames['Relative Frame'])

    if image is None:

        continue

    data_annotation = df_labels[(df_labels['Source'] == frames['Source']) & (df_labels['Relative Frame'] == frames['Relative Frame'])]

    #for loop

    for index, lable in data_annotation.iterrows():

        number_lables +=1

        # Crop image

        cropped = image.extract_bounding_box((int(lable['XTL']), int(lable['YTL']), int(lable['XBR']), int(lable['YBR'])))

        # OCR
        lable_info = {
            'video': frames['Source'],
            'frame': frames['Relative Frame'],
            'XTL': int(lable['XTL']),
            'YTL': int(lable['YTL']),
            'XBR': int(lable['XBR']),
            'YBR': int(lable['YBR']),
        }
        actual_code = str(lable['Code']).split('/')
        cropped_cv2 = cropped.to_cv2()
        # EasyOCR
        un, hin, ocr = get_text_from_image_easyocr(cropped_cv2)
        prediction_easyocr = (un, hin)
        store_evaluation(lable_info, 'easyocr', prediction_easyocr, actual_code)
        # Tesseract
        un, hin = get_text_from_image_ocr(cropped_cv2)
        prediction_ocr = (un, hin)
        store_evaluation(lable_info, 'tesseract', prediction_ocr, actual_code)
        # Idefics
        prediction_idefics = perform_ocr(cropped)
        store_evaluation(lable_info, 'idefics', prediction_idefics, actual_code)

print('Number of lables:', number_lables)     

  0%|          | 0/867 [00:17<?, ?it/s]


KeyboardInterrupt: 

In [None]:
print_results()

Evaluation  Idefics:
Number of lables: 100
Raw CER: 0.75
Raw WER: 2.5
CER: 0.0075
WER: 0.025
Evaluation  EasyOCR:
Number of lables: 100
Raw CER: 3.125
Raw WER: 11.5
CER: 0.03125
WER: 0.115
Evaluation  OCR:
Number of lables: 100
Raw CER: 65.75
Raw WER: 186.0
CER: 0.6575
WER: 1.86
Saving evaluation
Evaluation saved


In [None]:
store_evaluation(evaluation)

## Evaluation on public dataset

In [7]:
public_data_annotation = kagglehub.dataset_download("stanislavlevendeev/haz-mat-signs")
public_data_annotation

Downloading from https://www.kaggle.com/api/v1/datasets/download/stanislavlevendeev/haz-mat-signs?dataset_version_number=7...


100%|██████████| 328M/328M [01:35<00:00, 3.61MB/s] 

Extracting files...





'C:\\Users\\stani\\.cache\\kagglehub\\datasets\\stanislavlevendeev\\haz-mat-signs\\versions\\7'

In [8]:
df_public_labels = pd.read_csv(os.path.join(public_data_annotation, 'images_with_boxes.csv'))
df_public_labels.head()

Unnamed: 0,image_id,image_name,task_id,width,height,box_label,box_source,box_xtl,box_ytl,box_xbr,box_ybr,issue,code
0,10,images/10.png,226,1000,668,hazmat_sign,manual,825.87,413.29,855.01,437.19,low quality,90/3082
1,100,images/100.png,226,809,1080,hazmat_sign,manual,187.3,571.99,248.56,618.26,,22/1965
2,101,images/101.png,226,812,1080,hazmat_sign,manual,541.74,507.06,654.83,606.01,,23/1049
3,102,images/102.png,226,810,1079,hazmat_sign,manual,426.69,474.05,566.98,576.68,,23/1049
4,103,images/103.png,226,808,1080,hazmat_sign,manual,320.08,722.87,451.08,840.67,,30/1202


In [9]:
# list fiels from a directory
files = os.listdir(os.path.join(public_data_annotation, 'images'))
files

['10.png',
 '100.png',
 '101.png',
 '102.png',
 '103.png',
 '104.png',
 '105.png',
 '106.png',
 '107.png',
 '108.png',
 '109.png',
 '11.png',
 '110.png',
 '111.png',
 '112.png',
 '113.png',
 '114.png',
 '115.png',
 '116.png',
 '117.png',
 '118.png',
 '119.png',
 '12.png',
 '120.png',
 '121.png',
 '122.png',
 '123.png',
 '124.png',
 '125.png',
 '126.png',
 '127.png',
 '128.png',
 '129.png',
 '13.png',
 '130.png',
 '131.png',
 '133.png',
 '135.png',
 '136.png',
 '137.png',
 '138.png',
 '139.png',
 '14.png',
 '140.png',
 '141.png',
 '142.png',
 '143.png',
 '144.png',
 '145.png',
 '146.png',
 '147.png',
 '148.png',
 '149.png',
 '15.png',
 '151.png',
 '152.png',
 '154.png',
 '155.png',
 '156.png',
 '157.png',
 '158.png',
 '159.png',
 '16.png',
 '160.png',
 '161.png',
 '163.png',
 '164.png',
 '165.png',
 '166.png',
 '169.png',
 '17.png',
 '170.png',
 '172.png',
 '173.png',
 '174.png',
 '176.png',
 '177.png',
 '178.png',
 '179.png',
 '18.png',
 '183.png',
 '187.png',
 '188.png',
 '189.png',
 

In [12]:
# loop through files using progress bar
for file in tqdm(files):
    image = Image.open(os.path.join(public_data_annotation, 'images', file))
    # get the lables 
    image_id = int(file.split('.')[0])
    print(image_id)
    data_annotation = df_public_labels[df_public_labels['image_id'] == image_id]
    for index, lable in data_annotation.iterrows():
        # Crop image
        xtl, ytl, xbr, ybr = int(lable['box_xtl']), int(lable['box_ytl']), int(lable['box_xbr']), int(lable['box_ybr'])
        cropped = image.extract_bounding_box((xtl, ytl, xbr, ybr))
        # display cropped image
        cropped.show()
        lable_info = {
            'image_path': file,
            'XTL': xtl,
            'YTL': ytl,
            'XBR': xbr,
            'YBR': ybr,
        }
        actual_code = str(lable['code']).split('/')
        cropped_cv2 = cropped.to_cv2()
        # OCR
        # EasyOCR
        un, hin, ocr = get_text_from_image_easyocr(cropped_cv2)
        prediction_easyocr = (un, hin)
        store_evaluation(lable_info, 'easyocr', prediction_easyocr, actual_code)
        # Tesseract
        un, hin = get_text_from_image_ocr(cropped_cv2)
        prediction_ocr = (un, hin)
        store_evaluation(lable_info, 'tesseract', prediction_ocr, actual_code)
        # Idefics
        prediction_idefics = perform_ocr(cropped)
        store_evaluation(lable_info, 'idefics', prediction_idefics, actual_code)

  0%|          | 0/210 [00:00<?, ?it/s]

10


  0%|          | 0/210 [00:13<?, ?it/s]


KeyboardInterrupt: 

In [13]:
print_results()
evaluation['idefics']['images']

Evaluation  Idefics:
Number of lables: 2
Raw CER: 0.25
Raw WER: 0.5
CER: 0.125
WER: 0.25
Evaluation  EasyOCR:
Number of lables: 2
Raw CER: 1.625
Raw WER: 5.0
CER: 0.8125
WER: 2.5
Evaluation  Tesseract:
Number of lables: 2
Raw CER: 1.75
Raw WER: 5.0
CER: 0.875
WER: 2.5


[{'image_path': '10.png',
  'XTL': 825,
  'YTL': 413,
  'XBR': 855,
  'YBR': 437,
  'prediction': ('90', '3082'),
  'gt': ['90', '3082'],
  'CER': 0.0,
  'WER': 0.0},
 {'image_path': '100.png',
  'XTL': 187,
  'YTL': 571,
  'XBR': 248,
  'YBR': 618,
  'prediction': ('23', '1965'),
  'gt': ['22', '1965'],
  'CER': 0.25,
  'WER': 0.5}]

## Reading the results  

In [4]:
# open json file
evaluation = None
with open('evaluation.json') as f:
    evaluation = json.load(f)
#make datagframe
df_idefics = pd.DataFrame(evaluation['idefics']['images'])
df_idefics.head()

FileNotFoundError: [Errno 2] No such file or directory: 'evaluation.json'

In [None]:
df_idefics.describe()

Unnamed: 0,frame,XTL,YTL,XBR,YBR,CER,WER
count,6293.0,6293.0,6293.0,6293.0,6293.0,6293.0,6293.0
mean,543.25139,1228.155252,1021.255522,1417.614651,1162.310345,0.047857,0.152868
std,345.559928,1159.209151,435.721364,1165.059902,473.340496,0.186367,0.59043
min,0.0,0.0,354.0,57.0,428.0,0.0,0.0
25%,250.0,136.0,532.0,410.0,618.0,0.0,0.0
50%,499.0,884.0,1063.0,1080.0,1208.0,0.0,0.0
75%,812.0,2032.0,1463.0,2304.0,1642.0,0.0,0.0
max,1473.0,3774.0,1731.0,3840.0,1899.0,1.875,4.5


In [None]:
df_ocr = pd.DataFrame(evaluation['tesseract']['images'])
df_ocr.describe()

Unnamed: 0,frame,XTL,YTL,XBR,YBR,CER,WER
count,6293.0,6293.0,6293.0,6293.0,6293.0,6293.0,6293.0
mean,543.25139,1228.155252,1021.255522,1417.614651,1162.310345,0.562881,1.857461
std,345.559928,1159.209151,435.721364,1165.059902,473.340496,0.325953,1.092048
min,0.0,0.0,354.0,57.0,428.0,0.0,0.0
25%,250.0,136.0,532.0,410.0,618.0,0.25,1.0
50%,499.0,884.0,1063.0,1080.0,1208.0,0.625,2.0
75%,812.0,2032.0,1463.0,2304.0,1642.0,0.875,2.5
max,1473.0,3774.0,1731.0,3840.0,1899.0,1.375,4.0


In [None]:
df_easy = pd.DataFrame(evaluation['easyocr']['images'])
df_easy.describe()

Unnamed: 0,frame,XTL,YTL,XBR,YBR,CER,WER
count,6293.0,6293.0,6293.0,6293.0,6293.0,6293.0,6293.0
mean,543.25139,1228.155252,1021.255522,1417.614651,1162.310345,0.329553,1.109248
std,345.559928,1159.209151,435.721364,1165.059902,473.340496,0.331522,1.108049
min,0.0,0.0,354.0,57.0,428.0,0.0,0.0
25%,250.0,136.0,532.0,410.0,618.0,0.0,0.0
50%,499.0,884.0,1063.0,1080.0,1208.0,0.25,1.0
75%,812.0,2032.0,1463.0,2304.0,1642.0,0.583333,2.0
max,1473.0,3774.0,1731.0,3840.0,1899.0,1.333333,4.5


In [None]:
# display images where the CER is higher than 0.5
df_low_precision = df_idefics[df_idefics['CER'] > 0.5]
for(index, row) in df_low_precision.iterrows():
    image = cv2.imread(row["path"])
    #draw bounding box and draw prediiction and gt array 
    image = cv2.rectangle(image, (row["XTL"], row["YTL"]), (row["XBR"], row["YBR"]), (0, 255, 0), 2)
    cv2.putText(image, f'Prediction: {row["prediction"]}', (row["XTL"], row["YTL"] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    cv2.putText(image, f'GT: {row["gt"]}', (row["XTL"], row["YTL"] - 30), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    # Display the image in console
    cv2.imshow("Image", image)