## Interactive Pipeline to compare OCR systems

### Environment Set Up

In [6]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [7]:
!pip install -r "/content/drive/My Drive/ocr_project/datasets/IAM_data/requirements.txt"



In [8]:
from paddleocr import PaddleOCR, draw_ocr
import cv2
import imutils
import numpy as np
from autocorrect import Speller
import fitz
import glob
import os
from PIL import Image
import json
import shutil
from sklearn.model_selection import train_test_split
import tensorflow as tf
from datasets import load_metric

In [9]:

if tf.test.gpu_device_name():
    print("Default GPU Device: {}".format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Default GPU Device: /device:GPU:0


### Helpers

In [5]:
# Function to correct text using autocorrect
def correct_text(text):
    return spell(text)

# Function to sort OCR results
def sort_ocr_results(ocr_result):
    # Sort by top coordinate, then by left coordinate
    # Flatten the list of lines
    all_words = [word_info for line in ocr_result for word_info in line]
    # Sort all words based on their Y coordinate, and then their X coordinate
    sorted_ocr = sorted(all_words, key=lambda x: (np.mean([pt[1] for pt in x[0]]), np.mean([pt[0] for pt in x[0]])))
    return sorted_ocr


### Paddle OCR Test

#### Test on IAM Dataset
IAM dataset contains both hand-writing and printed texts

In [None]:
# Initialize PaddleOCR with use_gpu=True
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)

# Initialize the spell checker
spell = Speller(lang='en')

# Directory containing the image files
image_directory = "/content/drive/My Drive/ocr_project/datasets/IAM_data/test/test_image"

# Directory to save the results
output_directory = "/content/drive/My Drive/ocr_project/datasets/IAM_data/test/test_output"
os.makedirs(output_directory, exist_ok=True)

# Directory to save the images with highlighted text
detected_image_directory = "/content/drive/My Drive/ocr_project/datasets/IAM_data/test/detected_test_images"
os.makedirs(detected_image_directory, exist_ok=True)

# Get list of all PNG files in the directory
image_files = glob.glob(os.path.join(image_directory, "*.png"))

# Process each image
for img_path in image_files:
    image = cv2.imread(img_path)
    base_name = os.path.basename(img_path).split('.')[0]
    detected_image_path = os.path.join(detected_image_directory, f"{base_name}_detected.png")

    # Use PaddleOCR to recognize text on the original image
    result = ocr.ocr(img_path, cls=True)
    sorted_result = sort_ocr_results(result)

    # Draw red rectangles around detected text
    for word_info in sorted_result:
        points = word_info[0]
        cv2.polylines(image, [np.array(points).astype(np.int32)], isClosed=True, color=(0, 0, 255), thickness=2)

    # Save the image with highlighted text
    cv2.imwrite(detected_image_path, image)

    # Initialize a list to store ordered texts
    ordered_texts = []

    # Extract text and append to the list of ordered texts
    for word_info in sorted_result:
        text = word_info[1][0]  # Get the text
        corrected_text = correct_text(text)  # Correct the text
        ordered_texts.append(corrected_text)  # Add corrected text

    # Combine the words into lines based on their Y-coordinate
    lines = []
    current_y = sorted_result[0][0][0][1]
    current_line = []
    y_threshold = 10  # Y threshold to determine a new line

    for word_info in sorted_result:
        word_text = word_info[1][0]
        word_y = np.mean([pt[1] for pt in word_info[0]])

        if abs(word_y - current_y) > y_threshold:
            # New line
            lines.append(' '.join(current_line))
            current_line = [word_text]
            current_y = word_y
        else:
            # Same line
            current_line.append(word_text)

    # Don't forget the last line
    lines.append(' '.join(current_line))

    # Extract the base name of the image file for naming the output file
    output_txt_path = os.path.join(output_directory, f"{base_name}_result.txt")

    # Export the lines into a txt file
    with open(output_txt_path, 'w') as f:
        for line in lines:
            f.write("%s\n" % line)


[2024/01/01 05:22:37] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=True, use_xpu=False, use_npu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_char_dict_path='/

#### Test on Funsd Data
Funsd Data contains form-like strutures that test the OCR system on its ability to extract key informations

In [None]:
# Initialize PaddleOCR with use_gpu=True
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)

# Initialize the spell checker
spell = Speller(lang='en')

# Directory containing the image files
image_directory = "/content/drive/My Drive/ocr_project/datasets/funsd_data/testing_data/images"

# Directory to save the results
output_directory = "/content/drive/My Drive/ocr_project/datasets/funsd_data/testing_data/paddle_output"
os.makedirs(output_directory, exist_ok=True)

# Directory to save the images with highlighted text
detected_image_directory = "/content/drive/My Drive/ocr_project/datasets/funsd_data/testing_data/paddle_detected_images"
os.makedirs(detected_image_directory, exist_ok=True)

# Get list of all PNG files in the directory
image_files = glob.glob(os.path.join(image_directory, "*.png"))

# Process each image
for img_path in image_files:
    image = cv2.imread(img_path)
    base_name = os.path.basename(img_path).split('.')[0]
    detected_image_path = os.path.join(detected_image_directory, f"{base_name}_detected.png")

    # Use PaddleOCR to recognize text on the original image
    result = ocr.ocr(img_path, cls=True)
    sorted_result = sort_ocr_results(result)

    # Draw red rectangles around detected text
    for word_info in sorted_result:
        points = word_info[0]
        cv2.polylines(image, [np.array(points).astype(np.int32)], isClosed=True, color=(0, 0, 255), thickness=2)

    # Save the image with highlighted text
    cv2.imwrite(detected_image_path, image)

    # Initialize a list to store ordered texts
    ordered_texts = []

    # Extract text and append to the list of ordered texts
    for word_info in sorted_result:
        text = word_info[1][0]  # Get the text
        corrected_text = correct_text(text)  # Correct the text
        ordered_texts.append(corrected_text)  # Add corrected text

    # Combine the words into lines based on their Y-coordinate
    lines = []
    current_y = sorted_result[0][0][0][1]
    current_line = []
    y_threshold = 10  # Y threshold to determine a new line

    for word_info in sorted_result:
        word_text = word_info[1][0]
        word_y = np.mean([pt[1] for pt in word_info[0]])

        if abs(word_y - current_y) > y_threshold:
            # New line
            lines.append(' '.join(current_line))
            current_line = [word_text]
            current_y = word_y
        else:
            # Same line
            current_line.append(word_text)

    # Don't forget the last line
    lines.append(' '.join(current_line))

    # Extract the base name of the image file for naming the output file
    output_txt_path = os.path.join(output_directory, f"{base_name}_result.txt")

    # Export the lines into a txt file
    with open(output_txt_path, 'w') as f:
        for line in lines:
            f.write("%s\n" % line)


#### Measurement

#### On IAM dataset

In [None]:
# Load the metrics
cer_metric = load_metric('cer')
wer_metric = load_metric('wer')

# Paths to your directories
predictionp_printed_dir = "/content/drive/My Drive/ocr_project/datasets/IAM_data/test/printed_output"
predictionp_writing_dir = "/content/drive/My Drive/ocr_project/datasets/IAM_data/test/writing_output"
reference_dir = "/content/drive/My Drive/ocr_project/datasets/IAM_data/test/printed_label"

# Initialize lists for predictions and references
prediction_printed_texts = []
prediction_writing_texts = []
reference_texts = []

# Iterate over files in the predictions directory
for prediction_file in os.listdir(predictionp_printed_dir):
    if prediction_file.endswith('_printed_output.txt'):
        # Construct file paths
        prediction_path = os.path.join(predictionp_printed_dir, prediction_file)
        base_name = prediction_file.replace('_printed_output.txt', '')
        reference_file = base_name + '_printed_label.txt'
        reference_path = os.path.join(reference_dir, reference_file)

        # Read prediction and reference texts
        with open(prediction_path, 'r') as file:
            prediction_printed_texts.append(file.read().strip())

        with open(reference_path, 'r') as file:
            reference_texts.append(file.read().strip())

for prediction_file in os.listdir(predictionp_writing_dir):
    if prediction_file.endswith('_writing_output.txt'):
        # Construct file paths
        prediction_path = os.path.join(predictionp_writing_dir, prediction_file)
        base_name = prediction_file.replace('_writing_output.txt', '')
        reference_file = base_name + '_printed_label.txt'
        reference_path = os.path.join(reference_dir, reference_file)

        # Read prediction and reference texts
        with open(prediction_path, 'r') as file:
            prediction_writing_texts.append(file.read().strip())

        with open(reference_path, 'r') as file:
            reference_texts.append(file.read().strip())

# Calculate CER and WER for each prediction-reference pair
printed_cer_scores = [cer_metric.compute(predictions=[pred], references=[ref]) for pred, ref in zip(prediction_printed_texts, reference_texts)]
printed_wer_scores = [wer_metric.compute(predictions=[pred], references=[ref]) for pred, ref in zip(prediction_printed_texts, reference_texts)]
writing_cer_scores = [cer_metric.compute(predictions=[pred], references=[ref]) for pred, ref in zip(prediction_writing_texts, reference_texts)]
writing_wer_scores = [wer_metric.compute(predictions=[pred], references=[ref]) for pred, ref in zip(prediction_writing_texts, reference_texts)]

# Calculate the average scores
avg_cer_p = sum(printed_cer_scores) / len(printed_cer_scores)
avg_wer_p= sum(printed_wer_scores) / len(printed_wer_scores)
avg_cer_w= sum(writing_cer_scores) / len(writing_cer_scores)
avg_wer_w= sum(writing_wer_scores) / len(writing_wer_scores)

print(f"Average Character Error Rate (CER) for Printed Texts: {avg_cer_p}")
print(f"Average Word Error Rate (WER) for Printed Texts:{avg_wer_p}")
print(f"Average Character Error Rate (CER) for Hand-writing Texts: {avg_cer_w}")
print(f"Average Word Error Rate (WER) for Hand-writing Texts: {avg_wer_w}")


Average Character Error Rate (CER) for Printed Texts: 0.0860474377899655
Average Word Error Rate (WER) for Printed Texts:0.10537407129326445
Average Character Error Rate (CER) for Hand-writing Texts: 0.4820962185331984
Average Word Error Rate (WER) for Hand-writing Texts: 0.7941585022499669


#### On Funsd Dataset

### Fine Tune

In [None]:
train_list_file = "/content/drive/My Drive/ocr_project/datasets/IAM_data/train_list.txt"
val_list_file = "/content/drive/My Drive/ocr_project/datasets/IAM_data/val_list.txt"

def create_dataset_list(image_directory, label_directory, output_file):
    with open(output_file, 'w') as f:
        for img_file in os.listdir(image_directory):
            label_file = img_file.replace('.png', '.txt')
            f.write(f"{os.path.join(image_directory, img_file)}\t{os.path.join(label_directory, label_file)}\n")

create_dataset_list('/content/drive/My Drive/ocr_project/datasets/IAM_data/train/train_image', '/content/drive/My Drive/ocr_project/datasets/IAM_data/train/train_label', train_list_file)
create_dataset_list('/content/drive/My Drive/ocr_project/datasets/IAM_data/validation/val_image', '/content/drive/My Drive/ocr_project/datasets/IAM_data/validation/val_label', val_list_file)


In [None]:

# Modify the PaddleOCR configuration file
# Create or edit a YAML file to point to your dataset list files, set model parameters
config_file = "/content/drive/My Drive/ocr_project/models/paddle/paddle_config.yml"
# Train the model
# Navigate to the PaddleOCR script directory and run the training command, e.g.:
!python "/content/drive/My Drive/ocr_project/models/paddle/PaddleOCR/tools/train.py" -c "/content/drive/My Drive/ocr_project/models/paddle/paddle_config.yml"


[2024/01/03 04:53:44] ppocr INFO: Architecture : 
[2024/01/03 04:53:44] ppocr INFO:     Backbone : 
[2024/01/03 04:53:44] ppocr INFO:         last_conv_stride : [1, 2]
[2024/01/03 04:53:44] ppocr INFO:         last_pool_type : avg
[2024/01/03 04:53:44] ppocr INFO:         name : MobileNetV1Enhance
[2024/01/03 04:53:44] ppocr INFO:         scale : 0.5
[2024/01/03 04:53:44] ppocr INFO:     Head : 
[2024/01/03 04:53:44] ppocr INFO:         head_list : 
[2024/01/03 04:53:44] ppocr INFO:             CTCHead : 
[2024/01/03 04:53:44] ppocr INFO:                 Head : 
[2024/01/03 04:53:44] ppocr INFO:                     fc_decay : 1e-05
[2024/01/03 04:53:44] ppocr INFO:                 Neck : 
[2024/01/03 04:53:44] ppocr INFO:                     depth : 2
[2024/01/03 04:53:44] ppocr INFO:                     dims : 32
[2024/01/03 04:53:44] ppocr INFO:                     hidden_dims : 60
[2024/01/03 04:53:44] ppocr INFO:                     name : svtr
[2024/01/03 04:53:44] ppocr INFO:    

### Re-Conduct OCR on Test Set

#### Test on IAM dataset

#### Test on Funsd Dataset