### installs and imports

In [None]:
!pip install paddleocr paddlepaddle -q

In [36]:
!pip install jiwer -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/3.2 MB[0m [31m19.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m51.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import paddleocr
import os
import pandas as pd
import numpy as np
from pathlib import Path
from jiwer import wer, cer
from sklearn.model_selection import train_test_split
from paddleocr import PaddleOCR
from tqdm.auto import tqdm

### load trained custom model

In [38]:
custom_model_dir = "drive/MyDrive/paddleocr-files/custom_model"

In [39]:
ocr_engine = PaddleOCR(text_recognition_model_name="en_PP-OCRv5_mobile_rec",text_recognition_model_dir=custom_model_dir)

[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/root/.paddlex/official_models/PP-LCNet_x1_0_doc_ori`.[0m
[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/root/.paddlex/official_models/UVDoc`.[0m
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/root/.paddlex/official_models/PP-LCNet_x1_0_textline_ori`.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/root/.paddlex/official_models/PP-OCRv5_server_det`.[0m
[32mCreating model: ('en_PP-OCRv5_mobile_rec', 'drive/MyDrive/paddleocr-files/custom_model')[0m


### paths setups and gt csv loading

In [49]:
GT_CSV = Path("drive/MyDrive/paddleocr-files/full_ground_truth_lines.csv")
IMAGE_BASE_DIR = Path("drive/MyDrive/paddleocr-files/images")

In [46]:
try:
    gt_df = pd.read_csv(GT_CSV)
    print(f"Loaded {len(gt_df)} total samples from {GT_CSV}.")

    gt_df = gt_df.drop('Unnamed: 2', axis=1)
    train_df, test_df = train_test_split(gt_df, test_size=0.15, random_state=42)

    print(f"Split into {len(train_df)} training samples and {len(test_df)} testing samples.")

    if 'image_path' not in train_df.columns or 'transcription' not in train_df.columns:
        raise ValueError("CSV must contain 'image_path' and 'transcription' columns.")

    print("\nSample image path from CSV:")
    print(test_df.iloc[0]['image_path'])
    print("\nTraining DataFrame head:")
    print(test_df.head())

except FileNotFoundError:
    print(f"Error: Could not find {GT_CSV}.")
except Exception as e:
    print(f"An error occurred: {e}")

Loaded 870 total samples from drive/MyDrive/paddleocr-files/full_ground_truth_lines.csv.
Split into 739 training samples and 131 testing samples.

Sample image path from CSV:
image_splits/training_set_splits/segmented_lines_per_image/data_70/line_4.png

Training DataFrame head:
                                            image_path  \
394  image_splits/training_set_splits/segmented_lin...   
66   image_splits/training_set_splits/segmented_lin...   
495  image_splits/training_set_splits/segmented_lin...   
67   image_splits/training_set_splits/segmented_lin...   
855  image_splits/training_set_splits/synthetic_seg...   

                         transcription  
394                                 #7  
66                            (Kenzar)  
495       Sig: 1 tab 3x a day x 5 days  
67                                #100  
855  S: 1 tab every 4 hours for 8 days  


### test and show results

In [60]:
results = []
for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Running Inference"):
    gt_text = row['transcription']
    img_path = row['image_path']

    try:
        print("Processing: ", img_path)
        result = ocr_engine.predict(str(IMAGE_BASE_DIR) + "/" + str(img_path))

        pred_text = result[0]['rec_texts'][0]
        confidence = result[0]['rec_scores'][0]

        wer_val = wer(gt_text, pred_text)
        cer_val = cer(gt_text, pred_text)

        results.append({
            "image_path": img_path,
            "gt": gt_text,
            "pred": pred_text,
            "confidence": confidence,
            "wer": wer_val,
            "cer": cer_val
        })
    except Exception as e:
        print(f"Error processing {img_path}: {e}")
        results.append({
            "image_path": img_path,
            "gt": gt_text,
            "pred": "[ERROR]",
            "confidence": 0,
            "wer": 1.0,
            "cer": 1.0
        })

# --- 3. Save and Show Results ---
results_df = pd.DataFrame(results)
results_df.to_csv("paddle_predictions.csv", index=False)

print("\n--- Inference Complete ---")
print(f"Mean WER: {results_df['wer'].mean():.4f}")
print(f"Mean CER: {results_df['cer'].mean():.4f}")

print("\nSample Predictions:")
print(results_df.head())


Running Inference:   0%|          | 0/131 [00:00<?, ?it/s]

Processing:  image_splits/training_set_splits/segmented_lines_per_image/data_70/line_4.png
Error processing image_splits/training_set_splits/segmented_lines_per_image/data_70/line_4.png: list index out of range
Processing:  image_splits/training_set_splits/segmented_lines_per_image/data_8/line_8.png
Processing:  image_splits/training_set_splits/segmented_lines_per_image/data_101/line_2.png
Processing:  image_splits/training_set_splits/segmented_lines_per_image/data_8/line_9.png
Error processing image_splits/training_set_splits/segmented_lines_per_image/data_8/line_9.png: list index out of range
Processing:  image_splits/training_set_splits/synthetic_segmented/V09/page_2/line_11.png
Processing:  image_splits/training_set_splits/synthetic_segmented/V06/page_2/line_0.png
Processing:  image_splits/training_set_splits/segmented_lines_per_image/data_10/line_7.png
Processing:  image_splits/training_set_splits/synthetic_segmented/V08/page_1/line_3.png
Processing:  image_splits/training_set_spl

### tbr

In [33]:
image_path = "images/line_8.png"

In [56]:
print(str(IMAGE_BASE_DIR) + "/" + str(gt_df['image_path'][0]))

drive/MyDrive/paddleocr-files/images/image_splits/training_set_splits/segmented_lines_per_image/data_1/line_0.png


In [58]:
result = ocr_engine.predict(str(IMAGE_BASE_DIR) + "/" + str(gt_df['image_path'][0]))

In [59]:
print(result[0]['rec_texts'][0])
print(result[0]['rec_scores'][0])

10mg/tob
0.7753366231918335
