In [1]:
import os
import random
import matplotlib.pyplot as plt
import easyocr
import csv
import pandas as pd

from utils.config import get_model_configurations, get_dataset_configurations
from utils.dataset_loader import load_dataset, create_sample_dataset, create_balanced_sample_dataset
from utils.ocr_evaluator import run_ocr_evaluation, calculate_metrics, print_summary_statistics

In [2]:
# Paths - ให้ output อยู่ที่เดียวกับ input
input_file = '/Volumes/BACKUP/data/processed_lmdb_format/lmdb/test/labels.csv'
output_file = '/Volumes/BACKUP/data/processed_lmdb_format/lmdb/test/labels_fixed.csv'

print(f"📥 Input: {input_file}")
print(f"📤 Output: {output_file}")

# Quick fix
try:
    df = pd.read_csv(input_file, encoding='utf-8-sig', on_bad_lines='skip')
    df.to_csv(output_file, index=False)
    print(f"✅ Fixed! {len(df)} rows")
except:
    # Manual fix
    with open(input_file, 'r', encoding='utf-8', errors='ignore') as f, \
         open(output_file, 'w', encoding='utf-8', newline='') as out:
        
        writer = csv.writer(out)
        writer.writerow(['filename', 'words'])
        
        count = 0
        for i, line in enumerate(f):
            if i == 0 or not line.strip(): continue
            if ',' in line:
                parts = line.strip().split(',', 1)
                if len(parts) == 2 and parts[0].endswith('.png'):
                    writer.writerow([parts[0].strip(), parts[1].strip()])
                    count += 1
        print(f"✅ Manual fix: {count} rows")

# Test
df = pd.read_csv(output_file)
print(f"Final: {len(df)} rows")
print(df.head(2))

📥 Input: /Volumes/BACKUP/data/processed_lmdb_format/lmdb/test/labels.csv
📤 Output: /Volumes/BACKUP/data/processed_lmdb_format/lmdb/test/labels_fixed.csv
✅ Fixed! 940 rows
Final: 940 rows
                                   filename                       words
0  424dd150-9379-4fd9-94ef-8820550e879b.png  พล.ร.อ.เชิงชาย ชมเชิงแพทย์
1  de791b60-8f12-4587-83f9-2d33fa967c43.png             คุยโขมงบ่าย3โมง


In [None]:
models_to_test = get_model_configurations(models=[
    {
        'name': 'base_model',
        'reader': easyocr.Reader(
            lang_list=['en', 'th'],
            detector=False,
            gpu=True,
            download_enabled=False
        )
    },
    {
        'name': 'out_source_merge_kaggle_data_3gpus',
        'reader': easyocr.Reader(
            lang_list=['en','th'],
            detector=True,
            gpu=True,
            download_enabled=False,
            recog_network='out_source_merge_kaggle_data_3gpus',
            model_storage_directory='./custom_example/model',
            user_network_directory='./custom_example/user_network'
        )
    }
])
datasets_to_test = get_dataset_configurations(datasets=[
    {
        'name': 'data_from_web',
        'type': 'lmdb_format',
        'folder_path': '/Volumes/BACKUP/data/processed_lmdb_format/lmdb/test',
        'df': pd.read_csv('/Volumes/BACKUP/data/processed_lmdb_format/lmdb/test/labels_fixed.csv'),
    }
])

EVALUATION_SAMPLE_SIZE = -1
USE_BALANCED_SAMPLING = True

FileNotFoundError: [Errno 2] No such file or directory: './custom_example/user_network/out_source_merge_kaggle_data_3gpus.yaml'

In [4]:
all_results = []
for dataset in datasets_to_test:
    print(f"\nProcessing dataset: {dataset['name']}")
    if USE_BALANCED_SAMPLING and EVALUATION_SAMPLE_SIZE != -1:
        test_dataset = create_balanced_sample_dataset(dataset, EVALUATION_SAMPLE_SIZE)
        if 'estimated_images' in test_dataset:
            print(f"🎯 Balanced sampling: Using ~{test_dataset['estimated_images']} images to target {EVALUATION_SAMPLE_SIZE} samples")
    else:
        test_dataset = create_sample_dataset(dataset, EVALUATION_SAMPLE_SIZE)
    dataset_results = load_dataset(test_dataset, test_dataset['type'])
    
    if len(dataset_results) == 0:
        print(f"⚠️  No data loaded for {dataset['name']}")
        continue
    
    expected_msg = "full dataset" if EVALUATION_SAMPLE_SIZE == -1 else f"~{EVALUATION_SAMPLE_SIZE} samples"
    print(f"📊 Using {len(dataset_results)} samples for evaluation (target: {expected_msg})")
    
    # Run OCR evaluation with all models
    model_results = run_ocr_evaluation(models_to_test, dataset_results, dataset['name'])
    all_results.extend(model_results)

print(f"\n✓ Total processed results: {len(all_results)}")
sampling_type = "Balanced" if USE_BALANCED_SAMPLING else "Standard"
print(f"📈 {sampling_type} sampling for fairer comparison across datasets")


Processing dataset: data_from_web
Loading dataset: data_from_web (type: lmdb_format)
DataFrame shape: (940, 2)
Using filename column: 'filename', text column: 'words'


Processing lmdb_format: 100%|██████████| 940/940 [00:00<00:00, 3923.71it/s]


Loaded 940 samples
📊 Using 940 samples for evaluation (target: full dataset)
  Testing model: base_model


OCR with base_model: 100%|██████████| 940/940 [01:08<00:00, 13.64it/s]

  Completed 940 images

✓ Total processed results: 940
📈 Balanced sampling for fairer comparison across datasets





In [5]:
print(all_results)
types_list = [result['dataset_name'] for result in all_results]
print(f"🔍 Types of results collected: {set(types_list)}")

img = [x for x in all_results if x['dataset_name'] == 'data_from_outsource' and x['model_name'] == 'base_model']
random.seed(47)
img = random.sample(img, min(5, len(img)))
# for i, in enumerate(img):
for index, val in enumerate(img):
    plt.figure(figsize=(10, 3))
    plt.imshow(val['cropped_image'])
    plt.axis('off')
    os.makedirs("tmp", exist_ok=True)
    plt.savefig(f"tmp/sample_image_{index+1}_{val['dataset_name']}_{val['model_name']}.png", bbox_inches='tight', pad_inches=0.1)
    plt.show()

[{'dataset_name': 'data_from_web', 'model_name': 'base_model', 'source_file': '424dd150-9379-4fd9-94ef-8820550e879b.png', 'box_index': 0, 'cropped_image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=427x51 at 0x30F33B370>, 'ground_truth': 'พล.ร.อ.เชิงชาย ชมเชิงแพทย์', 'ocr_text': 'พล.ธ.อเเชิงชายอมเชิงแพทย์'}, {'dataset_name': 'data_from_web', 'model_name': 'base_model', 'source_file': 'de791b60-8f12-4587-83f9-2d33fa967c43.png', 'box_index': 0, 'cropped_image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=324x150 at 0x30F33AD10>, 'ground_truth': 'คุยโขมงบ่าย3โมง', 'ocr_text': "'คุยยgษๆ"}, {'dataset_name': 'data_from_web', 'model_name': 'base_model', 'source_file': '0d1e30f2-2aed-43e0-84a2-422faed90216.png', 'box_index': 0, 'cropped_image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=692x137 at 0x30F33B940>, 'ground_truth': 'กรรมสิทธิ์เป็นของไฟแนนซ์', 'ocr_text': 'กรรมสิทธิ์เป็นของไฟแนนซ์'}, {'dataset_name': 'data_from_web', 'model_name': 'base_model', 'source_f

In [None]:
print("=== METRICS CALCULATION ===")
if len(all_results) > 0:
    metrics_output_path = './output/evaluation_results.json'
    individual_metrics = calculate_metrics(all_results, metrics_output_path)
    print_summary_statistics(individual_metrics)
    print(f"\n✓ Evaluation complete! Results saved to {metrics_output_path}")
else:
    print("⚠️  No results to calculate metrics")

=== METRICS CALCULATION ===
Loaded semantic model: distiluse-base-multilingual-cased
Calculating metrics for data_from_web_base_model


Metrics for data_from_web_base_model: 100%|██████████| 940/940 [00:19<00:00, 48.12it/s]

Saved 940 metric results to ./evaluation_results.json

=== EVALUATION SUMMARY ===
data_from_web_base_model:
  - Character Accuracy: 75.39%
  - Character Error Rate: 24.61%
  - Word Error Rate: 70.77%
  - Word Error Rate (Thai): 64.07%
  - Semantic Similarity: 0.7795
  - Total samples: 940


✓ Evaluation complete! Results saved to ./evaluation_results.json



