# EasyOCR Thai Training Notebook

**Quick Setup:**
1. Config ในส่วน "🔧 CONFIG" 
2. Check dataset ในส่วน "📊 VALIDATION"
3. Start training ในส่วน "🚀 TRAINING"

In [None]:
# ⚙️ SETUP & IMPORTS
import os, sys, time, torch, yaml, pandas as pd, numpy as np
from datetime import datetime
import torch.backends.cudnn as cudnn
from train import train
from utils import AttrDict

# Configure CUDNN for performance
cudnn.benchmark = True
cudnn.deterministic = False

# Quick system check
print(f"🐍 PyTorch: {torch.__version__}")
print(f"🔥 CUDA: {'✅' if torch.cuda.is_available() else '❌'}")
if torch.cuda.is_available():
    print(f"📱 GPUs: {torch.cuda.device_count()}")

PyTorch version: 2.7.1
CUDA available: False


## ⚙️ SYSTEM SETUP

In [None]:
# 🎯 GPU SELECTION
if torch.cuda.is_available():
    GPU_ID = 0  # 🔧 CHANGE THIS TO SELECT DIFFERENT GPU
    device = torch.device(f'cuda:{GPU_ID}')
    torch.cuda.set_device(GPU_ID)
    torch.cuda.empty_cache()
    print(f"✅ Using GPU {GPU_ID}: {torch.cuda.get_device_name()}")
else:
    device = torch.device('cpu')
    print("⚠️  Using CPU (no CUDA available)")
print(f"🎯 Final device: {device}")

❌ No CUDA available, using CPU
🎯 Final device: cpu


In [None]:
# 📝 LOAD BASE CONFIG
config_file = 'config_files/thai_auto_config.yaml'

with open(config_file, 'r', encoding='utf8') as f:
    opt = AttrDict(yaml.safe_load(f))

# Auto-generate character set from training data
all_chars = set()
labels_path = os.path.join(opt.train_data, opt.select_data, 'labels.csv')
if os.path.exists(labels_path):
    df = pd.read_csv(labels_path, sep='^([^,]+),', engine='python', 
                     usecols=['filename', 'words'], keep_default_na=False)
    for text in df['words']:
        all_chars.update(text)

# Build complete character set
numbers = opt.get('number', '')
symbols = opt.get('symbol', '')
opt.character = numbers + symbols + ''.join(sorted(list(all_chars)))
opt.device = str(device)

print(f"📋 Base config loaded: {opt.experiment_name}")
print(f"🔤 Character set: {len(opt.character)} chars")

## 🔧 CONFIG

In [None]:
# 🔧 CONFIG OVERRIDES
# รวมทุกการตั้งค่าไว้ในที่เดียว - ไม่ต้องแก้ไฟล์ YAML!

CONFIG_OVERRIDES = {
    # 📁 เส้นทางข้อมูล
    # 'train_data': '/Users/puem/Downloads/thai_lang_ocr_dataset',
    # 'valid_data': '/Users/puem/Downloads/thai_lang_ocr_dataset', 
    # 'select_data': '0',
    
    # 🏋️ การฝึกสอน
    # 'experiment_name': 'my_thai_ocr_v3',
    # 'num_iter': 15000,
    # 'batch_size': 16,
    # 'lr': 0.001,
    # 'workers': 4,
    
    # 🤖 โมเดล
    # 'saved_model': '',  # Resume from checkpoint
    # 'batch_max_length': 100,
}

# นำค่าที่แก้ไขไปใช้
overrides_applied = 0
for key, value in CONFIG_OVERRIDES.items():
    if value is not None and value != '':
        setattr(opt, key, value)
        overrides_applied += 1

if overrides_applied > 0:
    print(f"✏️  Applied {overrides_applied} config overrides")
else:
    print("ℹ️  Using default config (uncomment lines above to override)")

🔧 Config overrides:
   ℹ️  No overrides - using config from YAML file
   💡 Uncomment lines above to override config values

📋 Current configuration:
   - Experiment: thai_auto
   - Iterations: 5,000
   - Batch size: 8
   - Learning rate: 0.001
   - Workers: 0
   - Device: cpu

💡 วิธีใช้:
   1. Uncomment บรรทัดที่ต้องการแก้ไข (ลบ # หน้าบรรทัด)
   2. ใส่ค่าที่ต้องการ
   3. Run cell นี้ใหม่
   4. ตัวอย่าง: 'batch_size': 8, 'num_iter': 10000


In [None]:
print(f"⚙️  {opt.experiment_name}")
print(f"🔢 Iterations: {opt.num_iter:,} | Batch: {opt.batch_size} | LR: {opt.lr}")
print(f"🎯 Device: {opt.device} | Workers: {opt.workers}")
print(f"🔤 Characters: {len(opt.character)} total")

if opt.batch_size > 16:
    print(f"⚠️  Large batch_size - reduce if out of memory")

⚙️ CONFIGURATION SUMMARY: thai_auto
  - Training iterations: 5,000
  - Batch size: 8
  - Learning rate: 0.001
  - Workers: 0
  - Target device: cpu
  - Character set: 92 characters
    - Numbers: 10, Symbols: 34, Thai: 48


In [None]:
print("🔧 Fixing dataset paths based on directory structure...")
DATASET_BASE = "/Users/puem/Downloads/thai_lang_ocr_dataset"
REQUIRED_PATHS = {
    'train_images': os.path.join(DATASET_BASE, "0"),
    'val_images': os.path.join(DATASET_BASE, "1"),
    'train_labels': os.path.join(DATASET_BASE, "train_list.txt"),
    'val_labels': os.path.join(DATASET_BASE, "val_list.txt")
}

all_good = True
for name, path in REQUIRED_PATHS.items():
    if os.path.exists(path):
        if 'images' in name:
            count = len([f for f in os.listdir(path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
            print(f"✅ {name}: {count} images")
        else:
            with open(path, 'r', encoding='utf8') as f:
                count = len(f.readlines())
            print(f"✅ {name}: {count} entries")
    else:
        print(f"❌ {name}: Not found")
        all_good = False

print(f"\n🎯 Dataset: {'✅ Ready' if all_good else '❌ Fix paths'}")

🔧 Fixing dataset paths based on directory structure...
✅ thai_train: OK (99995 images found)
✅ thai_train: OK (99995 images found)
✅ thai_val: OK (100000 images found)
✅ Training labels: OK (197995 entries)
✅ Validation labels: OK (2000 entries)

🎯 Result: ✅ Ready to train!

📁 Dataset structure confirmed:
   Base: /Users/puem/Downloads/thai_lang_ocr_dataset
   Train images: folder '0'
   Val images: folder '1'
   Labels: train_list.txt & val_list.txt
✅ thai_val: OK (100000 images found)
✅ Training labels: OK (197995 entries)
✅ Validation labels: OK (2000 entries)

🎯 Result: ✅ Ready to train!

📁 Dataset structure confirmed:
   Base: /Users/puem/Downloads/thai_lang_ocr_dataset
   Train images: folder '0'
   Val images: folder '1'
   Labels: train_list.txt & val_list.txt


## 📊 VALIDATION

In [None]:
# 🚀 TRAINING
use_amp = False  # Mixed precision for faster training
show_samples = 3  # Number of samples to show during validation

print(f"🚀 Starting: {datetime.now().strftime('%H:%M:%S')}")
print(f"⚡ AMP: {'On' if use_amp else 'Off'} | Samples: {show_samples}")

if opt.saved_model and opt.saved_model != '':
    print(f"🔄 Resuming from: {opt.saved_model}")

print("="*50)

try:
    train(opt, show_number=show_samples, amp=use_amp)
except KeyboardInterrupt:
    print("\n⚠️  Training interrupted (Ctrl+C)")
    print(f"💾 Models saved in: ./saved_models/{opt.experiment_name}/")
except Exception as e:
    print(f"\n❌ Training failed: {e}")
    raise
finally:
    print(f"\n🏁 Ended: {datetime.now().strftime('%H:%M:%S')}")

🚀 Starting training at: 2025-06-29 03:04:19
📊 Training samples to show: 3
⚡ Mixed precision (AMP): Disabled
🆕 Starting training from scratch

TRAINING LOG
Filtering the images containing characters which are not in opt.character
Filtering the images whose label is longer than opt.batch_max_length
--------------------------------------------------------------------------------
dataset_root: all_data
opt.select_data: ['thai_train']
opt.batch_ratio: ['1']
--------------------------------------------------------------------------------
dataset_root:    all_data	 dataset: thai_train
all_data/thai_train
sub-directory:	/thai_train	 num samples: 80
num total samples of thai_train: 80 x 1.0 (total_data_usage_ratio) = 80
num samples of thai_train per batch: 8 x 1.0 (batch_ratio) = 8
--------------------------------------------------------------------------------
Total_batch_size: 8 = 8
--------------------------------------------------------------------------------
dataset_root:    all_data/thai

  scaler = GradScaler()



⚠️  Training interrupted by user (Ctrl+C)
Model checkpoints are saved in: ./saved_models/thai_auto/

🏁 Training session ended at: 2025-06-29 03:04:56


## 🚀 TRAINING

In [None]:
# 📊 MONITORING TOOLS
def check_progress(experiment_name):
    """Check training progress"""
    log_dir = f"./saved_models/{experiment_name}"
    if not os.path.exists(log_dir):
        print(f"❌ No logs: {log_dir}")
        return
    
    # Check models
    models = [f for f in os.listdir(log_dir) if f.endswith('.pth')]
    if models:
        print(f"💾 {len(models)} models saved")
        for model in sorted(models)[-2:]:  # Last 2
            size_mb = os.path.getsize(os.path.join(log_dir, model)) / (1024*1024)
            print(f"   {model} ({size_mb:.1f}MB)")
    
    # Check log
    log_file = os.path.join(log_dir, "log_train.txt")
    if os.path.exists(log_file):
        with open(log_file, 'r', encoding='utf8') as f:
            lines = f.readlines()
        if lines:
            print(f"📄 Last log: {lines[-1].strip()}")

def quick_log(experiment_name, lines=3):
    """Show recent log lines"""
    log_file = f"./saved_models/{experiment_name}/log_train.txt"
    if os.path.exists(log_file):
        with open(log_file, 'r', encoding='utf8') as f:
            log_lines = f.readlines()
        for line in log_lines[-lines:]:
            print(line.strip())
    else:
        print("❌ No log found")

print("📊 Use: check_progress(opt.experiment_name) | quick_log(opt.experiment_name)")

## 📊 MONITORING