# PaddleOCR Text Recognition Training on Amazon SageMaker

This notebook demonstrates how to train **Text Recognition models only** using PaddleOCR on Amazon SageMaker with GPU support.

## Key Features:
- **เฉพาะ Text Recognition** (ไม่รวม Detection)
- ใช้ `tools/train_rec.py` script
- รูปแบบ annotation: `image_path\ttext_content`
- รองรับ CRNN, SVTR, PP-OCRv4 architectures
- S3 integration สำหรับ data และ model management

## Requirements:
- Amazon SageMaker Notebook Instance ที่รองรับ GPU
- PaddlePaddle GPU version
- การเข้าถึง S3 bucket สำหรับเก็บข้อมูลและ models

## 1. Environment Setup & GPU Check

ตรวจสอบสภาพแวดล้อมและติดตั้ง dependencies ที่จำเป็นสำหรับการเทรน Text Recognition

In [None]:
# ===== ENVIRONMENT SETUP & INITIALIZATION =====
# รัน cell นี้เพียงครั้งเดียวเพื่อตั้งค่าทั้งหมด

import sys
import os
import json
import subprocess
import time
from pathlib import Path
from datetime import datetime

# สำคัญ: ตรวจสอบว่า cell นี้รันแล้วหรือยัง
if 'ENVIRONMENT_INITIALIZED' in globals():
    print("⚠️  Environment already initialized. Skipping setup...")
    print(f"✅ S3 bucket: {S3_BUCKET}")
    print(f"✅ Region: {AWS_REGION}")
else:
    print("🔧 Environment Setup for PaddleOCR Recognition Training")
    print("=" * 60)
    
    # 1. ตรวจสอบ Python environment
    print(f"📍 Python version: {sys.version}")
    print(f"📍 Working directory: {os.getcwd()}")
    
    # 2. โหลด AWS configuration
    try:
        with open('aws-config.json', 'r') as f:
            aws_config = json.load(f)
        
        # ตั้งค่า environment variables จาก config
        credentials = aws_config['credentials']
        aws_settings = aws_config['aws_settings']
        
        os.environ['AWS_ACCESS_KEY_ID'] = credentials['aws_access_key_id']
        os.environ['AWS_SECRET_ACCESS_KEY'] = credentials['aws_secret_access_key']
        os.environ['AWS_SESSION_TOKEN'] = credentials['aws_session_token']
        os.environ['AWS_DEFAULT_REGION'] = aws_settings['region']
        
        # ตั้งค่า Global variables (ใช้ในทุก cell)
        S3_BUCKET = aws_settings['s3_bucket_name']
        AWS_REGION = aws_settings['region']
        SAGEMAKER_REGION = aws_settings['sagemaker_region']
        S3_DATA_PREFIX = 'recognition-data'
        
        print(f"✅ AWS credentials loaded")
        print(f"✅ S3 bucket: {S3_BUCKET}")
        print(f"✅ Region: {AWS_REGION}")
        
        # Mark as initialized
        ENVIRONMENT_INITIALIZED = True
        
    except FileNotFoundError:
        print("❌ aws-config.json not found! Please run setup first.")
        raise
    except Exception as e:
        print(f"❌ Error loading AWS config: {e}")
        raise
    
    # 3. ติดตั้ง dependencies ที่จำเป็น (เพียงครั้งเดียว)
    print("\n📦 Installing required packages...")
    packages_to_install = [
        "paddlepaddle-gpu", "boto3", "sagemaker", 
        "opencv-python", "pillow", "numpy", "PyYAML", "tqdm"
    ]
    
    for package in packages_to_install:
        try:
            __import__(package.replace('-', '_'))
            print(f"  ✅ {package} already installed")
        except ImportError:
            print(f"  📦 Installing {package}...")
            subprocess.run([sys.executable, "-m", "pip", "install", "-q", package], check=True)
    
    # 4. Import ที่จำเป็น (หลังจากติดตั้งเสร็จ)
    try:
        import boto3
        import sagemaker
        import paddle
        import cv2
        import numpy as np
        import yaml
        from tqdm import tqdm
        print("✅ All imports successful")
    except ImportError as e:
        print(f"❌ Import error: {e}")
        raise
    
    # 5. ตรวจสอบ GPU
    try:
        if paddle.is_compiled_with_cuda():
            print("✅ PaddlePaddle GPU support available")
            gpu_count = paddle.device.cuda.device_count()
            print(f"✅ Available GPUs: {gpu_count}")
        else:
            print("⚠️  PaddlePaddle CPU version detected")
    except Exception as e:
        print(f"⚠️  Could not check GPU status: {e}")
    
    # 6. ตรวจสอบ AWS connection
    try:
        sts = boto3.client('sts')
        identity = sts.get_caller_identity()
        print(f"✅ AWS connection successful")
        print(f"   Account: {identity['Account']}")
        
        # ตรวจสอบ S3 access
        s3 = boto3.client('s3')
        s3.head_bucket(Bucket=S3_BUCKET)
        print(f"✅ S3 bucket accessible: {S3_BUCKET}")
        
    except Exception as e:
        print(f"❌ AWS connection failed: {e}")
        raise
    
    print(f"\n🎯 Environment ready for PaddleOCR Recognition training!")
    print(f"📁 Data location: s3://{S3_BUCKET}/recognition-data/")
    print(f"🏋️ Training will use: {SAGEMAKER_REGION} region")
    print(f"\n⚠️  Note: Other cells will check this initialization before running.")

In [None]:
# ===== PADDLEOCR REPOSITORY SETUP =====
# Clone และตั้งค่า PaddleOCR repository

# ตรวจสอบว่า environment ถูกตั้งค่าแล้ว
if 'ENVIRONMENT_INITIALIZED' not in globals():
    print("❌ Please run Environment Setup cell first!")
    raise RuntimeError("Environment not initialized")

print("📥 PaddleOCR Repository Setup")
print("=" * 40)

# ตรวจสอบว่า PaddleOCR setup แล้วหรือยัง
PADDLEOCR_DIR = Path("PaddleOCR")

if PADDLEOCR_DIR.exists() and 'PADDLEOCR_READY' in globals():
    print("✅ PaddleOCR already set up and ready")
    print(f"📁 Directory: {PADDLEOCR_DIR}")
    print(f"⚙️ Training tools: PaddleOCR/tools/train.py")
    print(f"📋 Config files: PaddleOCR/configs/rec/")
else:
    # Clone หรือ update repository
    if PADDLEOCR_DIR.exists():
        print("✅ PaddleOCR directory exists, updating...")
        try:
            os.chdir("PaddleOCR")
            result = subprocess.run(["git", "pull", "origin", "main"], 
                                 capture_output=True, text=True, timeout=60)
            if result.returncode == 0:
                print("✅ Repository updated successfully")
            else:
                print(f"⚠️  Update failed: {result.stderr}")
            os.chdir("..")
        except Exception as e:
            print(f"⚠️  Could not update: {e}")
            os.chdir("..")
    else:
        print("📥 Cloning PaddleOCR repository...")
        try:
            result = subprocess.run(["git", "clone", 
                                   "https://github.com/PaddlePaddle/PaddleOCR.git"], 
                                  capture_output=True, text=True, timeout=300)
            if result.returncode == 0:
                print("✅ Repository cloned successfully")
            else:
                print(f"❌ Clone failed: {result.stderr}")
                raise RuntimeError("Failed to clone PaddleOCR repository")
        except subprocess.TimeoutExpired:
            print("❌ Clone timeout - please check internet connection")
            raise
        except Exception as e:
            print(f"❌ Clone error: {e}")
            raise
    
    # ตรวจสอบโครงสร้างไฟล์ที่สำคัญ
    if PADDLEOCR_DIR.exists():
        important_paths = [
            "PaddleOCR/configs/rec",
            "PaddleOCR/tools/train.py",
            "PaddleOCR/ppocr"
        ]
        
        print(f"\n📂 Verifying important directories and files:")
        all_paths_exist = True
        for path in important_paths:
            if Path(path).exists():
                print(f"  ✅ {path}")
            else:
                print(f"  ❌ {path} - NOT FOUND")
                all_paths_exist = False
        
        if all_paths_exist:
            # ตรวจสอบ Recognition configs ที่มี
            rec_configs_dir = Path("PaddleOCR/configs/rec")
            rec_configs = list(rec_configs_dir.glob("*.yml"))
            print(f"\n📋 Available Recognition configs ({len(rec_configs)}):")
            for config in sorted(rec_configs)[:5]:  # แสดง 5 อันแรก
                print(f"  📄 {config.name}")
            if len(rec_configs) > 5:
                print(f"  ... และอีก {len(rec_configs) - 5} ไฟล์")
            
            # Mark as ready
            PADDLEOCR_READY = True
            print(f"\n✅ PaddleOCR repository ready!")
        else:
            print(f"\n❌ PaddleOCR repository incomplete!")
            raise RuntimeError("PaddleOCR repository setup failed")
    else:
        print(f"❌ PaddleOCR directory not found after setup!")
        raise RuntimeError("PaddleOCR repository setup failed")

print(f"📁 Training tools: PaddleOCR/tools/train.py")
print(f"⚙️ Config files: PaddleOCR/configs/rec/")

## 2. Download Training Data from S3

ดาวน์โหลดข้อมูลการเทรนจาก S3 รวมถึง images, annotations และ character dictionary

In [None]:
# ===== S3 DATA MANAGEMENT =====
# ใช้ variables ที่ตั้งค่าไว้แล้วใน environment setup

# ตรวจสอบว่า environment ถูกตั้งค่าแล้ว
if 'ENVIRONMENT_INITIALIZED' not in globals():
    print("❌ Please run Environment Setup cell first!")
    raise RuntimeError("Environment not initialized")

print("📡 S3 Data Management Setup")
print("=" * 40)

# ตรวจสอบว่ามีการดาวน์โหลดข้อมูลแล้วหรือยัง
if 'DATA_DOWNLOADED' in globals():
    print("✅ Data already downloaded and ready")
    print(f"📁 Training annotation: s3_data/annotations/train_annotation.txt")
    print(f"📁 Validation annotation: s3_data/annotations/val_annotation.txt")
    print(f"📁 Character dictionary: character_dict.txt")
else:
    # Local paths สำหรับการเก็บข้อมูล
    data_dirs = [
        "s3_data/images/train",
        "s3_data/images/val", 
        "s3_data/annotations",
        "s3_data/metadata"
    ]

    for dir_path in data_dirs:
        Path(dir_path).mkdir(parents=True, exist_ok=True)
        print(f"📁 Created: {dir_path}")

    s3 = boto3.client('s3')

    def download_s3_folder(bucket, s3_prefix, local_dir):
        """ดาวน์โหลดโฟลเดอร์จาก S3"""
        print(f"📥 Downloading {s3_prefix} to {local_dir}...")
        
        # ลิสต์ไฟล์ใน S3
        paginator = s3.get_paginator('list_objects_v2')
        pages = paginator.paginate(Bucket=bucket, Prefix=s3_prefix)
        
        files_to_download = []
        for page in pages:
            if 'Contents' in page:
                for obj in page['Contents']:
                    if not obj['Key'].endswith('/'):  # ข้าม directories
                        files_to_download.append(obj['Key'])
        
        print(f"📊 Found {len(files_to_download)} files to download")
        
        # ดาวน์โหลดไฟล์
        if len(files_to_download) > 0:
            progress_bar = tqdm(files_to_download, desc="Downloading")
            
            for s3_key in progress_bar:
                # สร้าง local path
                relative_path = s3_key.replace(s3_prefix, '').lstrip('/')
                local_path = Path(local_dir) / relative_path
                
                # สร้าง directory ถ้าจำเป็น
                local_path.parent.mkdir(parents=True, exist_ok=True)
                
                # ดาวน์โหลดไฟล์
                try:
                    s3.download_file(bucket, s3_key, str(local_path))
                except Exception as e:
                    print(f"❌ Failed to download {s3_key}: {e}")
            
            progress_bar.close()
            return len(files_to_download)
        else:
            print("⚠️  No files found to download")
            return 0

    # ดาวน์โหลดข้อมูล
    total_downloaded = 0

    print("\n1️⃣ Downloading annotation files...")
    annotation_files = [
        f"{S3_DATA_PREFIX}/annotations/train_annotation.txt",
        f"{S3_DATA_PREFIX}/annotations/val_annotation.txt"
    ]

    for s3_key in annotation_files:
        local_path = f"s3_data/{s3_key.replace(S3_DATA_PREFIX + '/', '')}"
        Path(local_path).parent.mkdir(parents=True, exist_ok=True)
        try:
            s3.download_file(S3_BUCKET, s3_key, local_path)
            print(f"  ✅ {local_path}")
            total_downloaded += 1
        except Exception as e:
            print(f"  ❌ {s3_key}: {e}")

    print("\n2️⃣ Downloading metadata files...")
    metadata_files = [
        f"{S3_DATA_PREFIX}/metadata/character_dict.txt",
        f"{S3_DATA_PREFIX}/metadata/dataset_info.json"
    ]

    for s3_key in metadata_files:
        local_path = f"s3_data/{s3_key.replace(S3_DATA_PREFIX + '/', '')}"
        Path(local_path).parent.mkdir(parents=True, exist_ok=True)
        try:
            s3.download_file(S3_BUCKET, s3_key, local_path)
            print(f"  ✅ {local_path}")
            total_downloaded += 1
        except Exception as e:
            print(f"  ❌ {s3_key}: {e}")

    # Copy character dict สำหรับ PaddleOCR
    if Path("s3_data/metadata/character_dict.txt").exists():
        import shutil
        shutil.copy("s3_data/metadata/character_dict.txt", "character_dict.txt")
        print("✅ Character dictionary copied to root directory")

    print("\n3️⃣ Downloading training images...")
    train_downloaded = download_s3_folder(S3_BUCKET, f"{S3_DATA_PREFIX}/images/train/", "s3_data/images/train")

    print("\n4️⃣ Downloading validation images...")
    val_downloaded = download_s3_folder(S3_BUCKET, f"{S3_DATA_PREFIX}/images/val/", "s3_data/images/val")

    total_downloaded += train_downloaded + val_downloaded

    # สรุปผลการดาวน์โหลด
    print(f"\n📊 Download Summary:")
    print(f"  📥 Total files downloaded: {total_downloaded}")
    print(f"  🏋️ Training images: {train_downloaded}")
    print(f"  ✅ Validation images: {val_downloaded}")

    # ตรวจสอบข้อมูลที่ดาวน์โหลด
    if Path("s3_data/annotations/train_annotation.txt").exists():
        with open("s3_data/annotations/train_annotation.txt", 'r') as f:
            train_lines = len(f.readlines())
        print(f"  📋 Training annotations: {train_lines}")

    if Path("s3_data/annotations/val_annotation.txt").exists():
        with open("s3_data/annotations/val_annotation.txt", 'r') as f:
            val_lines = len(f.readlines())
        print(f"  📋 Validation annotations: {val_lines}")

    print(f"\n✅ Data download completed!")
    print(f"📁 Local data directory: ./s3_data/")
    print(f"🔤 Character dictionary: ./character_dict.txt")

    # ตั้งค่า flag ว่ามีการดาวน์โหลดข้อมูลแล้ว
    DATA_DOWNLOADED = True

## 3. Prepare Recognition Annotation Files

สร้างและตรวจสอบไฟล์ annotation สำหรับ Text Recognition ในรูปแบบ: `image_path\ttext_content`

In [None]:
import os
from pathlib import Path

def create_recognition_annotation_file(annotation_path, num_samples=10):
    """
    สร้างไฟล์ annotation ตัวอย่างสำหรับ Text Recognition
    รูปแบบ: image_path\ttext_content
    """
    sample_annotations = []
    
    # ตัวอย่างข้อมูล Recognition
    sample_texts = [
        "สวัสดีครับ", "PaddleOCR", "Text Recognition", "1234567890",
        "Hello World", "ยินดีต้อนรับ", "深度学习", "Machine Learning",
        "Amazon SageMaker", "การรู้จำตัวอักษร"
    ]
    
    for i in range(num_samples):
        image_path = f"recognition_images/word_{i:03d}.jpg"
        text_content = sample_texts[i % len(sample_texts)]
        
        # สร้างบรรทัดในรูปแบบ Recognition: image_path\ttext_content
        line = f"{image_path}\t{text_content}"
        sample_annotations.append(line)
    
    # เขียนไฟล์
    with open(annotation_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(sample_annotations))
    
    print(f"✅ Created recognition annotation file: {annotation_path}")
    return annotation_path

# ===== ANNOTATION FILE VALIDATION =====
# ตรวจสอบและแสดงข้อมูลจากไฟล์ annotation ที่ดาวน์โหลดมา

# ตรวจสอบว่ามีการดาวน์โหลดข้อมูลแล้ว
if 'DATA_DOWNLOADED' not in globals():
    print("❌ Please run S3 Data Management cell first!")
    raise RuntimeError("Data not downloaded")

print("📋 Recognition Annotation Validation")
print("=" * 40)

def validate_recognition_annotation_format(annotation_file):
    """
    ตรวจสอบรูปแบบไฟล์ annotation สำหรับ Recognition
    รูปแบบที่ถูกต้อง: image_path\ttext_content
    """
    try:
        with open(annotation_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            
        print(f"📄 File: {annotation_file}")
        print(f"📊 Total lines: {len(lines)}")
        
        valid_lines = 0
        for line_num, line in enumerate(lines, 1):
            line = line.strip()
            if not line:
                continue
            
            # แยกด้วย tab
            parts = line.split('\t')
            if len(parts) != 2:
                print(f"❌ Error at line {line_num}: Expected 2 parts separated by tab, got {len(parts)}")
                print(f"   Line content: {repr(line)}")
                return False, 0
            
            image_path, text_content = parts
            
            # ตรวจสอบว่ามีข้อความ
            if not text_content.strip():
                print(f"❌ Error at line {line_num}: Empty text content")
                return False, 0
            
            valid_lines += 1
            
            if line_num <= 3:  # แสดงตัวอย่าง 3 บรรทัดแรก
                print(f"   Line {line_num}: {image_path} -> {text_content}")
        
        print(f"✅ Valid annotation lines: {valid_lines}")
        return True, valid_lines
    
    except Exception as e:
        print(f"❌ Error validating annotation file: {e}")
        return False, 0

# ตรวจสอบไฟล์ annotation
train_annotation_file = "s3_data/annotations/train_annotation.txt"
val_annotation_file = "s3_data/annotations/val_annotation.txt"

print("🔍 Validating downloaded annotation files...")

# ตรวจสอบ training annotation
if Path(train_annotation_file).exists():
    train_valid, train_count = validate_recognition_annotation_format(train_annotation_file)
else:
    print(f"❌ Training annotation file not found: {train_annotation_file}")
    train_valid, train_count = False, 0

print()

# ตรวจสอบ validation annotation  
if Path(val_annotation_file).exists():
    val_valid, val_count = validate_recognition_annotation_format(val_annotation_file)
else:
    print(f"❌ Validation annotation file not found: {val_annotation_file}")
    val_valid, val_count = False, 0

if train_valid and val_valid:
    print(f"\n✅ All annotation files are valid!")
    print(f"📊 Training samples: {train_count}")
    print(f"📊 Validation samples: {val_count}")
    print(f"📊 Total samples: {train_count + val_count}")
    
    # ตรวจสอบ character dictionary
    char_dict_file = "character_dict.txt"
    if Path(char_dict_file).exists():
        with open(char_dict_file, 'r', encoding='utf-8') as f:
            chars = f.read().strip()
        print(f"🔤 Character dictionary: {len(chars)} characters")
        print(f"   Characters: {chars[:50]}{'...' if len(chars) > 50 else ''}")
    else:
        print(f"⚠️  Character dictionary not found: {char_dict_file}")
    
    # Set flag ว่า annotation files พร้อมแล้ว
    ANNOTATIONS_VALIDATED = True
else:
    print(f"\n❌ Annotation validation failed!")
    raise RuntimeError("Invalid annotation files")

# สร้างไฟล์ annotation ตัวอย่าง
train_annotation_file = os.path.join(LOCAL_RECOGNITION_DATA_DIR, "train_recognition.txt")
val_annotation_file = os.path.join(LOCAL_RECOGNITION_DATA_DIR, "val_recognition.txt")

print("📝 Creating sample recognition annotation files...")
create_recognition_annotation_file(train_annotation_file, 20)
create_recognition_annotation_file(val_annotation_file, 5)

print("\n🔍 Validating annotation files...")
validate_recognition_annotation_format(train_annotation_file)
validate_recognition_annotation_format(val_annotation_file)

# เริ่มการเทรน PaddleOCR Recognition Model
import subprocess
from datetime import datetime

print("🚀 Starting PaddleOCR Recognition Training")
print("=" * 50)

# ตรวจสอบไฟล์ที่จำเป็น
required_files = [
    "recognition_training_config.yml",
    "character_dict.txt",
    "s3_data/annotations/train_annotation.txt",
    "s3_data/annotations/val_annotation.txt"
]

missing_files = []
for file_path in required_files:
    if not Path(file_path).exists():
        missing_files.append(file_path)
    else:
        print(f"✅ {file_path}")

if missing_files:
    print(f"\n❌ Missing required files:")
    for file_path in missing_files:
        print(f"  - {file_path}")
    print("Please run previous cells first!")
else:
    print(f"\n📋 Training Configuration:")
    print(f"  ⚙️ Config: recognition_training_config.yml")
    print(f"  🔤 Character dict: character_dict.txt")
    print(f"  🏋️ Training data: s3_data/annotations/train_annotation.txt")
    print(f"  ✅ Validation data: s3_data/annotations/val_annotation.txt")
    
    # สร้าง output directory
    output_dir = f"./output/rec_training_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    print(f"  💾 Output directory: {output_dir}")
    
    # อัปเดต config ให้ใช้ output directory ที่ถูกต้อง
    import yaml
    with open("recognition_training_config.yml", 'r') as f:
        config = yaml.safe_load(f)
    
    config['Global']['save_model_dir'] = output_dir
    
    with open("recognition_training_config.yml", 'w') as f:
        yaml.safe_dump(config, f, default_flow_style=False, allow_unicode=True)
    
    print(f"\n🎯 Training Command:")
    train_cmd = [
        "python", "PaddleOCR/tools/train.py",
        "-c", "recognition_training_config.yml"
    ]
    print(f"  {' '.join(train_cmd)}")
    
    # ตัวเลือกการรัน
    print(f"\n🔧 Training Options:")
    print(f"1. 🚀 Quick Demo Training (3 epochs)")
    print(f"2. 🏋️ Full Production Training (10 epochs)")
    print(f"3. 🛠️ Custom Training (specify options)")
    print(f"4. 📋 Just show command (don't run)")
    
    # สำหรับ demo ให้เปลี่ยน epochs เป็น 3
    choice = input("\nSelect training option (1-4): ").strip()
    
    if choice == "1":
        print(f"\n🚀 Starting Quick Demo Training (3 epochs)...")
        # แก้ไข config สำหรับ demo
        config['Global']['epoch_num'] = 3
        config['Global']['save_epoch_step'] = 1
        with open("recognition_training_config.yml", 'w') as f:
            yaml.safe_dump(config, f, default_flow_style=False, allow_unicode=True)
        
        # รันการเทรน
        print(f"⏰ Training started at: {datetime.now()}")
        try:
            process = subprocess.Popen(
                train_cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                universal_newlines=True,
                bufsize=1
            )
            
            # แสดงผล output แบบ real-time
            for line in process.stdout:
                print(line.strip())
            
            process.wait()
            
            if process.returncode == 0:
                print(f"\n🎉 Training completed successfully!")
                print(f"📁 Model saved in: {output_dir}")
            else:
                print(f"\n❌ Training failed with return code: {process.returncode}")
                
        except Exception as e:
            print(f"\n❌ Training error: {e}")
    
    elif choice == "2":
        print(f"\n🏋️ Starting Full Production Training (10 epochs)...")
        print(f"⏰ Training started at: {datetime.now()}")
        
        try:
            result = subprocess.run(train_cmd, capture_output=True, text=True)
            
            if result.returncode == 0:
                print(f"🎉 Training completed successfully!")
                print(f"📁 Model saved in: {output_dir}")
            else:
                print(f"❌ Training failed:")
                print(result.stderr)
                
        except Exception as e:
            print(f"❌ Training error: {e}")
    
    elif choice == "3":
        epochs = input("Enter number of epochs (default 10): ").strip() or "10"
        config['Global']['epoch_num'] = int(epochs)
        
        with open("recognition_training_config.yml", 'w') as f:
            yaml.safe_dump(config, f, default_flow_style=False, allow_unicode=True)
        
        print(f"\n🛠️ Starting Custom Training ({epochs} epochs)...")
        print(f"⏰ Training started at: {datetime.now()}")
        
        try:
            result = subprocess.run(train_cmd, capture_output=True, text=True)
            
            if result.returncode == 0:
                print(f"🎉 Training completed successfully!")
                print(f"📁 Model saved in: {output_dir}")
            else:
                print(f"❌ Training failed:")
                print(result.stderr)
                
        except Exception as e:
            print(f"❌ Training error: {e}")
    
    elif choice == "4":
        print(f"\n📋 Training Command to run manually:")
        print(f"cd {os.getcwd()}")
        print(f"{' '.join(train_cmd)}")
    
    else:
        print(f"⚠️ Invalid choice. Please run this cell again.")

print(f"\n✅ Training setup completed!")

# สร้าง Training Configuration สำหรับ PaddleOCR Recognition
import yaml
from pathlib import Path
import shutil

def create_recognition_config():
    """สร้าง configuration file สำหรับ Recognition training"""
    
    # โหลด base config จาก PaddleOCR
    base_config_path = "configs/rec/rec_mv3_none_bilstm_ctc.yml"
    
    if not Path(base_config_path).exists():
        print(f"❌ Base config not found: {base_config_path}")
        return None
    
    print(f"📋 Loading base config: {base_config_path}")
    
    with open(base_config_path, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    
    # อัปเดต configuration สำหรับข้อมูลของเรา
    LOCAL_DATA_DIR = PROJECT_CONFIG['LOCAL_DATA_DIR']
    
    # Global settings
    config['Global'].update({
        'epoch_num': PROJECT_CONFIG['TRAINING_CONFIG']['epochs'],
        'log_smooth_window': 20,
        'print_batch_step': 10,
        'save_model_dir': './output/rec_mv3_ctc',
        'save_epoch_step': 10,
        'eval_batch_step': [0, 500],
        'cal_metric_during_training': True,
        'pretrained_model': None,
        'checkpoints': None,
        'use_visualdl': True,
        'infer_img': f"{LOCAL_DATA_DIR}/images/val",
        'character_dict_path': f"{LOCAL_DATA_DIR}/metadata/character_dict.txt",
        'character_type': 'ch',
        'max_text_length': PROJECT_CONFIG['TRAINING_CONFIG']['max_text_length'],
        'use_space_char': False,
        'save_res_path': './output/rec/predicts_rec.txt'
    })
    
    # Architecture settings
    config['Architecture'] = {
        'model_type': 'rec',
        'algorithm': 'CRNN',
        'Transform': None,
        'Backbone': {
            'name': 'MobileNetV3',
            'scale': 0.5,
            'model_name': 'small'
        },
        'Neck': {
            'name': 'SequenceEncoder',
            'encoder_type': 'rnn',
            'hidden_size': 48
        },
        'Head': {
            'name': 'CTCHead',
            'fc_decay': 0.00001
        }
    }
    
    # Loss settings
    config['Loss'] = {
        'name': 'CTCLoss'
    }
    
    # Optimizer settings
    config['Optimizer'] = {
        'name': 'Adam',
        'beta1': 0.9,
        'beta2': 0.999,
        'lr': {
            'name': 'Piecewise',
            'decay_epochs': [10, 20],
            'values': [0.001, 0.0001, 0.00001]
        },
        'regularizer': {
            'name': 'L2',
            'factor': 0.00001
        }
    }
    
    # PostProcess settings
    config['PostProcess'] = {
        'name': 'CTCLabelDecode'
    }
    
    # Metric settings
    config['Metric'] = {
        'name': 'RecMetric',
        'main_indicator': 'acc'
    }
    
    # Training dataset
    config['Train'] = {
        'dataset': {
            'name': 'SimpleDataSet',
            'data_dir': f"{LOCAL_DATA_DIR}/images/train",
            'label_file_list': [f"{LOCAL_DATA_DIR}/annotations/train_annotation.txt"]
        },
        'loader': {
            'shuffle': True,
            'batch_size_per_card': PROJECT_CONFIG['TRAINING_CONFIG']['batch_size'],
            'drop_last': True,
            'num_workers': 4,
            'use_shared_memory': False
        },
        'transforms': [
            {'DecodeImage': {'img_mode': 'BGR', 'channel_first': False}},
            {'CTCLabelEncode': None},
            {'RecResizeImg': {'image_shape': [3, 32, 320]}},
            {'KeepKeys': {'keep_keys': ['image', 'label', 'length']}}
        ]
    }
    
    # Evaluation dataset
    config['Eval'] = {
        'dataset': {
            'name': 'SimpleDataSet',
            'data_dir': f"{LOCAL_DATA_DIR}/images/val",
            'label_file_list': [f"{LOCAL_DATA_DIR}/annotations/val_annotation.txt"]
        },
        'loader': {
            'shuffle': False,
            'drop_last': False,
            'batch_size_per_card': PROJECT_CONFIG['TRAINING_CONFIG']['batch_size'],
            'num_workers': 4,
            'use_shared_memory': False
        },
        'transforms': [
            {'DecodeImage': {'img_mode': 'BGR', 'channel_first': False}},
            {'CTCLabelEncode': None},
            {'RecResizeImg': {'image_shape': [3, 32, 320]}},
            {'KeepKeys': {'keep_keys': ['image', 'label', 'length']}}
        ]
    }
    
    return config

print("⚙️  Creating Training Configuration...")
print("="*50)

# สร้าง config
config = create_recognition_config()

if config is None:
    raise Exception("Failed to create training configuration")

# บันทึก config file
config_save_path = "train_recognition_config.yml"
with open(config_save_path, 'w', encoding='utf-8') as f:
    yaml.dump(config, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

print(f"✅ Configuration saved: {config_save_path}")

# แสดงสรุป configuration
print(f"\n📋 Training Configuration Summary:")
print(f"   📁 Output dir: {config['Global']['save_model_dir']}")
print(f"   🔄 Epochs: {config['Global']['epoch_num']}")
print(f"   📦 Batch size: {config['Train']['loader']['batch_size_per_card']}")
print(f"   🎯 Max text length: {config['Global']['max_text_length']}")
print(f"   📊 Architecture: {config['Architecture']['algorithm']}")
print(f"   🔤 Character dict: {config['Global']['character_dict_path']}")

# ตรวจสอบไฟล์ที่จำเป็น
print(f"\n🔍 Verifying required files...")
required_files = [
    config['Global']['character_dict_path'],
    config['Train']['dataset']['label_file_list'][0],
    config['Eval']['dataset']['label_file_list'][0]
]

all_exist = True
for file_path in required_files:
    if Path(file_path).exists():
        print(f"✅ {file_path}")
    else:
        print(f"❌ {file_path} - Missing!")
        all_exist = False

if all_exist:
    print(f"\n🎉 Training configuration ready!")
    print(f"📝 Config file: {Path(config_save_path).absolute()}")
else:
    print(f"\n❌ Some required files are missing!")
    print("Please check data download step")

# เก็บ config path ไว้ใช้ในขั้นตอนต่อไป
TRAINING_CONFIG_PATH = config_save_path
print(f"\n💡 Use this config for training: {TRAINING_CONFIG_PATH}")

## 4. Create Training Configuration

สร้าง configuration file สำหรับการเทรน PaddleOCR Recognition model

In [None]:
# ===== TRAINING CONFIGURATION CREATION =====
# สร้าง configuration file สำหรับการเทรน PaddleOCR Recognition

# ตรวจสอบว่า annotation files ถูก validate แล้ว
if 'ANNOTATIONS_VALIDATED' not in globals():
    print("❌ Please run Annotation Validation cell first!")
    raise RuntimeError("Annotations not validated")

print("⚙️ Creating PaddleOCR Recognition Training Configuration")
print("=" * 55)

def create_recognition_config():
    """สร้าง configuration file สำหรับ Recognition training"""
    
    # โหลด base config จาก PaddleOCR
    base_config_path = "PaddleOCR/configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml"
    
    # ถ้าไม่มี ให้ใช้ config ทั่วไป
    if not Path(base_config_path).exists():
        base_config_path = "PaddleOCR/configs/rec/rec_mv3_none_bilstm_ctc.yml"
    
    if not Path(base_config_path).exists():
        print(f"❌ Base config not found: {base_config_path}")
        print("Available configs:")
        config_dir = Path("PaddleOCR/configs/rec")
        if config_dir.exists():
            for config_file in config_dir.glob("*.yml"):
                print(f"  📄 {config_file}")
        return None
    
    print(f"📋 Loading base config: {base_config_path}")
    
    with open(base_config_path, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    
    # อัปเดต configuration สำหรับข้อมูลของเรา
    
    # Global settings
    config['Global'].update({
        'epoch_num': 10,  # จำนวน epochs สำหรับการเทรน
        'log_smooth_window': 20,
        'print_batch_step': 10,
        'save_model_dir': './output/rec_training',
        'save_epoch_step': 5,
        'eval_batch_step': [0, 500],
        'cal_metric_during_training': True,
        'pretrained_model': None,
        'checkpoints': None,
        'use_visualdl': True,
        'character_dict_path': 'character_dict.txt',
        'character_type': 'en',  # ใช้สำหรับตัวเลข 0-9
        'max_text_length': 10,   # ความยาวสูงสุดของข้อความ
        'use_space_char': False,
        'save_res_path': './output/rec/predicts_rec.txt'
    })
    
    # Architecture settings (ใช้ CRNN สำหรับ Recognition)
    config['Architecture'] = {
        'model_type': 'rec',
        'algorithm': 'CRNN',
        'Transform': None,
        'Backbone': {
            'name': 'MobileNetV3',
            'scale': 0.5,
            'model_name': 'small'
        },
        'Neck': {
            'name': 'SequenceEncoder',
            'encoder_type': 'rnn',
            'hidden_size': 48
        },
        'Head': {
            'name': 'CTCHead',
            'fc_decay': 0.00001
        }
    }
    
    # Loss settings
    config['Loss'] = {
        'name': 'CTCLoss'
    }
    
    # Optimizer settings
    config['Optimizer'] = {
        'name': 'Adam',
        'beta1': 0.9,
        'beta2': 0.999,
        'lr': {
            'name': 'Piecewise',
            'decay_epochs': [5, 8],
            'values': [0.001, 0.0001, 0.00001]
        },
        'regularizer': {
            'name': 'L2',
            'factor': 0.00001
        }
    }
    
    # PostProcess settings
    config['PostProcess'] = {
        'name': 'CTCLabelDecode'
    }
    
    # Metric settings
    config['Metric'] = {
        'name': 'RecMetric',
        'main_indicator': 'acc'
    }
    
    # Training dataset
    config['Train'] = {
        'dataset': {
            'name': 'SimpleDataSet',
            'data_dir': 's3_data/images/train',
            'label_file_list': ['s3_data/annotations/train_annotation.txt']
        },
        'loader': {
            'shuffle': True,
            'batch_size_per_card': 8,  # batch size เล็กสำหรับ demo
            'drop_last': True,
            'num_workers': 2,
            'use_shared_memory': False
        },
        'transforms': [
            {'DecodeImage': {'img_mode': 'BGR', 'channel_first': False}},
            {'CTCLabelEncode': None},
            {'RecResizeImg': {'image_shape': [3, 32, 320]}},
            {'KeepKeys': {'keep_keys': ['image', 'label', 'length']}}
        ]
    }
    
    # Evaluation dataset
    config['Eval'] = {
        'dataset': {
            'name': 'SimpleDataSet',
            'data_dir': 's3_data/images/val',
            'label_file_list': ['s3_data/annotations/val_annotation.txt']
        },
        'loader': {
            'shuffle': False,
            'drop_last': False,
            'batch_size_per_card': 8,
            'num_workers': 2,
            'use_shared_memory': False
        },
        'transforms': [
            {'DecodeImage': {'img_mode': 'BGR', 'channel_first': False}},
            {'CTCLabelEncode': None},
            {'RecResizeImg': {'image_shape': [3, 32, 320]}},
            {'KeepKeys': {'keep_keys': ['image', 'label', 'length']}}
        ]
    }
    
    return config

# สร้าง config
config = create_recognition_config()

if config is None:
    raise Exception("Failed to create training configuration")

# บันทึก config file
config_save_path = "recognition_training_config.yml"
with open(config_save_path, 'w', encoding='utf-8') as f:
    yaml.dump(config, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

print(f"✅ Configuration saved: {config_save_path}")

# แสดงสรุป configuration
print(f"\n📋 Training Configuration Summary:")
print(f"   📁 Output dir: {config['Global']['save_model_dir']}")
print(f"   🔄 Epochs: {config['Global']['epoch_num']}")
print(f"   📦 Batch size: {config['Train']['loader']['batch_size_per_card']}")
print(f"   🎯 Max text length: {config['Global']['max_text_length']}")
print(f"   📊 Architecture: {config['Architecture']['algorithm']}")
print(f"   🔤 Character dict: {config['Global']['character_dict_path']}")

# ตรวจสอบไฟล์ที่จำเป็น
print(f"\n🔍 Verifying required files...")
required_files = [
    config['Global']['character_dict_path'],
    config['Train']['dataset']['label_file_list'][0],
    config['Eval']['dataset']['label_file_list'][0]
]

all_exist = True
for file_path in required_files:
    if Path(file_path).exists():
        print(f"✅ {file_path}")
    else:
        print(f"❌ {file_path} - Missing!")
        all_exist = False

if all_exist:
    print(f"\n🎉 Training configuration ready!")
    print(f"📝 Config file: {Path(config_save_path).absolute()}")
    
    # เก็บ config path ไว้ใช้ในขั้นตอนต่อไป
    TRAINING_CONFIG_PATH = config_save_path
    CONFIG_CREATED = True
else:
    print(f"\n❌ Some required files are missing!")
    print("Please check data download step")

## 5. Start PaddleOCR Recognition Training

เริ่มการเทรน Text Recognition model ด้วย PaddleOCR

In [None]:
# ===== TRAINING EXECUTION =====
# เริ่มการเทรน PaddleOCR Recognition Model

# ตรวจสอบว่า config ถูกสร้างแล้ว
if 'CONFIG_CREATED' not in globals():
    print("❌ Please run Training Configuration Creation cell first!")
    raise RuntimeError("Training config not created")

import subprocess
from datetime import datetime

print("🚀 PaddleOCR Recognition Training Execution")
print("=" * 50)

# ตรวจสอบไฟล์ที่จำเป็น
required_files = [
    "recognition_training_config.yml",
    "character_dict.txt",
    "s3_data/annotations/train_annotation.txt",
    "s3_data/annotations/val_annotation.txt"
]

missing_files = []
for file_path in required_files:
    if not Path(file_path).exists():
        missing_files.append(file_path)
    else:
        print(f"✅ {file_path}")

if missing_files:
    print(f"\n❌ Missing required files:")
    for file_path in missing_files:
        print(f"  - {file_path}")
    print("Please run previous cells first!")
    raise RuntimeError("Missing required files")

# สร้าง output directory
output_dir = f"./output/rec_training_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
Path(output_dir).mkdir(parents=True, exist_ok=True)
print(f"💾 Output directory: {output_dir}")

# อัปเดต config ให้ใช้ output directory ที่ถูกต้อง
with open("recognition_training_config.yml", 'r') as f:
    config = yaml.safe_load(f)

config['Global']['save_model_dir'] = output_dir

with open("recognition_training_config.yml", 'w') as f:
    yaml.safe_dump(config, f, default_flow_style=False, allow_unicode=True)

print(f"📋 Training Configuration:")
print(f"  ⚙️ Config: recognition_training_config.yml")
print(f"  🔤 Character dict: character_dict.txt")
print(f"  🏋️ Training data: s3_data/annotations/train_annotation.txt")
print(f"  ✅ Validation data: s3_data/annotations/val_annotation.txt")
print(f"  💾 Output directory: {output_dir}")

print(f"\n🎯 Training Command:")
train_cmd = [
    "python", "PaddleOCR/tools/train.py",
    "-c", "recognition_training_config.yml"
]
print(f"  {' '.join(train_cmd)}")

# ตัวเลือกการรัน
print(f"\n🔧 Training Options:")
print(f"1. 🚀 Quick Demo Training (3 epochs)")
print(f"2. 🏋️ Full Production Training (10 epochs)")
print(f"3. 🛠️ Custom Training (specify options)")
print(f"4. 📋 Just show command (don't run)")

# สำหรับ demo ให้เปลี่ยน epochs เป็น 3
choice = input("\nSelect training option (1-4): ").strip()

if choice == "1":
    print(f"\n🚀 Starting Quick Demo Training (3 epochs)...")
    # แก้ไข config สำหรับ demo
    config['Global']['epoch_num'] = 3
    config['Global']['save_epoch_step'] = 1
    with open("recognition_training_config.yml", 'w') as f:
        yaml.safe_dump(config, f, default_flow_style=False, allow_unicode=True)
    
    # รันการเทรน
    print(f"⏰ Training started at: {datetime.now()}")
    try:
        process = subprocess.Popen(
            train_cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            universal_newlines=True,
            bufsize=1
        )
        
        # แสดงผล output แบบ real-time
        for line in process.stdout:
            print(line.strip())
        
        process.wait()
        
        if process.returncode == 0:
            print(f"\n🎉 Training completed successfully!")
            print(f"📁 Model saved in: {output_dir}")
            # Set flag สำหรับการใช้งานต่อ
            TRAINING_COMPLETED = True
            LATEST_MODEL_DIR = output_dir
        else:
            print(f"\n❌ Training failed with return code: {process.returncode}")
            
    except Exception as e:
        print(f"\n❌ Training error: {e}")

elif choice == "2":
    print(f"\n🏋️ Starting Full Production Training (10 epochs)...")
    print(f"⏰ Training started at: {datetime.now()}")
    
    try:
        result = subprocess.run(train_cmd, capture_output=True, text=True)
        
        if result.returncode == 0:
            print(f"🎉 Training completed successfully!")
            print(f"📁 Model saved in: {output_dir}")
            TRAINING_COMPLETED = True
            LATEST_MODEL_DIR = output_dir
        else:
            print(f"❌ Training failed:")
            print(result.stderr)
            
    except Exception as e:
        print(f"❌ Training error: {e}")

elif choice == "3":
    epochs = input("Enter number of epochs (default 10): ").strip() or "10"
    config['Global']['epoch_num'] = int(epochs)
    
    with open("recognition_training_config.yml", 'w') as f:
        yaml.safe_dump(config, f, default_flow_style=False, allow_unicode=True)
    
    print(f"\n🛠️ Starting Custom Training ({epochs} epochs)...")
    print(f"⏰ Training started at: {datetime.now()}")
    
    try:
        result = subprocess.run(train_cmd, capture_output=True, text=True)
        
        if result.returncode == 0:
            print(f"🎉 Training completed successfully!")
            print(f"📁 Model saved in: {output_dir}")
            TRAINING_COMPLETED = True
            LATEST_MODEL_DIR = output_dir
        else:
            print(f"❌ Training failed:")
            print(result.stderr)
            
    except Exception as e:
        print(f"❌ Training error: {e}")

elif choice == "4":
    print(f"\n📋 Training Command to run manually:")
    print(f"cd {os.getcwd()}")
    print(f"{' '.join(train_cmd)}")

else:
    print(f"⚠️ Invalid choice. Please run this cell again.")

print(f"\n✅ Training execution completed!")

## 6. Test Trained Model

ทดสอบ Recognition model ที่เทรนเสร็จแล้วกับ sample images

In [None]:
# ===== MODEL TESTING =====
# ทดสอบ Recognition model ที่เทรนเสร็จแล้ว

# ตรวจสอบว่าการเทรนเสร็จแล้ว
if 'TRAINING_COMPLETED' not in globals():
    print("❌ Please run Training Execution cell first!")
    print("📋 You can also manually set model path if you have a trained model:")
    print("   LATEST_MODEL_DIR = './output/your_model_directory'")
    print("   TRAINING_COMPLETED = True")
    # Uncomment line below and set correct path if you have a trained model
    # LATEST_MODEL_DIR = './output/rec_training_20250101_120000'
    # TRAINING_COMPLETED = True

if 'TRAINING_COMPLETED' in globals() and 'LATEST_MODEL_DIR' in globals():
    print("🔬 Testing Trained Recognition Model")
    print("=" * 40)
    
    # ตรวจสอบ model directory
    model_dir = Path(LATEST_MODEL_DIR)
    if not model_dir.exists():
        print(f"❌ Model directory not found: {model_dir}")
    else:
        print(f"📁 Model directory: {model_dir}")
        
        # หา latest checkpoint
        checkpoint_files = list(model_dir.glob("latest.pdparams"))
        if not checkpoint_files:
            checkpoint_files = list(model_dir.glob("*.pdparams"))
        
        if checkpoint_files:
            latest_checkpoint = checkpoint_files[0]
            print(f"📄 Using checkpoint: {latest_checkpoint}")
            
            # สร้าง inference script
            inference_cmd = [
                "python", "PaddleOCR/tools/infer_rec.py",
                "-c", "recognition_training_config.yml",
                "-o", f"Global.pretrained_model={latest_checkpoint.parent}/{latest_checkpoint.stem}",
                "-o", "Global.infer_img=s3_data/images/val/"
            ]
            
            print(f"\n🎯 Inference Command:")
            print(f"  {' '.join(inference_cmd)}")
            
            # รัน inference (optional - เพิ่ม UI ให้เลือก)
            run_inference = input("\nDo you want to run inference now? (y/n): ").strip().lower()
            
            if run_inference == 'y':
                print(f"\n🚀 Running inference...")
                try:
                    result = subprocess.run(inference_cmd, capture_output=True, text=True, timeout=120)
                    
                    if result.returncode == 0:
                        print(f"✅ Inference completed successfully!")
                        print("\nOutput:")
                        print(result.stdout)
                    else:
                        print(f"❌ Inference failed:")
                        print(result.stderr)
                        
                except subprocess.TimeoutExpired:
                    print("⏰ Inference timeout - please try with fewer images")
                except Exception as e:
                    print(f"❌ Inference error: {e}")
            else:
                print("📋 You can run inference manually with the command above")
        else:
            print(f"❌ No checkpoint files found in {model_dir}")
            print("Available files:")
            for file in model_dir.iterdir():
                print(f"  📄 {file.name}")

    # แสดงผลสรุป
    print(f"\n📊 Training Summary:")
    print(f"  📁 Model saved in: {LATEST_MODEL_DIR}")
    print(f"  ⚙️ Config file: recognition_training_config.yml")
    print(f"  🔤 Character dict: character_dict.txt")
    print(f"  💾 S3 location: s3://{S3_BUCKET}/recognition-data/")
    
    # เตรียมไฟล์สำหรับ upload กลับ S3 (optional)
    upload_model = input("\nDo you want to upload trained model to S3? (y/n): ").strip().lower()
    
    if upload_model == 'y':
        print(f"\n📤 Uploading model to S3...")
        s3 = boto3.client('s3')
        
        try:
            # Upload trained model files
            for file_path in model_dir.glob("*"):
                if file_path.is_file():
                    s3_key = f"recognition-models/{model_dir.name}/{file_path.name}"
                    s3.upload_file(str(file_path), S3_BUCKET, s3_key)
                    print(f"  ✅ Uploaded: {s3_key}")
            
            print(f"🎉 Model uploaded to S3 successfully!")
            print(f"📍 S3 location: s3://{S3_BUCKET}/recognition-models/{model_dir.name}/")
            
        except Exception as e:
            print(f"❌ Upload failed: {e}")
    
    print(f"\n✅ Model testing and management completed!")
else:
    print("⏭️  Please run training first or manually set model path above")