# PaddleOCR Text Recognition Training on Amazon SageMaker

This notebook demonstrates how to train **Text Recognition models only** using PaddleOCR on Amazon SageMaker with GPU support.

## Key Features:
- **‡πÄ‡∏â‡∏û‡∏≤‡∏∞ Text Recognition** (‡πÑ‡∏°‡πà‡∏£‡∏ß‡∏° Detection)
- ‡πÉ‡∏ä‡πâ `tools/train_rec.py` script
- ‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö annotation: `image_path\ttext_content`
- ‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö CRNN, SVTR, PP-OCRv4 architectures
- S3 integration ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö data ‡πÅ‡∏•‡∏∞ model management

## Requirements:
- Amazon SageMaker Notebook Instance ‡∏ó‡∏µ‡πà‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö GPU
- PaddlePaddle GPU version
- ‡∏Å‡∏≤‡∏£‡πÄ‡∏Ç‡πâ‡∏≤‡∏ñ‡∏∂‡∏á S3 bucket ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÄ‡∏Å‡πá‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÅ‡∏•‡∏∞ models

## 1. Environment Setup & GPU Check

‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏™‡∏†‡∏≤‡∏û‡πÅ‡∏ß‡∏î‡∏•‡πâ‡∏≠‡∏°‡πÅ‡∏•‡∏∞‡∏ï‡∏¥‡∏î‡∏ï‡∏±‡πâ‡∏á dependencies ‡∏ó‡∏µ‡πà‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£‡πÄ‡∏ó‡∏£‡∏ô Text Recognition

In [None]:
# ===== ENVIRONMENT SETUP & INITIALIZATION =====
# ‡∏£‡∏±‡∏ô cell ‡∏ô‡∏µ‡πâ‡πÄ‡∏û‡∏µ‡∏¢‡∏á‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î

import sys
import os
import json
import subprocess
import time
from pathlib import Path
from datetime import datetime

# ‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç: ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤ cell ‡∏ô‡∏µ‡πâ‡∏£‡∏±‡∏ô‡πÅ‡∏•‡πâ‡∏ß‡∏´‡∏£‡∏∑‡∏≠‡∏¢‡∏±‡∏á
if 'ENVIRONMENT_INITIALIZED' in globals():
    print("‚ö†Ô∏è  Environment already initialized. Skipping setup...")
    print(f"‚úÖ S3 bucket: {S3_BUCKET}")
    print(f"‚úÖ Region: {AWS_REGION}")
else:
    print("üîß Environment Setup for PaddleOCR Recognition Training")
    print("=" * 60)
    
    # 1. ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö Python environment
    print(f"üìç Python version: {sys.version}")
    print(f"üìç Working directory: {os.getcwd()}")
    
    # 2. ‡πÇ‡∏´‡∏•‡∏î AWS configuration
    try:
        with open('aws-config.json', 'r') as f:
            aws_config = json.load(f)
        
        # ‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤ environment variables ‡∏à‡∏≤‡∏Å config
        credentials = aws_config['credentials']
        aws_settings = aws_config['aws_settings']
        
        os.environ['AWS_ACCESS_KEY_ID'] = credentials['aws_access_key_id']
        os.environ['AWS_SECRET_ACCESS_KEY'] = credentials['aws_secret_access_key']
        os.environ['AWS_SESSION_TOKEN'] = credentials['aws_session_token']
        os.environ['AWS_DEFAULT_REGION'] = aws_settings['region']
        
        # ‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤ Global variables (‡πÉ‡∏ä‡πâ‡πÉ‡∏ô‡∏ó‡∏∏‡∏Å cell)
        S3_BUCKET = aws_settings['s3_bucket_name']
        AWS_REGION = aws_settings['region']
        SAGEMAKER_REGION = aws_settings['sagemaker_region']
        S3_DATA_PREFIX = 'recognition-data'
        
        print(f"‚úÖ AWS credentials loaded")
        print(f"‚úÖ S3 bucket: {S3_BUCKET}")
        print(f"‚úÖ Region: {AWS_REGION}")
        
        # Mark as initialized
        ENVIRONMENT_INITIALIZED = True
        
    except FileNotFoundError:
        print("‚ùå aws-config.json not found! Please run setup first.")
        raise
    except Exception as e:
        print(f"‚ùå Error loading AWS config: {e}")
        raise
    
    # 3. ‡∏ï‡∏¥‡∏î‡∏ï‡∏±‡πâ‡∏á dependencies ‡∏ó‡∏µ‡πà‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô (‡πÄ‡∏û‡∏µ‡∏¢‡∏á‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡πÄ‡∏î‡∏µ‡∏¢‡∏ß)
    print("\nüì¶ Installing required packages...")
    packages_to_install = [
        "paddlepaddle-gpu", "boto3", "sagemaker", 
        "opencv-python", "pillow", "numpy", "PyYAML", "tqdm"
    ]
    
    for package in packages_to_install:
        try:
            __import__(package.replace('-', '_'))
            print(f"  ‚úÖ {package} already installed")
        except ImportError:
            print(f"  üì¶ Installing {package}...")
            subprocess.run([sys.executable, "-m", "pip", "install", "-q", package], check=True)
    
    # 4. Import ‡∏ó‡∏µ‡πà‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô (‡∏´‡∏•‡∏±‡∏á‡∏à‡∏≤‡∏Å‡∏ï‡∏¥‡∏î‡∏ï‡∏±‡πâ‡∏á‡πÄ‡∏™‡∏£‡πá‡∏à)
    try:
        import boto3
        import sagemaker
        import paddle
        import cv2
        import numpy as np
        import yaml
        from tqdm import tqdm
        print("‚úÖ All imports successful")
    except ImportError as e:
        print(f"‚ùå Import error: {e}")
        raise
    
    # 5. ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö GPU
    try:
        if paddle.is_compiled_with_cuda():
            print("‚úÖ PaddlePaddle GPU support available")
            gpu_count = paddle.device.cuda.device_count()
            print(f"‚úÖ Available GPUs: {gpu_count}")
        else:
            print("‚ö†Ô∏è  PaddlePaddle CPU version detected")
    except Exception as e:
        print(f"‚ö†Ô∏è  Could not check GPU status: {e}")
    
    # 6. ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö AWS connection
    try:
        sts = boto3.client('sts')
        identity = sts.get_caller_identity()
        print(f"‚úÖ AWS connection successful")
        print(f"   Account: {identity['Account']}")
        
        # ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö S3 access
        s3 = boto3.client('s3')
        s3.head_bucket(Bucket=S3_BUCKET)
        print(f"‚úÖ S3 bucket accessible: {S3_BUCKET}")
        
    except Exception as e:
        print(f"‚ùå AWS connection failed: {e}")
        raise
    
    print(f"\nüéØ Environment ready for PaddleOCR Recognition training!")
    print(f"üìÅ Data location: s3://{S3_BUCKET}/recognition-data/")
    print(f"üèãÔ∏è Training will use: {SAGEMAKER_REGION} region")
    print(f"\n‚ö†Ô∏è  Note: Other cells will check this initialization before running.")

In [None]:
# ===== PADDLEOCR REPOSITORY SETUP =====
# Clone ‡πÅ‡∏•‡∏∞‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤ PaddleOCR repository

# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤ environment ‡∏ñ‡∏π‡∏Å‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤‡πÅ‡∏•‡πâ‡∏ß
if 'ENVIRONMENT_INITIALIZED' not in globals():
    print("‚ùå Please run Environment Setup cell first!")
    raise RuntimeError("Environment not initialized")

print("üì• PaddleOCR Repository Setup")
print("=" * 40)

# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤ PaddleOCR setup ‡πÅ‡∏•‡πâ‡∏ß‡∏´‡∏£‡∏∑‡∏≠‡∏¢‡∏±‡∏á
PADDLEOCR_DIR = Path("PaddleOCR")

if PADDLEOCR_DIR.exists() and 'PADDLEOCR_READY' in globals():
    print("‚úÖ PaddleOCR already set up and ready")
    print(f"üìÅ Directory: {PADDLEOCR_DIR}")
    print(f"‚öôÔ∏è Training tools: PaddleOCR/tools/train.py")
    print(f"üìã Config files: PaddleOCR/configs/rec/")
else:
    # Clone ‡∏´‡∏£‡∏∑‡∏≠ update repository
    if PADDLEOCR_DIR.exists():
        print("‚úÖ PaddleOCR directory exists, updating...")
        try:
            os.chdir("PaddleOCR")
            result = subprocess.run(["git", "pull", "origin", "main"], 
                                 capture_output=True, text=True, timeout=60)
            if result.returncode == 0:
                print("‚úÖ Repository updated successfully")
            else:
                print(f"‚ö†Ô∏è  Update failed: {result.stderr}")
            os.chdir("..")
        except Exception as e:
            print(f"‚ö†Ô∏è  Could not update: {e}")
            os.chdir("..")
    else:
        print("üì• Cloning PaddleOCR repository...")
        try:
            result = subprocess.run(["git", "clone", 
                                   "https://github.com/PaddlePaddle/PaddleOCR.git"], 
                                  capture_output=True, text=True, timeout=300)
            if result.returncode == 0:
                print("‚úÖ Repository cloned successfully")
            else:
                print(f"‚ùå Clone failed: {result.stderr}")
                raise RuntimeError("Failed to clone PaddleOCR repository")
        except subprocess.TimeoutExpired:
            print("‚ùå Clone timeout - please check internet connection")
            raise
        except Exception as e:
            print(f"‚ùå Clone error: {e}")
            raise
    
    # ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡πÇ‡∏Ñ‡∏£‡∏á‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏µ‡πà‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç
    if PADDLEOCR_DIR.exists():
        important_paths = [
            "PaddleOCR/configs/rec",
            "PaddleOCR/tools/train.py",
            "PaddleOCR/ppocr"
        ]
        
        print(f"\nüìÇ Verifying important directories and files:")
        all_paths_exist = True
        for path in important_paths:
            if Path(path).exists():
                print(f"  ‚úÖ {path}")
            else:
                print(f"  ‚ùå {path} - NOT FOUND")
                all_paths_exist = False
        
        if all_paths_exist:
            # ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö Recognition configs ‡∏ó‡∏µ‡πà‡∏°‡∏µ
            rec_configs_dir = Path("PaddleOCR/configs/rec")
            rec_configs = list(rec_configs_dir.glob("*.yml"))
            print(f"\nüìã Available Recognition configs ({len(rec_configs)}):")
            for config in sorted(rec_configs)[:5]:  # ‡πÅ‡∏™‡∏î‡∏á 5 ‡∏≠‡∏±‡∏ô‡πÅ‡∏£‡∏Å
                print(f"  üìÑ {config.name}")
            if len(rec_configs) > 5:
                print(f"  ... ‡πÅ‡∏•‡∏∞‡∏≠‡∏µ‡∏Å {len(rec_configs) - 5} ‡πÑ‡∏ü‡∏•‡πå")
            
            # Mark as ready
            PADDLEOCR_READY = True
            print(f"\n‚úÖ PaddleOCR repository ready!")
        else:
            print(f"\n‚ùå PaddleOCR repository incomplete!")
            raise RuntimeError("PaddleOCR repository setup failed")
    else:
        print(f"‚ùå PaddleOCR directory not found after setup!")
        raise RuntimeError("PaddleOCR repository setup failed")

print(f"üìÅ Training tools: PaddleOCR/tools/train.py")
print(f"‚öôÔ∏è Config files: PaddleOCR/configs/rec/")

## 2. Download Training Data from S3

‡∏î‡∏≤‡∏ß‡∏ô‡πå‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏Å‡∏≤‡∏£‡πÄ‡∏ó‡∏£‡∏ô‡∏à‡∏≤‡∏Å S3 ‡∏£‡∏ß‡∏°‡∏ñ‡∏∂‡∏á images, annotations ‡πÅ‡∏•‡∏∞ character dictionary

In [None]:
# ===== S3 DATA MANAGEMENT =====
# ‡πÉ‡∏ä‡πâ variables ‡∏ó‡∏µ‡πà‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤‡πÑ‡∏ß‡πâ‡πÅ‡∏•‡πâ‡∏ß‡πÉ‡∏ô environment setup

# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤ environment ‡∏ñ‡∏π‡∏Å‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤‡πÅ‡∏•‡πâ‡∏ß
if 'ENVIRONMENT_INITIALIZED' not in globals():
    print("‚ùå Please run Environment Setup cell first!")
    raise RuntimeError("Environment not initialized")

print("üì° S3 Data Management Setup")
print("=" * 40)

# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤‡∏°‡∏µ‡∏Å‡∏≤‡∏£‡∏î‡∏≤‡∏ß‡∏ô‡πå‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÅ‡∏•‡πâ‡∏ß‡∏´‡∏£‡∏∑‡∏≠‡∏¢‡∏±‡∏á
if 'DATA_DOWNLOADED' in globals():
    print("‚úÖ Data already downloaded and ready")
    print(f"üìÅ Training annotation: s3_data/annotations/train_annotation.txt")
    print(f"üìÅ Validation annotation: s3_data/annotations/val_annotation.txt")
    print(f"üìÅ Character dictionary: character_dict.txt")
else:
    # Local paths ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£‡πÄ‡∏Å‡πá‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•
    data_dirs = [
        "s3_data/images/train",
        "s3_data/images/val", 
        "s3_data/annotations",
        "s3_data/metadata"
    ]

    for dir_path in data_dirs:
        Path(dir_path).mkdir(parents=True, exist_ok=True)
        print(f"üìÅ Created: {dir_path}")

    s3 = boto3.client('s3')

    def download_s3_folder(bucket, s3_prefix, local_dir):
        """‡∏î‡∏≤‡∏ß‡∏ô‡πå‡πÇ‡∏´‡∏•‡∏î‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå‡∏à‡∏≤‡∏Å S3"""
        print(f"üì• Downloading {s3_prefix} to {local_dir}...")
        
        # ‡∏•‡∏¥‡∏™‡∏ï‡πå‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏ô S3
        paginator = s3.get_paginator('list_objects_v2')
        pages = paginator.paginate(Bucket=bucket, Prefix=s3_prefix)
        
        files_to_download = []
        for page in pages:
            if 'Contents' in page:
                for obj in page['Contents']:
                    if not obj['Key'].endswith('/'):  # ‡∏Ç‡πâ‡∏≤‡∏° directories
                        files_to_download.append(obj['Key'])
        
        print(f"üìä Found {len(files_to_download)} files to download")
        
        # ‡∏î‡∏≤‡∏ß‡∏ô‡πå‡πÇ‡∏´‡∏•‡∏î‡πÑ‡∏ü‡∏•‡πå
        if len(files_to_download) > 0:
            progress_bar = tqdm(files_to_download, desc="Downloading")
            
            for s3_key in progress_bar:
                # ‡∏™‡∏£‡πâ‡∏≤‡∏á local path
                relative_path = s3_key.replace(s3_prefix, '').lstrip('/')
                local_path = Path(local_dir) / relative_path
                
                # ‡∏™‡∏£‡πâ‡∏≤‡∏á directory ‡∏ñ‡πâ‡∏≤‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô
                local_path.parent.mkdir(parents=True, exist_ok=True)
                
                # ‡∏î‡∏≤‡∏ß‡∏ô‡πå‡πÇ‡∏´‡∏•‡∏î‡πÑ‡∏ü‡∏•‡πå
                try:
                    s3.download_file(bucket, s3_key, str(local_path))
                except Exception as e:
                    print(f"‚ùå Failed to download {s3_key}: {e}")
            
            progress_bar.close()
            return len(files_to_download)
        else:
            print("‚ö†Ô∏è  No files found to download")
            return 0

    # ‡∏î‡∏≤‡∏ß‡∏ô‡πå‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•
    total_downloaded = 0

    print("\n1Ô∏è‚É£ Downloading annotation files...")
    annotation_files = [
        f"{S3_DATA_PREFIX}/annotations/train_annotation.txt",
        f"{S3_DATA_PREFIX}/annotations/val_annotation.txt"
    ]

    for s3_key in annotation_files:
        local_path = f"s3_data/{s3_key.replace(S3_DATA_PREFIX + '/', '')}"
        Path(local_path).parent.mkdir(parents=True, exist_ok=True)
        try:
            s3.download_file(S3_BUCKET, s3_key, local_path)
            print(f"  ‚úÖ {local_path}")
            total_downloaded += 1
        except Exception as e:
            print(f"  ‚ùå {s3_key}: {e}")

    print("\n2Ô∏è‚É£ Downloading metadata files...")
    metadata_files = [
        f"{S3_DATA_PREFIX}/metadata/character_dict.txt",
        f"{S3_DATA_PREFIX}/metadata/dataset_info.json"
    ]

    for s3_key in metadata_files:
        local_path = f"s3_data/{s3_key.replace(S3_DATA_PREFIX + '/', '')}"
        Path(local_path).parent.mkdir(parents=True, exist_ok=True)
        try:
            s3.download_file(S3_BUCKET, s3_key, local_path)
            print(f"  ‚úÖ {local_path}")
            total_downloaded += 1
        except Exception as e:
            print(f"  ‚ùå {s3_key}: {e}")

    # Copy character dict ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö PaddleOCR
    if Path("s3_data/metadata/character_dict.txt").exists():
        import shutil
        shutil.copy("s3_data/metadata/character_dict.txt", "character_dict.txt")
        print("‚úÖ Character dictionary copied to root directory")

    print("\n3Ô∏è‚É£ Downloading training images...")
    train_downloaded = download_s3_folder(S3_BUCKET, f"{S3_DATA_PREFIX}/images/train/", "s3_data/images/train")

    print("\n4Ô∏è‚É£ Downloading validation images...")
    val_downloaded = download_s3_folder(S3_BUCKET, f"{S3_DATA_PREFIX}/images/val/", "s3_data/images/val")

    total_downloaded += train_downloaded + val_downloaded

    # ‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏î‡∏≤‡∏ß‡∏ô‡πå‡πÇ‡∏´‡∏•‡∏î
    print(f"\nüìä Download Summary:")
    print(f"  üì• Total files downloaded: {total_downloaded}")
    print(f"  üèãÔ∏è Training images: {train_downloaded}")
    print(f"  ‚úÖ Validation images: {val_downloaded}")

    # ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ó‡∏µ‡πà‡∏î‡∏≤‡∏ß‡∏ô‡πå‡πÇ‡∏´‡∏•‡∏î
    if Path("s3_data/annotations/train_annotation.txt").exists():
        with open("s3_data/annotations/train_annotation.txt", 'r') as f:
            train_lines = len(f.readlines())
        print(f"  üìã Training annotations: {train_lines}")

    if Path("s3_data/annotations/val_annotation.txt").exists():
        with open("s3_data/annotations/val_annotation.txt", 'r') as f:
            val_lines = len(f.readlines())
        print(f"  üìã Validation annotations: {val_lines}")

    print(f"\n‚úÖ Data download completed!")
    print(f"üìÅ Local data directory: ./s3_data/")
    print(f"üî§ Character dictionary: ./character_dict.txt")

    # ‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤ flag ‡∏ß‡πà‡∏≤‡∏°‡∏µ‡∏Å‡∏≤‡∏£‡∏î‡∏≤‡∏ß‡∏ô‡πå‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÅ‡∏•‡πâ‡∏ß
    DATA_DOWNLOADED = True

## 3. Prepare Recognition Annotation Files

‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÅ‡∏•‡∏∞‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡πÑ‡∏ü‡∏•‡πå annotation ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Text Recognition ‡πÉ‡∏ô‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö: `image_path\ttext_content`

In [None]:
import os
from pathlib import Path

def create_recognition_annotation_file(annotation_path, num_samples=10):
    """
    ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÑ‡∏ü‡∏•‡πå annotation ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Text Recognition
    ‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö: image_path\ttext_content
    """
    sample_annotations = []
    
    # ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• Recognition
    sample_texts = [
        "‡∏™‡∏ß‡∏±‡∏™‡∏î‡∏µ‡∏Ñ‡∏£‡∏±‡∏ö", "PaddleOCR", "Text Recognition", "1234567890",
        "Hello World", "‡∏¢‡∏¥‡∏ô‡∏î‡∏µ‡∏ï‡πâ‡∏≠‡∏ô‡∏£‡∏±‡∏ö", "Ê∑±Â∫¶Â≠¶‰π†", "Machine Learning",
        "Amazon SageMaker", "‡∏Å‡∏≤‡∏£‡∏£‡∏π‡πâ‡∏à‡∏≥‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£"
    ]
    
    for i in range(num_samples):
        image_path = f"recognition_images/word_{i:03d}.jpg"
        text_content = sample_texts[i % len(sample_texts)]
        
        # ‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡πÉ‡∏ô‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö Recognition: image_path\ttext_content
        line = f"{image_path}\t{text_content}"
        sample_annotations.append(line)
    
    # ‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ô‡πÑ‡∏ü‡∏•‡πå
    with open(annotation_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(sample_annotations))
    
    print(f"‚úÖ Created recognition annotation file: {annotation_path}")
    return annotation_path

# ===== ANNOTATION FILE VALIDATION =====
# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡πÅ‡∏•‡∏∞‡πÅ‡∏™‡∏î‡∏á‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏à‡∏≤‡∏Å‡πÑ‡∏ü‡∏•‡πå annotation ‡∏ó‡∏µ‡πà‡∏î‡∏≤‡∏ß‡∏ô‡πå‡πÇ‡∏´‡∏•‡∏î‡∏°‡∏≤

# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤‡∏°‡∏µ‡∏Å‡∏≤‡∏£‡∏î‡∏≤‡∏ß‡∏ô‡πå‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÅ‡∏•‡πâ‡∏ß
if 'DATA_DOWNLOADED' not in globals():
    print("‚ùå Please run S3 Data Management cell first!")
    raise RuntimeError("Data not downloaded")

print("üìã Recognition Annotation Validation")
print("=" * 40)

def validate_recognition_annotation_format(annotation_file):
    """
    ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö‡πÑ‡∏ü‡∏•‡πå annotation ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Recognition
    ‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö‡∏ó‡∏µ‡πà‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á: image_path\ttext_content
    """
    try:
        with open(annotation_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            
        print(f"üìÑ File: {annotation_file}")
        print(f"üìä Total lines: {len(lines)}")
        
        valid_lines = 0
        for line_num, line in enumerate(lines, 1):
            line = line.strip()
            if not line:
                continue
            
            # ‡πÅ‡∏¢‡∏Å‡∏î‡πâ‡∏ß‡∏¢ tab
            parts = line.split('\t')
            if len(parts) != 2:
                print(f"‚ùå Error at line {line_num}: Expected 2 parts separated by tab, got {len(parts)}")
                print(f"   Line content: {repr(line)}")
                return False, 0
            
            image_path, text_content = parts
            
            # ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤‡∏°‡∏µ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°
            if not text_content.strip():
                print(f"‚ùå Error at line {line_num}: Empty text content")
                return False, 0
            
            valid_lines += 1
            
            if line_num <= 3:  # ‡πÅ‡∏™‡∏î‡∏á‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á 3 ‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡πÅ‡∏£‡∏Å
                print(f"   Line {line_num}: {image_path} -> {text_content}")
        
        print(f"‚úÖ Valid annotation lines: {valid_lines}")
        return True, valid_lines
    
    except Exception as e:
        print(f"‚ùå Error validating annotation file: {e}")
        return False, 0

# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡πÑ‡∏ü‡∏•‡πå annotation
train_annotation_file = "s3_data/annotations/train_annotation.txt"
val_annotation_file = "s3_data/annotations/val_annotation.txt"

print("üîç Validating downloaded annotation files...")

# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö training annotation
if Path(train_annotation_file).exists():
    train_valid, train_count = validate_recognition_annotation_format(train_annotation_file)
else:
    print(f"‚ùå Training annotation file not found: {train_annotation_file}")
    train_valid, train_count = False, 0

print()

# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö validation annotation  
if Path(val_annotation_file).exists():
    val_valid, val_count = validate_recognition_annotation_format(val_annotation_file)
else:
    print(f"‚ùå Validation annotation file not found: {val_annotation_file}")
    val_valid, val_count = False, 0

if train_valid and val_valid:
    print(f"\n‚úÖ All annotation files are valid!")
    print(f"üìä Training samples: {train_count}")
    print(f"üìä Validation samples: {val_count}")
    print(f"üìä Total samples: {train_count + val_count}")
    
    # ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö character dictionary
    char_dict_file = "character_dict.txt"
    if Path(char_dict_file).exists():
        with open(char_dict_file, 'r', encoding='utf-8') as f:
            chars = f.read().strip()
        print(f"üî§ Character dictionary: {len(chars)} characters")
        print(f"   Characters: {chars[:50]}{'...' if len(chars) > 50 else ''}")
    else:
        print(f"‚ö†Ô∏è  Character dictionary not found: {char_dict_file}")
    
    # Set flag ‡∏ß‡πà‡∏≤ annotation files ‡∏û‡∏£‡πâ‡∏≠‡∏°‡πÅ‡∏•‡πâ‡∏ß
    ANNOTATIONS_VALIDATED = True
else:
    print(f"\n‚ùå Annotation validation failed!")
    raise RuntimeError("Invalid annotation files")

# ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÑ‡∏ü‡∏•‡πå annotation ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á
train_annotation_file = os.path.join(LOCAL_RECOGNITION_DATA_DIR, "train_recognition.txt")
val_annotation_file = os.path.join(LOCAL_RECOGNITION_DATA_DIR, "val_recognition.txt")

print("üìù Creating sample recognition annotation files...")
create_recognition_annotation_file(train_annotation_file, 20)
create_recognition_annotation_file(val_annotation_file, 5)

print("\nüîç Validating annotation files...")
validate_recognition_annotation_format(train_annotation_file)
validate_recognition_annotation_format(val_annotation_file)

# ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡πÄ‡∏ó‡∏£‡∏ô PaddleOCR Recognition Model
import subprocess
from datetime import datetime

print("üöÄ Starting PaddleOCR Recognition Training")
print("=" * 50)

# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏µ‡πà‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô
required_files = [
    "recognition_training_config.yml",
    "character_dict.txt",
    "s3_data/annotations/train_annotation.txt",
    "s3_data/annotations/val_annotation.txt"
]

missing_files = []
for file_path in required_files:
    if not Path(file_path).exists():
        missing_files.append(file_path)
    else:
        print(f"‚úÖ {file_path}")

if missing_files:
    print(f"\n‚ùå Missing required files:")
    for file_path in missing_files:
        print(f"  - {file_path}")
    print("Please run previous cells first!")
else:
    print(f"\nüìã Training Configuration:")
    print(f"  ‚öôÔ∏è Config: recognition_training_config.yml")
    print(f"  üî§ Character dict: character_dict.txt")
    print(f"  üèãÔ∏è Training data: s3_data/annotations/train_annotation.txt")
    print(f"  ‚úÖ Validation data: s3_data/annotations/val_annotation.txt")
    
    # ‡∏™‡∏£‡πâ‡∏≤‡∏á output directory
    output_dir = f"./output/rec_training_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    print(f"  üíæ Output directory: {output_dir}")
    
    # ‡∏≠‡∏±‡∏õ‡πÄ‡∏î‡∏ï config ‡πÉ‡∏´‡πâ‡πÉ‡∏ä‡πâ output directory ‡∏ó‡∏µ‡πà‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á
    import yaml
    with open("recognition_training_config.yml", 'r') as f:
        config = yaml.safe_load(f)
    
    config['Global']['save_model_dir'] = output_dir
    
    with open("recognition_training_config.yml", 'w') as f:
        yaml.safe_dump(config, f, default_flow_style=False, allow_unicode=True)
    
    print(f"\nüéØ Training Command:")
    train_cmd = [
        "python", "PaddleOCR/tools/train.py",
        "-c", "recognition_training_config.yml"
    ]
    print(f"  {' '.join(train_cmd)}")
    
    # ‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏Å‡∏≤‡∏£‡∏£‡∏±‡∏ô
    print(f"\nüîß Training Options:")
    print(f"1. üöÄ Quick Demo Training (3 epochs)")
    print(f"2. üèãÔ∏è Full Production Training (10 epochs)")
    print(f"3. üõ†Ô∏è Custom Training (specify options)")
    print(f"4. üìã Just show command (don't run)")
    
    # ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö demo ‡πÉ‡∏´‡πâ‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô epochs ‡πÄ‡∏õ‡πá‡∏ô 3
    choice = input("\nSelect training option (1-4): ").strip()
    
    if choice == "1":
        print(f"\nüöÄ Starting Quick Demo Training (3 epochs)...")
        # ‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç config ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö demo
        config['Global']['epoch_num'] = 3
        config['Global']['save_epoch_step'] = 1
        with open("recognition_training_config.yml", 'w') as f:
            yaml.safe_dump(config, f, default_flow_style=False, allow_unicode=True)
        
        # ‡∏£‡∏±‡∏ô‡∏Å‡∏≤‡∏£‡πÄ‡∏ó‡∏£‡∏ô
        print(f"‚è∞ Training started at: {datetime.now()}")
        try:
            process = subprocess.Popen(
                train_cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                universal_newlines=True,
                bufsize=1
            )
            
            # ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏• output ‡πÅ‡∏ö‡∏ö real-time
            for line in process.stdout:
                print(line.strip())
            
            process.wait()
            
            if process.returncode == 0:
                print(f"\nüéâ Training completed successfully!")
                print(f"üìÅ Model saved in: {output_dir}")
            else:
                print(f"\n‚ùå Training failed with return code: {process.returncode}")
                
        except Exception as e:
            print(f"\n‚ùå Training error: {e}")
    
    elif choice == "2":
        print(f"\nüèãÔ∏è Starting Full Production Training (10 epochs)...")
        print(f"‚è∞ Training started at: {datetime.now()}")
        
        try:
            result = subprocess.run(train_cmd, capture_output=True, text=True)
            
            if result.returncode == 0:
                print(f"üéâ Training completed successfully!")
                print(f"üìÅ Model saved in: {output_dir}")
            else:
                print(f"‚ùå Training failed:")
                print(result.stderr)
                
        except Exception as e:
            print(f"‚ùå Training error: {e}")
    
    elif choice == "3":
        epochs = input("Enter number of epochs (default 10): ").strip() or "10"
        config['Global']['epoch_num'] = int(epochs)
        
        with open("recognition_training_config.yml", 'w') as f:
            yaml.safe_dump(config, f, default_flow_style=False, allow_unicode=True)
        
        print(f"\nüõ†Ô∏è Starting Custom Training ({epochs} epochs)...")
        print(f"‚è∞ Training started at: {datetime.now()}")
        
        try:
            result = subprocess.run(train_cmd, capture_output=True, text=True)
            
            if result.returncode == 0:
                print(f"üéâ Training completed successfully!")
                print(f"üìÅ Model saved in: {output_dir}")
            else:
                print(f"‚ùå Training failed:")
                print(result.stderr)
                
        except Exception as e:
            print(f"‚ùå Training error: {e}")
    
    elif choice == "4":
        print(f"\nüìã Training Command to run manually:")
        print(f"cd {os.getcwd()}")
        print(f"{' '.join(train_cmd)}")
    
    else:
        print(f"‚ö†Ô∏è Invalid choice. Please run this cell again.")

print(f"\n‚úÖ Training setup completed!")

# ‡∏™‡∏£‡πâ‡∏≤‡∏á Training Configuration ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö PaddleOCR Recognition
import yaml
from pathlib import Path
import shutil

def create_recognition_config():
    """‡∏™‡∏£‡πâ‡∏≤‡∏á configuration file ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Recognition training"""
    
    # ‡πÇ‡∏´‡∏•‡∏î base config ‡∏à‡∏≤‡∏Å PaddleOCR
    base_config_path = "configs/rec/rec_mv3_none_bilstm_ctc.yml"
    
    if not Path(base_config_path).exists():
        print(f"‚ùå Base config not found: {base_config_path}")
        return None
    
    print(f"üìã Loading base config: {base_config_path}")
    
    with open(base_config_path, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    
    # ‡∏≠‡∏±‡∏õ‡πÄ‡∏î‡∏ï configuration ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏Ç‡∏≠‡∏á‡πÄ‡∏£‡∏≤
    LOCAL_DATA_DIR = PROJECT_CONFIG['LOCAL_DATA_DIR']
    
    # Global settings
    config['Global'].update({
        'epoch_num': PROJECT_CONFIG['TRAINING_CONFIG']['epochs'],
        'log_smooth_window': 20,
        'print_batch_step': 10,
        'save_model_dir': './output/rec_mv3_ctc',
        'save_epoch_step': 10,
        'eval_batch_step': [0, 500],
        'cal_metric_during_training': True,
        'pretrained_model': None,
        'checkpoints': None,
        'use_visualdl': True,
        'infer_img': f"{LOCAL_DATA_DIR}/images/val",
        'character_dict_path': f"{LOCAL_DATA_DIR}/metadata/character_dict.txt",
        'character_type': 'ch',
        'max_text_length': PROJECT_CONFIG['TRAINING_CONFIG']['max_text_length'],
        'use_space_char': False,
        'save_res_path': './output/rec/predicts_rec.txt'
    })
    
    # Architecture settings
    config['Architecture'] = {
        'model_type': 'rec',
        'algorithm': 'CRNN',
        'Transform': None,
        'Backbone': {
            'name': 'MobileNetV3',
            'scale': 0.5,
            'model_name': 'small'
        },
        'Neck': {
            'name': 'SequenceEncoder',
            'encoder_type': 'rnn',
            'hidden_size': 48
        },
        'Head': {
            'name': 'CTCHead',
            'fc_decay': 0.00001
        }
    }
    
    # Loss settings
    config['Loss'] = {
        'name': 'CTCLoss'
    }
    
    # Optimizer settings
    config['Optimizer'] = {
        'name': 'Adam',
        'beta1': 0.9,
        'beta2': 0.999,
        'lr': {
            'name': 'Piecewise',
            'decay_epochs': [10, 20],
            'values': [0.001, 0.0001, 0.00001]
        },
        'regularizer': {
            'name': 'L2',
            'factor': 0.00001
        }
    }
    
    # PostProcess settings
    config['PostProcess'] = {
        'name': 'CTCLabelDecode'
    }
    
    # Metric settings
    config['Metric'] = {
        'name': 'RecMetric',
        'main_indicator': 'acc'
    }
    
    # Training dataset
    config['Train'] = {
        'dataset': {
            'name': 'SimpleDataSet',
            'data_dir': f"{LOCAL_DATA_DIR}/images/train",
            'label_file_list': [f"{LOCAL_DATA_DIR}/annotations/train_annotation.txt"]
        },
        'loader': {
            'shuffle': True,
            'batch_size_per_card': PROJECT_CONFIG['TRAINING_CONFIG']['batch_size'],
            'drop_last': True,
            'num_workers': 4,
            'use_shared_memory': False
        },
        'transforms': [
            {'DecodeImage': {'img_mode': 'BGR', 'channel_first': False}},
            {'CTCLabelEncode': None},
            {'RecResizeImg': {'image_shape': [3, 32, 320]}},
            {'KeepKeys': {'keep_keys': ['image', 'label', 'length']}}
        ]
    }
    
    # Evaluation dataset
    config['Eval'] = {
        'dataset': {
            'name': 'SimpleDataSet',
            'data_dir': f"{LOCAL_DATA_DIR}/images/val",
            'label_file_list': [f"{LOCAL_DATA_DIR}/annotations/val_annotation.txt"]
        },
        'loader': {
            'shuffle': False,
            'drop_last': False,
            'batch_size_per_card': PROJECT_CONFIG['TRAINING_CONFIG']['batch_size'],
            'num_workers': 4,
            'use_shared_memory': False
        },
        'transforms': [
            {'DecodeImage': {'img_mode': 'BGR', 'channel_first': False}},
            {'CTCLabelEncode': None},
            {'RecResizeImg': {'image_shape': [3, 32, 320]}},
            {'KeepKeys': {'keep_keys': ['image', 'label', 'length']}}
        ]
    }
    
    return config

print("‚öôÔ∏è  Creating Training Configuration...")
print("="*50)

# ‡∏™‡∏£‡πâ‡∏≤‡∏á config
config = create_recognition_config()

if config is None:
    raise Exception("Failed to create training configuration")

# ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å config file
config_save_path = "train_recognition_config.yml"
with open(config_save_path, 'w', encoding='utf-8') as f:
    yaml.dump(config, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

print(f"‚úÖ Configuration saved: {config_save_path}")

# ‡πÅ‡∏™‡∏î‡∏á‡∏™‡∏£‡∏∏‡∏õ configuration
print(f"\nüìã Training Configuration Summary:")
print(f"   üìÅ Output dir: {config['Global']['save_model_dir']}")
print(f"   üîÑ Epochs: {config['Global']['epoch_num']}")
print(f"   üì¶ Batch size: {config['Train']['loader']['batch_size_per_card']}")
print(f"   üéØ Max text length: {config['Global']['max_text_length']}")
print(f"   üìä Architecture: {config['Architecture']['algorithm']}")
print(f"   üî§ Character dict: {config['Global']['character_dict_path']}")

# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏µ‡πà‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô
print(f"\nüîç Verifying required files...")
required_files = [
    config['Global']['character_dict_path'],
    config['Train']['dataset']['label_file_list'][0],
    config['Eval']['dataset']['label_file_list'][0]
]

all_exist = True
for file_path in required_files:
    if Path(file_path).exists():
        print(f"‚úÖ {file_path}")
    else:
        print(f"‚ùå {file_path} - Missing!")
        all_exist = False

if all_exist:
    print(f"\nüéâ Training configuration ready!")
    print(f"üìù Config file: {Path(config_save_path).absolute()}")
else:
    print(f"\n‚ùå Some required files are missing!")
    print("Please check data download step")

# ‡πÄ‡∏Å‡πá‡∏ö config path ‡πÑ‡∏ß‡πâ‡πÉ‡∏ä‡πâ‡πÉ‡∏ô‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ï‡πà‡∏≠‡πÑ‡∏õ
TRAINING_CONFIG_PATH = config_save_path
print(f"\nüí° Use this config for training: {TRAINING_CONFIG_PATH}")

## 4. Create Training Configuration

‡∏™‡∏£‡πâ‡∏≤‡∏á configuration file ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£‡πÄ‡∏ó‡∏£‡∏ô PaddleOCR Recognition model

In [None]:
# ===== TRAINING CONFIGURATION CREATION =====
# ‡∏™‡∏£‡πâ‡∏≤‡∏á configuration file ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£‡πÄ‡∏ó‡∏£‡∏ô PaddleOCR Recognition

# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤ annotation files ‡∏ñ‡∏π‡∏Å validate ‡πÅ‡∏•‡πâ‡∏ß
if 'ANNOTATIONS_VALIDATED' not in globals():
    print("‚ùå Please run Annotation Validation cell first!")
    raise RuntimeError("Annotations not validated")

print("‚öôÔ∏è Creating PaddleOCR Recognition Training Configuration")
print("=" * 55)

def create_recognition_config():
    """‡∏™‡∏£‡πâ‡∏≤‡∏á configuration file ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Recognition training"""
    
    # ‡πÇ‡∏´‡∏•‡∏î base config ‡∏à‡∏≤‡∏Å PaddleOCR
    base_config_path = "PaddleOCR/configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml"
    
    # ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡∏°‡∏µ ‡πÉ‡∏´‡πâ‡πÉ‡∏ä‡πâ config ‡∏ó‡∏±‡πà‡∏ß‡πÑ‡∏õ
    if not Path(base_config_path).exists():
        base_config_path = "PaddleOCR/configs/rec/rec_mv3_none_bilstm_ctc.yml"
    
    if not Path(base_config_path).exists():
        print(f"‚ùå Base config not found: {base_config_path}")
        print("Available configs:")
        config_dir = Path("PaddleOCR/configs/rec")
        if config_dir.exists():
            for config_file in config_dir.glob("*.yml"):
                print(f"  üìÑ {config_file}")
        return None
    
    print(f"üìã Loading base config: {base_config_path}")
    
    with open(base_config_path, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    
    # ‡∏≠‡∏±‡∏õ‡πÄ‡∏î‡∏ï configuration ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏Ç‡∏≠‡∏á‡πÄ‡∏£‡∏≤
    
    # Global settings
    config['Global'].update({
        'epoch_num': 10,  # ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô epochs ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£‡πÄ‡∏ó‡∏£‡∏ô
        'log_smooth_window': 20,
        'print_batch_step': 10,
        'save_model_dir': './output/rec_training',
        'save_epoch_step': 5,
        'eval_batch_step': [0, 500],
        'cal_metric_during_training': True,
        'pretrained_model': None,
        'checkpoints': None,
        'use_visualdl': True,
        'character_dict_path': 'character_dict.txt',
        'character_type': 'en',  # ‡πÉ‡∏ä‡πâ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏Ç 0-9
        'max_text_length': 10,   # ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏¢‡∏≤‡∏ß‡∏™‡∏π‡∏á‡∏™‡∏∏‡∏î‡∏Ç‡∏≠‡∏á‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°
        'use_space_char': False,
        'save_res_path': './output/rec/predicts_rec.txt'
    })
    
    # Architecture settings (‡πÉ‡∏ä‡πâ CRNN ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Recognition)
    config['Architecture'] = {
        'model_type': 'rec',
        'algorithm': 'CRNN',
        'Transform': None,
        'Backbone': {
            'name': 'MobileNetV3',
            'scale': 0.5,
            'model_name': 'small'
        },
        'Neck': {
            'name': 'SequenceEncoder',
            'encoder_type': 'rnn',
            'hidden_size': 48
        },
        'Head': {
            'name': 'CTCHead',
            'fc_decay': 0.00001
        }
    }
    
    # Loss settings
    config['Loss'] = {
        'name': 'CTCLoss'
    }
    
    # Optimizer settings
    config['Optimizer'] = {
        'name': 'Adam',
        'beta1': 0.9,
        'beta2': 0.999,
        'lr': {
            'name': 'Piecewise',
            'decay_epochs': [5, 8],
            'values': [0.001, 0.0001, 0.00001]
        },
        'regularizer': {
            'name': 'L2',
            'factor': 0.00001
        }
    }
    
    # PostProcess settings
    config['PostProcess'] = {
        'name': 'CTCLabelDecode'
    }
    
    # Metric settings
    config['Metric'] = {
        'name': 'RecMetric',
        'main_indicator': 'acc'
    }
    
    # Training dataset
    config['Train'] = {
        'dataset': {
            'name': 'SimpleDataSet',
            'data_dir': 's3_data/images/train',
            'label_file_list': ['s3_data/annotations/train_annotation.txt']
        },
        'loader': {
            'shuffle': True,
            'batch_size_per_card': 8,  # batch size ‡πÄ‡∏•‡πá‡∏Å‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö demo
            'drop_last': True,
            'num_workers': 2,
            'use_shared_memory': False
        },
        'transforms': [
            {'DecodeImage': {'img_mode': 'BGR', 'channel_first': False}},
            {'CTCLabelEncode': None},
            {'RecResizeImg': {'image_shape': [3, 32, 320]}},
            {'KeepKeys': {'keep_keys': ['image', 'label', 'length']}}
        ]
    }
    
    # Evaluation dataset
    config['Eval'] = {
        'dataset': {
            'name': 'SimpleDataSet',
            'data_dir': 's3_data/images/val',
            'label_file_list': ['s3_data/annotations/val_annotation.txt']
        },
        'loader': {
            'shuffle': False,
            'drop_last': False,
            'batch_size_per_card': 8,
            'num_workers': 2,
            'use_shared_memory': False
        },
        'transforms': [
            {'DecodeImage': {'img_mode': 'BGR', 'channel_first': False}},
            {'CTCLabelEncode': None},
            {'RecResizeImg': {'image_shape': [3, 32, 320]}},
            {'KeepKeys': {'keep_keys': ['image', 'label', 'length']}}
        ]
    }
    
    return config

# ‡∏™‡∏£‡πâ‡∏≤‡∏á config
config = create_recognition_config()

if config is None:
    raise Exception("Failed to create training configuration")

# ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å config file
config_save_path = "recognition_training_config.yml"
with open(config_save_path, 'w', encoding='utf-8') as f:
    yaml.dump(config, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

print(f"‚úÖ Configuration saved: {config_save_path}")

# ‡πÅ‡∏™‡∏î‡∏á‡∏™‡∏£‡∏∏‡∏õ configuration
print(f"\nüìã Training Configuration Summary:")
print(f"   üìÅ Output dir: {config['Global']['save_model_dir']}")
print(f"   üîÑ Epochs: {config['Global']['epoch_num']}")
print(f"   üì¶ Batch size: {config['Train']['loader']['batch_size_per_card']}")
print(f"   üéØ Max text length: {config['Global']['max_text_length']}")
print(f"   üìä Architecture: {config['Architecture']['algorithm']}")
print(f"   üî§ Character dict: {config['Global']['character_dict_path']}")

# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏µ‡πà‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô
print(f"\nüîç Verifying required files...")
required_files = [
    config['Global']['character_dict_path'],
    config['Train']['dataset']['label_file_list'][0],
    config['Eval']['dataset']['label_file_list'][0]
]

all_exist = True
for file_path in required_files:
    if Path(file_path).exists():
        print(f"‚úÖ {file_path}")
    else:
        print(f"‚ùå {file_path} - Missing!")
        all_exist = False

if all_exist:
    print(f"\nüéâ Training configuration ready!")
    print(f"üìù Config file: {Path(config_save_path).absolute()}")
    
    # ‡πÄ‡∏Å‡πá‡∏ö config path ‡πÑ‡∏ß‡πâ‡πÉ‡∏ä‡πâ‡πÉ‡∏ô‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ï‡πà‡∏≠‡πÑ‡∏õ
    TRAINING_CONFIG_PATH = config_save_path
    CONFIG_CREATED = True
else:
    print(f"\n‚ùå Some required files are missing!")
    print("Please check data download step")

## 5. Start PaddleOCR Recognition Training

‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡πÄ‡∏ó‡∏£‡∏ô Text Recognition model ‡∏î‡πâ‡∏ß‡∏¢ PaddleOCR

In [None]:
# ===== TRAINING EXECUTION =====
# ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡πÄ‡∏ó‡∏£‡∏ô PaddleOCR Recognition Model

# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤ config ‡∏ñ‡∏π‡∏Å‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÅ‡∏•‡πâ‡∏ß
if 'CONFIG_CREATED' not in globals():
    print("‚ùå Please run Training Configuration Creation cell first!")
    raise RuntimeError("Training config not created")

import subprocess
from datetime import datetime

print("üöÄ PaddleOCR Recognition Training Execution")
print("=" * 50)

# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏µ‡πà‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô
required_files = [
    "recognition_training_config.yml",
    "character_dict.txt",
    "s3_data/annotations/train_annotation.txt",
    "s3_data/annotations/val_annotation.txt"
]

missing_files = []
for file_path in required_files:
    if not Path(file_path).exists():
        missing_files.append(file_path)
    else:
        print(f"‚úÖ {file_path}")

if missing_files:
    print(f"\n‚ùå Missing required files:")
    for file_path in missing_files:
        print(f"  - {file_path}")
    print("Please run previous cells first!")
    raise RuntimeError("Missing required files")

# ‡∏™‡∏£‡πâ‡∏≤‡∏á output directory
output_dir = f"./output/rec_training_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
Path(output_dir).mkdir(parents=True, exist_ok=True)
print(f"üíæ Output directory: {output_dir}")

# ‡∏≠‡∏±‡∏õ‡πÄ‡∏î‡∏ï config ‡πÉ‡∏´‡πâ‡πÉ‡∏ä‡πâ output directory ‡∏ó‡∏µ‡πà‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á
with open("recognition_training_config.yml", 'r') as f:
    config = yaml.safe_load(f)

config['Global']['save_model_dir'] = output_dir

with open("recognition_training_config.yml", 'w') as f:
    yaml.safe_dump(config, f, default_flow_style=False, allow_unicode=True)

print(f"üìã Training Configuration:")
print(f"  ‚öôÔ∏è Config: recognition_training_config.yml")
print(f"  üî§ Character dict: character_dict.txt")
print(f"  üèãÔ∏è Training data: s3_data/annotations/train_annotation.txt")
print(f"  ‚úÖ Validation data: s3_data/annotations/val_annotation.txt")
print(f"  üíæ Output directory: {output_dir}")

print(f"\nüéØ Training Command:")
train_cmd = [
    "python", "PaddleOCR/tools/train.py",
    "-c", "recognition_training_config.yml"
]
print(f"  {' '.join(train_cmd)}")

# ‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏Å‡∏≤‡∏£‡∏£‡∏±‡∏ô
print(f"\nüîß Training Options:")
print(f"1. üöÄ Quick Demo Training (3 epochs)")
print(f"2. üèãÔ∏è Full Production Training (10 epochs)")
print(f"3. üõ†Ô∏è Custom Training (specify options)")
print(f"4. üìã Just show command (don't run)")

# ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö demo ‡πÉ‡∏´‡πâ‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô epochs ‡πÄ‡∏õ‡πá‡∏ô 3
choice = input("\nSelect training option (1-4): ").strip()

if choice == "1":
    print(f"\nüöÄ Starting Quick Demo Training (3 epochs)...")
    # ‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç config ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö demo
    config['Global']['epoch_num'] = 3
    config['Global']['save_epoch_step'] = 1
    with open("recognition_training_config.yml", 'w') as f:
        yaml.safe_dump(config, f, default_flow_style=False, allow_unicode=True)
    
    # ‡∏£‡∏±‡∏ô‡∏Å‡∏≤‡∏£‡πÄ‡∏ó‡∏£‡∏ô
    print(f"‚è∞ Training started at: {datetime.now()}")
    try:
        process = subprocess.Popen(
            train_cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            universal_newlines=True,
            bufsize=1
        )
        
        # ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏• output ‡πÅ‡∏ö‡∏ö real-time
        for line in process.stdout:
            print(line.strip())
        
        process.wait()
        
        if process.returncode == 0:
            print(f"\nüéâ Training completed successfully!")
            print(f"üìÅ Model saved in: {output_dir}")
            # Set flag ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô‡∏ï‡πà‡∏≠
            TRAINING_COMPLETED = True
            LATEST_MODEL_DIR = output_dir
        else:
            print(f"\n‚ùå Training failed with return code: {process.returncode}")
            
    except Exception as e:
        print(f"\n‚ùå Training error: {e}")

elif choice == "2":
    print(f"\nüèãÔ∏è Starting Full Production Training (10 epochs)...")
    print(f"‚è∞ Training started at: {datetime.now()}")
    
    try:
        result = subprocess.run(train_cmd, capture_output=True, text=True)
        
        if result.returncode == 0:
            print(f"üéâ Training completed successfully!")
            print(f"üìÅ Model saved in: {output_dir}")
            TRAINING_COMPLETED = True
            LATEST_MODEL_DIR = output_dir
        else:
            print(f"‚ùå Training failed:")
            print(result.stderr)
            
    except Exception as e:
        print(f"‚ùå Training error: {e}")

elif choice == "3":
    epochs = input("Enter number of epochs (default 10): ").strip() or "10"
    config['Global']['epoch_num'] = int(epochs)
    
    with open("recognition_training_config.yml", 'w') as f:
        yaml.safe_dump(config, f, default_flow_style=False, allow_unicode=True)
    
    print(f"\nüõ†Ô∏è Starting Custom Training ({epochs} epochs)...")
    print(f"‚è∞ Training started at: {datetime.now()}")
    
    try:
        result = subprocess.run(train_cmd, capture_output=True, text=True)
        
        if result.returncode == 0:
            print(f"üéâ Training completed successfully!")
            print(f"üìÅ Model saved in: {output_dir}")
            TRAINING_COMPLETED = True
            LATEST_MODEL_DIR = output_dir
        else:
            print(f"‚ùå Training failed:")
            print(result.stderr)
            
    except Exception as e:
        print(f"‚ùå Training error: {e}")

elif choice == "4":
    print(f"\nüìã Training Command to run manually:")
    print(f"cd {os.getcwd()}")
    print(f"{' '.join(train_cmd)}")

else:
    print(f"‚ö†Ô∏è Invalid choice. Please run this cell again.")

print(f"\n‚úÖ Training execution completed!")

## 6. Test Trained Model

‡∏ó‡∏î‡∏™‡∏≠‡∏ö Recognition model ‡∏ó‡∏µ‡πà‡πÄ‡∏ó‡∏£‡∏ô‡πÄ‡∏™‡∏£‡πá‡∏à‡πÅ‡∏•‡πâ‡∏ß‡∏Å‡∏±‡∏ö sample images

In [None]:
# ===== MODEL TESTING =====
# ‡∏ó‡∏î‡∏™‡∏≠‡∏ö Recognition model ‡∏ó‡∏µ‡πà‡πÄ‡∏ó‡∏£‡∏ô‡πÄ‡∏™‡∏£‡πá‡∏à‡πÅ‡∏•‡πâ‡∏ß

# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤‡∏Å‡∏≤‡∏£‡πÄ‡∏ó‡∏£‡∏ô‡πÄ‡∏™‡∏£‡πá‡∏à‡πÅ‡∏•‡πâ‡∏ß
if 'TRAINING_COMPLETED' not in globals():
    print("‚ùå Please run Training Execution cell first!")
    print("üìã You can also manually set model path if you have a trained model:")
    print("   LATEST_MODEL_DIR = './output/your_model_directory'")
    print("   TRAINING_COMPLETED = True")
    # Uncomment line below and set correct path if you have a trained model
    # LATEST_MODEL_DIR = './output/rec_training_20250101_120000'
    # TRAINING_COMPLETED = True

if 'TRAINING_COMPLETED' in globals() and 'LATEST_MODEL_DIR' in globals():
    print("üî¨ Testing Trained Recognition Model")
    print("=" * 40)
    
    # ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö model directory
    model_dir = Path(LATEST_MODEL_DIR)
    if not model_dir.exists():
        print(f"‚ùå Model directory not found: {model_dir}")
    else:
        print(f"üìÅ Model directory: {model_dir}")
        
        # ‡∏´‡∏≤ latest checkpoint
        checkpoint_files = list(model_dir.glob("latest.pdparams"))
        if not checkpoint_files:
            checkpoint_files = list(model_dir.glob("*.pdparams"))
        
        if checkpoint_files:
            latest_checkpoint = checkpoint_files[0]
            print(f"üìÑ Using checkpoint: {latest_checkpoint}")
            
            # ‡∏™‡∏£‡πâ‡∏≤‡∏á inference script
            inference_cmd = [
                "python", "PaddleOCR/tools/infer_rec.py",
                "-c", "recognition_training_config.yml",
                "-o", f"Global.pretrained_model={latest_checkpoint.parent}/{latest_checkpoint.stem}",
                "-o", "Global.infer_img=s3_data/images/val/"
            ]
            
            print(f"\nüéØ Inference Command:")
            print(f"  {' '.join(inference_cmd)}")
            
            # ‡∏£‡∏±‡∏ô inference (optional - ‡πÄ‡∏û‡∏¥‡πà‡∏° UI ‡πÉ‡∏´‡πâ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å)
            run_inference = input("\nDo you want to run inference now? (y/n): ").strip().lower()
            
            if run_inference == 'y':
                print(f"\nüöÄ Running inference...")
                try:
                    result = subprocess.run(inference_cmd, capture_output=True, text=True, timeout=120)
                    
                    if result.returncode == 0:
                        print(f"‚úÖ Inference completed successfully!")
                        print("\nOutput:")
                        print(result.stdout)
                    else:
                        print(f"‚ùå Inference failed:")
                        print(result.stderr)
                        
                except subprocess.TimeoutExpired:
                    print("‚è∞ Inference timeout - please try with fewer images")
                except Exception as e:
                    print(f"‚ùå Inference error: {e}")
            else:
                print("üìã You can run inference manually with the command above")
        else:
            print(f"‚ùå No checkpoint files found in {model_dir}")
            print("Available files:")
            for file in model_dir.iterdir():
                print(f"  üìÑ {file.name}")

    # ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡∏™‡∏£‡∏∏‡∏õ
    print(f"\nüìä Training Summary:")
    print(f"  üìÅ Model saved in: {LATEST_MODEL_DIR}")
    print(f"  ‚öôÔ∏è Config file: recognition_training_config.yml")
    print(f"  üî§ Character dict: character_dict.txt")
    print(f"  üíæ S3 location: s3://{S3_BUCKET}/recognition-data/")
    
    # ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡πÑ‡∏ü‡∏•‡πå‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö upload ‡∏Å‡∏•‡∏±‡∏ö S3 (optional)
    upload_model = input("\nDo you want to upload trained model to S3? (y/n): ").strip().lower()
    
    if upload_model == 'y':
        print(f"\nüì§ Uploading model to S3...")
        s3 = boto3.client('s3')
        
        try:
            # Upload trained model files
            for file_path in model_dir.glob("*"):
                if file_path.is_file():
                    s3_key = f"recognition-models/{model_dir.name}/{file_path.name}"
                    s3.upload_file(str(file_path), S3_BUCKET, s3_key)
                    print(f"  ‚úÖ Uploaded: {s3_key}")
            
            print(f"üéâ Model uploaded to S3 successfully!")
            print(f"üìç S3 location: s3://{S3_BUCKET}/recognition-models/{model_dir.name}/")
            
        except Exception as e:
            print(f"‚ùå Upload failed: {e}")
    
    print(f"\n‚úÖ Model testing and management completed!")
else:
    print("‚è≠Ô∏è  Please run training first or manually set model path above")