# Thai OCR Dataset Converter for EasyOCR

**คำอธิบาย:**
Notebook นี้จะแปลงข้อมูล Thai OCR Dataset ให้อยู่ในรูปแบบที่ EasyOCR ใช้

**ขั้นตอน:**
1. 📚 Import libraries
2. 📁 ตั้งค่า path ข้อมูลเข้าและออก
3. 📊 อ่านและตรวจสอบข้อมูลต้นทาง
4. 🔄 แปลงข้อมูลให้อยู่ในรูปแบบ EasyOCR
5. 💾 บันทึกข้อมูลที่แปลงแล้ว
6. ✅ ตรวจสอบผลลัพธ์

**โครงสร้างเป้าหมาย:**
```
output_dataset/
├── train/
│   ├── images/
│   └── labels.txt
└── val/
    ├── images/ 
    └── labels.txt
```

## 📚 1. Import Required Libraries

In [6]:
import os
import shutil
import random
from pathlib import Path
from sklearn.model_selection import train_test_split # pip install scikit-learn
from datetime import datetime
import pandas as pd
from tqdm import tqdm

print("📚 Libraries imported successfully!")
print(f"⏰ Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

📚 Libraries imported successfully!
⏰ Current time: 2025-06-29 05:02:04


## 📁 2. Set Input and Output File Paths

In [7]:
# 🔧 INPUT PATHS - แก้ไขตรงนี้ให้ตรงกับข้อมูลของคุณ
INPUT_BASE = "all_data/thai_lang_ocr_dataset"  # 📁 โฟลเดอร์ข้อมูลต้นทาง
INPUT_TRAIN_LIST = os.path.join(INPUT_BASE, "train_list.txt")
INPUT_VAL_LIST = os.path.join(INPUT_BASE, "val_list.txt")

# 🎯 OUTPUT PATHS
OUTPUT_BASE = "all_data/thai_easyocr_format"  # 📁 โฟลเดอร์ผลลัพธ์
OUTPUT_TRAIN_DIR = os.path.join(OUTPUT_BASE, "train")
OUTPUT_VAL_DIR = os.path.join(OUTPUT_BASE, "val")

# 📊 SETTINGS
TRAIN_RATIO = 0.8  # 80% สำหรับ training
VAL_RATIO = 0.2    # 20% สำหรับ validation
RANDOM_SEED = 42   # สำหรับทำซ้ำได้

print(f"📁 Input dataset: {INPUT_BASE}")
print(f"📁 Output dataset: {OUTPUT_BASE}")
print(f"📊 Split ratio: Train {TRAIN_RATIO*100:.0f}% | Val {VAL_RATIO*100:.0f}%")

# ตรวจสอบว่าข้อมูลต้นทางมีอยู่หรือไม่
if os.path.exists(INPUT_BASE):
    print(f"✅ Input directory exists")
else:
    print(f"❌ Input directory not found: {INPUT_BASE}")
    print("   Please update INPUT_BASE path above")

📁 Input dataset: all_data/thai_lang_ocr_dataset
📁 Output dataset: all_data/thai_easyocr_format
📊 Split ratio: Train 80% | Val 20%
✅ Input directory exists


## 📊 3. Read and Inspect Input Files

In [8]:
def read_label_file(file_path):
    """อ่านไฟล์ label และคืนค่าเป็น list ของ (image_path, text)"""
    data = []
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf8') as f:
            for line_num, line in enumerate(f, 1):
                line = line.strip()
                if line:
                    parts = line.split('\t')
                    if len(parts) >= 2:
                        image_path = parts[0].replace('th_img/', '')  # ลบ 'th_img/' ออก
                        text = parts[1]
                        data.append((image_path, text))
                    else:
                        print(f"⚠️  Line {line_num} has incorrect format: {line}")
    return data

# อ่านข้อมูลจากไฟล์ต้นทาง
print("📖 Reading input files...")
train_data = read_label_file(INPUT_TRAIN_LIST)
val_data = read_label_file(INPUT_VAL_LIST)

print(f"📊 Data summary:")
print(f"   Train data: {len(train_data)} samples")
print(f"   Val data: {len(val_data)} samples")
print(f"   Total: {len(train_data) + len(val_data)} samples")

# แสดงตัวอย่างข้อมูล
if train_data:
    print(f"\n📝 Sample from train data:")
    for i, (img_path, text) in enumerate(train_data[:3]):
        print(f"   {i+1}. Image: {img_path}")
        print(f"      Text: {text}")
        
        # ตรวจสอบว่าไฟล์รูปมีอยู่จริงหรือไม่
        full_img_path = os.path.join(INPUT_BASE, img_path)
        exists = "✅" if os.path.exists(full_img_path) else "❌"
        print(f"      File exists: {exists}")
        print()

if val_data:
    print(f"📝 Sample from val data:")
    for i, (img_path, text) in enumerate(val_data[:2]):
        print(f"   {i+1}. Image: {img_path}")
        print(f"      Text: {text}")
        
        # ตรวจสอบว่าไฟล์รูปมีอยู่จริงหรือไม่
        full_img_path = os.path.join(INPUT_BASE, img_path)
        exists = "✅" if os.path.exists(full_img_path) else "❌"
        print(f"      File exists: {exists}")
        print()

📖 Reading input files...
📊 Data summary:
   Train data: 197995 samples
   Val data: 2000 samples
   Total: 199995 samples

📝 Sample from train data:
   1. Image: 0/ILSVRC2012_val_00012640_12215.jpg
      Text: ด้านรองหัวหน้า
      File exists: ✅

   2. Image: 0/ILSVRC2012_val_00013060_17187.jpg
      Text: รวมกับสำนวนของ
      File exists: ✅

   3. Image: 0/ILSVRC2012_val_00014897_10839.jpg
      Text: โดยบอกให้เธอ
      File exists: ✅

📝 Sample from val data:
   1. Image: 1/ILSVRC2012_val_00018359_10720.jpg
      Text: gia
      File exists: ✅

   2. Image: 1/ILSVRC2012_val_00011177_2324.jpg
      Text: (imperialism)
      File exists: ✅



## 🔄 4. Transform Data to EasyOCR Format

In [9]:
def convert_to_easyocr_format():
    """แปลงข้อมูลให้อยู่ในรูปแบบที่ EasyOCR ใช้"""
    
    # 🚀 FAST CONVERSION - EasyOCR Format
    print("🔄 Converting to EasyOCR format...")

    # รวมข้อมูลทั้งหมด
    all_data = train_data + val_data
    print(f"📊 Total: {len(all_data)} samples")

    # แบ่งข้อมูล 80-20
    random.seed(RANDOM_SEED)
    random.shuffle(all_data)
    split_idx = int(len(all_data) * 0.8)
    train_set = all_data[:split_idx]
    val_set = all_data[split_idx:]

    print(f"Train: {len(train_set)} | Val: {len(val_set)}")

    # สร้างโฟลเดอร์
    os.makedirs(f"{OUTPUT_BASE}/train", exist_ok=True)
    os.makedirs(f"{OUTPUT_BASE}/val", exist_ok=True)

    def create_dataset(data_list, output_dir, prefix):
        """สร้าง dataset แบบเร็ว พร้อมทั้ง labels.txt และ labels.csv"""
        labels_txt = []
        labels_csv_data = []
        count = 0
        
        for i, (img_path, text) in enumerate(tqdm(data_list, desc=f"Processing {prefix}")):
            source = os.path.join(INPUT_BASE, img_path)
            if os.path.exists(source):
                # คัดลอกรูป
                ext = os.path.splitext(img_path)[1] or '.jpg'
                new_name = f"{prefix}_{i:06d}{ext}"
                target = os.path.join(output_dir, new_name)
                shutil.copy2(source, target)
                
                # เพิ่ม label สำหรับ labels.txt (tab-separated)
                labels_txt.append(f"{new_name}\t{text}")
                
                # เพิ่มข้อมูลสำหรับ labels.csv 
                labels_csv_data.append([new_name, text])
                
                count += 1
        
        # เขียน labels.txt
        with open(f"{output_dir}/labels.txt", 'w', encoding='utf8') as f:
            f.write('\n'.join(labels_txt))
        
        # เขียน labels.csv พร้อม header
        df = pd.DataFrame(labels_csv_data, columns=['filename', 'words'])
        df.to_csv(f"{output_dir}/labels.csv", index=False, encoding='utf8')
        
        print(f"   ✅ Created {count} samples in {output_dir}")
        print(f"   📄 Generated labels.txt ({len(labels_txt)} entries)")
        print(f"   📊 Generated labels.csv ({len(labels_csv_data)} entries)")
        
        return count

    # สร้าง datasets
    print("🔄 Creating train set...")
    train_count = create_dataset(train_set, f"{OUTPUT_BASE}/train", "train")

    print("🔄 Creating val set...")
    val_count = create_dataset(val_set, f"{OUTPUT_BASE}/val", "val")

    print(f"\n✅ Conversion Complete!")
    print(f"📊 Summary: Train: {train_count} | Val: {val_count}")
    print(f"📁 Output: {OUTPUT_BASE}")
    print(f"🎯 Both labels.txt and labels.csv files created for EasyOCR compatibility")
    
    return train_count, val_count

# เริ่มการแปลง
if train_data or val_data:
    train_count, val_count = convert_to_easyocr_format()
else:
    print("❌ No data to convert. Please check input files.")

🔄 Converting to EasyOCR format...
📊 Total: 199995 samples
Train: 159996 | Val: 39999
🔄 Creating train set...


Processing train: 100%|██████████| 159996/159996 [01:41<00:00, 1571.23it/s]


   ✅ Created 159996 samples in all_data/thai_easyocr_format/train
   📄 Generated labels.txt (159996 entries)
   📊 Generated labels.csv (159996 entries)
🔄 Creating val set...


Processing val: 100%|██████████| 39999/39999 [00:25<00:00, 1596.78it/s]

   ✅ Created 39999 samples in all_data/thai_easyocr_format/val
   📄 Generated labels.txt (39999 entries)
   📊 Generated labels.csv (39999 entries)

✅ Conversion Complete!
📊 Summary: Train: 159996 | Val: 39999
📁 Output: all_data/thai_easyocr_format
🎯 Both labels.txt and labels.csv files created for EasyOCR compatibility





## 💾 5. Dataset Structure Summary

In [11]:
def show_dataset_structure():
    """แสดงโครงสร้างของ dataset ที่สร้างขึ้น"""
    
    print("📁 Dataset Structure:")
    print(f"{OUTPUT_BASE}/")
    
    # ตรวจสอบโฟลเดอร์ train
    train_images_dir = os.path.join(OUTPUT_TRAIN_DIR, "images")
    train_labels_file = os.path.join(OUTPUT_TRAIN_DIR, "labels.txt")
    
    if os.path.exists(train_images_dir):
        train_image_count = len([f for f in os.listdir(train_images_dir) 
                               if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
        print(f"├── train/")
        print(f"│   ├── images/ ({train_image_count} images)")
        
        if os.path.exists(train_labels_file):
            with open(train_labels_file, 'r', encoding='utf8') as f:
                train_label_count = len(f.readlines())
            print(f"│   └── labels.txt ({train_label_count} entries)")
        else:
            print(f"│   └── labels.txt (❌ not found)")
    
    # ตรวจสอบโฟลเดอร์ val
    val_images_dir = os.path.join(OUTPUT_VAL_DIR, "images")
    val_labels_file = os.path.join(OUTPUT_VAL_DIR, "labels.txt")
    
    if os.path.exists(val_images_dir):
        val_image_count = len([f for f in os.listdir(val_images_dir) 
                             if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
        print(f"└── val/")
        print(f"    ├── images/ ({val_image_count} images)")
        
        if os.path.exists(val_labels_file):
            with open(val_labels_file, 'r', encoding='utf8') as f:
                val_label_count = len(f.readlines())
            print(f"    └── labels.txt ({val_label_count} entries)")
        else:
            print(f"    └── labels.txt (❌ not found)")

# แสดงโครงสร้าง dataset
if os.path.exists(OUTPUT_BASE):
    show_dataset_structure()
    
    # 📊 QUICK CHECK
    train_images = len([f for f in os.listdir(f"{OUTPUT_BASE}/train") if f.endswith(('.jpg', '.png'))])
    val_images = len([f for f in os.listdir(f"{OUTPUT_BASE}/val") if f.endswith(('.jpg', '.png'))])
    
    print(f"📁 {OUTPUT_BASE}/")
    print(f"├── train/ ({train_images} images + labels.txt)")
    print(f"└── val/ ({val_images} images + labels.txt)")
    
    print(f"\n🎯 For EasyOCR training, use:")
    print(f"'train_data': '{OUTPUT_BASE}',")
    print(f"'valid_data': '{OUTPUT_BASE}',")
    print(f"'select_data': 'train',")
else:
    print("❌ Output dataset not found. Please run the transformation step first.")

📁 Dataset Structure:
all_data/thai_easyocr_format/
📁 all_data/thai_easyocr_format/
├── train/ (159996 images + labels.txt)
└── val/ (39999 images + labels.txt)

🎯 For EasyOCR training, use:
'train_data': 'all_data/thai_easyocr_format',
'valid_data': 'all_data/thai_easyocr_format',
'select_data': 'train',


## ✅ 6. Verify Output Dataset

In [None]:
def verify_output_dataset():
    """ตรวจสอบความถูกต้องของ dataset ที่สร้างขึ้น"""
    
    print("🔍 Verifying output dataset...")
    
    # ตรวจสอบว่า output dataset มีอยู่จริง
    if os.path.exists(OUTPUT_BASE):
        # ตรวจสอบ training data
        train_images_dir = os.path.join(OUTPUT_BASE, "train")
        train_images = [f for f in os.listdir(train_images_dir) 
                       if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        
        with open(os.path.join(train_images_dir, 'labels.txt'), 'r', encoding='utf8') as f:
            train_labels = [line.strip() for line in f if line.strip()]
        
        print(f"\n📊 Training Data Verification:")
        print(f"   📸 Images: {len(train_images)}")
        print(f"   🏷️  Labels: {len(train_labels)}")
        
        if len(train_images) == len(train_labels):
            print(f"   ✅ Images and labels count match")
        else:
            print(f"   ❌ Images and labels count mismatch")
        
        # แสดงตัวอย่าง labels
        print(f"   📝 Sample labels:")
        for i, label in enumerate(train_labels[:3]):
            print(f"      {i+1}. {label}")
        
        # ตรวจสอบ validation data
        val_images_dir = os.path.join(OUTPUT_BASE, "val")
        val_images = [f for f in os.listdir(val_images_dir) 
                     if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        
        with open(os.path.join(val_images_dir, 'labels.txt'), 'r', encoding='utf8') as f:
            val_labels = [line.strip() for line in f if line.strip()]
        
        print(f"\n📊 Validation Data Verification:")
        print(f"   📸 Images: {len(val_images)}")
        print(f"   🏷️  Labels: {len(val_labels)}")
        
        if len(val_images) == len(val_labels):
            print(f"   ✅ Images and labels count match")
        else:
            print(f"   ❌ Images and labels count mismatch")
        
        # แสดงตัวอย่าง labels
        print(f"   📝 Sample labels:")
        for i, label in enumerate(val_labels[:3]):
            print(f"      {i+1}. {label}")
        
        # สรุปผลลัพธ์
        total_train = len(train_images)
        total_val = len(val_images)
        total_all = total_train + total_val
        
        print(f"\n🎯 Final Summary:")
        print(f"   📊 Total dataset: {total_all} samples")
        print(f"   🏋️  Training: {total_train} samples ({total_train/total_all*100:.1f}%)" if total_all > 0 else "   🏋️  Training: 0 samples")
        print(f"   🔬 Validation: {total_val} samples ({total_val/total_all*100:.1f}%)" if total_all > 0 else "   🔬 Validation: 0 samples")
        
        if total_all > 0:
            print(f"   ✅ Dataset ready for EasyOCR training!")
            print(f"   📁 Location: {OUTPUT_BASE}")
        else:
            print(f"   ❌ Dataset verification failed")
    else:
        print("❌ No dataset found")

# เริ่มการตรวจสอบ
verify_output_dataset()