# Create Label

In [2]:
import os
import pandas as pd

IMAGE_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.gif')

def create_labels_csv(image_folder: str, output_csv_path: str):
    image_files = []
    text_contents = []
    found_pairs = 0
    missing_text_for_images = 0
    print(f"Scanning folder: {image_folder}")
    if not os.path.isdir(image_folder):
        print(f"Error: Image folder not found or is not a directory: {image_folder}")
        return
    for filename in os.listdir(image_folder):
        file_basename, file_extension = os.path.splitext(filename)
        if file_extension.lower() in IMAGE_EXTENSIONS:
            current_image_path = os.path.join(image_folder, filename)
            text_to_add = None
            text_filename_pattern1 = f"{file_basename}_text.txt"
            text_filepath_pattern1 = os.path.join(image_folder, text_filename_pattern1)
            if os.path.exists(text_filepath_pattern1):
                try:
                    with open(text_filepath_pattern1, 'r', encoding='utf-8') as f:
                        content = f.read().strip()
                    if content:
                        text_to_add = content
                    else:
                        print(f"Info: Text file '{text_filename_pattern1}' for image '{filename}' is empty.")
                except Exception as e:
                    print(f"Warning: Error reading text file '{text_filename_pattern1}' for image '{filename}': {e}")
            if text_to_add is None:
                text_filename_pattern2 = f"{file_basename}.txt"
                text_filepath_pattern2 = os.path.join(image_folder, text_filename_pattern2)
                if os.path.exists(text_filepath_pattern2):
                    try:
                        with open(text_filepath_pattern2, 'r', encoding='utf-8') as f:
                            content = f.read().strip()
                        if content:
                            text_to_add = content
                        else:
                            print(f"Info: Text file '{text_filename_pattern2}' for image '{filename}' is empty.")
                    except Exception as e:
                        print(f"Warning: Error reading text file '{text_filename_pattern2}' for image '{filename}': {e}")
            if text_to_add:
                image_files.append(current_image_path)
                text_contents.append(text_to_add)
                found_pairs += 1
            else:
                missing_text_for_images += 1
                if not os.path.exists(text_filepath_pattern1) and \
                   not os.path.exists(os.path.join(image_folder, f"{file_basename}.txt")):
                    print(f"Info: No corresponding text file ('{text_filename_pattern1}' or '{file_basename}.txt') found for image '{filename}'.")
    if not image_files:
        print("No image-text pairs found. Please check file naming conventions and folder content.")
        return
    df = pd.DataFrame({'image_path': image_files, 'text': text_contents})
    try:
        os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
        df.to_csv(output_csv_path, index=False)
        print(f"\nSuccessfully created '{output_csv_path}' with {found_pairs} image-text pairs.")
        if missing_text_for_images > 0:
            print(f"Could not find or process valid text for {missing_text_for_images} images.")
    except Exception as e:
        print(f"Error: Could not write CSV file to '{output_csv_path}': {e}")

if __name__ == "__main__":
    # --- Configuration ---
    # Input image folder (assuming this path is still correct)
    image_folder_path = "/home/jupyter/advanced/ocr/" 
    
    # UPDATED: Output directory for labels.csv
    writable_output_directory = "/home/jupyter/PaddleOCR_Training/ocr_output/" 
    output_csv_file = os.path.join(writable_output_directory, "labels.csv")

    # --- Run the script ---
    create_labels_csv(image_folder_path, output_csv_file)

Scanning folder: /home/jupyter/advanced/ocr/

Successfully created '/home/jupyter/ocr_output/labels.csv' with 4500 image-text pairs.


# Create Paddle Labels

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

def convert_and_split_labels(csv_path, output_dir, train_filename="rec_gt_train.txt", eval_filename="rec_gt_eval.txt", test_size=0.2, random_state=42):
    if not os.path.exists(csv_path):
        print(f"Error: CSV file not found at {csv_path}")
        return

    df = pd.read_csv(csv_path)

    if 'image_path' not in df.columns or 'text' not in df.columns:
        print("Error: CSV must contain 'image_path' and 'text' columns.")
        return

    os.makedirs(output_dir, exist_ok=True)
    train_df, eval_df = train_test_split(df, test_size=test_size, random_state=random_state)

    output_train_path = os.path.join(output_dir, train_filename)
    with open(output_train_path, 'w', encoding='utf-8') as f_train:
        for _, row in train_df.iterrows():
            f_train.write(f"{row['image_path']}\t{row['text']}\n")
    print(f"Successfully created training label file: {output_train_path} with {len(train_df)} entries.")

    output_eval_path = os.path.join(output_dir, eval_filename)
    with open(output_eval_path, 'w', encoding='utf-8') as f_eval:
        for _, row in eval_df.iterrows():
            f_eval.write(f"{row['image_path']}\t{row['text']}\n")
    print(f"Successfully created evaluation label file: {output_eval_path} with {len(eval_df)} entries.")

if __name__ == "__main__":
    # --- Configuration ---
    # UPDATED: Path to labels.csv
    labels_csv_file = "/home/jupyter/ocr_output/labels.csv" 
    
    # UPDATED: Directory where rec_gt_train.txt and rec_gt_eval.txt will be saved
    paddle_labels_output_dir = "/home/jupyter/ocr_output/paddle_labels/" 

    # --- Run the script ---
    if os.path.exists(labels_csv_file):
        convert_and_split_labels(labels_csv_file, paddle_labels_output_dir)
    else:
        print(f"Input CSV '{labels_csv_file}' not found. Please run create_label.py first.")

Successfully created training label file: /home/jupyter/ocr_output/paddle_labels/rec_gt_train.txt with 3600 entries.
Successfully created evaluation label file: /home/jupyter/ocr_output/paddle_labels/rec_gt_eval.txt with 900 entries.


# Create Char Dict

In [None]:
import pandas as pd
import os

def create_char_dictionary(csv_path, output_dict_path="custom_char_dict.txt"):
    if not os.path.exists(csv_path):
        print(f"Error: CSV file not found at {csv_path}")
        return

    df = pd.read_csv(csv_path)
    if 'text' not in df.columns:
        print("Error: CSV must contain a 'text' column.")
        return

    all_text = "".join(df['text'].astype(str).tolist())
    unique_chars = sorted(list(set(all_text)))

    os.makedirs(os.path.dirname(output_dict_path), exist_ok=True)
    with open(output_dict_path, 'w', encoding='utf-8') as f:
        for char in unique_chars:
            f.write(char + '\n')
    
    print(f"Successfully created character dictionary: {output_dict_path} with {len(unique_chars)} unique characters.")

if __name__ == "__main__":
    # --- Configuration ---
    # UPDATED: Path to labels.csv
    labels_csv_file = "/home/jupyter/ocr_output/labels.csv"
    
    # UPDATED: Path for custom_char_dict.txt
    char_dict_file = "/home/jupyter/ocr_output/paddle_labels/custom_char_dict.txt"

    # --- Run the script ---
    if os.path.exists(labels_csv_file):
        create_char_dictionary(labels_csv_file, char_dict_file)
    else:
        print(f"Input CSV '{labels_csv_file}' not found. Please run create_label.py first.")