In [23]:
# PREPROCESSING
import cv2
import numpy as np
import os

def preprocess_sheet(image_path):
    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Unable to load image at {image_path}")
        return None
    
    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply Gaussian blur to smooth the image and reduce noise
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    
    # Adaptive thresholding to binarize the image
    binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 11, 8)
    
    # Invert the binary image to make characters black and background white
    binary = cv2.bitwise_not(binary)
    
    return binary

def extract_characters(binary_image, rows, cols, crop_percent=0.1):
    if binary_image is None:
        return []
    
    # Get the dimensions of the binary image
    height, width = binary_image.shape
    cell_width_pixels = width // cols
    cell_height_pixels = height // rows
    
    crop_pixels_x = int(cell_width_pixels * crop_percent)
    crop_pixels_y = int(cell_height_pixels * crop_percent)
    
    extracted_characters = []
    
    for row in range(rows):
        for col in range(cols):
            x_start = col * cell_width_pixels + crop_pixels_x
            y_start = row * cell_height_pixels + crop_pixels_y
            x_end = (col + 1) * cell_width_pixels - crop_pixels_x
            y_end = (row + 1) * cell_height_pixels - crop_pixels_y

            # Ensure the coordinates are within bounds
            x_start = max(x_start, 0)
            y_start = max(y_start, 0)
            x_end = min(x_end, width)
            y_end = min(y_end, height)
            
            # Extract the cell and append it
            cell = binary_image[y_start:y_end, x_start:x_end]
            extracted_characters.append(cell)

    return extracted_characters

def save_characters(characters, output_folder, sheet_index, folder_start_number):
    # Create root output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for index, character in enumerate(characters):
        character_folder_number = folder_start_number + index + 1
        character_folder = os.path.join(output_folder, f'character_{character_folder_number}')
        if not os.path.exists(character_folder):
            os.makedirs(character_folder)
        
        character_filename = f'{sheet_index + 1}.png'
        character_path = os.path.join(character_folder, character_filename)
        
        # Save character image
        cv2.imwrite(character_path, character)

def process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number=1):
    if not os.path.exists(input_folder):
        print(f"Input folder does not exist: {input_folder}")
        return
    
    # List all files in input folder
    all_files = os.listdir(input_folder)
    # print(f"Files in input folder: {all_files}")
    
    # Filter only .jpg files
    sheet_files = sorted([f for f in all_files if f.lower().endswith('.jpg')])
    if not sheet_files:
        print(f"No .jpg files found in the input folder: {input_folder}")
        return
    
    print(f"Processing {len(sheet_files)} sheets from folder: {input_folder}")

    for sheet_index, sheet_file in enumerate(sheet_files):
        sheet_path = os.path.join(input_folder, sheet_file)
        print(f"Processing sheet: {sheet_path}")
        
        binarized_image = preprocess_sheet(sheet_path)
        if binarized_image is None:
            continue
        
        characters = extract_characters(binarized_image, rows, cols)
        save_characters(characters, output_folder, sheet_index, folder_start_number)

# Parameters
input_folder = 'dataset/Lipi_1'  # Folder containing the images
output_folder = 'preprocessing'  # Folder where the extracted characters will be saved
rows = 6
cols = 9
folder_start_number = 0  # Starting number for character folders

# Process all sheets
process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number)

Processing 112 sheets from folder: dataset/Lipi_1
Processing sheet: dataset/Lipi_1\DATASET_0001.jpg
Processing sheet: dataset/Lipi_1\DATASET_0002.jpg
Processing sheet: dataset/Lipi_1\DATASET_0003.jpg
Processing sheet: dataset/Lipi_1\DATASET_0004.jpg
Processing sheet: dataset/Lipi_1\DATASET_0005.jpg
Processing sheet: dataset/Lipi_1\DATASET_0006.jpg
Processing sheet: dataset/Lipi_1\DATASET_0007.jpg
Processing sheet: dataset/Lipi_1\DATASET_0008.jpg
Processing sheet: dataset/Lipi_1\DATASET_0009.jpg
Processing sheet: dataset/Lipi_1\DATASET_0010.jpg
Processing sheet: dataset/Lipi_1\DATASET_0011.jpg
Processing sheet: dataset/Lipi_1\DATASET_0012.jpg
Processing sheet: dataset/Lipi_1\DATASET_0013.jpg
Processing sheet: dataset/Lipi_1\DATASET_0014.jpg
Processing sheet: dataset/Lipi_1\DATASET_0015.jpg
Processing sheet: dataset/Lipi_1\DATASET_0016.jpg
Processing sheet: dataset/Lipi_1\DATASET_0017.jpg
Processing sheet: dataset/Lipi_1\DATASET_0018.jpg
Processing sheet: dataset/Lipi_1\DATASET_0019.jpg


In [25]:
# PREPROCESSING
import cv2
import numpy as np
import os

def preprocess_sheet(image_path):
    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Unable to load image at {image_path}")
        return None
    
    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply Gaussian blur to smooth the image and reduce noise
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    
    # Adaptive thresholding to binarize the image
    binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 11, 8)
    
    # Invert the binary image to make characters black and background white
    binary = cv2.bitwise_not(binary)
    
    return binary

def extract_characters(binary_image, rows, cols, crop_percent=0.1):
    if binary_image is None:
        return []
    
    # Get the dimensions of the binary image
    height, width = binary_image.shape
    cell_width_pixels = width // cols
    cell_height_pixels = height // rows
    
    crop_pixels_x = int(cell_width_pixels * crop_percent)
    crop_pixels_y = int(cell_height_pixels * crop_percent)
    
    extracted_characters = []
    
    for row in range(rows):
        for col in range(cols):
            x_start = col * cell_width_pixels + crop_pixels_x
            y_start = row * cell_height_pixels + crop_pixels_y
            x_end = (col + 1) * cell_width_pixels - crop_pixels_x
            y_end = (row + 1) * cell_height_pixels - crop_pixels_y

            # Ensure the coordinates are within bounds
            x_start = max(x_start, 0)
            y_start = max(y_start, 0)
            x_end = min(x_end, width)
            y_end = min(y_end, height)
            
            # Extract the cell and append it
            cell = binary_image[y_start:y_end, x_start:x_end]
            extracted_characters.append(cell)

    return extracted_characters

def save_characters(characters, output_folder, sheet_index, folder_start_number):
    # Create root output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for index, character in enumerate(characters):
        character_folder_number = folder_start_number + index + 1
        character_folder = os.path.join(output_folder, f'character_{character_folder_number}')
        if not os.path.exists(character_folder):
            os.makedirs(character_folder)
        
        character_filename = f'{sheet_index + 1}.png'
        character_path = os.path.join(character_folder, character_filename)
        
        # Save character image
        cv2.imwrite(character_path, character)

def process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number=1):
    if not os.path.exists(input_folder):
        print(f"Input folder does not exist: {input_folder}")
        return
    
    # List all files in input folder
    all_files = os.listdir(input_folder)
    # print(f"Files in input folder: {all_files}")
    
    # Filter only .jpg files
    sheet_files = sorted([f for f in all_files if f.lower().endswith('.jpg')])
    if not sheet_files:
        print(f"No .jpg files found in the input folder: {input_folder}")
        return
    
    print(f"Processing {len(sheet_files)} sheets from folder: {input_folder}")

    for sheet_index, sheet_file in enumerate(sheet_files):
        sheet_path = os.path.join(input_folder, sheet_file)
        print(f"Processing sheet: {sheet_path}")
        
        binarized_image = preprocess_sheet(sheet_path)
        if binarized_image is None:
            continue
        
        characters = extract_characters(binarized_image, rows, cols)
        save_characters(characters, output_folder, sheet_index, folder_start_number)

# Parameters
input_folder = 'dataset/Lipi_2'  # Folder containing the images
output_folder = 'preprocessing'  # Folder where the extracted characters will be saved
rows = 6
cols = 9
folder_start_number = 16  # Starting number for character folders

# Process all sheets
process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number)

Processing 9 sheets from folder: dataset/Lipi_2
Processing sheet: dataset/Lipi_2\Lipi 1_0001_page-0001.jpg
Processing sheet: dataset/Lipi_2\Lipi 2_0001_page-0001.jpg
Processing sheet: dataset/Lipi_2\Lipi 3_0001_page-0001.jpg
Processing sheet: dataset/Lipi_2\Lipi 4_0001_page-0001.jpg
Processing sheet: dataset/Lipi_2\Lipi 5_0001_page-0001.jpg
Processing sheet: dataset/Lipi_2\Lipi 6_0001_page-0001.jpg
Processing sheet: dataset/Lipi_2\Lipi 7_0001_page-0001.jpg
Processing sheet: dataset/Lipi_2\Lipi 8_0001_page-0001.jpg
Processing sheet: dataset/Lipi_2\Lipi 9_0001_page-0001.jpg


In [27]:
# PREPROCESSING
import cv2
import numpy as np
import os

def preprocess_sheet(image_path):
    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Unable to load image at {image_path}")
        return None
    
    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply Gaussian blur to smooth the image and reduce noise
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    
    # Adaptive thresholding to binarize the image
    binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 11, 8)
    
    # Invert the binary image to make characters black and background white
    binary = cv2.bitwise_not(binary)
    
    return binary

def extract_characters(binary_image, rows, cols, crop_percent=0.1):
    if binary_image is None:
        return []
    
    # Get the dimensions of the binary image
    height, width = binary_image.shape
    cell_width_pixels = width // cols
    cell_height_pixels = height // rows
    
    crop_pixels_x = int(cell_width_pixels * crop_percent)
    crop_pixels_y = int(cell_height_pixels * crop_percent)
    
    extracted_characters = []
    
    for row in range(rows):
        for col in range(cols):
            x_start = col * cell_width_pixels + crop_pixels_x
            y_start = row * cell_height_pixels + crop_pixels_y
            x_end = (col + 1) * cell_width_pixels - crop_pixels_x
            y_end = (row + 1) * cell_height_pixels - crop_pixels_y

            # Ensure the coordinates are within bounds
            x_start = max(x_start, 0)
            y_start = max(y_start, 0)
            x_end = min(x_end, width)
            y_end = min(y_end, height)
            
            # Extract the cell and append it
            cell = binary_image[y_start:y_end, x_start:x_end]
            extracted_characters.append(cell)

    return extracted_characters

def save_characters(characters, output_folder, sheet_index, folder_start_number):
    # Create root output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for index, character in enumerate(characters):
        character_folder_number = folder_start_number + index + 1
        character_folder = os.path.join(output_folder, f'character_{character_folder_number}')
        if not os.path.exists(character_folder):
            os.makedirs(character_folder)
        
        character_filename = f'{sheet_index + 1}.png'
        character_path = os.path.join(character_folder, character_filename)
        
        # Save character image
        cv2.imwrite(character_path, character)

def process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number=1):
    if not os.path.exists(input_folder):
        print(f"Input folder does not exist: {input_folder}")
        return
    
    # List all files in input folder
    all_files = os.listdir(input_folder)
    # print(f"Files in input folder: {all_files}")
    
    # Filter only .jpg files
    sheet_files = sorted([f for f in all_files if f.lower().endswith('.jpg')])
    if not sheet_files:
        print(f"No .jpg files found in the input folder: {input_folder}")
        return
    
    print(f"Processing {len(sheet_files)} sheets from folder: {input_folder}")

    for sheet_index, sheet_file in enumerate(sheet_files):
        sheet_path = os.path.join(input_folder, sheet_file)
        print(f"Processing sheet: {sheet_path}")
        
        binarized_image = preprocess_sheet(sheet_path)
        if binarized_image is None:
            continue
        
        characters = extract_characters(binarized_image, rows, cols)
        save_characters(characters, output_folder, sheet_index, folder_start_number)

# Parameters
input_folder = 'dataset/Lipi_3'  # Folder containing the images
output_folder = 'preprocessing'  # Folder where the extracted characters will be saved
rows = 6
cols = 9
folder_start_number = 70  # Starting number for character folders

# Process all sheets
process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number)

Processing 9 sheets from folder: dataset/Lipi_3
Processing sheet: dataset/Lipi_3\Lipi 1_0001_page-0002.jpg
Processing sheet: dataset/Lipi_3\Lipi 2_0001_page-0002.jpg
Processing sheet: dataset/Lipi_3\Lipi 3_0001_page-0002.jpg
Processing sheet: dataset/Lipi_3\Lipi 4_0001_page-0002.jpg
Processing sheet: dataset/Lipi_3\Lipi 5_0001_page-0002.jpg
Processing sheet: dataset/Lipi_3\Lipi 6_0001_page-0002.jpg
Processing sheet: dataset/Lipi_3\Lipi 7_0001_page-0002.jpg
Processing sheet: dataset/Lipi_3\Lipi 8_0001_page-0002.jpg
Processing sheet: dataset/Lipi_3\Lipi 9_0001_page-0002.jpg


In [29]:
# PREPROCESSING
import cv2
import numpy as np
import os

def preprocess_sheet(image_path):
    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Unable to load image at {image_path}")
        return None
    
    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply Gaussian blur to smooth the image and reduce noise
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    
    # Adaptive thresholding to binarize the image
    binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 11, 8)
    
    # Invert the binary image to make characters black and background white
    binary = cv2.bitwise_not(binary)
    
    return binary

def extract_characters(binary_image, rows, cols, crop_percent=0.1):
    if binary_image is None:
        return []
    
    # Get the dimensions of the binary image
    height, width = binary_image.shape
    cell_width_pixels = width // cols
    cell_height_pixels = height // rows
    
    crop_pixels_x = int(cell_width_pixels * crop_percent)
    crop_pixels_y = int(cell_height_pixels * crop_percent)
    
    extracted_characters = []
    
    for row in range(rows):
        for col in range(cols):
            x_start = col * cell_width_pixels + crop_pixels_x
            y_start = row * cell_height_pixels + crop_pixels_y
            x_end = (col + 1) * cell_width_pixels - crop_pixels_x
            y_end = (row + 1) * cell_height_pixels - crop_pixels_y

            # Ensure the coordinates are within bounds
            x_start = max(x_start, 0)
            y_start = max(y_start, 0)
            x_end = min(x_end, width)
            y_end = min(y_end, height)
            
            # Extract the cell and append it
            cell = binary_image[y_start:y_end, x_start:x_end]
            extracted_characters.append(cell)

    return extracted_characters

def save_characters(characters, output_folder, sheet_index, folder_start_number):
    # Create root output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for index, character in enumerate(characters):
        character_folder_number = folder_start_number + index + 1
        character_folder = os.path.join(output_folder, f'character_{character_folder_number}')
        if not os.path.exists(character_folder):
            os.makedirs(character_folder)
        
        character_filename = f'{sheet_index + 1}.png'
        character_path = os.path.join(character_folder, character_filename)
        
        # Save character image
        cv2.imwrite(character_path, character)

def process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number=1):
    if not os.path.exists(input_folder):
        print(f"Input folder does not exist: {input_folder}")
        return
    
    # List all files in input folder
    all_files = os.listdir(input_folder)
    # print(f"Files in input folder: {all_files}")
    
    # Filter only .jpg files
    sheet_files = sorted([f for f in all_files if f.lower().endswith('.jpg')])
    if not sheet_files:
        print(f"No .jpg files found in the input folder: {input_folder}")
        return
    
    print(f"Processing {len(sheet_files)} sheets from folder: {input_folder}")

    for sheet_index, sheet_file in enumerate(sheet_files):
        sheet_path = os.path.join(input_folder, sheet_file)
        print(f"Processing sheet: {sheet_path}")
        
        binarized_image = preprocess_sheet(sheet_path)
        if binarized_image is None:
            continue
        
        characters = extract_characters(binarized_image, rows, cols)
        save_characters(characters, output_folder, sheet_index, folder_start_number)

# Parameters
input_folder = 'dataset/Lipi_4'  # Folder containing the images
output_folder = 'preprocessing'  # Folder where the extracted characters will be saved
rows = 6
cols = 9
folder_start_number = 124  # Starting number for character folders

# Process all sheets
process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number)

Processing 9 sheets from folder: dataset/Lipi_4
Processing sheet: dataset/Lipi_4\Lipi 1_0001_page-0003.jpg
Processing sheet: dataset/Lipi_4\Lipi 2_0001_page-0003.jpg
Processing sheet: dataset/Lipi_4\Lipi 3_0001_page-0003.jpg
Processing sheet: dataset/Lipi_4\Lipi 4_0001_page-0003.jpg
Processing sheet: dataset/Lipi_4\Lipi 5_0001_page-0003.jpg
Processing sheet: dataset/Lipi_4\Lipi 6_0001_page-0003.jpg
Processing sheet: dataset/Lipi_4\Lipi 7_0001_page-0003.jpg
Processing sheet: dataset/Lipi_4\Lipi 8_0001_page-0003.jpg
Processing sheet: dataset/Lipi_4\Lipi 9_0001_page-0003.jpg


In [31]:
# PREPROCESSING
import cv2
import numpy as np
import os

def preprocess_sheet(image_path):
    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Unable to load image at {image_path}")
        return None
    
    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply Gaussian blur to smooth the image and reduce noise
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    
    # Adaptive thresholding to binarize the image
    binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 11, 8)
    
    # Invert the binary image to make characters black and background white
    binary = cv2.bitwise_not(binary)
    
    return binary

def extract_characters(binary_image, rows, cols, crop_percent=0.1):
    if binary_image is None:
        return []
    
    # Get the dimensions of the binary image
    height, width = binary_image.shape
    cell_width_pixels = width // cols
    cell_height_pixels = height // rows
    
    crop_pixels_x = int(cell_width_pixels * crop_percent)
    crop_pixels_y = int(cell_height_pixels * crop_percent)
    
    extracted_characters = []
    
    for row in range(rows):
        for col in range(cols):
            x_start = col * cell_width_pixels + crop_pixels_x
            y_start = row * cell_height_pixels + crop_pixels_y
            x_end = (col + 1) * cell_width_pixels - crop_pixels_x
            y_end = (row + 1) * cell_height_pixels - crop_pixels_y

            # Ensure the coordinates are within bounds
            x_start = max(x_start, 0)
            y_start = max(y_start, 0)
            x_end = min(x_end, width)
            y_end = min(y_end, height)
            
            # Extract the cell and append it
            cell = binary_image[y_start:y_end, x_start:x_end]
            extracted_characters.append(cell)

    return extracted_characters

def save_characters(characters, output_folder, sheet_index, folder_start_number):
    # Create root output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for index, character in enumerate(characters):
        character_folder_number = folder_start_number + index + 1
        character_folder = os.path.join(output_folder, f'character_{character_folder_number}')
        if not os.path.exists(character_folder):
            os.makedirs(character_folder)
        
        character_filename = f'{sheet_index + 1}.png'
        character_path = os.path.join(character_folder, character_filename)
        
        # Save character image
        cv2.imwrite(character_path, character)

def process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number=1):
    if not os.path.exists(input_folder):
        print(f"Input folder does not exist: {input_folder}")
        return
    
    # List all files in input folder
    all_files = os.listdir(input_folder)
    # print(f"Files in input folder: {all_files}")
    
    # Filter only .jpg files
    sheet_files = sorted([f for f in all_files if f.lower().endswith('.jpg')])
    if not sheet_files:
        print(f"No .jpg files found in the input folder: {input_folder}")
        return
    
    print(f"Processing {len(sheet_files)} sheets from folder: {input_folder}")

    for sheet_index, sheet_file in enumerate(sheet_files):
        sheet_path = os.path.join(input_folder, sheet_file)
        print(f"Processing sheet: {sheet_path}")
        
        binarized_image = preprocess_sheet(sheet_path)
        if binarized_image is None:
            continue
        
        characters = extract_characters(binarized_image, rows, cols)
        save_characters(characters, output_folder, sheet_index, folder_start_number)

# Parameters
input_folder = 'dataset/Lipi_5'  # Folder containing the images
output_folder = 'preprocessing'  # Folder where the extracted characters will be saved
rows = 6
cols = 9
folder_start_number = 178  # Starting number for character folders

# Process all sheets
process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number)

Processing 9 sheets from folder: dataset/Lipi_5
Processing sheet: dataset/Lipi_5\Lipi 1_0001_page-0004.jpg
Processing sheet: dataset/Lipi_5\Lipi 2_0001_page-0004.jpg
Processing sheet: dataset/Lipi_5\Lipi 3_0001_page-0004.jpg
Processing sheet: dataset/Lipi_5\Lipi 4_0001_page-0004.jpg
Processing sheet: dataset/Lipi_5\Lipi 5_0001_page-0004.jpg
Processing sheet: dataset/Lipi_5\Lipi 6_0001_page-0004.jpg
Processing sheet: dataset/Lipi_5\Lipi 7_0001_page-0004.jpg
Processing sheet: dataset/Lipi_5\Lipi 8_0001_page-0004.jpg
Processing sheet: dataset/Lipi_5\Lipi 9_0001_page-0004.jpg


In [33]:
# PREPROCESSING
import cv2
import numpy as np
import os

def preprocess_sheet(image_path):
    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Unable to load image at {image_path}")
        return None
    
    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply Gaussian blur to smooth the image and reduce noise
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    
    # Adaptive thresholding to binarize the image
    binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 11, 8)
    
    # Invert the binary image to make characters black and background white
    binary = cv2.bitwise_not(binary)
    
    return binary

def extract_characters(binary_image, rows, cols, crop_percent=0.1):
    if binary_image is None:
        return []
    
    # Get the dimensions of the binary image
    height, width = binary_image.shape
    cell_width_pixels = width // cols
    cell_height_pixels = height // rows
    
    crop_pixels_x = int(cell_width_pixels * crop_percent)
    crop_pixels_y = int(cell_height_pixels * crop_percent)
    
    extracted_characters = []
    
    for row in range(rows):
        for col in range(cols):
            x_start = col * cell_width_pixels + crop_pixels_x
            y_start = row * cell_height_pixels + crop_pixels_y
            x_end = (col + 1) * cell_width_pixels - crop_pixels_x
            y_end = (row + 1) * cell_height_pixels - crop_pixels_y

            # Ensure the coordinates are within bounds
            x_start = max(x_start, 0)
            y_start = max(y_start, 0)
            x_end = min(x_end, width)
            y_end = min(y_end, height)
            
            # Extract the cell and append it
            cell = binary_image[y_start:y_end, x_start:x_end]
            extracted_characters.append(cell)

    return extracted_characters

def save_characters(characters, output_folder, sheet_index, folder_start_number):
    # Create root output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for index, character in enumerate(characters):
        character_folder_number = folder_start_number + index + 1
        character_folder = os.path.join(output_folder, f'character_{character_folder_number}')
        if not os.path.exists(character_folder):
            os.makedirs(character_folder)
        
        character_filename = f'{sheet_index + 1}.png'
        character_path = os.path.join(character_folder, character_filename)
        
        # Save character image
        cv2.imwrite(character_path, character)

def process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number=1):
    if not os.path.exists(input_folder):
        print(f"Input folder does not exist: {input_folder}")
        return
    
    # List all files in input folder
    all_files = os.listdir(input_folder)
    # print(f"Files in input folder: {all_files}")
    
    # Filter only .jpg files
    sheet_files = sorted([f for f in all_files if f.lower().endswith('.jpg')])
    if not sheet_files:
        print(f"No .jpg files found in the input folder: {input_folder}")
        return
    
    print(f"Processing {len(sheet_files)} sheets from folder: {input_folder}")

    for sheet_index, sheet_file in enumerate(sheet_files):
        sheet_path = os.path.join(input_folder, sheet_file)
        print(f"Processing sheet: {sheet_path}")
        
        binarized_image = preprocess_sheet(sheet_path)
        if binarized_image is None:
            continue
        
        characters = extract_characters(binarized_image, rows, cols)
        save_characters(characters, output_folder, sheet_index, folder_start_number)

# Parameters
input_folder = 'dataset/Lipi_6'  # Folder containing the images
output_folder = 'preprocessing'  # Folder where the extracted characters will be saved
rows = 6
cols = 9
folder_start_number = 232  # Starting number for character folders

# Process all sheets
process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number)

Processing 9 sheets from folder: dataset/Lipi_6
Processing sheet: dataset/Lipi_6\Lipi 1_0001_page-0005.jpg
Processing sheet: dataset/Lipi_6\Lipi 2_0001_page-0005.jpg
Processing sheet: dataset/Lipi_6\Lipi 3_0001_page-0005.jpg
Processing sheet: dataset/Lipi_6\Lipi 4_0001_page-0005.jpg
Processing sheet: dataset/Lipi_6\Lipi 5_0001_page-0005.jpg
Processing sheet: dataset/Lipi_6\Lipi 6_0001_page-0005.jpg
Processing sheet: dataset/Lipi_6\Lipi 7_0001_page-0005.jpg
Processing sheet: dataset/Lipi_6\Lipi 8_0001_page-0005.jpg
Processing sheet: dataset/Lipi_6\Lipi 9_0001_page-0005.jpg


In [35]:
# PREPROCESSING
import cv2
import numpy as np
import os

def preprocess_sheet(image_path):
    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Unable to load image at {image_path}")
        return None
    
    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply Gaussian blur to smooth the image and reduce noise
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    
    # Adaptive thresholding to binarize the image
    binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 11, 8)
    
    # Invert the binary image to make characters black and background white
    binary = cv2.bitwise_not(binary)
    
    return binary

def extract_characters(binary_image, rows, cols, crop_percent=0.1):
    if binary_image is None:
        return []
    
    # Get the dimensions of the binary image
    height, width = binary_image.shape
    cell_width_pixels = width // cols
    cell_height_pixels = height // rows
    
    crop_pixels_x = int(cell_width_pixels * crop_percent)
    crop_pixels_y = int(cell_height_pixels * crop_percent)
    
    extracted_characters = []
    
    for row in range(rows):
        for col in range(cols):
            x_start = col * cell_width_pixels + crop_pixels_x
            y_start = row * cell_height_pixels + crop_pixels_y
            x_end = (col + 1) * cell_width_pixels - crop_pixels_x
            y_end = (row + 1) * cell_height_pixels - crop_pixels_y

            # Ensure the coordinates are within bounds
            x_start = max(x_start, 0)
            y_start = max(y_start, 0)
            x_end = min(x_end, width)
            y_end = min(y_end, height)
            
            # Extract the cell and append it
            cell = binary_image[y_start:y_end, x_start:x_end]
            extracted_characters.append(cell)

    return extracted_characters

def save_characters(characters, output_folder, sheet_index, folder_start_number):
    # Create root output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for index, character in enumerate(characters):
        character_folder_number = folder_start_number + index + 1
        character_folder = os.path.join(output_folder, f'character_{character_folder_number}')
        if not os.path.exists(character_folder):
            os.makedirs(character_folder)
        
        character_filename = f'{sheet_index + 1}.png'
        character_path = os.path.join(character_folder, character_filename)
        
        # Save character image
        cv2.imwrite(character_path, character)

def process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number=1):
    if not os.path.exists(input_folder):
        print(f"Input folder does not exist: {input_folder}")
        return
    
    # List all files in input folder
    all_files = os.listdir(input_folder)
    # print(f"Files in input folder: {all_files}")
    
    # Filter only .jpg files
    sheet_files = sorted([f for f in all_files if f.lower().endswith('.jpg')])
    if not sheet_files:
        print(f"No .jpg files found in the input folder: {input_folder}")
        return
    
    print(f"Processing {len(sheet_files)} sheets from folder: {input_folder}")

    for sheet_index, sheet_file in enumerate(sheet_files):
        sheet_path = os.path.join(input_folder, sheet_file)
        print(f"Processing sheet: {sheet_path}")
        
        binarized_image = preprocess_sheet(sheet_path)
        if binarized_image is None:
            continue
        
        characters = extract_characters(binarized_image, rows, cols)
        save_characters(characters, output_folder, sheet_index, folder_start_number)

# Parameters
input_folder = 'dataset/Lipi_7'  # Folder containing the images
output_folder = 'preprocessing'  # Folder where the extracted characters will be saved
rows = 6
cols = 9
folder_start_number = 286  # Starting number for character folders

# Process all sheets
process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number)

Processing 9 sheets from folder: dataset/Lipi_7
Processing sheet: dataset/Lipi_7\Lipi 1_0001_page-0006.jpg
Processing sheet: dataset/Lipi_7\Lipi 2_0001_page-0006.jpg
Processing sheet: dataset/Lipi_7\Lipi 3_0001_page-0006.jpg
Processing sheet: dataset/Lipi_7\Lipi 4_0001_page-0006.jpg
Processing sheet: dataset/Lipi_7\Lipi 5_0001_page-0006.jpg
Processing sheet: dataset/Lipi_7\Lipi 6_0001_page-0006.jpg
Processing sheet: dataset/Lipi_7\Lipi 7_0001_page-0006.jpg
Processing sheet: dataset/Lipi_7\Lipi 8_0001_page-0006.jpg
Processing sheet: dataset/Lipi_7\Lipi 9_0001_page-0006.jpg


In [37]:
# PREPROCESSING
import cv2
import numpy as np
import os

def preprocess_sheet(image_path):
    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Unable to load image at {image_path}")
        return None
    
    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply Gaussian blur to smooth the image and reduce noise
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    
    # Adaptive thresholding to binarize the image
    binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 11, 8)
    
    # Invert the binary image to make characters black and background white
    binary = cv2.bitwise_not(binary)
    
    return binary

def extract_characters(binary_image, rows, cols, crop_percent=0.1):
    if binary_image is None:
        return []
    
    # Get the dimensions of the binary image
    height, width = binary_image.shape
    cell_width_pixels = width // cols
    cell_height_pixels = height // rows
    
    crop_pixels_x = int(cell_width_pixels * crop_percent)
    crop_pixels_y = int(cell_height_pixels * crop_percent)
    
    extracted_characters = []
    
    for row in range(rows):
        for col in range(cols):
            x_start = col * cell_width_pixels + crop_pixels_x
            y_start = row * cell_height_pixels + crop_pixels_y
            x_end = (col + 1) * cell_width_pixels - crop_pixels_x
            y_end = (row + 1) * cell_height_pixels - crop_pixels_y

            # Ensure the coordinates are within bounds
            x_start = max(x_start, 0)
            y_start = max(y_start, 0)
            x_end = min(x_end, width)
            y_end = min(y_end, height)
            
            # Extract the cell and append it
            cell = binary_image[y_start:y_end, x_start:x_end]
            extracted_characters.append(cell)

    return extracted_characters

def save_characters(characters, output_folder, sheet_index, folder_start_number):
    # Create root output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for index, character in enumerate(characters):
        character_folder_number = folder_start_number + index + 1
        character_folder = os.path.join(output_folder, f'character_{character_folder_number}')
        if not os.path.exists(character_folder):
            os.makedirs(character_folder)
        
        character_filename = f'{sheet_index + 1}.png'
        character_path = os.path.join(character_folder, character_filename)
        
        # Save character image
        cv2.imwrite(character_path, character)

def process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number=1):
    if not os.path.exists(input_folder):
        print(f"Input folder does not exist: {input_folder}")
        return
    
    # List all files in input folder
    all_files = os.listdir(input_folder)
    # print(f"Files in input folder: {all_files}")
    
    # Filter only .jpg files
    sheet_files = sorted([f for f in all_files if f.lower().endswith('.jpg')])
    if not sheet_files:
        print(f"No .jpg files found in the input folder: {input_folder}")
        return
    
    print(f"Processing {len(sheet_files)} sheets from folder: {input_folder}")

    for sheet_index, sheet_file in enumerate(sheet_files):
        sheet_path = os.path.join(input_folder, sheet_file)
        print(f"Processing sheet: {sheet_path}")
        
        binarized_image = preprocess_sheet(sheet_path)
        if binarized_image is None:
            continue
        
        characters = extract_characters(binarized_image, rows, cols)
        save_characters(characters, output_folder, sheet_index, folder_start_number)

# Parameters
input_folder = 'dataset/Lipi_8'  # Folder containing the images
output_folder = 'preprocessing'  # Folder where the extracted characters will be saved
rows = 6
cols = 9
folder_start_number = 340  # Starting number for character folders

# Process all sheets
process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number)

Processing 9 sheets from folder: dataset/Lipi_8
Processing sheet: dataset/Lipi_8\Lipi 1_0001_page-0007.jpg
Processing sheet: dataset/Lipi_8\Lipi 2_0001_page-0007.jpg
Processing sheet: dataset/Lipi_8\Lipi 3_0001_page-0007.jpg
Processing sheet: dataset/Lipi_8\Lipi 4_0001_page-0007.jpg
Processing sheet: dataset/Lipi_8\Lipi 5_0001_page-0007.jpg
Processing sheet: dataset/Lipi_8\Lipi 6_0001_page-0007.jpg
Processing sheet: dataset/Lipi_8\Lipi 7_0001_page-0007.jpg
Processing sheet: dataset/Lipi_8\Lipi 8_0001_page-0007.jpg
Processing sheet: dataset/Lipi_8\Lipi 9_0001_page-0007.jpg


In [39]:
# PREPROCESSING
import cv2
import numpy as np
import os

def preprocess_sheet(image_path):
    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Unable to load image at {image_path}")
        return None
    
    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply Gaussian blur to smooth the image and reduce noise
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    
    # Adaptive thresholding to binarize the image
    binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 11, 8)
    
    # Invert the binary image to make characters black and background white
    binary = cv2.bitwise_not(binary)
    
    return binary

def extract_characters(binary_image, rows, cols, crop_percent=0.1):
    if binary_image is None:
        return []
    
    # Get the dimensions of the binary image
    height, width = binary_image.shape
    cell_width_pixels = width // cols
    cell_height_pixels = height // rows
    
    crop_pixels_x = int(cell_width_pixels * crop_percent)
    crop_pixels_y = int(cell_height_pixels * crop_percent)
    
    extracted_characters = []
    
    for row in range(rows):
        for col in range(cols):
            x_start = col * cell_width_pixels + crop_pixels_x
            y_start = row * cell_height_pixels + crop_pixels_y
            x_end = (col + 1) * cell_width_pixels - crop_pixels_x
            y_end = (row + 1) * cell_height_pixels - crop_pixels_y

            # Ensure the coordinates are within bounds
            x_start = max(x_start, 0)
            y_start = max(y_start, 0)
            x_end = min(x_end, width)
            y_end = min(y_end, height)
            
            # Extract the cell and append it
            cell = binary_image[y_start:y_end, x_start:x_end]
            extracted_characters.append(cell)

    return extracted_characters

def save_characters(characters, output_folder, sheet_index, folder_start_number):
    # Create root output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for index, character in enumerate(characters):
        character_folder_number = folder_start_number + index + 1
        character_folder = os.path.join(output_folder, f'character_{character_folder_number}')
        if not os.path.exists(character_folder):
            os.makedirs(character_folder)
        
        character_filename = f'{sheet_index + 1}.png'
        character_path = os.path.join(character_folder, character_filename)
        
        # Save character image
        cv2.imwrite(character_path, character)

def process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number=1):
    if not os.path.exists(input_folder):
        print(f"Input folder does not exist: {input_folder}")
        return
    
    # List all files in input folder
    all_files = os.listdir(input_folder)
    # print(f"Files in input folder: {all_files}")
    
    # Filter only .jpg files
    sheet_files = sorted([f for f in all_files if f.lower().endswith('.jpg')])
    if not sheet_files:
        print(f"No .jpg files found in the input folder: {input_folder}")
        return
    
    print(f"Processing {len(sheet_files)} sheets from folder: {input_folder}")

    for sheet_index, sheet_file in enumerate(sheet_files):
        sheet_path = os.path.join(input_folder, sheet_file)
        print(f"Processing sheet: {sheet_path}")
        
        binarized_image = preprocess_sheet(sheet_path)
        if binarized_image is None:
            continue
        
        characters = extract_characters(binarized_image, rows, cols)
        save_characters(characters, output_folder, sheet_index, folder_start_number)

# Parameters
input_folder = 'dataset/Lipi_9'  # Folder containing the images
output_folder = 'preprocessing'  # Folder where the extracted characters will be saved
rows = 6
cols = 9
folder_start_number = 394  # Starting number for character folders

# Process all sheets
process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number)

Processing 9 sheets from folder: dataset/Lipi_9
Processing sheet: dataset/Lipi_9\Lipi 1_0001_page-0008.jpg
Processing sheet: dataset/Lipi_9\Lipi 2_0001_page-0008.jpg
Processing sheet: dataset/Lipi_9\Lipi 3_0001_page-0008.jpg
Processing sheet: dataset/Lipi_9\Lipi 4_0001_page-0008.jpg
Processing sheet: dataset/Lipi_9\Lipi 5_0001_page-0008.jpg
Processing sheet: dataset/Lipi_9\Lipi 6_0001_page-0008.jpg
Processing sheet: dataset/Lipi_9\Lipi 7_0001_page-0008.jpg
Processing sheet: dataset/Lipi_9\Lipi 8_0001_page-0008.jpg
Processing sheet: dataset/Lipi_9\Lipi 9_0001_page-0008.jpg


In [41]:
# PREPROCESSING
import cv2
import numpy as np
import os

def preprocess_sheet(image_path):
    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Unable to load image at {image_path}")
        return None
    
    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply Gaussian blur to smooth the image and reduce noise
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    
    # Adaptive thresholding to binarize the image
    binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 11, 8)
    
    # Invert the binary image to make characters black and background white
    binary = cv2.bitwise_not(binary)
    
    return binary

def extract_characters(binary_image, rows, cols, crop_percent=0.1):
    if binary_image is None:
        return []
    
    # Get the dimensions of the binary image
    height, width = binary_image.shape
    cell_width_pixels = width // cols
    cell_height_pixels = height // rows
    
    crop_pixels_x = int(cell_width_pixels * crop_percent)
    crop_pixels_y = int(cell_height_pixels * crop_percent)
    
    extracted_characters = []
    
    for row in range(rows):
        for col in range(cols):
            x_start = col * cell_width_pixels + crop_pixels_x
            y_start = row * cell_height_pixels + crop_pixels_y
            x_end = (col + 1) * cell_width_pixels - crop_pixels_x
            y_end = (row + 1) * cell_height_pixels - crop_pixels_y

            # Ensure the coordinates are within bounds
            x_start = max(x_start, 0)
            y_start = max(y_start, 0)
            x_end = min(x_end, width)
            y_end = min(y_end, height)
            
            # Extract the cell and append it
            cell = binary_image[y_start:y_end, x_start:x_end]
            extracted_characters.append(cell)

    return extracted_characters

def save_characters(characters, output_folder, sheet_index, folder_start_number):
    # Create root output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for index, character in enumerate(characters):
        character_folder_number = folder_start_number + index + 1
        character_folder = os.path.join(output_folder, f'character_{character_folder_number}')
        if not os.path.exists(character_folder):
            os.makedirs(character_folder)
        
        character_filename = f'{sheet_index + 1}.png'
        character_path = os.path.join(character_folder, character_filename)
        
        # Save character image
        cv2.imwrite(character_path, character)

def process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number=1):
    if not os.path.exists(input_folder):
        print(f"Input folder does not exist: {input_folder}")
        return
    
    # List all files in input folder
    all_files = os.listdir(input_folder)
    # print(f"Files in input folder: {all_files}")
    
    # Filter only .jpg files
    sheet_files = sorted([f for f in all_files if f.lower().endswith('.jpg')])
    if not sheet_files:
        print(f"No .jpg files found in the input folder: {input_folder}")
        return
    
    print(f"Processing {len(sheet_files)} sheets from folder: {input_folder}")

    for sheet_index, sheet_file in enumerate(sheet_files):
        sheet_path = os.path.join(input_folder, sheet_file)
        print(f"Processing sheet: {sheet_path}")
        
        binarized_image = preprocess_sheet(sheet_path)
        if binarized_image is None:
            continue
        
        characters = extract_characters(binarized_image, rows, cols)
        save_characters(characters, output_folder, sheet_index, folder_start_number)

# Parameters
input_folder = 'dataset/Lipi_10'  # Folder containing the images
output_folder = 'preprocessing'  # Folder where the extracted characters will be saved
rows = 6
cols = 9
folder_start_number = 448  # Starting number for character folders

# Process all sheets
process_all_sheets(input_folder, output_folder, rows, cols, folder_start_number)

Processing 9 sheets from folder: dataset/Lipi_10
Processing sheet: dataset/Lipi_10\Lipi 1_0001_page-0009.jpg
Processing sheet: dataset/Lipi_10\Lipi 2_0001_page-0009.jpg
Processing sheet: dataset/Lipi_10\Lipi 3_0001_page-0009.jpg
Processing sheet: dataset/Lipi_10\Lipi 4_0001_page-0009.jpg
Processing sheet: dataset/Lipi_10\Lipi 5_0001_page-0009.jpg
Processing sheet: dataset/Lipi_10\Lipi 6_0001_page-0009.jpg
Processing sheet: dataset/Lipi_10\Lipi 7_0001_page-0009.jpg
Processing sheet: dataset/Lipi_10\Lipi 8_0001_page-0009.jpg
Processing sheet: dataset/Lipi_10\Lipi 9_0001_page-0009.jpg


In [67]:
# BINARIZATION
import cv2
import numpy as np
import os

def preprocess_character_image(image, target_size=(50, 50)):
    # Convert the image to grayscale if it isn't already
    if len(image.shape) == 3:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply Gaussian blur to reduce noise
    blurred_image = cv2.GaussianBlur(image, (5, 5), 0)
    
    # Apply adaptive thresholding (binarization) to get a binary image
    binary_image = cv2.adaptiveThreshold(
        blurred_image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, 
        cv2.THRESH_BINARY_INV, 11, 8
    )
    
    # Resize to the target size (50x50)
    resized_image = cv2.resize(binary_image, target_size, interpolation=cv2.INTER_AREA)
    
    # Normalize pixel values to the range [0, 1]
    normalized_image = resized_image / 255.0
    
    return normalized_image

def process_dataset(input_folder, output_folder, target_size=(64, 64)):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Loop through each character folder
    for character_folder in os.listdir(input_folder):
        character_path = os.path.join(input_folder, character_folder)
        
        # Skip non-folder items
        if not os.path.isdir(character_path):
            continue

        # Create the output directory for this character if it doesn’t exist
        character_output_folder = os.path.join(output_folder, character_folder)
        if not os.path.exists(character_output_folder):
            os.makedirs(character_output_folder)
        
        # Loop through images in this character's folder
        for img_file in os.listdir(character_path):
            img_path = os.path.join(character_path, img_file)
            
            # Load the image
            image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            if image is None:
                print(f"Error loading image: {img_path}")
                continue
            
            # Preprocess the image with binarization
            preprocessed_image = preprocess_character_image(image, target_size)
            
            # Save the preprocessed image
            output_image_path = os.path.join(character_output_folder, img_file)
            # Multiply by 255 to convert back to uint8 for saving
            cv2.imwrite(output_image_path, (preprocessed_image * 255).astype(np.uint8))

input_folder = 'preprocessing'   # Input folder with original character images
output_folder = 'binarization'  # Output folder for preprocessed images
target_size = (50, 50)  # Target size for each character image

# Process the dataset
process_dataset(input_folder, output_folder, target_size)

In [69]:
# AUGMENTATION
import cv2
import os
import numpy as np
def augment_images_with_slant(input_folder, output_folder, target_size=(500, 500)):
    for root, dirs, files in os.walk(input_folder):
        for file in files:
            input_path = os.path.join(root, file)
            output_subfolder = os.path.relpath(root, input_folder)

            # Ensure the output subfolder exists
            output_subfolder_path = os.path.join(output_folder, output_subfolder)
            os.makedirs(output_subfolder_path, exist_ok=True)

            try:
                # Read the image
                original_image = cv2.imread(input_path)

                # Resize the image to the target size
                resized_image = cv2.resize(original_image, target_size)

                # Convert to grayscale and create a binary mask
                gray_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)
                _, binary_mask = cv2.threshold(gray_image, 1, 255, cv2.THRESH_BINARY)

                # Invert the mask to get background as 255 and characters as 0
                binary_mask = cv2.bitwise_not(binary_mask)

                # Define the slant angles for left and right
                left_slant_angle = 10  # degrees
                right_slant_angle = -10  # degrees

                # Center of the image
                center = (target_size[1] // 2, target_size[0] // 2)

                # Compute the rotation matrices
                left_rotation_matrix = cv2.getRotationMatrix2D(center, left_slant_angle, 1)
                right_rotation_matrix = cv2.getRotationMatrix2D(center, right_slant_angle, 1)

                # Apply the slant (rotation) transformations
                left_slanted_image = cv2.warpAffine(resized_image, left_rotation_matrix, target_size, borderMode=cv2.BORDER_CONSTANT, borderValue=(0, 0, 0))
                right_slanted_image = cv2.warpAffine(resized_image, right_rotation_matrix, target_size, borderMode=cv2.BORDER_CONSTANT, borderValue=(0, 0, 0))

                # Apply the same transformations to the binary mask
                left_slanted_mask = cv2.warpAffine(binary_mask, left_rotation_matrix, target_size, borderMode=cv2.BORDER_CONSTANT, borderValue=255)
                right_slanted_mask = cv2.warpAffine(binary_mask, right_rotation_matrix, target_size, borderMode=cv2.BORDER_CONSTANT, borderValue=255)

                # Mask out the white areas introduced by the rotation
                left_slanted_image[left_slanted_mask == 255] = (0, 0, 0)
                right_slanted_image[right_slanted_mask == 255] = (0, 0, 0)

                # Save the original and slanted images
                original_output_path = os.path.join(output_subfolder_path, f"original_{file}")
                left_slanted_output_path = os.path.join(output_subfolder_path, f"left_slanted_{file}")
                right_slanted_output_path = os.path.join(output_subfolder_path, f"right_slanted_{file}")

                cv2.imwrite(original_output_path, resized_image)
                cv2.imwrite(left_slanted_output_path, left_slanted_image)
                cv2.imwrite(right_slanted_output_path, right_slanted_image)

            except Exception as e:
                print(f"Error processing image {input_path}: {e}")

if __name__ == "__main__":
    # Set your input folder and output folder for images
    input_folder = 'binarization' # Change this to your input images folder path
    output_folder = 'augmentation' # Change this to your output folder for augmented images

    # Augment the images with left and right slants, and save the results
    augment_images_with_slant(input_folder, output_folder)

In [71]:
# MAPPING
import os
import json

def map_tulu_to_kannada(root_folder, output_file="tulu_to_kannada_mapping.json"):
    # Kannada characters list (ensure it matches the dataset size)
    kannada_characters = [
     "_", "ಅ", "ಆ", "ಇ", "ಈ", "ಉ", "ಊ", "ಋ", "ೠ", "ಎ", "ಏ", "ಐ", "ಒ", "ಔ", "ಅಂ", "ಅಃ", 
    # ಕ family
    "ಕ", "ಕಾ", "ಕಿ", "ಕೀ", "ಕು", "ಕೂ", "ಕೃ", "ಕೆ", "ಕೈ", "ಕೊ", "ಕೌ", "ಕಂ", "ಕಃ",
    # ಖ family
    "ಖ", "ಖಾ", "ಖಿ", "ಖೀ", "ಖು", "ಖೂ", "ಖೃ", "ಖೆ", "ಖೈ", "ಖೊ", "ಖೌ", "ಖಂ", "ಖಃ",
    # ಗ family
    "ಗ", "ಗಾ", "ಗಿ", "ಗೀ", "ಗು", "ಗೂ", "ಗೃ", "ಗೆ", "ಗೈ", "ಗೊ", "ಗೌ", "ಗಂ", "ಗಃ",
    # ಘ family
    "ಘ", "ಘಾ", "ಘಿ", "ಘೀ", "ಘು", "ಘೂ", "ಘೃ", "ಘೆ", "ಘೈ", "ಘೊ", "ಘೌ", "ಘಂ", "ಘಃ",
    # ಙ family
    "ಙ", "ಙಾ", "ಙಿ", "ಙೀ", "ಙು", "ಙೂ", "ಙೃ", "ಙೆ", "ಙೈ", "ಙೊ", "ಙೌ", "ಙಂ", "ಙಃ",
    # ಚ family
    "ಚ", "ಚಾ", "ಚಿ", "ಚೀ", "ಚು", "ಚೂ", "ಚೃ", "ಚೆ", "ಚೈ", "ಚೊ", "ಚೌ", "ಚಂ", "ಚಃ",
    # ಛ family
    "ಛ", "ಛಾ", "ಛಿ", "ಛೀ", "ಛು", "ಛೂ", "ಛೃ", "ಛೆ", "ಛೈ", "ಛೊ", "ಛೌ", "ಛಂ", "ಛಃ",
    # ಜ family
    "ಜ", "ಜಾ", "ಜಿ", "ಜೀ", "ಜು", "ಜೂ", "ಜೃ", "ಜೆ", "ಜೈ", "ಜೊ", "ಜೌ", "ಜಂ", "ಜಃ",
    # ಝ family
    "ಝ", "ಝಾ", "ಝಿ", "ಝೀ", "ಝು", "ಝೂ", "ಝೃ", "ಝೆ", "ಝೈ", "ಝೊ", "ಝೌ", "ಝಂ", "ಝಃ",
    # ಞ family
    "ಞ", "ಞಾ", "ಞಿ", "ಞೀ", "ಞು", "ಞೂ", "ಞೃ", "ಞೆ", "ಞೈ", "ಞೊ", "ಞೌ", "ಞಂ", "ಞಃ",
    # ಟ family
    "ಟ", "ಟಾ", "ಟಿ", "ಟೀ", "ಟು", "ಟೂ", "ಟೃ", "ಟೆ", "ಟೈ", "ಟೊ", "ಟೌ", "ಟಂ", "ಟಃ",
    # ಠ family
    "ಠ", "ಠಾ", "ಠಿ", "ಠೀ", "ಠು", "ಠೂ", "ಠೃ", "ಠೆ", "ಠೈ", "ಠೊ", "ಠೌ", "ಠಂ", "ಠಃ",
    # ಡ family
    "ಡ", "ಡಾ", "ಡಿ", "ಡೀ", "ಡು", "ಡೂ", "ಡೃ", "ಡೆ", "ಡೈ", "ಡೊ", "ಡೌ", "ಡುಂ", "ಡಃ",
    # ಢ family
    "ಢ", "ಢಾ", "ಢಿ", "ಢೀ", "ಢು", "ಢೂ", "ಢೃ", "ಢೆ", "ಢೈ", "ಢೊ", "ಢೌ", "ಢಂ", "ಢಃ",
    # ಣ family
    "ಣ", "ಣಾ", "ಣಿ", "ಣೀ", "ಣು", "ಣೂ", "ಣೃ", "ಣೆ", "ಣೈ", "ಣೊ", "ಣೌ", "ಣಂ", "ಣಃ",
    # ತ family
    "ತ", "ತಾ", "ತಿ", "ತೀ", "ತು", "ತೂ", "ತೃ", "ತೆ", "ತೈ", "ತೊ", "ತೌ", "ತಂ", "ತಃ",
    # ಥ family
    "ಥ", "ಥಾ", "ಥಿ", "ಥೀ", "ಥು", "ಥೂ", "ಥೃ", "ಥೆ", "ಥೈ", "ಥೊ", "ಥೌ", "ಥಂ", "ಥಃ",
    # ದ family
    "ದ", "ದಾ", "ದಿ", "ದೀ", "ದು", "ದೂ", "ದೃ", "ದೆ", "ದೈ", "ದೊ", "ದೌ", "ದಂ", "ದಃ",
    # ಧ family
    "ಧ", "ಧಾ", "ಧಿ", "ಧೀ", "ಧು", "ಧೂ", "ಧೃ", "ಧೆ", "ಧೈ", "ಧೊ", "ಧೌ", "ಧಂ", "ಧಃ",
    # ನ family
    "ನ", "ನಾ", "ನಿ", "ನೀ", "ನು", "ನೂ", "ನೃ", "ನೆ", "ನೈ", "ನೊ", "ನೌ", "ನಂ", "ನಃ",
    # ಪ family
    "ಪ", "ಪಾ", "ಪಿ", "ಪೀ", "ಪು", "ಪೂ", "ಪೃ", "ಪೆ", "ಪೈ", "ಪೊ", "ಪೌ", "ಪಂ", "ಪಃ",
    # ಫ family
    "ಫ", "ಫಾ", "ಫಿ", "ಫೀ", "ಫು", "ಫೂ", "ಫೃ", "ಫೆ", "ಫೈ", "ಫೊ", "ಫೌ", "ಫಂ", "ಫಃ",
    # ಬ family
    "ಬ", "ಬಾ", "ಬಿ", "ಬೀ", "ಬು", "ಬೂ", "ಬೃ", "ಬೆ", "ಬೈ", "ಬೊ", "ಬೌ", "ಬಂ", "ಬಃ",
    # ಭ family
    "ಭ", "ಭಾ", "ಭಿ", "ಭೀ", "ಭು", "ಭೂ", "ಭೃ", "ಭೆ", "ಭೈ", "ಭೊ", "ಭೌ", "ಭಂ", "ಭಃ",
    # ಮ family
    "ಮ", "ಮಾ", "ಮಿ", "ಮೀ", "ಮು", "ಮೂ", "ಮೃ", "ಮೆ", "ಮೈ", "ಮೊ", "ಮೌ", "ಮಂ", "ಮಃ",
    # ಯ family
    "ಯ", "ಯಾ", "ಯಿ", "ಯೀ", "ಯು", "ಯೂ", "ಯೃ", "ಯೆ", "ಯೈ", "ಯೊ", "ಯೌ", "ಯಂ", "ಯಃ",
    # ರ family
    "ರ", "ರಾ", "ರಿ", "ರೀ", "ರು", "ರೂ", "ರೃ", "ರೆ", "ರೈ", "ರೊ", "ರೌ", "ರಂ", "ರಃ",
    # ಲ family
    "ಲ", "ಲಾ", "ಲಿ", "ಲೀ", "ಲು", "ಲೂ", "ಲೃ", "ಲೆ", "ಲೈ", "ಲೊ", "ಲೌ", "ಲಂ", "ಲಃ",
    # ವ family
    "ವ", "ವಾ", "ವಿ", "ವೀ", "ವು", "ವೂ", "ವೃ", "ವೆ", "ವೈ", "ವೊ", "ವೌ", "ವಂ", "ವಃ",
    # ಶ family 
    "ಶ", "ಶಾ", "ಶಿ", "ಶೀ", "ಶು", "ಶೂ", "ಶೃ", "ಶೆ", "ಶೈ", "ಶೊ", "ಶೌ", "ಶಂ", "ಶಃ",
    # ಷ family 
    "ಷ", "ಷಾ", "ಷಿ", "ಷೀ", "ಷು", "ಷೂ", "ಷೃ", "ಷೆ", "ಷೈ", "ಷೊ", "ಷೌ", "ಷಂ", "ಷಃ",
    # ಸ family 
    "ಸ", "ಸಾ", "ಸಿ", "ಸೀ", "ಸು", "ಸೂ", "ಸೃ", "ಸೆ", "ಸೈ", "ಸೊ", "ಸೌ", "ಸಂ", "ಸಃ",
    # ಹ family
    "ಹ", "ಹಾ", "ಹಿ", "ಹೀ", "ಹು", "ಹೂ", "ಹೃ", "ಹೆ", "ಹೈ", "ಹೊ", "ಹೌ", "ಹಂ", "ಹಃ",
    # ಳ family
    "ಳ", "ಳಾ", "ಳಿ", "ಳೀ", "ಳು", "ಳೂ", "ಳೃ", "ಳೆ", "ಳೈ", "ಳೊ", "ಳೌ", "ಳಂ", "ಳಃ"
    ]  

    # Get all folder names in the root directory
    folder_names = [f for f in os.listdir(root_folder) if os.path.isdir(os.path.join(root_folder, f))]

    # Ensure correct ordering (sort numerically if folder names contain numbers)
    folder_names_sorted = sorted(folder_names, key=lambda x: int(''.join(filter(str.isdigit, x))) if any(c.isdigit() for c in x) else x)

    # Check if we have enough Kannada characters
    if len(folder_names_sorted) > len(kannada_characters):
        print("Error: Not enough Kannada characters available for mapping.")
        return

    # Create a dictionary mapping each folder to a Kannada character
    mapping = {folder: kannada_characters[idx] for idx, folder in enumerate(folder_names_sorted)}

    # Save mapping to JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(mapping, f, ensure_ascii=False, indent=4)

    print(f"Mapping saved to '{output_file}'.")
    return mapping

# Example Usage
if __name__ == "__main__":
    root_folder = 'augmentation'  # Update with your dataset path
    mapping = map_tulu_to_kannada(root_folder)
    # print(mapping,) Print to verify mapping

Mapping saved to 'tulu_to_kannada_mapping.json'.


In [73]:
# MODEL TRAINING
import os
import cv2
import numpy as np
import json
import joblib
from skimage.feature import hog
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# Load Dataset with Mapped Kannada Labels
def load_dataset(root_folder, mapping_file, img_size=(64, 64)):
    # Load the mapping from JSON
    with open(mapping_file, 'r', encoding='utf-8') as f:
        tulu_to_kannada = json.load(f)
    
    images, labels = [], []
    
    for folder in tqdm(tulu_to_kannada.keys(), desc="Loading Data"):
        folder_path = os.path.join(root_folder, folder)
        
        if os.path.isdir(folder_path):
            for img_name in os.listdir(folder_path):
                img_path = os.path.join(folder_path, img_name)
                
                try:
                    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                    img = cv2.resize(img, img_size)  # Resize for consistency
                    images.append(img)
                    labels.append(tulu_to_kannada[folder])  # Map Tulu folder to Kannada character
                except Exception as e:
                    print(f"Error loading image {img_path}: {e}")
    
    return np.array(images), np.array(labels)

# Extract Features using HOG
def extract_features(images):
    features = []
    for img in tqdm(images, desc="Extracting Features"):
        hog_features = hog(img, orientations=9, pixels_per_cell=(8, 8), 
                           cells_per_block=(2, 2), block_norm='L2-Hys')
        features.append(hog_features)
    return np.array(features)

# Train and Evaluate SVM Model
def train_svm(features, labels):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    svm = SVC(kernel='linear', C=1.0)
    print("\nTraining SVM Model...")
    svm.fit(X_train, y_train)

    y_pred = svm.predict(X_test)

    # Print Accuracy and Classification Report
    print("\nModel Evaluation:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))
    
    return svm

# Save Model to File
def save_model(model, filename="tulu_ocr_model.pkl"):
    joblib.dump(model, filename)
    print(f"\nModel saved as '{filename}'.")

# Main Execution
if __name__ == "__main__":
    root_folder = 'augmentation'  # Update dataset path
    mapping_file = "tulu_to_kannada_mapping.json"

    # Load images and labels
    images, labels = load_dataset(root_folder, mapping_file)

    # Extract HOG features
    features = extract_features(images)

    # Train SVM model
    model = train_svm(features, labels)

    # Save trained model
    save_model(model)

Loading Data: 100%|██████████████████████████████████████████████████████████████████| 458/458 [03:36<00:00,  2.11it/s]
Extracting Features: 100%|██████████████████████████████████████████████████████| 17310/17310 [00:30<00:00, 576.12it/s]



Training SVM Model...

Model Evaluation:
Accuracy: 0.7158
Classification Report:
               precision    recall  f1-score   support

           _       0.81      0.84      0.83        76
           ಅ       0.62      0.85      0.72        66
          ಅಂ       0.71      0.66      0.68        80
          ಅಃ       0.58      0.73      0.65        63
           ಆ       0.83      0.90      0.86        58
           ಇ       0.69      0.84      0.76        69
           ಈ       0.73      0.81      0.77        73
           ಉ       0.66      0.81      0.72        52
           ಊ       0.78      0.82      0.80        71
           ಋ       0.87      0.92      0.90        66
           ಎ       0.81      0.85      0.83        66
           ಏ       0.81      0.88      0.84        73
           ಐ       0.84      0.85      0.85        62
           ಒ       0.86      0.85      0.86        73
           ಔ       0.79      0.77      0.78        62
           ಕ       0.71      0.83      0.77         

In [75]:
# PREDICTION
import cv2
import numpy as np
import joblib
from skimage.feature import hog
import json

# Load trained model
model_filename = "tulu_ocr_model.pkl"
svm_model = joblib.load(model_filename)

# Load mapping file
with open("tulu_to_kannada_mapping.json", "r", encoding="utf-8") as f:
    character_mapping = json.load(f)

# Function to extract features
def extract_features(image_path, img_size=(64, 64)):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        print(f"Error: Could not load image at {image_path}")
        return None
    img = cv2.resize(img, img_size)
    
    # Extract HOG features
    features = hog(img, orientations=9, pixels_per_cell=(8, 8),
                   cells_per_block=(2, 2), block_norm='L2-Hys')
    
    return np.array(features).reshape(1, -1)

# Function to predict character
def predict_character(image_path):
    features = extract_features(image_path)
    
    if features is None:
        print("Prediction aborted due to image loading error.")
        return "Unknown"
    
    prediction_character = svm_model.predict(features)[0]  # Get predicted label
    
    print(f"Predicted Kannada Characetr: {prediction_character}")


# Example usage
if __name__ == "__main__":
    test_image_path = "augmentation/character_5/left_slanted_3.png"
    predict_character(test_image_path)

Predicted Kannada Characetr: ಈ
