In [4]:
import os
import numpy as np
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from tqdm import tqdm

In [5]:
# Ensure all directories exist or create them
def create_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

In [6]:

# Extract trace data from an InkML file
def get_traces_data(inkml_file_abs_path):
    traces_data = []
    tree = ET.parse(inkml_file_abs_path)
    root = tree.getroot()
    doc_namespace = "{http://www.w3.org/2003/InkML}"
    
    # Retrieve all traces with their coordinates
    traces_all = [{'id': trace_tag.get('id'),
                   'coords': [[round(float(axis_coord)) if float(axis_coord).is_integer() else round(float(axis_coord) * 10000)
                              for axis_coord in coord.strip().split(' ')]
                             for coord in (trace_tag.text).replace('\n', '').split(',')]}
                  for trace_tag in root.findall(doc_namespace + 'trace')]

    traces_all.sort(key=lambda trace_dict: int(trace_dict['id']))
    traceGroupWrapper = root.find(doc_namespace + 'traceGroup')
    
    if traceGroupWrapper is not None:
        for traceGroup in traceGroupWrapper.findall(doc_namespace + 'traceGroup'):
            label = traceGroup.find(doc_namespace + 'annotation').text
            traces_curr = []
            for traceView in traceGroup.findall(doc_namespace + 'traceView'):
                traceDataRef = int(traceView.get('traceDataRef'))
                single_trace = traces_all[traceDataRef]['coords']
                traces_curr.append(single_trace)
            traces_data.append({'label': label, 'trace_group': traces_curr})
    else:
        [traces_data.append({'trace_group': [trace['coords']]}) for trace in traces_all]
    
    return traces_data

In [7]:
# Function to convert InkML to PNG while maintaining the structure
def inkml_to_png(input_path, output_base_path, relative_subdir):
    traces = get_traces_data(input_path)
    file_base_name = os.path.basename(input_path).split('.')[0]  # Use file name without underscore

    # Combine the output_base_path with the relative subdirectory to maintain the structure
    output_folder = os.path.join(output_base_path, relative_subdir)
    create_dir(output_folder)
    
    for idx, elem in enumerate(traces):
        # Create a new plot for each trace group
        plt.gca().invert_yaxis()
        plt.gca().set_aspect('equal', adjustable='box')
        plt.axis('off')

        for stroke in elem['trace_group']:
            data = np.array(stroke)
            x, y = zip(*data)
            plt.plot(x, y, linewidth=2, color='black')
        
        label = 'unknown' if 'label' not in elem else elem['label']
        if label in ['A', 'B', 'C', 'F', 'X', 'Y']:
            label = 'capital_' + label
        
        # Save in the respective label folder within the subdirectory
        labeled_output_folder = os.path.join(output_folder, label)
        create_dir(labeled_output_folder)
        
        # Use file_base_name with PNG index for unique filenames
        output_file_path = os.path.join(labeled_output_folder, f"{file_base_name}_{idx}.png")
        plt.savefig(output_file_path, bbox_inches='tight', dpi=100)
        plt.close()

In [10]:
# Main processing function
def process_inkml_files(input_dir, output_dir):
    create_dir(output_dir)
    
    for root, _, files in os.walk(input_dir):
        # Calculate the relative subdirectory path from the input directory
        relative_subdir = os.path.relpath(root, input_dir)
        output_subdir = os.path.join(output_dir, relative_subdir)
        create_dir(output_subdir)
        
        for file in tqdm(files):
            if file.endswith('.inkml'):
                input_file_path = os.path.join(root, file)
                
                # Convert and save the PNG in the corresponding output subdirectory
                inkml_to_png(input_file_path, output_dir, relative_subdir)
                return

In [None]:

# Example usage in Kaggle
input_path = '/kaggle/input/handwritten-mathematical-expressions'
output_path = '/kaggle/working/CHROME_png1'
process_inkml_files(input_path, output_path)