In [12]:
import xml.etree.ElementTree as ET

def extract_annotations(inkml_path):
    tree = ET.parse(inkml_path)
    root = tree.getroot()
    
    namespace = {'ink': 'http://www.w3.org/2003/InkML'}
    
    annotations = {}
    for annotation in root.findall('ink:annotation', namespace):
        annotations[annotation.get('type')] = annotation.text
        
    annotation_xml = {}
    for annotation in root.findall('ink:annotationXML', namespace):
        annotation_xml[annotation.get('type')] = ET.tostring(annotation, encoding='unicode')
        
    traces = []
    for trace in root.findall('ink:trace', namespace):
        trace_id = trace.get('id')
        trace_data = trace.text.strip().split(', ')
        trace_points = [tuple(map(float, point.split())) for point in trace_data]
        traces.append((trace_id, trace_points))
    
    return annotations, annotation_xml, traces

In [13]:
import os

train_folders = [
    'archive/CROHME_training_2011/',
    'archive/MatricesTrain2014/',
    'archive/trainData_2012_part1/',
    'archive/trainData_2012_part2/',
    'archive/TrainINKML_2013/',
]

test_folders = [
    'archive/CROHME_test_2011',
    'archive/MatricesTest2014/',
    'archive/testData_2012/',
    'archive/TestINKML_2013/',
]

for folder in train_folders + test_folders:
    assert os.path.exists(folder), f"Path {folder} does not exist"

In [15]:
import csv
import os
from inkml2png import inkml2img

train_counter = 0
train_output_folder = 'data/train/'
csv_file_path = 'data/annotations.csv'
successes = 0
failures = 0
failed_files = []

os.makedirs(train_output_folder, exist_ok=True)

with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    csv_writer = csv.writer(file)
    csv_writer.writerow(['path', 'age', 'gender', 'hand', 'writer', 'truth', 'ui', 'copyright'])

    for folder in train_folders:
        for filename in os.listdir(folder):
            if filename.endswith('.inkml'):
                input_path = os.path.join(folder, filename)
                output_path = os.path.join(train_output_folder, f'{train_counter}.png')
                            
                try:
                    inkml2img(input_path, output_path)
                    annotations, _, _ = extract_annotations(input_path)
                    
                    age = annotations.get('age', '')
                    gender = annotations.get('gender', '')
                    hand = annotations.get('hand', '')
                    writer = annotations.get('writer', '')
                    truth = annotations.get('truth', '')
                    ui = annotations.get('UI', '')
                    copyright = annotations.get('copyright', '')
                    
                    csv_writer.writerow([f'{train_counter}.png', age, gender, hand, writer, truth, ui, copyright])

                    print(f'Converted: {input_path} -> {output_path}')
                    train_counter += 1
                    successes += 1
                except:
                    print(f'Failed: {input_path}')
                    failures += 1
                    failed_files.append(input_path)
                    continue

print(f'Successes: {successes}, Failures: {failures}')

Converted: archive/CROHME_training_2011/TrainData1_0_sub_1.inkml -> data/train/0.png
Converted: archive/CROHME_training_2011/TrainData1_0_sub_12.inkml -> data/train/1.png
Converted: archive/CROHME_training_2011/TrainData1_0_sub_15.inkml -> data/train/2.png
Converted: archive/CROHME_training_2011/TrainData1_0_sub_19.inkml -> data/train/3.png
Converted: archive/CROHME_training_2011/TrainData1_0_sub_21.inkml -> data/train/4.png
Converted: archive/CROHME_training_2011/TrainData1_0_sub_24.inkml -> data/train/5.png
Converted: archive/CROHME_training_2011/TrainData1_0_sub_28.inkml -> data/train/6.png
Converted: archive/CROHME_training_2011/TrainData1_0_sub_3.inkml -> data/train/7.png
Converted: archive/CROHME_training_2011/TrainData1_0_sub_5.inkml -> data/train/8.png
Converted: archive/CROHME_training_2011/TrainData1_0_sub_7.inkml -> data/train/9.png
Converted: archive/CROHME_training_2011/TrainData1_0_sub_9.inkml -> data/train/10.png
Converted: archive/CROHME_training_2011/TrainData1_1_sub_1

<Figure size 640x480 with 0 Axes>