# Step 1: Download the CROHME Dataset

You can download the CROHME dataset from a publicly available source like the following:

CROHME Official Competition Dataset

Alternatively, you can search for “CROHME Dataset” to find additional sources or archives.

In [None]:
import requests
import zipfile
import os
from google.colab.patches import cv2_imshow
import cv2
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout, BatchNormalization

# Step 2: Unzip the Dataset

Once you download the dataset, unzip it using Python.

In [None]:
# URL of the dataset
url = 'https://ddfe.curtin.edu.au/563B05F210B82/CROHME.zip'

# Path to save the zip file
zip_file_path = 'CROHME.zip'

# Download the file from the URL
print("Downloading the dataset...")
response = requests.get(url)
with open(zip_file_path, 'wb') as f:
    f.write(response.content)
print(f"Downloaded dataset to {zip_file_path}")

# Extract the downloaded ZIP file
print("Extracting the dataset...")
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('crohme_dataset')  # Extract to 'crohme_dataset' folder

print("Extraction completed!")

# Clean up the zip file if no longer needed
os.remove(zip_file_path)
print("ZIP file removed.")

Downloading the dataset...
Downloaded dataset to CROHME.zip
Extracting the dataset...
Extraction completed!
ZIP file removed.


# Step 3: Load the Dataset in Python

Assuming the dataset contains .inkml files (the standard format for handwritten mathematical symbols), we need to parse them. We can use libraries like xml.etree.ElementTree for parsing XML-like structures. Alternatively, if the dataset is in another format like images (e.g., .png or .jpg), we can load them using OpenCV.

Here's how you can start with the parsing and loading:

Since the dataset contains image files:

In [None]:
from google.colab.patches import cv2_imshow

# Define the path where the dataset is stored
dataset_path = 'crohme_dataset/archive/'

'''
# Load all images from the directory
image_files = [f for f in os.listdir(dataset_path) if f.endswith('.png') or f.endswith('.jpg')]

# Check if any image files were found
if image_files:
    # Load and display the first image as an example
    image = cv2.imread(os.path.join(dataset_path, image_files[0]), cv2.IMREAD_GRAYSCALE)
    cv2.imshow('First Image', image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
else:
    print(f"No image files found in {dataset_path}")
'''
'''
# Traverse subfolders and load images
for subdir, dirs, files in os.walk(dataset_path):
    for file in files:
        if file.endswith('.png') or file.endswith('.jpg'):
            # Construct the full image path
            image_path = os.path.join(subdir, file)

            # Load and display the image
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            #cv2.imshow(f"Image: {image_path}", image)
            # cv2_imshow(image) #debug
            cv2.waitKey(0)
            cv2.destroyAllWindows()
'''

'\n# Traverse subfolders and load images\nfor subdir, dirs, files in os.walk(dataset_path):\n    for file in files:\n        if file.endswith(\'.png\') or file.endswith(\'.jpg\'):\n            # Construct the full image path\n            image_path = os.path.join(subdir, file)\n\n            # Load and display the image\n            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)\n            #cv2.imshow(f"Image: {image_path}", image)\n            # cv2_imshow(image) #debug\n            cv2.waitKey(0)\n            cv2.destroyAllWindows()\n'

After unzipping the dataset path, those numbers, letters, and symbols are organized into subfolders under the `crohme_dataset/archive` directory.

## Code to Load and Organize the Dataset

We will now write a script to traverse these sub-folders, load the images, and categorize them for training. This script will load all the images from their respective folders and associate them with their corresponding labels.

In [None]:
import os
import cv2
import numpy as np

# Directory where the dataset is located
dataset_dir = 'crohme_dataset/archive'

# Function to load the images and their labels
def load_crohme_dataset(dataset_dir):
    images = []
    labels = []
    class_names = []

    # Traverse the dataset directory
    for folder_name in os.listdir(dataset_dir):
        folder_path = os.path.join(dataset_dir, folder_name)
        # print(folder_path) #debug

        if os.path.isdir(folder_path):
            # Assign a label to each folder (symbol)
            class_names.append(folder_name)
            label = len(class_names) - 1

            # Load all images from the folder
            for file_name in os.listdir(folder_path):
                file_path = os.path.join(folder_path, file_name)
                if file_path.endswith('.png') or file_path.endswith('.jpg'):
                    # Check if the file exists and is readable
                    if not os.path.isfile(file_path):
                        #print(f"Error: File not found - {file_path}")
                        continue

                    img = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)

                    # Check if the image was loaded correctly
                    if img is None:
                        #print(f"Error: Could not read image - {file_path}")
                        continue

                    # Resize all images to the same size (e.g., 64x64)
                    img_resized = cv2.resize(img, (64, 64))

                    # Normalize pixel values to [0, 1]
                    img_normalized = img_resized.astype('float32') / 255.0

                    # Append the image and its label
                    images.append(img_normalized)
                    labels.append(label)

    # Convert lists to numpy arrays
    images = np.array(images).reshape(-1, 64, 64, 1)  # Add channel dimension for grayscale images
    labels = np.array(labels)

    return images, labels, class_names

# Load the dataset
images, labels, class_names = load_crohme_dataset(dataset_dir)

# Display the shape of the dataset
print(f"Loaded {len(images)} images with {len(class_names)} unique classes.")


Loaded 34310 images with 61 unique classes.


If the dataset contains .inkml files (common for CROHME):

You’ll need an XML parser to load the .inkml files:

In [None]:
'''
import xml.etree.ElementTree as ET

# Path to a sample .inkml file
inkml_file_path = 'crohme_dataset/sample.inkml'

# Load and parse the .inkml file
tree = ET.parse(inkml_file_path)
root = tree.getroot()

# Display the structure of the .inkml file
for elem in root:
    print(elem.tag, elem.attrib)

'''

"\nimport xml.etree.ElementTree as ET\n\n# Path to a sample .inkml file\ninkml_file_path = 'crohme_dataset/sample.inkml'\n\n# Load and parse the .inkml file\ntree = ET.parse(inkml_file_path)\nroot = tree.getroot()\n\n# Display the structure of the .inkml file\nfor elem in root:\n    print(elem.tag, elem.attrib)\n\n"

# Step 4: Convert .inkml Data to Images (If Applicable)

In CROHME, .inkml files contain stroke data that you may want to convert into images. This step might involve converting the stroke coordinates into an image grid using matplotlib or OpenCV for visualization.

Here's an example of visualizing stroke data using matplotlib:

In [None]:
'''
import matplotlib.pyplot as plt

def visualize_inkml(inkml_file_path):
    tree = ET.parse(inkml_file_path)
    root = tree.getroot()

    # Extract strokes (traces)
    strokes = []
    for trace in root.findall('{http://www.w3.org/2003/InkML}trace'):
        stroke_data = trace.text.strip().split(',')
        stroke_points = [tuple(map(float, point.split())) for point in stroke_data]
        strokes.append(stroke_points)

    # Plot strokes
    for stroke in strokes:
        x, y = zip(*stroke)
        plt.plot(x, y, color='black')

    plt.gca().invert_yaxis()  # Invert Y axis to match the image coordinate system
    plt.show()

# Visualize an inkml file
visualize_inkml('crohme_dataset/sample.inkml')
'''

"\nimport matplotlib.pyplot as plt\n\ndef visualize_inkml(inkml_file_path):\n    tree = ET.parse(inkml_file_path)\n    root = tree.getroot()\n\n    # Extract strokes (traces)\n    strokes = []\n    for trace in root.findall('{http://www.w3.org/2003/InkML}trace'):\n        stroke_data = trace.text.strip().split(',')\n        stroke_points = [tuple(map(float, point.split())) for point in stroke_data]\n        strokes.append(stroke_points)\n    \n    # Plot strokes\n    for stroke in strokes:\n        x, y = zip(*stroke)\n        plt.plot(x, y, color='black')\n    \n    plt.gca().invert_yaxis()  # Invert Y axis to match the image coordinate system\n    plt.show()\n\n# Visualize an inkml file\nvisualize_inkml('crohme_dataset/sample.inkml')\n"

Next Steps:

* Training and Validation Sets: Organize your dataset into training, validation, and testing sets.
* Preprocessing: Use techniques like resizing and normalization, especially if using machine learning models.
* Training: You can now feed this data to your CNN for recognizing mathematical symbols.

In [None]:
# Define number of output classes (digits + letters + symbols/operators)
num_classes = 61  # for digits (0-9) + letters + symbols + operators (+, -, *, /, =, etc.)

# Building the CNN model
model = Sequential()

# First Conv Layer with BatchNorm and MaxPooling
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(64, 64, 1)))
model.add(BatchNormalization())  # Normalize the activations of the previous layer
model.add(MaxPooling2D(pool_size=(2, 2)))

# Second Conv Layer with BatchNorm and MaxPooling
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))

# Third Conv Layer for deeper feature extraction
model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))

# Flattening the output for the Dense Layers
model.add(Flatten())

# Fully Connected Layer with Dropout
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))  # Dropout for regularization

# Fully Connected Layer with higher neurons to capture more information
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))

# Output Layer
model.add(Dense(num_classes, activation='softmax'))

# Model Summary
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
import numpy as np
import os
import cv2
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

# Path to the dataset
dataset_path = 'crohme_dataset/archive/'

# Initialize lists to store the data and labels
data = []
labels = []

# Load the dataset
for subdir, dirs, files in os.walk(dataset_path):
    for file in files:
        if file.endswith('.png') or file.endswith('.jpg'):
            # Load the image
            image_path = os.path.join(subdir, file)
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

            # Check if the image was loaded correctly
            if image is not None:
                # Resize the image to 64x64 if needed
                image = cv2.resize(image, (64, 64))

                # Normalize the image data to [0,1]
                image = image.astype('float32') / 255.0

                # Append the image data to the list
                data.append(image)

                # Assuming folder name is the class label
                label = os.path.basename(subdir)
                labels.append(label)
            else:
                # If the image failed to load, skip the label
                #print(f"Failed to load image: {image_path}")
                pass




# Convert data and labels to numpy arrays
data = np.array(data).reshape(-1, 64, 64, 1)
labels = np.array(labels)

# Check if the number of data points matches the labels
print(f"Number of images: {data.shape[0]}, Number of labels: {labels.shape[0]}")

# Proceed with the split if the lengths are equal
if data.shape[0] == labels.shape[0]:
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

    print(f"Shape of training data: {X_train.shape}, {y_train.shape}")
    print(f"Shape of test data: {X_test.shape}, {y_test.shape}")
else:
    print("Mismatch between data and labels. Please check the dataset.")


'''
# Assuming your images and labels are loaded in variables 'images' and 'labels'
# No one-hot encoding here, using integer labels

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)

print(f"Shape of training data: {X_train.shape}, {y_train.shape}")
print(f"Shape of test data: {X_test.shape}, {y_test.shape}")
'''
'''
# Encode labels (Assuming you already have a dictionary or mapping for the class labels)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)

print (labels_encoded)
# One-hot encode the labels
labels_encoded = to_categorical(labels_encoded, num_classes=88)

# After one-hot encoding
print(f"Encoded labels shape: {labels_encoded.shape}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels_encoded, test_size=0.2, random_state=42)
'''

Number of images: 45438, Number of labels: 45438
Shape of training data: (36350, 64, 64, 1), (36350,)
Shape of test data: (9088, 64, 64, 1), (9088,)


'\n# Encode labels (Assuming you already have a dictionary or mapping for the class labels)\nfrom sklearn.preprocessing import LabelEncoder\nle = LabelEncoder()\nlabels_encoded = le.fit_transform(labels)\n\nprint (labels_encoded)\n# One-hot encode the labels\nlabels_encoded = to_categorical(labels_encoded, num_classes=88)\n\n# After one-hot encoding\nprint(f"Encoded labels shape: {labels_encoded.shape}")\n\n# Split the data into training and testing sets\nX_train, X_test, y_train, y_test = train_test_split(data, labels_encoded, test_size=0.2, random_state=42)\n'

In [None]:
import numpy as np
import os
import cv2
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# ... [rest of your code for loading and preprocessing images] ...

# Convert data and labels to numpy arrays
data = np.array(data).reshape(-1, 64, 64, 1)
labels = np.array(labels)

# Check if the number of data points matches the labels
print(f"Number of images: {data.shape[0]}, Number of labels: {labels.shape[0]}")

# Proceed with the split if the lengths are equal
if data.shape[0] == labels.shape[0]:
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

    print(f"Shape of training data: {X_train.shape}, {y_train.shape}")
    print(f"Shape of test data: {X_test.shape}, {y_test.shape}")

    # Encode labels using LabelEncoder
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

    # Calculate the actual number of classes
    num_classes = len(le.classes_)

    # Ensure num_classes matches the model output
    model.layers[-1].units = num_classes

    # One-hot encode the labels
    y_train = to_categorical(y_train, num_classes=num_classes)
    y_test = to_categorical(y_test, num_classes=num_classes)

    # Set up callbacks
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model_checkpoint = tf.keras.callbacks.ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss')

    # Train the model
    history = model.fit(X_train, y_train,
                        epochs=50, validation_data=(X_test, y_test),
                        callbacks=[early_stopping, model_checkpoint]
                        )
else:
    print("Mismatch between data and labels. Please check the dataset.")

Number of images: 45438, Number of labels: 45438
Shape of training data: (36350, 64, 64, 1), (36350,)
Shape of test data: (9088, 64, 64, 1), (9088,)
Epoch 1/50


ValueError: Arguments `target` and `output` must have the same shape. Received: target.shape=(None, 87), output.shape=(None, 61)

# Step 5: Training the Model

In [None]:
# Set up callbacks
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = tf.keras.callbacks.ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss')
#early_stopping
#model_checkpoint

# Train the model
history = model.fit(X_train, y_train,
                    epochs=50, validation_data=(X_test, y_test),
                    callbacks=[early_stopping, model_checkpoint]
                    )


ValueError: Invalid dtype: str416

In [None]:
def sort_contours(contours):
    # Sort contours based on the x-coordinate to arrange symbols in the correct order
    contours = sorted(contours, key=lambda x: cv2.boundingRect(x)[0])
    return contours

sorted_contours = sort_contours(contours)

In [None]:
def recognize_equation(image_path):
    preprocessed_image = preprocess_image(image_path)
    characters = segment_characters(preprocessed_image)

    equation = ''

    # For each character, resize it to 28x28 and pass it through the model
    for character in characters:
        character_resized = cv2.resize(character, (28, 28))
        character_resized = character_resized.astype('float32') / 255
        character_resized = character_resized.reshape(1, 28, 28, 1)

        # Predict the character
        prediction = model.predict(character_resized)
        predicted_label = np.argmax(prediction)

        # Map the label to the actual character (digit or operator)
        equation += label_to_symbol(predicted_label)

    print(f'Recognized Equation: {equation}')


In [None]:
# Example usage
recognize_equation('scanned_math_equation.png')