In [4]:
import pylab
import random
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import io
from PIL import Image
import pandas

In [5]:
def crop_image(img):
    img_array = np.array(img)

    # Find the bounding box of non-white pixels
    rows = np.where(np.any(img_array != 255, axis=1))[0]
    cols = np.where(np.any(img_array != 255, axis=0))[0]

    if rows.size == 0 or cols.size == 0:
        return img  # No non-white pixels found

    # Crop the image to the bounding box
    cropped_img = img.crop((cols[0] - 5, rows[0] - 5, cols[-1] + 6, rows[-1] + 6))
    return cropped_img

In [6]:
# plt.rcParams.update({
#     "text.usetex": False,  # Ensure Matplotlib does NOT use external LaTeX
#     "mathtext.fontset": "cm",  # Use Computer Modern font
#     "font.family": "serif",
#     "font.serif": ["Computer Modern Roman"]
# })

def gen_image(tex_formula):
    fig, ax = plt.subplots()
    ax.text(0.5, 0.5, f"${tex_formula}$", fontsize=20, ha='center', va='center')
    ax.axis('off')  # Hide axes

    # Save the figure to a BytesIO object
    buf = io.BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0.2, dpi=300)
    plt.close(fig)
    buf.seek(0)
    image = Image.open(buf)

    return image

In [None]:
label_csv = pandas.read_csv('labels.csv')
allchars = [x for x in label_csv['0']]
chars = []

for c in allchars:
    try: 
        gen_image(c)
        chars.append(c)
    except Exception as e:
        print(f"Error generating image for {c}: {e}")
        continue

In [8]:
def brac_var(a):
    return '{' + a + '}'

def brac_equa(a):
    return '(' + a + ')'

def sum(a, b):
    return a + '+' + b

def sub(a, b):
    return a + '-' + b

def mul(a, b):
    return a + '\\times' + b

def div(a, b):
    return a + '\\div' + b

def fraction(a, b):
    return '\\frac{' + a + '}{' + b + '}'

def square_root(a):
    return '\\sqrt{' + a + '}'

def exp(a, b):
    return a + '^' + b

def underline(a):
    return '\\underline{' + a + '}'

def overline(a):
    return '\\overline{' + a + '}'

functions = [sum, sub, mul, div, fraction, square_root, exp, underline, overline]

lenf = len(functions)
lenc = len(chars)

def gen_formula():
    last_formula, last_fun = None, None
    numv = 1
    vars = random.choices(chars, k = numv)
    while len(vars) > 1:
        x = random.choices(vars, k = 1)[0]
        f = random.choices(functions, k = 1)[0]
        para = len(f.__code__.co_varnames)
        vars.remove(x)
        if para == 1:
            last_formula = [x]
            last_fun = f
            vars.append(f(brac_var(x)))
            continue
        y = random.choices(vars, k = 1)[0]
        vars.remove(y)
        last_formula = [x, y]
        last_fun = f
        vars.append(f(brac_var(x), brac_var(y)))

    return (vars[0], (last_formula, last_fun))

In [9]:
def gen_test():
    input, output = gen_formula()
    img = gen_image(input)
    img = crop_image(img)
    return img, input, output

In [10]:
import datasets

train_size = 10000
test_size = 1000

train_data = []
for _ in range(train_size):
    train_data.append(gen_test())
    if _ % 100 == 0:
        print(f"Generated {len(train_data)} training samples")

test_data = []

for _ in range(test_size):
    test_data.append(gen_test())
    if _ % 100 == 0:
        print(f"Generated {len(test_data)} test samples")

train_data = datasets.Dataset.from_dict({
    'input': [x[0] for x in train_data],
    'label': [x[1] for x in train_data],
    'output': [x[2] for x in train_data]
})

test_data = datasets.Dataset.from_dict({
    'input': [x[0] for x in test_data],
    'label': [x[1] for x in test_data],
    'output': [x[2] for x in test_data]
})

ds = datasets.DatasetDict({
    'train': train_data,
    'test': test_data
})

ds.save_to_disk('dataset')

  from .autonotebook import tqdm as notebook_tqdm


Generated 1 training samples
Generated 101 training samples
Generated 201 training samples
Generated 301 training samples
Generated 401 training samples
Generated 501 training samples
Generated 601 training samples
Generated 701 training samples
Generated 801 training samples
Generated 901 training samples
Generated 1001 training samples
Generated 1101 training samples
Generated 1201 training samples
Generated 1301 training samples
Generated 1401 training samples
Generated 1501 training samples
Generated 1601 training samples
Generated 1701 training samples
Generated 1801 training samples
Generated 1901 training samples
Generated 2001 training samples
Generated 2101 training samples
Generated 2201 training samples
Generated 2301 training samples
Generated 2401 training samples
Generated 2501 training samples
Generated 2601 training samples
Generated 2701 training samples
Generated 2801 training samples
Generated 2901 training samples
Generated 3001 training samples
Generated 3101 train

Saving the dataset (1/1 shards): 100%|██████████| 10000/10000 [00:00<00:00, 64493.72 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 45824.36 examples/s]
