In [169]:
import matplotlib.pyplot as plt
from PIL import Image


def latex_to_image(latex_str, filename="output.png"):
    """
    Renders a LaTeX expression to an image with a white background.

    Args:
    latex_str (str): The LaTeX string to render.
    filename (str): The filename where the image will be saved.
    """
    # Configure Matplotlib: use white background and remove axes
    plt.rcParams.update(
        {
            "text.usetex": True,
            "text.latex.preamble": r"\usepackage{amsmath}",
            "axes.facecolor": "white",
            "axes.edgecolor": "white",
            "axes.linewidth": 0,
            "xtick.bottom": False,
            "xtick.top": False,
            "ytick.left": False,
            "ytick.right": False,
            "xtick.labelbottom": False,
            "xtick.labeltop": False,
            "ytick.labelleft": False,
            "ytick.labelright": False,
        }
    )
    plt.figure(figsize=(2, 1))
    plt.text(
        0, 0, f"${latex_str}$", fontsize=40, ha="center", va="center", color="white"
    )
    plt.axis("off")
    plt.gca().set_axis_off()
    plt.subplots_adjust(top=1.5, bottom=0, right=0.5, left=0, hspace=0, wspace=0)
    plt.margins(0, 0)
    plt.gca().xaxis.set_major_locator(plt.NullLocator())
    plt.gca().yaxis.set_major_locator(plt.NullLocator())
    # Plot color is white
    plt.gca().set_facecolor("white")

    # Save the image
    plt.savefig(filename, bbox_inches="tight", pad_inches=0, dpi=300)
    plt.close()

    # Open the image and invert the colors, then save it

    img = Image.open(filename)
    img = img.convert("L")
    img = img.point(lambda p: 255 - p)
    img.save(filename)


# Test
latex_to_image(r"\frac{1}{2} \cdot \frac{3}{4} = \frac{3}{8}", "output.png")

In [157]:
# Generate arbitrary LaTeX strings

# Singletons
numbers = [str(i) for i in range(100)]
# Alphabet
alphabet = [chr(i) for i in range(65, 91)] + [chr(i) for i in range(97, 123)]
# Greek alphabet LaTeX symbols (with \)
greek_lower = [
    "\\alpha",
    "\\beta",
    "\\gamma",
    "\\delta",
    "\\epsilon",
    "\\zeta",
    "\\eta",
    "\\theta",
    "\\iota",
    "\\kappa",
    "\\lambda",
    "\\mu",
    "\\nu",
    "\\xi",
    "\\omicron",
    "\\pi",
    "\\rho",
    "\\sigma",
    "\\tau",
    "\\upsilon",
    "\\phi",
    "\\chi",
    "\\psi",
    "\\omega",
]

greek_lower = ["{" + ch + "}" for ch in greek_lower]

special_singleton = [
    "{\\infty}",
    "{\\emptyset}",
]

# Two element operators
two_op = {
    "#+#": 1,
    "#-#": 1,
    "#\\cdot #": 1,
    "#\\times #": 1,
    "##": 4,
    "#/#": 2,
    "\\frac{#}{#}": 3,
    "{#}^{#}": 3,
    "{#}_{#}": 3,
}

# One element operators
one_op = {
    "-#": 2,
    "\\sum_{i=0}^{n}#": 0.2,
    "\\sum_{n=1}^{\\infty}#": 0.2,
    "\\int_{-\\infty}^{\\infty}#": 0.2,
    "\\int_{0}^{1}#": 0.2,
    "\\sqrt{#}": 1,
    "\\sin{#}": 1,
    "\\cos{#}": 1,
    "\\tan{#}": 1,
    "\\sec{#}": 1,
    "\\csc{#}": 1,
    "\\cot{#}": 1,
}

# Three element operators
three_op = {"\\int_{#}^{#}#": 3, "\\sum_{#}^{#}#": 2, "###": 4}

singletons = numbers + alphabet + greek_lower + special_singleton
one_ops = one_op
two_ops = two_op
three_ops = three_op

In [158]:
# Construct a random LaTeX expression
import random


def sample_from_dict(d):
    # Weights are not normalized
    total = sum(d.values())
    rand = random.random() * total
    for k, v in d.items():
        rand -= v
        if rand <= 0:
            return k
    return k


# Methodology: Start with a #. Now until it has no more #, replace the first # with an expresson. Expression is singleton with probablity 0.5, one_op with probability 0.3, two_op with probability 0.1, three_op with probability 0.1
def generate_expression():
    expression = "#"
    while "#" in expression:
        rand = random.random()
        expressions = [
            random.choice(singletons),
            sample_from_dict(one_ops),
            sample_from_dict(two_ops),
            sample_from_dict(three_ops),
        ]

        if rand < 0.6:
            expression = expression.replace("#", expressions[0], 1)
        elif rand < 0.7:
            expression = expression.replace("#", expressions[1], 1)
        elif rand < 0.95:
            expression = expression.replace("#", expressions[2], 1)
        else:
            expression = expression.replace("#", expressions[3], 1)
    return expression


# expression = ""
# for i in range(10):
#     while len(expression) < 400:
#         expression = generate_expression()
#     print("$" + expression + "$")
#     expression = ""

$\frac{{{{\frac{{\frac{28}{6}}^{\frac{15\cdot {\sqrt{45}\cdot --\frac{{n}^{\frac{h}{75}}}{{\int_{91}^{O}78\sec{20go\sum_{-{\sec{{\beta}}}^{{\omega}}}^{{\alpha}}81}}^{3}}}^{{l}_{\sqrt{94}}}\frac{\tan{b}+\int_{\sum_{i=0}^{n}18}^{23}q}{g}55I-45L}{B}}}{{\mu}{R}_{\sum_{{\upsilon}-{\frac{78+e}{717}}^{70}rF85}^{\frac{5}{\cos{\int_{0}^{1}28/87}}}38}56}}_{88}/80}_{80}}^{\frac{70}{\cos{-24\times j+\frac{89}{57\cdot 28}}}}}{C+96}$
$\frac{{\sin{{h}_{\frac{\sum_{{17-54}_{37\cdot w}{{\cot{87}}^{48}\times \frac{21}{10}/q}_{\frac{58}{\sin{H}}23/{36/{\rho}\cdot 46/\sum_{98}^{89}{\beta}}^{70}M}\cdot \int_{n}^{{\cot{-{\sin{\cot{35}}}^{92}}}^{58}}U\times 75}^{{23}^{{440}_{16}/7}}48}{\int_{{\lambda}}^{6{\upsilon}+I}56}\int_{\cos{\sum_{n=1}^{\infty}{\mu}}}^{{42}_{{\chi}}}86}}}_{{71+{\kappa}{\frac{45}{\frac{\cot{{90}^{17}}}{{{\cot{A}}_{39}}^{{\upsilon}}}}}_{\cos{{\psi}}69}}_{h-17-P17\cdot 1}}}{\int_{g}^{{60}_{\cot{\frac{J\frac{1}{W}\times 25}{{2}^{54}}}}}12}$
$\frac{85}{\frac{S{40}_{S}}{{{\lambda}}^{{G}^{{a}

In [168]:
f = open("dataset/latex.txt", "w")

for i in range(1000):
    desired_min_length = {
        0: 0,
        200: 5,
        400: 10,
        600: 15,
        800: 20,
        1000: 25,
    }
    expression = ""
    i_round = (i // 200) * 200
    while (
        len(expression) < desired_min_length[i_round]
        or len(expression) >= desired_min_length[i_round + 200]
    ):
        expression = generate_expression()
    latex_to_image(expression, filename=f"dataset/{i}.png")
    # Open dataset/latex.txt and append the expression
    f.write(expression + "\n")

f.close()

RuntimeError: latex was not able to process the following string:
b'$$'

Here is the full command invocation and its output:

latex -interaction=nonstopmode --halt-on-error --output-directory=tmpzdmqsrw5 3bf72ffa63d104e20350d217b3050879.tex

This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=latex)
 restricted \write18 enabled.
entering extended mode
(./3bf72ffa63d104e20350d217b3050879.tex
LaTeX2e <2022-11-01> patch level 1
L3 programming layer <2023-02-22>
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/article.cls
Document Class: article 2022/07/02 v1.4n Standard LaTeX document class
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/size10.clo))
(/usr/local/texlive/2023/texmf-dist/tex/latex/type1cm/type1cm.sty)
(/usr/local/texlive/2023/texmf-dist/tex/latex/cm-super/type1ec.sty
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/t1cmr.fd))
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/inputenc.sty)
(/usr/local/texlive/2023/texmf-dist/tex/latex/geometry/geometry.sty
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics/keyval.sty)
(/usr/local/texlive/2023/texmf-dist/tex/generic/iftex/ifvtex.sty
(/usr/local/texlive/2023/texmf-dist/tex/generic/iftex/iftex.sty)))
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsmath.sty
For additional information on amsmath, use the `?' option.
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amstext.sty
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsgen.sty))
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsbsy.sty)
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsopn.sty))
(/usr/local/texlive/2023/texmf-dist/tex/latex/underscore/underscore.sty)
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/textcomp.sty)
(/usr/local/texlive/2023/texmf-dist/tex/latex/l3backend/l3backend-dvips.def)
No file 3bf72ffa63d104e20350d217b3050879.aux.
*geometry* driver: auto-detecting
*geometry* detected driver: dvips

LaTeX Font Warning: Font shape `OMX/cmex/m/n' in size <40> not available
(Font)              size <24.88> substituted on input line 29.


LaTeX Font Warning: Font shape `OMX/cmex/m/n' in size <27.99988> not available
(Font)              size <24.88> substituted on input line 29.


LaTeX Font Warning: Font shape `OMX/cmex/m/n' in size <20> not available
(Font)              size <20.74> substituted on input line 29.

! Extra }, or forgotten $.
l.29 {\rmfamily $$}
                   %
No pages of output.
Transcript written on tmpzdmqsrw5/3bf72ffa63d104e20350d217b3050879.log.




Error in callback <function _draw_all_if_interactive at 0x13e651ee0> (for post_execute), with arguments args (),kwargs {}:


RuntimeError: latex was not able to process the following string:
b'$$'

Here is the full command invocation and its output:

latex -interaction=nonstopmode --halt-on-error --output-directory=tmpxnonhsqh 3bf72ffa63d104e20350d217b3050879.tex

This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=latex)
 restricted \write18 enabled.
entering extended mode
(./3bf72ffa63d104e20350d217b3050879.tex
LaTeX2e <2022-11-01> patch level 1
L3 programming layer <2023-02-22>
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/article.cls
Document Class: article 2022/07/02 v1.4n Standard LaTeX document class
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/size10.clo))
(/usr/local/texlive/2023/texmf-dist/tex/latex/type1cm/type1cm.sty)
(/usr/local/texlive/2023/texmf-dist/tex/latex/cm-super/type1ec.sty
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/t1cmr.fd))
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/inputenc.sty)
(/usr/local/texlive/2023/texmf-dist/tex/latex/geometry/geometry.sty
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics/keyval.sty)
(/usr/local/texlive/2023/texmf-dist/tex/generic/iftex/ifvtex.sty
(/usr/local/texlive/2023/texmf-dist/tex/generic/iftex/iftex.sty)))
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsmath.sty
For additional information on amsmath, use the `?' option.
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amstext.sty
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsgen.sty))
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsbsy.sty)
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsopn.sty))
(/usr/local/texlive/2023/texmf-dist/tex/latex/underscore/underscore.sty)
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/textcomp.sty)
(/usr/local/texlive/2023/texmf-dist/tex/latex/l3backend/l3backend-dvips.def)
No file 3bf72ffa63d104e20350d217b3050879.aux.
*geometry* driver: auto-detecting
*geometry* detected driver: dvips

LaTeX Font Warning: Font shape `OMX/cmex/m/n' in size <40> not available
(Font)              size <24.88> substituted on input line 29.


LaTeX Font Warning: Font shape `OMX/cmex/m/n' in size <27.99988> not available
(Font)              size <24.88> substituted on input line 29.


LaTeX Font Warning: Font shape `OMX/cmex/m/n' in size <20> not available
(Font)              size <20.74> substituted on input line 29.

! Extra }, or forgotten $.
l.29 {\rmfamily $$}
                   %
No pages of output.
Transcript written on tmpxnonhsqh/3bf72ffa63d104e20350d217b3050879.log.




RuntimeError: latex was not able to process the following string:
b'$$'

Here is the full command invocation and its output:

latex -interaction=nonstopmode --halt-on-error --output-directory=tmpat0w3oxl 3bf72ffa63d104e20350d217b3050879.tex

This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=latex)
 restricted \write18 enabled.
entering extended mode
(./3bf72ffa63d104e20350d217b3050879.tex
LaTeX2e <2022-11-01> patch level 1
L3 programming layer <2023-02-22>
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/article.cls
Document Class: article 2022/07/02 v1.4n Standard LaTeX document class
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/size10.clo))
(/usr/local/texlive/2023/texmf-dist/tex/latex/type1cm/type1cm.sty)
(/usr/local/texlive/2023/texmf-dist/tex/latex/cm-super/type1ec.sty
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/t1cmr.fd))
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/inputenc.sty)
(/usr/local/texlive/2023/texmf-dist/tex/latex/geometry/geometry.sty
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics/keyval.sty)
(/usr/local/texlive/2023/texmf-dist/tex/generic/iftex/ifvtex.sty
(/usr/local/texlive/2023/texmf-dist/tex/generic/iftex/iftex.sty)))
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsmath.sty
For additional information on amsmath, use the `?' option.
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amstext.sty
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsgen.sty))
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsbsy.sty)
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsopn.sty))
(/usr/local/texlive/2023/texmf-dist/tex/latex/underscore/underscore.sty)
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/textcomp.sty)
(/usr/local/texlive/2023/texmf-dist/tex/latex/l3backend/l3backend-dvips.def)
No file 3bf72ffa63d104e20350d217b3050879.aux.
*geometry* driver: auto-detecting
*geometry* detected driver: dvips

LaTeX Font Warning: Font shape `OMX/cmex/m/n' in size <40> not available
(Font)              size <24.88> substituted on input line 29.


LaTeX Font Warning: Font shape `OMX/cmex/m/n' in size <27.99988> not available
(Font)              size <24.88> substituted on input line 29.


LaTeX Font Warning: Font shape `OMX/cmex/m/n' in size <20> not available
(Font)              size <20.74> substituted on input line 29.

! Extra }, or forgotten $.
l.29 {\rmfamily $$}
                   %
No pages of output.
Transcript written on tmpat0w3oxl/3bf72ffa63d104e20350d217b3050879.log.




<Figure size 200x100 with 1 Axes>