**Mathkov: A Math generation NLP software that uses Markov Chains**

In [26]:
import re
import random
import spacy
import markovify

In [27]:
IN = "math.txt" #input file for mathematics generated
CLEAN = f"cleaned_{IN}"
FLAGS = re.DOTALL | re.MULTILINE | re.IGNORECASE
OUT = "mathkov.txt" #output file

In [28]:
# asy and LaTeX cleaning for input into NLP
ASY = re.compile(r"\[asy\].*?\[/asy\]", FLAGS)
LATEX_DOLLAR = re.compile(r"\$(.*?)\$", FLAGS)
LATEX_PAREN = re.compile(r"\\\((.*?)\\\)", FLAGS)
LATEX_BRACK = re.compile(r"\\\[(.*?)\\\]", FLAGS)
LATEX_BRACE = re.compile(r"\{(.*?)\}", FLAGS)
LATEX_BSLASH = re.compile(r"\\", FLAGS)
OP = re.compile(r"(\+|\-|=|/|\*|\^|_|<|>)")

# AoPS wiki formatting

WIKI_PROBLEM_NUM = re.compile(r"^Problem \d+?$", FLAGS)
WIKI_SOLUTION = re.compile(r"^Solution$", FLAGS)
IMAGE = re.compile(r"^.*?\.png$")


def aops_wiki_clean(text: str) -> str:
    """Clean text from AoPS Wiki."""
    text = ASY.sub("", text)
    text = LATEX_DOLLAR.sub(r"\1", text)
    text = LATEX_PAREN.sub(r"\1", text)
    text = LATEX_BRACK.sub(r"\1", text)
    text = LATEX_BRACE.sub(r" \1", text)
    text = LATEX_BSLASH.sub(" ", text)
    text = OP.sub(r" \1 ", text)
    text = WIKI_PROBLEM_NUM.sub("", text)
    text = WIKI_SOLUTION.sub("", text)
    text = IMAGE.sub("", text)
    return " ".join(text.split())

In [29]:
with open(IN, "r") as reader:
    cleaned = aops_wiki_clean(reader.read())

with open(CLEAN, "w") as cleaned_file:
    cleaned_file.write(cleaned)

In [31]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(cleaned)

sentences = " ".join([sentence.text for sentence in doc.sents if len(sentence.text) > 1])

In [33]:
gen = markovify.Text(sentences, state_size = 3)

output = ""

for prob in range(1, 16):
  output += f"Problmem {prob}\n"
  for _ in range(random.randrange(2, 5)):
    output += gen.make_sentence() + " "
  output += "\n\n"

output = output.strip() + "\n"

with open(OUT, "w") as output_file:
  output_file.write(output)