# The Math Library - Semantic search for competition math

In [None]:
from datasets import load_dataset
from txtai.embeddings import Embeddings

The search database will be [NuminaMath 1.5](https://huggingface.co/datasets/AI-MO/NuminaMath-1.5), which contains around 900k competition-level math problems and their solution.

In [33]:
ds = load_dataset("AI-MO/NuminaMath-1.5", "default")['train'] #.select(range(10**4))
print(ds[:5])

We also initialize our embedding database based on [nli-mpnet-base-v2](https://huggingface.co/sentence-transformers/nli-mpnet-base-v2), which is a 100M parameter model that maps sentences to a 768 dimensional vector.

In [34]:
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})

Since the embedding process takes a few hours, we will save the results to give the option to simply load the embeddings from disk enabled with `load_from_disk = True`.

In [45]:
load_from_disk = True

if not load_from_disk:
    embeddings.index([(uid, {"text": row["problem"], "answer": row["solution"]}, None) for uid, row in enumerate(ds)])
    embeddings.save("numina-math.tar.gz")
else:
    embeddings.load("numina-math.tar.gz")

Now we can query for similar math problems using our embedding database:

In [46]:
# search top results, number specified by limit
def question(text, limit=5):
  result = embeddings.search(text, limit)

  for (uid, score) in result:
      print(score, ds[uid])

question("Integral")

0.4288303256034851 {'problem': 'XLIII OM - I - Task 4\n\nCalculate\n\n\n\n\n\n\n\n\n\n\n\n', 'solution': 'Denoting the $ n $-th term of the considered sum by $ a_n $, we obtain the equality\n\n\n\nThus, $ a_n $, as a positive number, equals $ (\\sqrt{n+1}- \\sqrt{n-1})/\\sqrt{2} $. Therefore,\n\n\n\n\nWe add all these equalities side by side, reduce what can be reduced, divide both sides by $ \\sqrt{2} $, and obtain the result:', 'answer': 'notfound', 'problem_type': 'Other', 'question_type': 'math-word-problem', 'problem_is_valid': 'Incomplete', 'solution_is_valid': 'Yes', 'source': 'olympiads', 'synthetic': False}
0.4279802441596985 {'problem': 'XXII OM - II - Zadanie 6\n\nDany jest ciąg nieskończony $ \\{a_n\\} $. Dowieść, że jeżeli \n\n\n\n\n\n\nto\n\n\n\n\n\n\ndla $ n = 1, 2, \\ldots $.\n', 'solution': '\nNierówność $ a_n + a_{n+2} \\geq 2a_{n+1} $ daną w założeniu przekształcamy do postaci równoważnej\n\n\n\n\n\n\nOznaczając $ b_{n+1} = a_{n+1} - a_n $ dla $ n = 1, 2, \\ldots $ n

(c) Mia Müßig