In [None]:
import os 
import re
import pandas as pd 

In [None]:
df = pd.DataFrame(columns=['problem_link', 'problem_statement'])

In [None]:
# Dictionary for LaTeX to text replacements
latex_replacements = {
    # Basic replacements
    r'\$': '',
    r'\\lambda': 'λ',
    r'\\mod': 'mod',
    r'\\textrm': '',
    r'\\text': '',
    r'_': '',
    r'\\': '',
    r'\{': '',
    r'\}': '',
    r'\^': '^',
    # Operators
    r'\\times': '×',
    r'\\otimes': '⊗',
    r'\\oplus': '⊕',
    r'\\cdot': '·',
    r'\\circ': '°',
    # Relations
    r'\\leq': '≤',
    r'\\geq': '≥',
    r'\\neq': '≠',
    r'\\approx': '≈',
    r'\\equiv': '≡',
    # Greek letters
    r'\\alpha': 'α',
    r'\\beta': 'β',
    # Add more Greek letters as needed
    # Set symbols
    r'\\emptyset': '∅',
    r'\\in': '∈',
    r'\\notin': '∉',
    r'\\subset': '⊂',
    r'\\supset': '⊃',
    r'\\subseteq': '⊆',
    r'\\supseteq': '⊇',
    # Add more set symbols as needed
    # Logic
    r'\\land': '∧',
    r'\\lor': '∨',
    r'\\lnot': '¬',
    r'\\forall': '∀',
    r'\\exists': '∃',
    # Add more logic symbols as needed
    # Miscellaneous
    r'\\infty': '∞',
    r'\\nabla': '∇',
    r'\\partial': '∂',
    # Add more miscellaneous symbols as needed
    # Arrows
    r'\\rightarrow': '→',
    r'\\leftarrow': '←',
    r'\\Rightarrow': '⇒',
    r'\\Leftarrow': '⇐',
    r'\\uparrow': '↑',
    r'\\downarrow': '↓',
    # Add more arrows as needed
    # Brackets, etc.
    r'\\left\[': '[',
    r'\\right\]': ']',
    r'\\left\(': '(',
    r'\\right\)': ')',
    r'\\langle': '⟨',
    r'\\rangle': '⟩',
    # Add more bracket types as needed
    # Remove unwanted spaces
    r' +': ' ',
}

def replace_latex(text, replacements):
    for latex, replacement in replacements.items():
        text = re.sub(latex, replacement, text)
    return text


In [None]:
def clean(text):
    text = re.split(r'## @{keyword\.constraints}', text, maxsplit=1)[0]
    text = re.sub(r'@{lang\.ja}.*?@{lang\.end}', '', text, flags=re.DOTALL)
    text = replace_latex(text, latex_replacements)
    text = re.sub(r'\@.+', '', text)
    text = re.sub(r'#+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
repo_path = 'repos/yosupo06'
root = "https://judge.yosupo.jp/problem/"

for root_dir, dir_name, files in os.walk(repo_path):
    for file in files:
        if file != "task.md":
            continue

        problem_name = root_dir.split("/")[-1]
        problem_link = root + problem_name

        file_path = os.path.join(root_dir, file)
        problem_statement = open(file_path).read()
        problem_statement = clean(problem_statement)

        new_row = {
            'problem_link': problem_link,
            'problem_statement': problem_statement
        }

        df.loc[len(df)] = new_row

In [None]:
df.to_csv('../datasets/yosupo_problems.csv')