# Parse LaTeX file and convert into DeepL-friendly format

## Input LaTeX source from file and parse

First replace newly defined commands by original commands to help pylatexenc parsing well

In [1]:
import re
def replaceNewCommand(orig_command, match_command):
    return orig_command + match_command.group(1)

def preprocessing(orig):
    mod = orig
    while True:
        match = re.search(r'\\newcommand\{(.*?)\}\{(.*)\}', mod)
        if match is None:
            break
        mod = mod.replace(match.group(), '')
        mod = re.sub(match.group(1).replace('\\','\\\\')+"([\\\\ \n\{\}\(\)\^\$_\+\-\*\/=,.;:])'", lambda wrapper: replaceNewCommand(match.group(2), wrapper), mod)
    while True:
        match = re.search(r'\\def([^\#\n]*?)\{(.*)\}', mod)
        if match is None:
            break
        mod = mod.replace(match.group(), '')
        mod = re.sub(match.group(1).replace('\\','\\\\')+"([\\\\ \n\{\}\(\)\^\$_\+\-\*\/=,.;:])'", lambda wrapper: replaceNewCommand(match.group(2), wrapper), mod)
    return mod

In [2]:
from pylatexenc.latexwalker import LatexWalker, LatexCharsNode, LatexCommentNode, LatexEnvironmentNode, LatexGroupNode, LatexMacroNode, LatexMathNode, LatexSpecialsNode

input_file = input()
with open(input_file) as f:
    latex_orig = f.read()
latex_mod = preprocessing(latex_orig)
w = LatexWalker(latex_mod)
(nodelist, pos, len_) = w.get_latex_nodes(pos=0)

 SSinos.tex


## Look for \begin{document} ... \end{document} environment

Tempolary: search for the document environment step by step

In [3]:
def searchDocument(nl):
    for i in range(len(nl)):
        if nl[i] is None:
            continue
        if nl[i].isNodeType(LatexEnvironmentNode) and nl[i].environmentname == 'document':
            return nl[i]
        elif nl[i].isNodeType(LatexCharsNode) or nl[i].isNodeType(LatexCommentNode) or nl[i].isNodeType(LatexSpecialsNode):
            continue
        elif nl[i].isNodeType(LatexMacroNode):
            if nl[i].nodeargd is None:
                continue
            else:
                next = nl[i].nodeargd.argnlist
        else:
            next = nl[i].nodelist
        res = searchDocument(next)
        if res != []:
            return res
    return []

In [4]:
document = searchDocument(nodelist)
# env = [n for n in nodelist if n.isNodeType(LatexEnvironmentNode)]
# doc = [e for e in env if e.environmentname=='document']
# if len(doc)==1:
#    doc = doc[0]
#else:
#    print('Unexpected format with more than one document environment')

## Look for successive LaTeX special expressions and replace them by the format N(from)T(to)

In [5]:
def replaceSpecial(i, node):
    format8 = '{:008}'
    if node is None:
        return ''
    elif node.isNodeType(LatexCharsNode):
        s = node.chars
#    elif node.isNodeType(LatexCommentNode):
#        s = ''
    elif node.isNodeType(LatexSpecialsNode) and node.specials_chars == '~':
        s = ' '
    else:
        s = ' N'+format8.format(i)+'T'+format8.format(i)+' '
    return s

def replaceSuccessiveTags(match):
    return 'N'+match.group(1)+'T'+match.group(2)

def replaceTags(dnl):
    str_list = [replaceSpecial(i, n) for i, n in enumerate(dnl)]
    latex_out = ''.join(str_list)
    while re.search('N\d{8}T\d{8} [ \n]* N\d{8}T\d{8}', latex_out) is not None:
        latex_out = re.sub('N(\d{8})T\d{8} [ \n]* N\d{8}T(\d{8})', replaceSuccessiveTags, latex_out)
    return latex_out

## For postprocessing : replace N(from)T(to) to original LaTeX formula

In [6]:
def outputLatex(nl):
    latex = ''
    if type(nl) is not list:
        nl = [nl]
    for i in range(len(nl)):
        if nl[i] is None:
            continue
        if nl[i].isNodeType(LatexCharsNode):
            latex += nl[i].chars
        #elif nl[i].isNodeType(LatexCommentNode):
        #    print(nl[i])
        elif nl[i].isNodeType(LatexEnvironmentNode):
            evn = nl[i].environmentname
            latex += '\\begin{'+evn+'}'
            if nl[i].nodeargd.argnlist != [] and nl[i].nodeargd.argnlist != [None]:  # e.g., \begin{tabular}{ccc}
                latex += outputLatex(nl[i].nodeargd.argnlist)
            latex += outputLatex(nl[i].nodelist)
            latex += '\\end{'+evn+'}'
        elif nl[i].isNodeType(LatexGroupNode):
            latex += nl[i].delimiters[0]
            latex += outputLatex(nl[i].nodelist)
            latex += nl[i].delimiters[1]
        elif nl[i].isNodeType(LatexMacroNode):
            latex += '\\'+nl[i].macroname
            latex += nl[i].macro_post_space
            if nl[i].nodeargd is not None:
                latex += outputLatex(nl[i].nodeargd.argnlist)
        elif nl[i].isNodeType(LatexMathNode):
            latex += nl[i].delimiters[0]
            latex += outputLatex(nl[i].nodelist)
            latex += nl[i].delimiters[1]
        elif nl[i].isNodeType(LatexSpecialsNode):
            latex += nl[i].specials_chars
    return latex

def replaceRule(dnl, match):
    (fr, to) = (int(match.group(1)), int(match.group(2)))
    return outputLatex(dnl[fr:to+1])

## Preprocess the string, translate using DeepL, and postprocess the string

In [7]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')

DRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)

load_url = 'https://www.deepl.com/ja/translator#en/ja'
driver.get(load_url)

clipboard = ppc.paste()
stextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
ttextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')

In [8]:
def translateParagraph(par):
    if par == '' or re.fullmatch('[ \n]+', par) is not None:
        return par
    ppc.copy(par)
    stextarea.send_keys(Keys.COMMAND, 'v')
    translated_text = ''
    while not translated_text:
        time.sleep(1)
        translated_text = ttextarea.get_property('value')
    stextarea.send_keys(Keys.COMMAND, 'a')
    stextarea.send_keys(Keys.BACKSPACE)
    return translated_text

def node2tag(node):
    return 'N'+node[0]+'T'+node[1]
        
def translateOneLevel(doc):
    
    # translate paragraph by paragraph
    if doc.isNodeType(LatexCharsNode) or doc.isNodeType(LatexMacroNode):
        dnl = [doc]
    else:
        dnl = doc.nodelist
    latex_rep = replaceTags(dnl)
    paras = re.split('\n\n', latex_rep)
    paras = [p.replace('\n', ' ') for p in paras]
    paras_ja = [translateParagraph(par) for par in paras]
    latex_tmp = '\n\n'.join(paras_ja)

    # extract missing tag information
    pattern = re.compile('N(\d{8})T(\d{8})')
    tag0 = pattern.findall(latex_rep)
    tag1 = pattern.findall(latex_tmp) + re.compile('N\d{8}T\d{8}').findall(latex_tmp)
    tag_miss = list((set(tag0) ^ set(tag1)) & set(tag0))
    tag_miss = [node2tag(t) for t in tag_miss]

    # add missing expression warning
    warning = '\nLatex2DeepL missing expresion warning: '
    for tag in tag_miss:
        target = [p for p in paras if tag in p]
        index = paras.index(target[0])
        if re.search(warning, paras_ja[index]) is None:
            paras_ja[index] += warning+'\"'+tag+'\", '
        else:
            paras_ja[index] += '\"'+tag+'\", '
    
    # convert back tags into original formula
    latex_ja = '\n\n'.join(paras_ja)
    latex_ja = re.sub('N(\d{8})T(\d{8})', lambda wrapper: replaceRule(dnl, wrapper), latex_ja)
    latex_ja = re.sub('n(\d{8})t(\d{8})', lambda wrapper: replaceRule(dnl, wrapper), latex_ja)  # DeepL sometimes translate large characters into small ones
    return paras_ja, latex_ja

## Modules for specific environments e.g. \caption{}, \begin{abstract}, etc.

In [9]:
def translateContent(node):
    _, content = translateOneLevel(node)
    node.nodelist = [LatexCharsNode(parsing_state=None, pos=0, len=0, chars=content)]

In [10]:
def translateAllContents(nl):
    for i in range(len(nl)):
        if nl[i] is None:
            continue
        if nl[i].isNodeType(LatexEnvironmentNode):
            en = nl[i].environmentname
            if en == 'abstract' or en == 'itemize' or en == 'enumerate':
                translateContent(nl[i])
                continue
            else:
                next = nl[i].nodelist
        elif nl[i].isNodeType(LatexMacroNode):
            if nl[i].nodeargd is None:
                continue
            else:
                mn = nl[i].macroname
                if mn == 'section' or mn == 'subsection' or mn == 'subsubsection' \
                    or mn == 'chapter' or mn == 'subchapter' or mn == 'subsubchapter' or mn == 'footnote' \
                    or mn == 'textit' or mn == 'textbf':
                    elem = nl[i].nodeargd.argnlist
                    if elem != [] and elem[0] is not None:
                        translateContent(elem[0])
                    continue
                else:
                    next = nl[i].nodeargd.argnlist
        elif nl[i].isNodeType(LatexCharsNode) or nl[i].isNodeType(LatexCommentNode) or nl[i].isNodeType(LatexSpecialsNode):
            continue
        else:
            next = nl[i].nodelist
        translateAllContents(next)

In [11]:
import copy
doc_rep = copy.copy(document)
translateAllContents(doc_rep.nodelist)

## Debug: comparing original tags and "translated" ones

In [12]:
#target = [p for p in paras if tag_miss[0] in p]
#paras.index(target[0])

In [13]:
#paras[2]

In [14]:
#paras_ja[2]

## Replace N(from)T(to) to original LaTeX formula

In [15]:
paras_ja, latex_ja = translateOneLevel(doc_rep.nodelist[1])
latex_ja = latex_mod[:document.pos]+'\\begin{document}\n'+latex_ja+'\n\\end{document}'

AttributeError: 'LatexMacroNode' object has no attribute 'nodelist'

In [None]:
ppc.copy(clipboard)
driver.quit()

In [None]:
(head, ext) = re.split('\.', input_file)
output_file = head+'_ja.'+ext
with open(output_file, mode='w') as f:
    f.write(latex_ja)