# Parse LaTeX file and convert into DeepL-friendly format

## Input LaTeX source from file and parse

First replace newly defined commands by original commands to help pylatexenc parsing well

In [1]:
import re
def replaceNewCommand(orig_command, match_command):
    return orig_command + match_command.group(1)

def preprocessing(orig):
    mod = orig
    while True:
        match = re.search(r'\\newcommand\{(.*?)\}\{(.*)\}', mod)
        if match is None:
            break
        mod = mod.replace(match.group(), '')
        mod = re.sub(match.group(1).replace('\\','\\\\')+'([\\\\ \n\{\}\(\)\^\$_\+\-\*\/=,.])', lambda wrapper: replaceNewCommand(match.group(2), wrapper), mod)
    while True:
        match = re.search(r'\\def([^\#\n]*?)\{(.*)\}', mod)
        if match is None:
            break
        mod = mod.replace(match.group(), '')
        mod = re.sub(match.group(1).replace('\\','\\\\')+'([\\\\ \n\{\}\(\)\^\$_\+\-\*\/=,.])', lambda wrapper: replaceNewCommand(match.group(2), wrapper), mod)
    return mod

In [2]:
from pylatexenc.latexwalker import LatexWalker, LatexCharsNode, LatexCommentNode, LatexEnvironmentNode, LatexGroupNode, LatexMacroNode, LatexMathNode, LatexSpecialsNode

input_file = input()
with open(input_file) as f:
    latex_orig = f.read()
#############################################
# latex_mod = preprocessing(latex_orig)
latex_mod = latex_orig
#############################################
w = LatexWalker(latex_mod)
(nodelist, pos, len_) = w.get_latex_nodes(pos=0)

 v0.tex


## Look for \begin{document} ... \end{document} environment

Tempolary: search for the document environment step by step

In [3]:
def searchDocument(nl):
    for i in range(len(nl)):
        if nl[i] is None:
            continue
        if nl[i].isNodeType(LatexEnvironmentNode) and nl[i].environmentname == 'document':
            return nl[i]
            # return [i]
        else:
            if nl[i].isNodeType(LatexCharsNode) or nl[i].isNodeType(LatexCommentNode) or nl[i].isNodeType(LatexSpecialsNode):
                continue
            elif nl[i].isNodeType(LatexMacroNode):
                if nl[i].nodeargd is None:
                    continue
                else:
                    next = nl[i].nodeargd.argnlist
            else:
                next = nl[i].nodelist
            res = searchDocument(next)
            if res != []:
                return res
                # return [i,res]
    return []

In [4]:
doc = searchDocument(nodelist)
# env = [n for n in nodelist if n.isNodeType(LatexEnvironmentNode)]
# doc = [e for e in env if e.environmentname=='document']
# if len(doc)==1:
#    doc = doc[0]
#else:
#    print('Unexpected format with more than one document environment')

## Look for successive LaTeX special expressions and replace them by the format Node(from)T(to)

In [5]:
format8 = '{:008}'
def replaceSpecial(i, node):
    if node.isNodeType(LatexCharsNode):
        s = node.chars
#    elif node.isNodeType(LatexCommentNode):
#        s = ''
    elif node.isNodeType(LatexSpecialsNode) and node.specials_chars == '~':
        s = ' '
    else:
        s = ' Node'+format8.format(i)+'T'+format8.format(i)+' '
    return s

In [6]:
def replaceSuccessiveTags(match):
    return 'Node'+match.group(1)+'T'+match.group(2)

In [7]:
dnl = doc.nodelist
str_list = [replaceSpecial(i, n) for i, n in enumerate(dnl)]
latex_rep = ''.join(str_list)
while re.search('Node\d{8}T\d{8} [ \n]* Node\d{8}T\d{8}', latex_rep) is not None:
    latex_rep = re.sub('Node(\d{8})T\d{8} [ \n]* Node\d{8}T(\d{8})', replaceSuccessiveTags, latex_rep)

## Preprocess the string and translate using DeepL

In [8]:
paras = re.split('\n\n', latex_rep)
paras = [p.replace('\n', ' ') for p in paras]

In [9]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')

DRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)

load_url = 'https://www.deepl.com/ja/translator#en/ja'
driver.get(load_url)

clipboard = ppc.paste()
stextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
ttextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')

In [10]:
def translateParagraph(par):
    if par == '' or re.fullmatch('[ \n]+', par) is not None:
        return par
    ppc.copy(par)
    stextarea.send_keys(Keys.COMMAND, 'v')
    translated_text = ''
    while not translated_text:
        time.sleep(1)
        translated_text = ttextarea.get_property('value')
    stextarea.send_keys(Keys.COMMAND, 'a')
    stextarea.send_keys(Keys.BACKSPACE)
    return translated_text

In [11]:
paras_ja = [translateParagraph(par) for par in paras]
latex_ja = '\n\n'.join(paras_ja)

In [12]:
ppc.copy(clipboard)
driver.quit()

## Debug: comparing original tags and "translated" ones

In [13]:
def node2tag(node):
    return 'Node'+node[0]+'T'+node[1]

In [14]:
pattern = re.compile('Node(\d{8})T(\d{8})')
tag0 = pattern.findall(latex_rep)
tag1 = pattern.findall(latex_ja) + re.compile('Node\d{8}T\d{8}').findall(latex_ja)
tag_miss = list((set(tag0) ^ set(tag1)) & set(tag0))
tag_miss = [node2tag(t) for t in tag_miss]
(tag_miss)

['Node00000145T00000145',
 'Node00000936T00000936',
 'Node00000296T00000296',
 'Node00000523T00000523',
 'Node00000664T00000664',
 'Node00000219T00000219',
 'Node00000644T00000644',
 'Node00000386T00000386',
 'Node00000363T00000363',
 'Node00000198T00000201',
 'Node00000409T00000409',
 'Node00000388T00000388',
 'Node00000510T00000510',
 'Node00000384T00000384',
 'Node00000944T00000947',
 'Node00000502T00000502',
 'Node00000390T00000390',
 'Node00000344T00000344',
 'Node00000298T00000301',
 'Node00000677T00000677',
 'Node00000967T00000967',
 'Node00000942T00000942',
 'Node00000361T00000361',
 'Node00000143T00000143',
 'Node00000406T00000406',
 'Node00000217T00000217',
 'Node00001040T00001040',
 'Node00000989T00000989']

In [15]:
target = [p for p in paras if tag_miss[0] in p]
paras.index(target[0])

4

In [16]:
#paras[9]

In [17]:
#paras_ja[9]

In [18]:
warning1 = '\nLatex2DeepL missing expresion warning: '
warning2 = ', '
def addMissingExpressionWarning(tag):
    target = [p for p in paras if tag in p]
    index = paras.index(target[0])
    if re.search(warning1, paras_ja[index]) is None:
        paras_ja[index] += warning1+'\"'+tag+'\", '
    else:
        paras_ja[index] += '\"'+tag+'\", '
for t in tag_miss:
    addMissingExpressionWarning(t)

In [19]:
latex_ja = '\n\n'.join(paras_ja)

## Replace Node#(from)to(to) to original LaTeX formula

In [20]:
def outputLatex(nl):
    latex = ''
    if type(nl) is not list:
        nl = [nl]
    for i in range(len(nl)):
        if nl[i] is None:
            continue
        if nl[i].isNodeType(LatexCharsNode):
            latex += nl[i].chars
        #elif nl[i].isNodeType(LatexCommentNode):
        #    print(nl[i])
        elif nl[i].isNodeType(LatexEnvironmentNode):
            evn = nl[i].environmentname
            latex += '\\begin{'+evn+'}'
            if nl[i].nodeargd.argnlist != [] and nl[i].nodeargd.argnlist != [None]:  # e.g., \begin{tabular}{ccc}
                latex += outputLatex(nl[i].nodeargd.argnlist)
            latex += outputLatex(nl[i].nodelist)
            latex += '\\end{'+evn+'}'
        elif nl[i].isNodeType(LatexGroupNode):
            latex += nl[i].delimiters[0]
            latex += outputLatex(nl[i].nodelist)
            latex += nl[i].delimiters[1]
        elif nl[i].isNodeType(LatexMacroNode):
            latex += '\\'+nl[i].macroname
            latex += outputLatex(nl[i].nodeargd.argnlist)
            latex += nl[i].macro_post_space
        elif nl[i].isNodeType(LatexMathNode):
            latex += nl[i].delimiters[0]
            latex += outputLatex(nl[i].nodelist)
            latex += nl[i].delimiters[1]
        elif nl[i].isNodeType(LatexSpecialsNode):
            latex += nl[i].specials_chars
    return latex

In [21]:
def replaceRule(match):
    (fr, to) = (int(match.group(1)), int(match.group(2)))
    return outputLatex(dnl[fr:to+1])

In [22]:
latex_fin = re.sub('Node(\d{8})T(\d{8})', replaceRule, latex_ja)
latex_fin = latex_mod[:doc.pos]+'\\begin{document}\n'+latex_fin+'\n\\end{document}'

In [23]:
(head, ext) = re.split('\.', input_file)
output_file = head+'_ja.'+ext
with open(output_file, mode='w') as f:
    f.write(latex_fin)

In [24]:
wtest = LatexWalker(latex_orig)
(nodetest, p, l_) = wtest.get_latex_nodes(pos=0)
test = outputLatex(nodelist)
with open('test.tex', mode='w') as f:
    f.write(test)