# Parse LaTeX file and convert into DeepL-friendly format

## Input LaTeX source from file and parse

First replace newly defined commands by original commands to help pylatexenc parsing well

In [60]:
import re
def replaceNewCommand(orig_command, match_command):
    return orig_command + match_command.group(1)

def preprocessing(orig):
    mod = orig
    while True:
        match = re.search(r'\\newcommand\{(.*?)\}\{(.*)\}', mod)
        if match is None:
            break
        mod = mod.replace(match.group(), '')
        mod = re.sub(match.group(1).replace('\\','\\\\')+'([\\\\ \n\{\}\(\)\^\$_\+\-\*\/=,.])', lambda wrapper: replaceNewCommand(match.group(2), wrapper), mod)
    while True:
        match = re.search(r'\\def([^\#\n]*?)\{(.*)\}', mod)
        if match is None:
            break
        mod = mod.replace(match.group(), '')
        mod = re.sub(match.group(1).replace('\\','\\\\')+'([\\\\ \n\{\}\(\)\^\$_\+\-\*\/=,.])', lambda wrapper: replaceNewCommand(match.group(2), wrapper), mod)
    return mod

In [61]:
def outputLatex(of, nl):
    latex = ''
    if type(nl) is not list:
        nl = [nl]
    for i in range(len(nl)):
        if nl[i] is None:
            continue
        if nl[i].isNodeType(LatexCharsNode):
            latex += nl[i].chars
        #elif nl[i].isNodeType(LatexCommentNode):
        #    print(nl[i])
        elif nl[i].isNodeType(LatexEnvironmentNode):
            evn = nl[i].environmentname
            latex += '\\begin{'+evn+'}'
            latex += outputLatex('', nl[i].nodelist)
            latex += '\\end{'+evn+'}'
        elif nl[i].isNodeType(LatexGroupNode):
            latex += nl[i].delimiters[0]
            latex += outputLatex('', nl[i].nodelist)
            latex += nl[i].delimiters[1]
        elif nl[i].isNodeType(LatexMacroNode):
            latex += '\\'+nl[i].macroname
            for argn in nl[i].nodeargd.argnlist:
                latex += outputLatex('', argn)
            latex += nl[i].macro_post_space
        elif nl[i].isNodeType(LatexSpecialsNode):
            latex += nl[i].specials_chars
    if of != '':
        with open(of, mode='w') as f:
            f.write(latex)
    else:
        return latex

In [62]:
# outputLatex('test.tex', nodelist)

In [63]:
from pylatexenc.latexwalker import LatexWalker, LatexCharsNode, LatexCommentNode, LatexEnvironmentNode, LatexMacroNode, LatexSpecialsNode

input_file = input()
with open(input_file) as f:
    latex_orig = f.read()
latex_mod = preprocessing(latex_orig)
w = LatexWalker(latex_mod)
(nodelist, pos, len_) = w.get_latex_nodes(pos=0)

## Look for \begin{document} ... \end{document} environment

Tempolary: search for the document environment step by step

In [64]:
def searchDocument(nl):
    for i in range(len(nl)):
        if nl[i] is None:
            continue
        if nl[i].isNodeType(LatexEnvironmentNode) and nl[i].environmentname == 'document':
            return nl[i]
            # return [i]
        else:
            if nl[i].isNodeType(LatexCharsNode) or nl[i].isNodeType(LatexCommentNode) or nl[i].isNodeType(LatexSpecialsNode):
                continue
            elif nl[i].isNodeType(LatexMacroNode):
                if nl[i].nodeargd is None:
                    continue
                else:
                    next = nl[i].nodeargd.argnlist
            else:
                next = nl[i].nodelist
            res = searchDocument(next)
            if res != []:
                return res
                # return [i,res]
    return []

In [65]:
doc = searchDocument(nodelist)
# env = [n for n in nodelist if n.isNodeType(LatexEnvironmentNode)]
# doc = [e for e in env if e.environmentname=='document']
# if len(doc)==1:
#    doc = doc[0]
#else:
#    print('Unexpected format with more than one document environment')

## Look for successive LaTeX special expressions and replace them by the format P(pos)L(len)

In [66]:
format8 = '{:008}'
def replaceSpecial(node):
    if node.isNodeType(LatexCharsNode):
        s = node.chars
#    elif node.isNodeType(LatexCommentNode):
#        s = ''
    elif node.isNodeType(LatexSpecialsNode) and node.specials_chars == '~':
        s = ' '
    else:
        s = ' P'+format8.format(node.pos)+'L'+format8.format(node.len)+' '
    return s

In [67]:
def replaceSuccessiveTags(match):
    n_space = len(match.group(3))
    return 'P'+match.group(1)+'L'+format8.format(int(match.group(2))+int(match.group(4))+n_space)

In [68]:
dnl = doc.nodelist
str_list = [replaceSpecial(n) for n in dnl]
latex_rep = ''.join(str_list)
while re.search('P(\d{8})L(\d{8}) ([ \n]*) P\d{8}L(\d{8})', latex_rep) is not None:
    latex_rep = re.sub('P(\d{8})L(\d{8}) ([ \n]*) P\d{8}L(\d{8})', replaceSuccessiveTags, latex_rep)

## Preprocess the string and translate using DeepL

In [69]:
paras = re.split('\n\n', latex_rep)
paras = [p.replace('\n', ' ') for p in paras]

In [70]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import pyperclip as ppc

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')

# automatic driver download
# driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# another choice in case the above fails
DRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)

load_url = 'https://www.deepl.com/ja/translator#en/ja'
driver.get(load_url)

clipboard = ppc.paste()
stextarea = driver.find_element(By.CSS_SELECTOR, 
    '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
ttextarea = driver.find_element(By.CSS_SELECTOR,
    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')

  driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)


In [71]:
def translateParagraph(par):
    if par == '' or re.fullmatch('[ \n]+', par) is not None:
        return par
    ppc.copy(par)
    stextarea.send_keys(Keys.COMMAND, 'v')
    translated_text = ''
    while not translated_text:
        time.sleep(1)
        translated_text = ttextarea.get_property('value')
    stextarea.send_keys(Keys.COMMAND, 'a')
    stextarea.send_keys(Keys.BACKSPACE)
    return translated_text

In [72]:
paras_ja = [translateParagraph(par) for par in paras]
latex_ja = '\n\n'.join(paras_ja)

In [73]:
ppc.copy(clipboard)
driver.quit()

## Debug: comparing original tags and "translated" ones

In [74]:
def posl2tag(posl):
    return 'P'+posl[0]+'L'+posl[1]

In [75]:
pattern = re.compile('P(\d{8})L(\d{8})')
tag0 = pattern.findall(latex_rep)
tag1 = pattern.findall(latex_ja) + re.compile('p(\d{8})l(\d{8})').findall(latex_ja)
tag_miss = list((set(tag0) ^ set(tag1)) & set(tag0))
tag_miss = [posl2tag(t) for t in tag_miss]
(tag_miss)

['P00115137L00000012',
 'P00112906L00000199',
 'P00011426L00000014',
 'P00118645L00000013',
 'P00093286L00000012',
 'P00076791L00000172',
 'P00011334L00000013',
 'P00075031L00000257',
 'P00116206L00000209',
 'P00011315L00000014',
 'P00054910L00000022',
 'P00056693L00000002',
 'P00115154L00000012',
 'P00119868L00000015',
 'P00075984L00000018',
 'P00058652L00000002',
 'P00026468L00000003',
 'P00068230L00000027',
 'P00051495L00000235',
 'P00014217L00000047',
 'P00053166L00000653',
 'P00089891L00000007',
 'P00093000L00000012',
 'P00011476L00000013',
 'P00093244L00000011',
 'P00083621L00000003',
 'P00079115L00000013',
 'P00064152L00000003',
 'P00089096L00000008',
 'P00074016L00000008',
 'P00065987L00000019',
 'P00024105L00000029',
 'P00037813L00000006',
 'P00027807L00000012',
 'P00011302L00000010',
 'P00074089L00000013',
 'P00079059L00000008',
 'P00092455L00000444',
 'P00023288L00000014',
 'P00064253L00000011',
 'P00071408L00000011',
 'P00058663L00000002',
 'P00081262L00000013',
 'P00071687

In [76]:
target = [p for p in paras if tag_miss[0] in p]
paras.index(target[0])

159

In [77]:
warning1 = '\nLatex2DeepL missing expresion warning: '
warning2 = ', '
def addMissingExpressionWarning(tag):
    target = [p for p in paras if tag in p]
    index = paras.index(target[0])
    if re.search(warning1, paras_ja[index]) is None:
        paras_ja[index] += warning1+'\"'+tag+'\", '
    else:
        paras_ja[index] += '\"'+tag+'\", '
for t in tag_miss:
    addMissingExpressionWarning(t)

In [78]:
latex_ja = '\n\n'.join(paras_ja)

## Replace P(pos)L(len) to original LaTeX formula

In [79]:
def replaceRule(match):
    (pos, l) = (int(match.group(1)), int(match.group(2)))
    return latex_mod[pos:pos+l]

In [80]:
latex_fin = re.sub('P(\d{8})L(\d{8})', replaceRule, latex_ja)
latex_fin = re.sub('p(\d{8})l(\d{8})', replaceRule, latex_fin) # DeepL sometimes translates P->p and L->l
latex_fin = latex_mod[:doc.pos]+'\\begin{document}\n'+latex_fin+'\n\\end{document}'

In [81]:
(head, ext) = re.split('\.', input_file)
output_en = head+'_orig.'+ext
with open(output_en, mode='w') as f:   # backup of English version with rename
    f.write(latex_orig)
with open(input_file, mode='w') as f:  # Japanese version with original file name
    f.write(latex_fin)