# Parse LaTeX file and convert into DeepL-friendly format

## Input LaTeX source from file and parse

In [1]:
from pylatexenc.latexwalker import LatexWalker, LatexCharsNode, LatexEnvironmentNode, LatexMacroNode, LatexSpecialsNode
import re

input_file = input()
with open(input_file) as f:
    latex = f.read()
w = LatexWalker(latex)
(nodelist, pos, len_) = w.get_latex_nodes(pos=0)

 main.tex


## Look for \begin{document} ... \end{document} environment

In [2]:
# nodelist = nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist ##################################################
env = [n for n in nodelist if n.isNodeType(LatexEnvironmentNode)]
doc = [e for e in env if e.environmentname=='document']
if len(doc)==1:
    doc = doc[0]
else:
    print('Unexpected format with more than one document environment')

## Look for successive LaTeX special expressions and replace them by the format P(pos)L(len)

In [3]:
format8 = '{:008}'
def replaceSpecial(node):
    if node.isNodeType(LatexCharsNode):
        s = node.chars
    else:
        s = ' P'+format8.format(node.pos)+'L'+format8.format(node.len)+' '
    return s

In [4]:
def replaceSuccessiveTags(match):
    n_space = len(match.group(3))
    return 'P'+match.group(1)+'L'+format8.format(int(match.group(2))+int(match.group(4))+n_space)

In [5]:
dnl = doc.nodelist
str_list = [replaceSpecial(n) for n in dnl]
latex_rep = ''.join(str_list)
while re.search('P(\d{8})L(\d{8}) ([ \n]+) P\d{8}L(\d{8})', latex_rep) is not None:
    latex_rep = re.sub('P(\d{8})L(\d{8}) ([ \n]+) P\d{8}L(\d{8})', replaceSuccessiveTags, latex_rep)

## Preprocess the string and translate using DeepL

In [6]:
paras = re.split('\n\n', latex_rep)
paras = [p.replace('\n', ' ') for p in paras]

In [7]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')

DRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)

load_url = 'https://www.deepl.com/ja/translator#en/ja'
driver.get(load_url)

clipboard = ppc.paste()
stextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
ttextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')

In [8]:
def translateParagraph(par):
    if par == '' or re.fullmatch('[ \n]+', par) is not None:
        return par
    ppc.copy(par)
    stextarea.send_keys(Keys.COMMAND, 'v')
    translated_text = ''
    while not translated_text:
        time.sleep(1)
        translated_text = ttextarea.get_property('value')
    stextarea.send_keys(Keys.COMMAND, 'a')
    stextarea.send_keys(Keys.BACKSPACE)
    return translated_text

In [9]:
paras_ja = [translateParagraph(par) for par in paras]
latex_ja = '\n\n'.join(paras_ja)

In [10]:
ppc.copy(clipboard)
driver.quit()

## Debug: comparing original tags and "translated" ones

In [11]:
def posl2tag(posl):
    return 'P'+posl[0]+'L'+posl[1]

In [12]:
pattern = re.compile('P(\d{8})L(\d{8})')
tag0 = pattern.findall(latex_rep)
tag1 = pattern.findall(latex_ja) + re.compile('p(\d{8})l(\d{8})').findall(latex_ja)
tag_miss = list((set(tag0) ^ set(tag1)) & set(tag0))
tag_miss = [posl2tag(t) for t in tag_miss]
(tag_miss)

['P00041455L00000012',
 'P00039217L00000001',
 'P00064783L00000001',
 'P00016583L00000019',
 'P00039218L00000023',
 'P00025092L00000019',
 'P00060677L00000021',
 'P00030428L00000026',
 'P00009591L00000024',
 'P00022499L00000195',
 'P00025091L00000001',
 'P00070659L00000001',
 'P00053420L00000002',
 'P00030427L00000001',
 'P00054373L00000001',
 'P00053737L00000001',
 'P00016534L00000024',
 'P00020499L00000541',
 'P00039744L00000001',
 'P00044524L00000004',
 'P00041622L00000005',
 'P00029547L00000001',
 'P00048387L00000001',
 'P00016582L00000001',
 'P00040631L00000020',
 'P00039476L00000001',
 'P00029548L00000025',
 'P00016533L00000001',
 'P00063124L00000004',
 'P00013038L00000238',
 'P00025281L00000002',
 'P00039477L00000023',
 'P00054374L00000021',
 'P00070660L00000023',
 'P00009590L00000001',
 'P00039745L00000020',
 'P00025793L00000003',
 'P00064784L00000022',
 'P00040630L00000001',
 'P00048388L00000021',
 'P00042159L00000012',
 'P00060676L00000001',
 'P00013037L00000001']

In [13]:
target = [p for p in paras if tag_miss[3] in p]
paras.index(target[0])

7

In [21]:
paras[7]

' P00014826L00000027  The most common experimental searches for axions and ALPs rely on the electromagnetic interaction mediating the axion-photon coupling. The presence of a coherently oscillating axion field  P00015042L00000003  in free space modifies Gauss\' law and Amp P00015087L00000015 re\'s law of Maxwell\'s equations as  P00015137L00000358  where  P00015502L00000006  and  P00015513L00000003  are the ordinary charge and current.  P00015554L00000351 The additional interaction terms,  P00015939L00000042  and  P00015986L00000065 , correspond to the axion induced charge and current densities respectively. For axion dark matter, however,  P00016160L00000020  is valid and thus only the time dependent current source term is in effect. In 1983, a promising detection principle was proposed based on the  P00016324L00000002 Primakoff effect" (well known from the particle physics of the Standard Model) to utilize this remaining source term, i.e., the axions are converted into photons in the

In [22]:
paras_ja[7]



In [16]:
warning1 = '\nLatex2DeepL missing expresion warning: '
warning2 = ', '
def addMissingExpressionWarning(tag):
    target = [p for p in paras if tag in p]
    index = paras.index(target[0])
    if re.search(warning1, paras_ja[index]) is None:
        paras_ja[index] += warning1+'\"'+tag+'\", '
    else:
        paras_ja[index] += '\"'+tag+'\", '
for t in tag_miss:
    addMissingExpressionWarning(t)

In [17]:
latex_ja = '\n\n'.join(paras_ja)

## Replace P(pos)L(len) to original LaTeX formula

In [18]:
def replaceRule(match):
    (pos, l) = (int(match.group(1)), int(match.group(2)))
    return latex[pos:pos+l]

In [19]:
latex_fin = re.sub('P(\d{8})L(\d{8})', replaceRule, latex_ja)
latex_fin = re.sub('p(\d{8})l(\d{8})', replaceRule, latex_fin) # DeepL sometimes translates P->p and L->l
latex_fin = latex[:doc.pos]+'\\begin{document}\n'+latex_fin+'\n\\end{document}'

In [20]:
(head, ext) = re.split('\.', input_file)
output_file = head+'_ja.'+ext
with open(output_file, mode='w') as f:
    f.write(latex_fin)