# Parse LaTeX file and convert into DeepL-friendly format

## Input LaTeX source from file and parse

In [65]:
from pylatexenc.latexwalker import LatexWalker, LatexCharsNode, LatexEnvironmentNode, LatexMacroNode, LatexSpecialsNode
import re

input_file = input()
with open(input_file) as f:
    latex = f.read()
w = LatexWalker(latex)
(nodelist, pos, len_) = w.get_latex_nodes(pos=0)

 /Users/SoChigusa/works/Latex2DeepL/test/test.tex


## Look for \begin{document} ... \end{document} environment

In [66]:
env = [n for n in nodelist if n.isNodeType(LatexEnvironmentNode)]
doc = [e for e in env if e.environmentname=='document']
if len(doc)==1:
    doc = doc[0]
else:
    print('Unexpected format with more than one document environment')

## Look for successive LaTeX special expressions and replace them by the format P(pos)L(len)

In [67]:
format8 = '{:008}'
def replaceSpecial(node):
    if node.isNodeType(LatexCharsNode):
        s = node.chars
    else:
        s = ' P'+format8.format(node.pos)+'L'+format8.format(node.len)+' '
    return s

In [68]:
def replaceSuccessiveTags(match):
    return 'P'+match.group(1)+'L'+format8.format(int(match.group(2))+int(match.group(3)))

In [69]:
dnl = doc.nodelist
str_list = [replaceSpecial(n) for n in dnl]
latex_rep = ''.join(str_list)
while re.search('P(\d{8})L(\d{8})  P\d{8}L(\d{8})', latex_rep) is not None:
    latex_rep = re.sub('P(\d{8})L(\d{8})  P\d{8}L(\d{8})', replaceSuccessiveTags, latex_rep)

## Preprocess the string and translate using DeepL

In [70]:
paras = re.split('\n\n', latex_rep)
paras = [p.replace('\n', ' ') for p in paras]

In [71]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')

DRIVER_PATH = '/Users/SoChigusa/works/Latex2DeepL/bin/chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)

load_url = 'https://www.deepl.com/ja/translator#en/ja'
driver.get(load_url)

clipboard = ppc.paste()
stextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
ttextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')

In [72]:
def translateParagraph(par):
    if par == '' or par == '\n':
        return par
    ppc.copy(par)
    stextarea.send_keys(Keys.COMMAND, 'v')
    translated_text = ''
    while not translated_text:
        time.sleep(1)
        translated_text = ttextarea.get_property('value')
    stextarea.send_keys(Keys.COMMAND, 'a')
    stextarea.send_keys(Keys.BACKSPACE)
    return translated_text

In [73]:
paras_JP = [translateParagraph(par) for par in paras]
latex_JP = '\n\n'.join(paras_JP)

In [74]:
ppc.copy(clipboard)
driver.quit()

## Debug: comparing original tags and "translated" ones

In [75]:
def posl2tag(posl):
    return 'P'+posl[0]+'L'+posl[1]

In [76]:
pattern = re.compile('P(\d{8})L(\d{8})')
tag0 = pattern.findall(latex_rep)
tag1 = pattern.findall(latex_JP) + re.compile('p(\d{8})l(\d{8})').findall(latex_JP)
tag_miss = list((set(tag0) ^ set(tag1)) & set(tag0))
tag_miss = [posl2tag(t) for t in tag_miss]
(tag_miss)

['P00080889L00000023',
 'P00004666L00000008',
 'P00017010L00000096',
 'P00066096L00000165',
 'P00024648L00000111',
 'P00027006L00000031',
 'P00032905L00000639',
 'P00071784L00000005',
 'P00080918L00000014',
 'P00024763L00000172',
 'P00048401L00000168',
 'P00004000L00000052',
 'P00031957L00000016',
 'P00012978L00000002',
 'P00048588L00000003',
 'P00004675L00000010',
 'P00070809L00000073',
 'P00026517L00000108',
 'P00014602L00000024',
 'P00080855L00000005',
 'P00017107L00000003',
 'P00035316L00000431',
 'P00004636L00000003',
 'P00070780L00000016',
 'P00024936L00000003',
 'P00012889L00000088']

In [77]:
target = [p for p in paras if tag_miss[0] in p]
paras.index(target[0])

94

In [78]:
paras[94]

'It is well known that the general  P00079739L00000007 -dimensional quantum Hall insulator is characterized by the first Chern number  P00079825L00000018  in terms of the integration of the Berry connection over the Brillouin zone P00079919L00000023 . Its electromagnetic response is described by the action  P00080000L00000123   P00080124L00000003 Similarly, the  P00080142L00000007 -dimensional quantum Hall insulator is characterized by the second Chern number  P00080229L00000018  and described by the action  P00080276L00000172   P00080449L00000003 where  P00080458L00000155  with  P00080619L00000140   P00080760L00000003 Here we used a shorthand notation like  P00080802L00000041  and so on ( P00080855L00000005  may be rather understood as  P00080889L00000023 ) and  P00080918L00000014  denotes the Berry connection matrix in the momentum space given by  P00081000L00000131   P00081132L00000003 with  P00081140L00000020  being the Bloch state with  P00081188L00000008  representing the band in

In [79]:
paras_JP[94]

'一般的なP00079739L00000007次元の量子ホール絶縁体は、ブリルアンゾーン上のベリー接続の積分P00079919L00000023の観点から、第1のChern数P00079825L00000018によって特徴付けられることはよく知られている。その電磁応答は、アクションP00080000L00000123 P00080124L00000003によって記述されます。同様に、P00080142L00000007 - 次元の量子ホール絶縁体は 次元の量子ホール絶縁体は第2のChern数P00080229L00000018で特徴付けられ、作用P00080276L00000172 P00080449L00000003で記述されます。ここでP00080458L00000155とP00080619L00000140 P00080760L00000003 ここではP00080802L00000041のような略記法を使用していますので P00081000L00000131 P00081132L00000003で与えられる運動量空間のベリー結合行列を表し、P00081140L00000020はブロッホ状態、P00081188L00000008はバンドインデックスを表しています。であり、Eq.のトレースは P00081246L00000001 ( P00081248L00000010 ) は、占有されているバンドで取られます。なお、P00081304L00000018はP00081339L00000118で表され、P00081464L00000218 P00081683L00000002 '

## Replace P(pos)L(len) to original LaTeX formula

In [80]:
def replaceRule(match):
    (pos, l) = (int(match.group(1)), int(match.group(2)))
    return latex[pos:pos+l]

In [81]:
latex_fin = re.sub('P(\d{8})L(\d{8})', replaceRule, latex_JP)
latex_fin = re.sub('p(\d{8})l(\d{8})', replaceRule, latex_fin) # DeepL sometimes translates P->p and L->l
latex_fin = latex[:doc.pos]+'\\begin{document}\n'+latex_fin+'\n\\end{document}'

In [82]:
with open('test_JP.tex', mode='w') as f:
    f.write(latex_fin)