# Parse LaTeX file and convert into DeepL-friendly format

## Input LaTeX source from file and parse

In [207]:
from pylatexenc.latexwalker import LatexWalker, LatexCharsNode, LatexEnvironmentNode, LatexMacroNode, LatexSpecialsNode

input_file = input()
with open(input_file) as f:
    latex = f.read()
w = LatexWalker(latex)
(nodelist, pos, len_) = w.get_latex_nodes(pos=0)

 /Users/SoChigusa/works/Latex2DeepL/test/test.tex


## Look for \begin{document} ... \end{document} environment

In [177]:
env = [n for n in nodelist if n.isNodeType(LatexEnvironmentNode)]
doc = [e for e in env if e.environmentname=='document']
if len(doc)==1:
    doc = doc[0]
else:
    Print('Unexpected format with more than one document environment')

## Look for successive LaTeX special expressions and replace them by the format P(pos)L(len)

In [223]:
def replaceSpecial(node):
    if node.isNodeType(LatexCharsNode):
        s = node.chars
    else:
        f = '{:008}'
        s = ' P'+f.format(node.pos)+'L'+f.format(node.len)+' '
    return s

In [209]:
dnl = doc.nodelist
str_list = [replaceSpecial(n) for n in dnl]
latex_rep = ''.join(str_list)

## Preprocess the string and translate using DeepL

In [210]:
import re
paras = re.split('\n\n', latex_rep)
paras = [p.replace('\n', ' ') for p in paras]

In [211]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')

DRIVER_PATH = '/Users/SoChigusa/works/Latex2DeepL/bin/chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)

load_url = 'https://www.deepl.com/ja/translator#en/ja'
driver.get(load_url)

clipboard = ppc.paste()
stextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
ttextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')

In [212]:
def translateParagraph(par):
    if par == '' or par == '\n':
        return par
    ppc.copy(par)
    stextarea.send_keys(Keys.COMMAND, 'v')
    translated_text = ''
    while not translated_text:
        time.sleep(1)
        translated_text = ttextarea.get_property('value')
    stextarea.send_keys(Keys.COMMAND, 'a')
    stextarea.send_keys(Keys.BACKSPACE)
    return translated_text

In [213]:
paras_jap = [translateParagraph(par) for par in paras]
latex_jap = '\n\n'.join(paras_jap)

In [214]:
ppc.copy(clipboard)
driver.quit()

## Debug: comparing original tags and "translated" ones

In [215]:
def posl2tag(posl):
    return 'P'+posl[0]+'L'+posl[1]

In [228]:
pattern = re.compile('P(\d{8})L(\d{8})')
tag0 = pattern.findall(latex_rep)
tag1 = pattern.findall(latex_jap) + re.compile('p(\d{8})l(\d{8})').findall(latex_jap)
tag_miss = list((set(tag0) ^ set(tag1)) & set(tag0))
tag_miss = [posl2tag(t) for t in tag_miss]
(tag_miss)

['P00004675L00000010',
 'P00048401L00000168',
 'P00026517L00000108',
 'P00044468L00000539',
 'P00066096L00000165',
 'P00004636L00000003',
 'P00048588L00000003',
 'P00014602L00000024',
 'P00035316L00000431',
 'P00080003L00000120',
 'P00012842L00000005',
 'P00004666L00000008',
 'P00004001L00000051',
 'P00032905L00000639',
 'P00071784L00000005']

In [241]:
target = [p for p in paras if tag_miss[10] in p]
paras.index(target[0])

16

In [240]:
paras[16]

' Let us start with the Heisenberg anti-ferromagnet model  P00010470L00000043 . P00010514L00000192  Suppose a bipartite lattice consisting of sublattices A and B, and on each lattice point  P00010796L00000009  A or  P00010811L00000010  B there is an electron spin  P00010850L00000008 . Applying an external magnetic field  P00010896L00000005  along the  P00010912L00000003  direction, the model Hamiltonian is given by  P00010961L00000003  P00010964L00000206   P00011171L00000003 where  P00011180L00000005  is the exchange interaction,  P00011215L00000005  and  P00011225L00000016  is the Bohr magneton, and  P00011268L00000005  is the anisotropy field.  The collective excitation of the spin-wave around the ground state, called magnon, is analyzed through the Holstein-Primakoff transformation,  P00011442L00000003  P00011445L00000379   P00011825L00000003 where we have defined  P00011850L00000038  and  P00011893L00000043 , and the creation-annihilation operators satisfy the commutation relation 

## Replace P(pos)L(len) to original LaTeX formula

In [219]:
def replaceRule(match):
    (pos, l) = (int(match.group(1)), int(match.group(2)))
    return latex[pos:pos+l]

In [225]:
latex_fin = re.sub('P(\d{8})L(\d{8})', replaceRule, latex_jap)
latex_fin = re.sub('p(\d{8})l(\d{8})', replaceRule, latex_fin) # DeepL sometimes translates P->p and L->l
latex_fin = latex[:doc.pos]+'\\begin{document}\n'+latex_fin+'\n\\end{document}'

In [227]:
with open('test_jap.tex', mode='w') as f:
    f.write(latex_fin)