# Parse LaTeX file and convert into DeepL-friendly format

## Input LaTeX source from file and parse

In [3]:
from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode, LatexCharsNode

input_file = input()
with open(input_file) as f:
    latex = f.read()
w = LatexWalker(latex)
(nodelist, pos, len_) = w.get_latex_nodes(pos=0)

 /Users/SoChigusa/works/Latex2DeepL/test/test.tex


## Look for \begin{document} ... \end{document} environment

In [4]:
env = [n for n in nodelist if n.isNodeType(LatexEnvironmentNode)]
doc = [e for e in env if e.environmentname=='document']
if len(doc)==1:
    doc = doc[0]
else:
    Print('Unexpected format with more than one document environment')

## Look for successive LaTeX special expressions and replace them by the format P(pos)L(len)

In [5]:
def replaceSpecial(node):
    if node.isNodeType(LatexCharsNode):
        s = node.chars
    else:
        f = '{:008}'
        s = 'P'+f.format(node.pos)+'L'+f.format(node.len)
    return s

In [6]:
dnl = doc.nodelist
str_list = [replaceSpecial(n) for n in dnl]
latex_rep = ''.join(str_list)

## Preprocess the string and translate using DeepL

In [107]:
import re
paras = re.split('\n\n', latex_rep)
paras = [p.replace('\n', ' ') for p in paras]

In [109]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')

DRIVER_PATH = '/Users/SoChigusa/works/Latex2DeepL/bin/chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)

load_url = 'https://www.deepl.com/ja/translator#en/ja'
driver.get(load_url)

clipboard = ppc.paste()
stextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
ttextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')

In [103]:
def translateParagraph(par):
    if par == '' or par == '\n':
        return par
    ppc.copy(par)
    stextarea.send_keys(Keys.COMMAND, 'v')
    translated_text = ''
    while not translated_text:
        time.sleep(1)
        translated_text = ttextarea.get_property('value')
    stextarea.send_keys(Keys.COMMAND, 'a')
    stextarea.send_keys(Keys.BACKSPACE)
    return translated_text

In [110]:
paras_jap = [translateParagraph(par) for par in paras]
latex_jap = '\n\n'.join(paras_jap)

In [24]:
ppc.copy(clipboard)
driver.quit()

In [112]:
target = [p for p in paras if 'denote the electron creation and' in p]
paras.index(target[0])

22

In [145]:
paras[9]

' The QCD axion is a hypothetical elementary particle that solves the strong CP problemP00004000L00000001P00004001L00000051 and is a candidate of dark matter (DM) of the universeP00004107L00000001P00004108L00000048 (see Refs.P00004167L00000001P00004168L00000044 for reviews). Recently people often consider axion-like particles (ALPs) in a broad sense, partly motivated by the developments in string theoryP00004357L00000001P00004358L00000052. ALPs do not necessarily address the strong CP problem, but they are also good DM candidates and may be experimentally probed through, e.g., the axion-photon coupling of the form P00004590L00000039 where P00004636L00000003 denotes the ALP field and P00004666L00000008 P00004675L00000010 denotes the electric (magnetic) field respectively. There are many experimental ideas to search for ALPs including the QCD axion,P00004815L00000130 although still it is not discovered yetP00004985L00000001P00004986L00000410.'

In [115]:
paras_jap[22]

'ここでは、各サイトで1つの電子軌道のみを考慮し、異なる軌道間の相互作用、スピン軌道結合、電子の自己相互作用などを無視します。 P00016716L00000215 第二量子化図では 密結合のハミルトニアンは ここで、P00017116L00000021とP00017142L00000013は、サイトP00017224L00000021での電子の生成と消滅の演算子を表す。ここで、P00017116L00000021とP00017142L00000013は、スピンP00017238L00000008(P00017248L00000010またはP00017262L00000012)を持つサイトP00017224L00000003での電子の生成と消滅の演算子を示し、その合計は隣接するサイトP00017342L00000018の組み合わせで取られます。創造と消滅の演算子は、反 P00017440L00000003P00017443L00000114 P00017558L00000003フーリエ変換はP00017602L00000003P00017605L00000117 P00017723L00000003で定義されます。ハミルトニアンは対角線上に書き直され、P00017777L00000003P00017780L00000178 P00017959L00000003このP00017967L00000019は電子のエネルギーバンドを表しています。例えば、単純な立方体の格子では、P00018071L00000055となります。'

In [123]:
target = [s for s in str_list if 'P00017238L00000008' in s]
str_list.index(target[0])

310

In [121]:
re.sub('P(\d{8})L(\d{8})', replaceRule, paras_jap[22])

"ここでは、各サイトで1つの電子軌道のみを考慮し、異なる軌道間の相互作用、スピン軌道結合、電子の自己相互作用などを無視します。 \\footnote{\n\tEffects of the interaction among different orbitals and spin-orbit coupling are important for the topological insulator. The electron self-interaction will be taken into account in the next subsection.\n} 第二量子化図では 密結合のハミルトニアンは ここで、$c_{i\\sigma}^\\dagger$と$c_{i\\sigma}$は、サイト$i$ with spin $\\sigmaでの電子の生成と消滅の演算子を表す。ここで、$c_{i\\sigma}^\\dagger$と$c_{i\\sigma}$は、スピン$\\sigma$($\\uparrow$または$\\downarrow$)を持つサイト$i$での電子の生成と消滅の演算子を示し、その合計は隣接するサイト$\\left<i,j\\right>$の組み合わせで取られます。創造と消滅の演算子は、反 %%\n\\begin{align}\n\t\\left\\{ c_{i\\sigma}, c^\\dagger_{j\\sigma'} \\right\\} = \\delta_{ij}\\delta_{\\sigma\\sigma'}.\n\\end{align} %%\nフーリエ変換は%%\n\\begin{align}\n\tc_{i\\sigma} = \\frac{1}{\\sqrt N}\\sum_{\\vec k} e^{-i\\vec k\\cdot \\vec x_i} c_{\\vec k,\\sigma}.\n\\end{align} %%\nで定義されます。ハミルトニアンは対角線上に書き直され、%%\n\\begin{align}\n\tH = \\sum_{\\vec k,\\sigma} \\epsilon_{\\vec k} c_{\\vec k,\\sigma}^\\dagger c_{\\vec k,\\sigma},\n\t

## Debug: comparing original tags and "translated" ones

In [154]:
def posl2tag(posl):
    return 'P'+posl[0]+'L'+posl[1]

In [155]:
pattern = re.compile('P(\d{8})L(\d{8})')
tag0 = pattern.findall(latex_rep)
tag1 = pattern.findall(latex_jap)
tag_miss = list((set(tag0) ^ set(tag1)) & set(tag0))
tag_miss = [posl2tag(t) for t in tag_miss]

['P00006739L00000005',
 'P00017107L00000003',
 'P00048588L00000003',
 'P00074180L00000018',
 'P00031709L00000003',
 'P00006750L00000001',
 'P00006751L00000078',
 'P00031476L00000232',
 'P00012842L00000005',
 'P00017010L00000003',
 'P00048401L00000168',
 'P00066096L00000165',
 'P00032905L00000639',
 'P00004000L00000001',
 'P00074179L00000001',
 'P00014602L00000024',
 'P00071784L00000005',
 'P00044468L00000539',
 'P00017013L00000093',
 'P00063689L00000022',
 'P00080003L00000120',
 'P00031359L00000113',
 'P00004001L00000051',
 'P00031473L00000003']

In [149]:
latex[17010:17120]

'%%\n\\begin{align}\n\tH=-t\\sum_{\\left<i,j\\right>,\\sigma}c_{i\\sigma}^\\dagger c_{j\\sigma},\n\\end{align}\n%%\nwhere $c_{'

## Replace P(pos)L(len) to original LaTeX formula

In [37]:
def replaceRule(match):
    (pos, l) = (int(match.group(1)), int(match.group(2)))
    return latex[pos:pos+l]

In [161]:
latex_fin = re.sub('P(\d{8})L(\d{8})', replaceRule, latex_jap)
latex_fin = latex[:doc.pos]+'\\begin{document}\n'+latex_fin+'\n\\end{document}'

In [162]:
with open('test_jap.tex', mode='w') as f:
    f.write(latex_fin)