# Parse LaTeX file and convert into DeepL-friendly format

## Input LaTeX source from file and parse

In [2]:
from pylatexenc.latexwalker import LatexWalker, LatexCharsNode, LatexEnvironmentNode, LatexMacroNode, LatexSpecialsNode
import re

input_file = input()
with open(input_file) as f:
    latex = f.read()
w = LatexWalker(latex)
(nodelist, pos, len_) = w.get_latex_nodes(pos=0)

 main.tex


## Look for \begin{document} ... \end{document} environment

In [21]:
# nodelist = nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist ##################################################
env = [n for n in nodelist if n.isNodeType(LatexEnvironmentNode)]
doc = [e for e in env if e.environmentname=='document']
if len(doc)==1:
    doc = doc[0]
else:
    print('Unexpected format with more than one document environment')

## Look for successive LaTeX special expressions and replace them by the format P(pos)L(len)

In [22]:
format8 = '{:008}'
def replaceSpecial(node):
    if node.isNodeType(LatexCharsNode):
        s = node.chars
    else:
        s = ' P'+format8.format(node.pos)+'L'+format8.format(node.len)+' '
    return s

In [23]:
def replaceSuccessiveTags(match):
    n_space = len(match.group(3))
    return 'P'+match.group(1)+'L'+format8.format(int(match.group(2))+int(match.group(4))+n_space)

In [24]:
dnl = doc.nodelist
str_list = [replaceSpecial(n) for n in dnl]
latex_rep = ''.join(str_list)
while re.search('P(\d{8})L(\d{8}) ([ \n]+) P\d{8}L(\d{8})', latex_rep) is not None:
    latex_rep = re.sub('P(\d{8})L(\d{8}) ([ \n]+) P\d{8}L(\d{8})', replaceSuccessiveTags, latex_rep)

## Preprocess the string and translate using DeepL

In [25]:
paras = re.split('\n\n', latex_rep)
paras = [p.replace('\n', ' ') for p in paras]

In [26]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')

DRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)

load_url = 'https://www.deepl.com/ja/translator#en/ja'
driver.get(load_url)

clipboard = ppc.paste()
stextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
ttextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')

In [27]:
def translateParagraph(par):
    if par == '' or par == '\n':
        return par
    ppc.copy(par)
    stextarea.send_keys(Keys.COMMAND, 'v')
    translated_text = ''
    while not translated_text:
        time.sleep(1)
        translated_text = ttextarea.get_property('value')
    stextarea.send_keys(Keys.COMMAND, 'a')
    stextarea.send_keys(Keys.BACKSPACE)
    return translated_text

In [28]:
paras_JP = [translateParagraph(par) for par in paras]
latex_JP = '\n\n'.join(paras_JP)

In [29]:
ppc.copy(clipboard)
driver.quit()

## Debug: comparing original tags and "translated" ones

In [30]:
def posl2tag(posl):
    return 'P'+posl[0]+'L'+posl[1]

In [31]:
pattern = re.compile('P(\d{8})L(\d{8})')
tag0 = pattern.findall(latex_rep)
tag1 = pattern.findall(latex_JP) + re.compile('p(\d{8})l(\d{8})').findall(latex_JP)
tag_miss = list((set(tag0) ^ set(tag1)) & set(tag0))
tag_miss = [posl2tag(t) for t in tag_miss]
(tag_miss)

['P00070168L00000001',
 'P00052555L00000021',
 'P00077078L00000002',
 'P00030331L00000002',
 'P00039043L00000002',
 'P00016777L00000010',
 'P00048629L00000001',
 'P00035881L00000002',
 'P00002786L00000010',
 'P00035951L00000001',
 'P00052554L00000001',
 'P00016787L00000002',
 'P00059033L00000026',
 'P00082823L00000001',
 'P00069136L00000002',
 'P00053870L00000001',
 'P00038718L00000023',
 'P00048513L00000002',
 'P00053871L00000069',
 'P00061568L00000008',
 'P00071061L00000021',
 'P00021427L00000013',
 'P00007955L00000026',
 'P00069131L00000002',
 'P00009047L00000047',
 'P00007922L00000010',
 'P00076512L00000020',
 'P00035952L00000036',
 'P00073446L00000002',
 'P00070169L00000022',
 'P00075930L00000002',
 'P00071060L00000001',
 'P00071823L00000002']

In [34]:
target = [p for p in paras if tag_miss[2] in p]
paras.index(target[0])

91

In [35]:
paras[91]

'  P00076224L00000040  P00076264L00000013 : published results P00076296L00000001  P00076297L00000039  and future projections P00076359L00000001  P00076360L00000020  for both prompt and displaced visible  P00076419L00000024  decays, along with future projections for both prompt and displaced  P00076512L00000020  decays, where the  P00076551L00000008 is produced in the decays of charm mesons P00076600L00000001  P00076601L00000020 . LHCb is a general purpose detector in the forward region, located at the LHC at CERN. It studies heavy flavor physics, including CP violation, rare decays, and new phenomena such as lepton universality violation. LHCb employs real-time calibration, alignment, and physics analysis. During long shutdown 2 (2018 P00076933L00000002 2021) LHCb is moving to a triggerless readout system, in which the full detector information is read out every LHC collision; a data rate of 40 P00077078L00000002 Tb/s. The instantaneous luminosity will increase by a factor of 5, and wh

In [36]:
paras_JP[91]

'  P00076224L00000040 P00076264L00000013 : P00076296L00000001 P00076297L00000039の結果と、P00076359L00000001 P00076360L00000020の結果と、P00076419L00000024の即発と置換された可視光の両方の崩壊についての将来の予測。P00076551L00000008はチャーム中間子P00076600L00000001 P00076601L00000020の崩壊で生成されます。LHCbは，CERNのLHCに設置された前方領域の汎用検出器です。LHCbは，CP対称性の破れ，稀な崩壊，レプトンの普遍性の破れなどの新しい現象を含む，重いフレーバーの物理を研究しています。LHCbでは、リアルタイムでの校正、アライメント、物理解析を行っています。LHCbは、第2期長期停止期間中（2018年P00076933L00000002 2021年）に、トリガーレス読み出しシステムに移行し、LHCの衝突ごとに検出器の全情報を読み出すことになります。瞬間的な輝度は5倍になり，検出器のアップグレードと合わせて，暗黒光子崩壊を含む多くの物理チャンネルの感度が大幅に向上します。 P00077288L00000013 P00077301L00000012 : prompt visible P00077352L00000025 decaysの発表結果 P00077384L00000001 P00077385L00000023 . CMSは、CERNの大型ハドロン衝突型加速器に設置された大型の汎用検出器で、4 P00077512L00000002 Tのソレノイド磁石を備えています。CMSは、高横方向運動量プロセスにおける新物理学の探索に特に重点を置いた、幅広い物理学プログラムを持っています。これまでのハイライトは、ATLAS実験とともにヒッグス粒子を発見したことです。この実験は、2026年に予定されている高輝度LHC運転に対応するために、いくつかのアップグレードを準備しています。 P00077868L00000013 P00077881L00000014 : 将来の予測 P00077915L00000001 P00077916L00000020 目に見える形でのずれ P00077959L

## Replace P(pos)L(len) to original LaTeX formula

In [37]:
def replaceRule(match):
    (pos, l) = (int(match.group(1)), int(match.group(2)))
    return latex[pos:pos+l]

In [38]:
latex_fin = re.sub('P(\d{8})L(\d{8})', replaceRule, latex_JP)
latex_fin = re.sub('p(\d{8})l(\d{8})', replaceRule, latex_fin) # DeepL sometimes translates P->p and L->l
latex_fin = latex[:doc.pos]+'\\begin{document}\n'+latex_fin+'\n\\end{document}'

In [1]:

with open('combine_ja.tex', mode='w') as f:
    f.write(latex_fin)

NameError: name 're' is not defined

In [5]:
(head, )re.split('\.', input_file)

['main', 'tex']