# Parse LaTeX file and convert into DeepL-friendly format

## Input LaTeX source from file and parse

In [1]:
from pylatexenc.latexwalker import LatexWalker, LatexCharsNode, LatexEnvironmentNode, LatexMacroNode, LatexSpecialsNode
import re

input_file = input()
with open(input_file) as f:
    latex = f.read()
w = LatexWalker(latex)
(nodelist, pos, len_) = w.get_latex_nodes(pos=0)

 main.arxiv.v3.tex


## Look for \begin{document} ... \end{document} environment

In [19]:
# nodelist = nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist ##################################################
# nodelist = nodelist[-1].nodeargd.argnlist[-1].nodelist[-1].nodelist[-1].nodeargd.argnlist[-1].nodelist[-1].nodelist[-1].nodeargd.argnlist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist
nodelist = nodelist[-1].nodeargd.argnlist[-1].nodelist[-1].nodelist[-1].nodeargd.argnlist[-1].nodelist[-1].nodelist
env = [n for n in nodelist if n.isNodeType(LatexEnvironmentNode)]
doc = [e for e in env if e.environmentname=='document']
if len(doc)==1:
    doc = doc[0]
else:
    print('Unexpected format with more than one document environment')

## Look for successive LaTeX special expressions and replace them by the format P(pos)L(len)

In [20]:
format8 = '{:008}'
def replaceSpecial(node):
    if node.isNodeType(LatexCharsNode):
        s = node.chars
    else:
        s = ' P'+format8.format(node.pos)+'L'+format8.format(node.len)+' '
    return s

In [21]:
def replaceSuccessiveTags(match):
    n_space = len(match.group(3))
    return 'P'+match.group(1)+'L'+format8.format(int(match.group(2))+int(match.group(4))+n_space)

In [22]:
dnl = doc.nodelist
str_list = [replaceSpecial(n) for n in dnl]
latex_rep = ''.join(str_list)
while re.search('P(\d{8})L(\d{8}) ([ \n]+) P\d{8}L(\d{8})', latex_rep) is not None:
    latex_rep = re.sub('P(\d{8})L(\d{8}) ([ \n]+) P\d{8}L(\d{8})', replaceSuccessiveTags, latex_rep)

## Preprocess the string and translate using DeepL

In [23]:
paras = re.split('\n\n', latex_rep)
paras = [p.replace('\n', ' ') for p in paras]

In [24]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')

DRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)

load_url = 'https://www.deepl.com/ja/translator#en/ja'
driver.get(load_url)

clipboard = ppc.paste()
stextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
ttextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')

In [25]:
def translateParagraph(par):
    if par == '' or re.fullmatch('[ \n]+', par) is not None:
        return par
    ppc.copy(par)
    stextarea.send_keys(Keys.COMMAND, 'v')
    translated_text = ''
    while not translated_text:
        time.sleep(1)
        translated_text = ttextarea.get_property('value')
    stextarea.send_keys(Keys.COMMAND, 'a')
    stextarea.send_keys(Keys.BACKSPACE)
    return translated_text

In [26]:
paras_ja = [translateParagraph(par) for par in paras]
latex_ja = '\n\n'.join(paras_ja)

In [27]:
ppc.copy(clipboard)
driver.quit()

## Debug: comparing original tags and "translated" ones

In [28]:
def posl2tag(posl):
    return 'P'+posl[0]+'L'+posl[1]

In [29]:
pattern = re.compile('P(\d{8})L(\d{8})')
tag0 = pattern.findall(latex_rep)
tag1 = pattern.findall(latex_ja) + re.compile('p(\d{8})l(\d{8})').findall(latex_ja)
tag_miss = list((set(tag0) ^ set(tag1)) & set(tag0))
tag_miss = [posl2tag(t) for t in tag_miss]
(tag_miss)

['P00019364L00000003',
 'P00010048L00000001',
 'P00010087L00000001',
 'P00035069L00000001',
 'P00010740L00000025',
 'P00010083L00000001',
 'P00019025L00000002',
 'P00011503L00000001',
 'P00010739L00000001',
 'P00013009L00000067',
 'P00011501L00000001',
 'P00041743L00000003',
 'P00011492L00000006',
 'P00010049L00000018',
 'P00013091L00000021',
 'P00058486L00000027',
 'P00058485L00000001',
 'P00016277L00000001',
 'P00041725L00000012',
 'P00010076L00000006',
 'P00019047L00000003',
 'P00041724L00000001',
 'P00019372L00000003',
 'P00011504L00000021',
 'P00054519L00000002',
 'P00013112L00000001',
 'P00019109L00000006',
 'P00010088L00000020']

In [33]:
target = [p for p in paras if tag_miss[0] in p]
paras.index(target[0])

7

In [34]:
paras[7]

"  P00016521L00000983  P00017504L00000078  P00017582L00000057  P00017639L00000077  P00017716L00000077  P00017793L00000077  P00017870L00000077  P00017947L00000009  P00017956L00000025  We collect blinded data for constraining DM that produces events with  P00018052L00000007  electrons.  We expose the Skipper-CCD for 20 hours, and then read each quadrant through one amplifier with 300 samples per pixel. We refer to one such exposure-and-readout as an  P00018238L00000002 image. P00018246L00000002   We took 22 images of DM science data before a mandatory shutdown. All charge on the CCD is erased before taking a new image.  The read time per sample is 42.825 P00018410L00000001  P00018411L00000005 s, while the readout time of the entire active area is 5.153 P00018476L00000001 hours.  Commissioning data, consisting of (7) 20-hour-exposure images, were used to determine the data quality cuts.   During commissioning,  P00018618L00000002 quadrant-1 P00018630L00000002  and  P00018637L00000002 quad

In [35]:
paras_ja[7]

'  P00016521L00000983 P00017504L00000078 P00017582L00000057 P00017639L00000077 P00017716L00000077 P00017793L00000077 P00017870L00000077 P00017947L00000009 P00017956L00000025 P00018052L00000007電子のイベントを生成するDMを拘束するためのブラインドデータを収集しました。 Skipper-CCDを20時間露光した後、1つのアンプで各象限を300サンプル/ピクセルで読み込みます。このような露光と読み出しを、P00018238L00000002画像と呼びます。P00018246L00000002 DMサイエンスデータを22枚撮影した後、強制的にシャットダウンしました。新しい画像を撮影する前に、CCD上の電荷はすべて消去されます。 1サンプルあたりの読み出し時間は42.825 P00018410L00000001 P00018411L00000005秒、全活動領域の読み出し時間は5.153 P00018476L00000001時間です。 コミッショニング・データは、20時間の露光画像（7枚）で構成され、データ・クオリティ・カットの判定に使用されました。  コミッショニングでは、P00018618L00000002第1象限のP00018630L00000002およびP00018637L00000002第2象限のP00018649L00000002が優れた性能を発揮し、二乗平均平方根ノイズはそれぞれ0.146 P00018720L00000003および0.139 P00018732L00000003（エラー・バーは無視できる程度）となりました。  P00018780L00000002 クワドラント-4 P00018792L00000002は、電荷転送効率が過度に高く（シリアル・レジスタ・クロックの切断と一致）、そのデータは破棄されました。また、P00018945L00000002 クワドラント-3 P00018957L00000002 (ショートCCD側のクワドラント-4の隣)は、ノイズが0. これは、周囲の暖かい容器からの黒体放射が、リーフスプリングの溝を通して冷たいCCDの一部に漏れてい

In [36]:
warning1 = '\nLatex2DeepL missing expresion warning: '
warning2 = ', '
def addMissingExpressionWarning(tag):
    target = [p for p in paras if tag in p]
    index = paras.index(target[0])
    if re.search(warning1, paras_ja[index]) is None:
        paras_ja[index] += warning1+'\"'+tag+'\", '
    else:
        paras_ja[index] += '\"'+tag+'\", '
for t in tag_miss:
    addMissingExpressionWarning(t)

In [37]:
latex_ja = '\n\n'.join(paras_ja)

## Replace P(pos)L(len) to original LaTeX formula

In [38]:
def replaceRule(match):
    (pos, l) = (int(match.group(1)), int(match.group(2)))
    return latex[pos:pos+l]

In [39]:
latex_fin = re.sub('P(\d{8})L(\d{8})', replaceRule, latex_ja)
latex_fin = re.sub('p(\d{8})l(\d{8})', replaceRule, latex_fin) # DeepL sometimes translates P->p and L->l
latex_fin = latex[:doc.pos]+'\\begin{document}\n'+latex_fin+'\n\\end{document}'

In [43]:
(head, ext) = re.split('\.', input_file)
output_file = head+'_ja.'+ext
with open(output_file, mode='w') as f:
    f.write(latex_fin)