# Parse LaTeX file and convert into DeepL-friendly format

## Input LaTeX source from file and parse

In [10]:
from pylatexenc.latexwalker import LatexWalker, LatexCharsNode, LatexEnvironmentNode, LatexMacroNode, LatexSpecialsNode
import re

input_file = input()
with open(input_file) as f:
    latex = f.read()
w = LatexWalker(latex)
(nodelist, pos, len_) = w.get_latex_nodes(pos=0)

 combine.tex


## Look for \begin{document} ... \end{document} environment

In [11]:
# nodelist = nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist[-1].nodelist ##################################################
env = [n for n in nodelist if n.isNodeType(LatexEnvironmentNode)]
doc = [e for e in env if e.environmentname=='document']
if len(doc)==1:
    doc = doc[0]
else:
    print('Unexpected format with more than one document environment')

## Look for successive LaTeX special expressions and replace them by the format P(pos)L(len)

In [12]:
format8 = '{:008}'
def replaceSpecial(node):
    if node.isNodeType(LatexCharsNode):
        s = node.chars
    else:
        s = ' P'+format8.format(node.pos)+'L'+format8.format(node.len)+' '
    return s

In [13]:
def replaceSuccessiveTags(match):
    n_space = len(match.group(3))
    return 'P'+match.group(1)+'L'+format8.format(int(match.group(2))+int(match.group(4))+n_space)

In [14]:
dnl = doc.nodelist
str_list = [replaceSpecial(n) for n in dnl]
latex_rep = ''.join(str_list)
while re.search('P(\d{8})L(\d{8}) ([ \n]+) P\d{8}L(\d{8})', latex_rep) is not None:
    latex_rep = re.sub('P(\d{8})L(\d{8}) ([ \n]+) P\d{8}L(\d{8})', replaceSuccessiveTags, latex_rep)

## Preprocess the string and translate using DeepL

In [15]:
paras = re.split('\n\n', latex_rep)
paras = [p.replace('\n', ' ') for p in paras]

In [56]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')

DRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)

load_url = 'https://www.deepl.com/ja/translator#en/ja'
driver.get(load_url)

clipboard = ppc.paste()
stextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
ttextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')

In [17]:
def translateParagraph(par):
    if par == '' or par == '\n':
        return par
    ppc.copy(par)
    stextarea.send_keys(Keys.COMMAND, 'v')
    translated_text = ''
    while not translated_text:
        time.sleep(1)
        translated_text = ttextarea.get_property('value')
    stextarea.send_keys(Keys.COMMAND, 'a')
    stextarea.send_keys(Keys.BACKSPACE)
    return translated_text

In [57]:
paras_ja = [translateParagraph(par) for par in paras]

In [59]:
ppc.copy(clipboard)
driver.quit()

## Debug: comparing original tags and "translated" ones

In [60]:
def posl2tag(posl):
    return 'P'+posl[0]+'L'+posl[1]

In [61]:
pattern = re.compile('P(\d{8})L(\d{8})')
tag0 = pattern.findall(latex_rep)
tag1 = pattern.findall(latex_JP) + re.compile('p(\d{8})l(\d{8})').findall(latex_JP)
tag_miss = list((set(tag0) ^ set(tag1)) & set(tag0))
tag_miss = [posl2tag(t) for t in tag_miss]
(tag_miss)

['P00009292L00000010',
 'P00076522L00000020',
 'P00033956L00000002',
 'P00016787L00000010',
 'P00063063L00000010',
 'P00059043L00000026',
 'P00030341L00000002',
 'P00009057L00000047',
 'P00061101L00000021',
 'P00069146L00000002',
 'P00070178L00000001',
 'P00060940L00000008',
 'P00034139L00000031',
 'P00034138L00000001',
 'P00071070L00000001',
 'P00053880L00000001',
 'P00053881L00000069',
 'P00007932L00000010',
 'P00075940L00000002',
 'P00061578L00000008',
 'P00048523L00000002',
 'P00002796L00000010',
 'P00052564L00000001',
 'P00016797L00000002',
 'P00073456L00000002',
 'P00060994L00000008',
 'P00082833L00000001',
 'P00077088L00000002',
 'P00009238L00000013',
 'P00007965L00000026',
 'P00021437L00000013',
 'P00039053L00000002',
 'P00052565L00000021',
 'P00035891L00000002',
 'P00070179L00000022',
 'P00071833L00000002',
 'P00061100L00000001',
 'P00069141L00000002',
 'P00071071L00000021']

In [37]:
target = [p for p in paras if tag_miss[3] in p]
paras.index(target[0])

18

In [38]:
paras[18]

'Whether dark matter experiences any forces other than gravity, known as self-interacting dark matter (SIDM), is a hotly debated topic in astrophysics. This section provides a brief overview; the reader is encouraged to see Ref. P00015694L00000001  P00015695L00000020  for a thorough review. One motivation for SIDM is that the lightest dark matter particle charged under a dark-sector interaction must be stable due to charge conservation, consistent with the fact that dark matter particles have survived for over 14 billion years to date. SIDM models also have observable implications for astrophysical structure. For example, self-interactions could explain several small-scale structure observations that appear to be in tension with collisionless dark matter predictions,  P00016225L00000010  P00016235L00000002 the so-called core-cusp problem; however, at large scales, collisionless dark matter models have been a great success. This can all be reconciled if dark matter self-interactions are

In [39]:
paras_JP[18]

'暗黒物質が重力以外の力を経験しているかどうかは、自己相互作用暗黒物質（SIDM）と呼ばれ、宇宙物理学の分野で盛んに議論されているテーマである。本節ではその概要を説明しますが、読者の皆様にはRef. P00015694L00000001 P00015695L00000020を参照してください。SIDMの動機の1つは、ダークセクター相互作用下で充電された最も軽いダークマター粒子は、電荷保存により安定していなければならないということであり、これはダークマター粒子が現在まで140億年以上も生き残っているという事実と一致します。SIDMモデルは、天体物理学的な構造に対しても観測可能な意味を持っている。例えば、自己相互作用は、衝突のない暗黒物質の予測とは相反するように見えるいくつかの小さなスケールの構造観測（P00016225L00000010 P00016235L00000002 いわゆるコア・カスプ問題）を説明することができる。しかし、大きなスケールでは、衝突のない暗黒物質モデルは大きな成功を収めている。これは、暗黒物質の自己相互作用が速度に依存している場合で、比較的軽いP00016503L00000014 MeV P00016520L00000002 GeV P00016525L00000003の媒介粒子、P00016548L00000010 P00016558L00000002の暗黒光子によって媒介される場合に予想される。実際、単純な暗黒光子モデルは、矮小銀河から銀河団まで、広い範囲の長さスケールの観測結果を説明することができます。しかし、バリオン相互作用と暗黒物質の相互作用、つまり、超新星からのフィードバックが暗黒物質の密度分布にどのように影響するかについては完全には理解されておらず、この非線形力学が小規模な構造問題に対する別の解決策になるかもしれません。いずれにしても、SIDMには十分な動機があり、ダークフォトンの探索は、素粒子物理学と天体物理学の両方のコミュニティにとって大きな関心事です。'

In [62]:
warning1 = ' Latex2DeepL missing expresion warning: '
warning2 = ', '
def addMissingExpressionWarning(tag):
    target = [p for p in paras if tag in p]
    index = paras.index(target[0])
    if re.search(warning1, paras_ja[index]) is None:
        paras_ja[index] += warning1+'\"'+tag+'\", '
    else:
        paras_ja[index] += '\"'+tag+'\", '
for t in tag_miss:
    addMissingExpressionWarning(t)

In [63]:
latex_ja = '\n\n'.join(paras_ja)

## Replace P(pos)L(len) to original LaTeX formula

In [64]:
def replaceRule(match):
    (pos, l) = (int(match.group(1)), int(match.group(2)))
    return latex[pos:pos+l]

In [65]:
latex_fin = re.sub('P(\d{8})L(\d{8})', replaceRule, latex_ja)
latex_fin = re.sub('p(\d{8})l(\d{8})', replaceRule, latex_fin) # DeepL sometimes translates P->p and L->l
latex_fin = latex[:doc.pos]+'\\begin{document}\n'+latex_fin+'\n\\end{document}'

In [66]:
(head, ext) = re.split('\.', input_file)
output_file = head+'_ja.'+ext
with open(output_file, mode='w') as f:
    f.write(latex_fin)