# Parse LaTeX file and convert into DeepL-friendly format

## Parse LaTeX and generate tree structure

In [14]:
import regex
from anytree import Node, RenderTree

env1 = regex.compile(r'\\begin\{((?>[^\{\}]+|(?R))*)\}') # take care of correspondence btw \{ and \}
env2 = regex.compile(r'\\end\{((?>[^\{\}]+|(?R))*)\}')
def latex2tree(latex, tree, texdict, pos=1, structure=[]):
    
    # initialize tree with input file name
    if structure == []:
        structure = [tree]
    
    name = 'n'+str(pos)
    match1 = env1.search(latex)
    match2 = env2.search(latex)
    
    # end of document
    if match1 is None and match2 is None:
        Node(name, parent=structure[-1], dirtype='plain')
        texdict[name] = latex
    
    # end environment
    elif match1 is None or match1.span(0)[0] > match2.span(0)[0]:
        Node(name, parent=structure[-1], dirtype='plain')
        texdict[name] = latex[:match2.span(0)[0]]
        
        latex = latex[match2.span(0)[1]:]
        structure.pop(-1)
        latex2tree(latex, tree, texdict, pos+1, structure)
        
    # begin environment
    else:
        Node(name, parent=structure[-1], dirtype='plain')
        child = Node(match1.group(1), parent=structure[-1], dirtype=match1.group(1))
        texdict[name] = latex[:match1.span(0)[0]]
        
        structure.append(child)
        latex = latex[match1.span(0)[1]:]
        latex2tree(latex, tree, texdict, pos+1, structure)

In [15]:
#input_file = input()
input_file = 'reply.tex'
with open(input_file) as f:
    latex_orig = f.read()
    
tree = Node(input_file, parent=None)
texdict = {}
latex2tree(latex_orig, tree, texdict)

## for demonstration

In [17]:
children = tree.children[1].children[1].children
tlist = [texdict[c.name] if c.dirtype == 'plain' else 'N'+'{:008}'.format(i) for i, c in enumerate(children)]
text = ''.join(tlist)
paras = regex.split('\n[\x20\t]*\n', text)
paras = [p.replace('\n', ' ') for p in paras]

In [18]:
for pre, fill, node in RenderTree(tree):
    print('%s%s' % (pre, node.name))

reply.tex
├── n1
├── document
│   ├── n2
│   ├── enumerate
│   │   ├── n3
│   │   ├── align*
│   │   │   └── n4
│   │   └── n5
│   ├── n6
│   ├── flushleft
│   │   └── n7
│   └── n8
└── n9


In [29]:
tree.children[0].dirtype

'plain'

In [110]:
texdict['n3']

'\n    \\item Do the authors have a specific material candidate in mind modelled by the FKMH model?\n    \n    The material example of Fe-doped Bismuth Selenide (Li et al, 2010), is not FKMH (to my understanding). In this case the CM axion is a purely longitudinal magnon. The longitudinal case does not have the simple Heisenberg interpretation or Kittel splitting of FKMH.\n    \n    Can the present treatment be extended to this case of a longitudinal CM axion?\n    \n    \\textbf{Reply}:\n    %%\n    We agree that the Fe-doped BiSe3 is different from the FKMH model. To our understanding, the axion as the longitudinal magnon in the Fe-doped BiSe3 is not simply expressed by the magnon creation/annihilation operator (at least at the linear level). Thus we think that some extended formalism is required to describe the CM axion in the Fe-doped BiSe3, although we have not found it.\n    %%\n    \n    \\item Eq. 5.11 is formally divergent when $m_a=m_m$. Finite linewidth of the magnon regulat

In [10]:
# tree = Node(input_file, parent=None)
# structure = [tree]
# texdict = {}
# env1 = regex.compile(r'\\begin\{((?>[^\{\}]+|(?R))*)\}') # take care of correspondence btw \{ and \}
# env2 = regex.compile(r'\\end\{((?>[^\{\}]+|(?R))*)\}')

# latex = latex_orig

# pos = 1
# name = 'n'+str(pos)
# match = env1.search(latex)
# Node(name, parent=structure[-1])
# child = Node('env', parent=tree, dirtype=match.group(1))
# structure.append(child)
# texdict[name] = latex[:match.span(0)[0]]
# latex = latex[match.span(0)[1]:]

# pos = pos + 1
# name = 'n'+str(pos)
# match = env2.search(latex)
# Node(name, parent=structure[-1])
# structure.pop(-1)
# texdict[name] = latex[:match.span(0)[0]]
# latex = latex[match.span(0)[1]:]

# pos = pos + 1
# name = 'n'+str(pos)
# match = env1.search(latex)
# if match is None:
#     Node(name, parent=structure[-1])
#     texdict[name] = latex

In [71]:
def replaceSpecial(i, node):
    format8 = '{:008}'
    if node is None:
        return ''
    elif node.isNodeType(LatexCharsNode):
        s = node.chars
#    elif node.isNodeType(LatexCommentNode):
#        s = ''
    elif node.isNodeType(LatexSpecialsNode) and node.specials_chars == '~':
        s = ' '
    else:
        s = ' N'+format8.format(i)+'T'+format8.format(i)+' '
    return s

def replaceSuccessiveTags(match):
    return 'N'+match.group(1)+'T'+match.group(2)

def replaceTags(dnl):
    str_list = [replaceSpecial(i, n) for i, n in enumerate(dnl)]
    latex_out = ''.join(str_list)
    while regex.search('N\d{8}T\d{8} [ \n]* N\d{8}T\d{8}', latex_out) is not None:
        latex_out = regex.sub('N(\d{8})T\d{8} [ \n]* N\d{8}T(\d{8})', replaceSuccessiveTags, latex_out)
    return latex_out

## Translation

In [87]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')

DRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)

load_url = 'https://www.deepl.com/ja/translator#en/ja'
driver.get(load_url)

clipboard = ppc.paste()
stextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
ttextarea = driver.find_element_by_css_selector(
    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')

In [73]:
def translateParagraph(par):
    if par == '' or regex.fullmatch('[ \n]+', par) is not None:
        return par
    ppc.copy(par)
    stextarea.send_keys(Keys.COMMAND, 'v')
    translated_text = ''
    while not translated_text:
        time.sleep(1)
        translated_text = ttextarea.get_property('value')
    stextarea.send_keys(Keys.COMMAND, 'a')
    stextarea.send_keys(Keys.BACKSPACE)
    return translated_text

def node2tag(node):
    return 'N'+node[0]+'T'+node[1]
        
def translateOneLevel(doc):
    
    # translate paragraph by paragraph
    if doc.isNodeType(LatexCharsNode) or doc.isNodeType(LatexMacroNode):
        dnl = [doc]
    else:
        dnl = doc.nodelist
    latex_rep = replaceTags(dnl)
    paras = regex.split('\n\n', latex_rep)
    paras = [p.replace('\n', ' ') for p in paras]
    paras_ja = [translateParagraph(par) for par in paras]
    latex_tmp = '\n\n'.join(paras_ja)

    # extract missing tag information
    pattern = regex.compile('N(\d{8})T(\d{8})')
    tag0 = pattern.findall(latex_rep)
    tag1 = pattern.findall(latex_tmp) + regex.compile('N\d{8}T\d{8}').findall(latex_tmp)
    tag_miss = list((set(tag0) ^ set(tag1)) & set(tag0))
    tag_miss = [node2tag(t) for t in tag_miss]

    # add missing expression warning
    warning = '\nLatex2DeepL missing expresion warning: '
    for tag in tag_miss:
        target = [p for p in paras if tag in p]
        index = paras.index(target[0])
        if regex.search(warning, paras_ja[index]) is None:
            paras_ja[index] += warning+'\"'+tag+'\", '
        else:
            paras_ja[index] += '\"'+tag+'\", '
    
    # convert back tags into original formula
    latex_ja = '\n\n'.join(paras_ja)
    latex_ja = regex.sub('N(\d{8})T(\d{8})', lambda wrapper: replaceRule(dnl, wrapper), latex_ja)
    latex_ja = regex.sub('n(\d{8})t(\d{8})', lambda wrapper: replaceRule(dnl, wrapper), latex_ja)  # DeepL sometimes translate large characters into small ones
    return paras_ja, latex_ja

In [116]:
def translateContent(node):
#     _, content = translateOneLevel(node)
    node.nodelist = [LatexCharsNode(parsing_state=None, pos=0, len=0, chars=content)]

In [104]:
def translateAllContents(nl):
    for i in range(len(nl)):
        if nl[i] is None:
            continue
        if nl[i].isNodeType(LatexEnvironmentNode):
            en = nl[i].environmentname
            if en == 'abstract' or en == 'itemize' or en == 'enumerate':
                translateContent(nl[i])
                continue
            else:
                next = nl[i].nodelist
        elif nl[i].isNodeType(LatexMacroNode):
            if nl[i].nodeargd is None:
                continue
            else:
                mn = nl[i].macroname
                if mn == 'section' or mn == 'subsection' or mn == 'subsubsection' \
                    or mn == 'chapter' or mn == 'subchapter' or mn == 'subsubchapter' or mn == 'footnote' \
                    or mn == 'textit' or mn == 'textbf':
                    elem = nl[i].nodeargd.argnlist
                    if elem != [] and elem[0] is not None:
                        translateContent(elem[0])
                    continue
                else:
                    next = nl[i].nodeargd.argnlist
        elif nl[i].isNodeType(LatexCharsNode) or nl[i].isNodeType(LatexCommentNode) or nl[i].isNodeType(LatexSpecialsNode):
            continue
        else:
            next = nl[i].nodelist
        translateAllContents(next)

In [118]:
from pylatexenc.latexwalker import LatexWalker, LatexCharsNode, LatexCommentNode, LatexEnvironmentNode, LatexGroupNode, LatexMacroNode, LatexMathNode, LatexSpecialsNode
def latexParse(par):
    w = LatexWalker(par)
    (nodelist, pos, len_) = w.get_latex_nodes(pos=0)
    translateAllContents(nodelist)    

In [119]:
def replaceRule(texdict, match):
    (nl, nn) = (match.group(1), match.group(2))
    return texdict['L'+nl+'N'+nn]

def translateTree(tree, texdict, layer=1, doc=False):
    children = tree.children
    llist = [texdict[c.name] if c.dirtype == 'plain' else 'L'+str(layer)+'N'+'{:008}'.format(i) for i, c in enumerate(children)]
    latex = ''.join(llist)
    
    # translate at any level inside document environment
    if doc:
        paras = regex.split('\n[\x20\t]*\n', latex)
        paras = [p.replace('\n', ' ') for p in paras]
        
        # translate each paragraph here
        paras_ja = paras
        print(paras)
        [latexParse(p) for p in paras_ja]
#         paras_ja = [translateParagraph(p) for p in paras]
        
        latex_ja = '\n\n'.join(paras_ja)
    
    # do not translate outside document
    else:
        latex_ja = latex

    # go to next level
    for i, c in enumerate(children):
        if c.children != ():
            texdict['L'+str(layer)+'N'+'{:008}'.format(i)] = translateTree(c, texdict, layer+1, True)
            
    # convert back tags into original formula
    latex_ja = regex.sub('L(\d)N(\d{8})', lambda wrapper: replaceRule(texdict, wrapper), latex_ja)
    latex_ja = regex.sub('l(\d)n(\d{8})', lambda wrapper: replaceRule(texdict, wrapper), latex_ja)  # DeepL sometimes translate tags in this form
    
    return latex_ja

In [120]:
latex_ja = translateTree(tree, texdict)

['', 'We thank the referee very much for careful reading and useful comments that help to improve our draft. Replies to the comments are listed below.', 'L2N00000001', ' We hope that the modified version is suitable for publication in JHEP.\\\\', 'Sincerely,\\\\', 'L2N00000003', ' ']
['     \\item Do the authors have a specific material candidate in mind modelled by the FKMH model?', '    The material example of Fe-doped Bismuth Selenide (Li et al, 2010), is not FKMH (to my understanding). In this case the CM axion is a purely longitudinal magnon. The longitudinal case does not have the simple Heisenberg interpretation or Kittel splitting of FKMH.', '    Can the present treatment be extended to this case of a longitudinal CM axion?', '    \\textbf{Reply}:     %%     We agree that the Fe-doped BiSe3 is different from the FKMH model. To our understanding, the axion as the longitudinal magnon in the Fe-doped BiSe3 is not simply expressed by the magnon creation/annihilation operator (at le

NameError: name 'content' is not defined

In [80]:
ppc.copy(clipboard)
driver.quit()