In [1]:
import unicodedata
unicodedata.name('e')

'LATIN SMALL LETTER E'

In [2]:
unicodedata.name('Δ')

'GREEK CAPITAL LETTER DELTA'

In [3]:
"\N{GREEK CAPITAL LETTER DELTA}"

'Δ'

In [4]:
unicodedata.lookup('GREEK CAPITAL LETTER DELTA')

'Δ'

In [5]:
# {string_to_replace, replacement} dictionary
latex_commands = {
    r'\l{}': 'ø',
    r'\o': 'ł',
    r'\textless': '<',
    r'\textgreater': '>',
    r'\euro{}': '€',
    r'\EUR{}': '€',
    r'\P': '¶',
    r'\ddag': '‡',
    r'\textbar': '|',
    r'\textendash': '–',
    r'\texttrademark': '™',
    r'\textexclamdown': '¡',
    r'\pounds': '£',
    r'\S': '§',
    r'\dag': '†',
    r'\textbackslash': '',
    r'\textemdash': '—',
    r'\textregistered': '®',
    r'\textquestiondown': '¿',
    r'\copyright': '©',
    r'\%': '%',
    r'\$': '$',
    r'\{': '{',
    r'\_': '_',
    r'\#': '#',
    r'\&': '&',
    r'\}': '}'
}

In [6]:
import sys
import unicodedata

def get_replacement(unicode_expression, matchgroup):
    matched_caracter = matchgroup.group(1) # caracter which take the place of '(.)'
    replacement_unicode_caracter_name = unicode_expression.replace('(.)', unicodedata.name(matched_caracter))
    try: # try to get the unicode caracter corresponding to unicode_caracter_name
        replacement_caracter = unicodedata.lookup(replacement_unicode_caracter_name)
    except KeyError as e:
        print('Cannot find unicode for', replacement_unicode_caracter_name,
              ', using', matchgroup.group(1), 'instead', file = sys.stderr)
        replacement_caracter = matchgroup.group(1)
    return replacement_caracter

# {regex: replacement} dictionary
latex_commands_whith_parametter = {
    r"\\'(.)": lambda matchgroup : get_replacement('(.) WITH ACUTE', matchgroup),
    r'\\`(.)': lambda matchgroup : get_replacement('(.) WITH GRAVE', matchgroup),
    r'\\^(.)': lambda matchgroup : get_replacement('(.) WITH CIRCUMFLEX', matchgroup),
    r'\\"(.)': lambda matchgroup : get_replacement('(.) WITH DIAERESIS', matchgroup),
    r'\\H(.)': lambda matchgroup : get_replacement('(.) WITH DOUBLE ACUTE', matchgroup),
    r'\\~(.)': lambda matchgroup : get_replacement('(.) WITH TILDE', matchgroup),
    r'\\c(.)': lambda matchgroup : get_replacement('(.) WITH CEDILLA', matchgroup),
    r'\\k(.)': lambda matchgroup : get_replacement('(.) WITH OGONEK', matchgroup),
    r'\\=(.)': lambda matchgroup : get_replacement('(.) WITH MACRON', matchgroup),
    r'\\\.(.)': lambda matchgroup : get_replacement('(.) WITH DOT ABOVE', matchgroup),
    r'\\d(.)': lambda matchgroup : get_replacement('(.) WITH DOT BELOW', matchgroup),
    r'\\r(.)': lambda matchgroup : get_replacement('(.) WITH RING ABOVE', matchgroup),
    r'\\u(.)': lambda matchgroup : get_replacement('(.) WITH BREVE', matchgroup),
    r'\\v(.)': lambda matchgroup : get_replacement('(.) WITH CARON', matchgroup),
    r'\\textcircled(.)': lambda matchgroup : get_replacement('CIRCLED (.)', matchgroup),
        # we need to escape the '.' and '\' to match the litteral '.' and '\'
        # the '(.)' match any one caracter and enable us to retriview it as a matchgroup
}

In [7]:
test_string = r"""
@inproceedings{backes:inria-00080498,
  TITLE = {{Computationally Sound Secrecy Proofs by Mechanized Flow Analysis}},
  AUTHOR = {Backes, Michael and Laud, Peeter},
  URL = {https://hal.inria.fr/inria-00080498},
  BOOKTITLE = {{Workshop on Formal and Computational Cryptography (FCC2006)}},
  ADDRESS = {Venice/Italy},
  ORGANIZATION = {{V{\'e}ronique Cortier, Steve Kremer}},
  YEAR = {2006},
  MONTH = Jul,
  PDF = {https://hal.inria.fr/inria-00080498/file/paper1.pdf},
  HAL_ID = {inria-00080498},
  HAL_VERSION = {v1},
}

@INPROCEEDINGS{Legeay-2011,
  author = {Matthieu Legeay},
  title = {Permutation decoding : Towards an approach using algebraic properties
	of the $\sigma$-subcode},
  editor = {Daniel Augot and Anne Canteaut},
  booktitle = {WCC 2011},
  year = {2011},
  pages = {193-202}
}

@inproceedings{blondeau:hal-01276270,
  TITLE = {{On Distinct Known Plaintext Attacks}},
  AUTHOR = {Blondeau, C{\'e}line and Nyberg, Kaisa},
  URL = {https://hal.inria.fr/hal-01276270},
  BOOKTITLE = {{WCC2015 - 9th International Workshop on Coding and Cryptography 2015}},
  ADDRESS = {Paris, France},
  ORGANIZATION = {{Anne Canteaut, Ga{\"e}tan Leurent, Maria Naya-Plasencia}},
  EDITOR = {Pascale Charpin, Nicolas Sendrier, Jean-Pierre Tillich},
  SERIES = {Proceedings of the 9th International Workshop on Coding and Cryptography 2015 WCC2015},
  YEAR = {2015},
  MONTH = Apr,
  KEYWORDS = {multidimensional linear attack ; zero-correlation linear attack ; key-difference-invariant-bias attack ; known plaintext ; distinct known plaintext ; statistical model},
  PDF = {https://hal.inria.fr/hal-01276270/file/wcc15-we2-3.pdf},
  HAL_ID = {hal-01276270},
  HAL_VERSION = {v1},
}
"""

In [8]:
import re
s = re.sub("{", "", test_string)
s = re.sub("}", "", s)
s2 = s
for latex, unicode in latex_commands.items():
    latex = re.escape(latex)     # escape characters with meaning in the regex langage, like '*',
    unicode = re.escape(unicode) # with a '\', so they are considered as normal characters
    s2 = re.sub(latex, unicode, s2)
print(s2)


@inproceedingsbackes:inria-00080498,
  TITLE = Computationally Sound Secrecy Proofs by Mechanized Flow Analysis,
  AUTHOR = Backes, Michael and Laud, Peeter,
  URL = https://hal.inria.fr/inria-00080498,
  BOOKTITLE = Workshop on Formal and Computational Cryptography (FCC2006),
  ADDRESS = Venice/Italy,
  ORGANIZATION = V\'eronique Cortier, Steve Kremer,
  YEAR = 2006,
  MONTH = Jul,
  PDF = https://hal.inria.fr/inria-00080498/file/paper1.pdf,
  HAL_ID = inria-00080498,
  HAL_VERSION = v1,


@INPROCEEDINGSLegeay-2011,
  author = Matthieu Legeay,
  title = Permutation decoding : Towards an approach using algebraic properties
	of the $\sigma$-subcode,
  editor = Daniel Augot and Anne Canteaut,
  booktitle = WCC 2011,
  year = 2011,
  pages = 193-202


@inproceedingsblondeau:hal-01276270,
  TITLE = On Distinct Known Plaintext Attacks,
  AUTHOR = Blondeau, C\'eline and Nyberg, Kaisa,
  URL = https://hal.inria.fr/hal-01276270,
  BOOKTITLE = WCC2015 - 9th International Workshop on Coding and

In [9]:
import re
s3 = s2
for latex, replacement_function in latex_commands_whith_parametter.items():
    print(latex)
    pattern = re.compile(latex)
    s3 = re.sub(pattern, replacement_function, s3)
print(s3)

\\'(.)
\\`(.)
\\^(.)
\\"(.)
\\H(.)
\\~(.)
\\c(.)
\\k(.)
\\=(.)
\\\.(.)
\\d(.)
\\r(.)
\\u(.)
\\v(.)
\\textcircled(.)

@inproceedingsbackes:inria-00080498,
  TITLE = Computationally Sound Secrecy Proofs by Mechanized Flow Analysis,
  AUTHOR = Backes, Michael and Laud, Peeter,
  URL = https://hal.inria.fr/inria-00080498,
  BOOKTITLE = Workshop on Formal and Computational Cryptography (FCC2006),
  ADDRESS = Venice/Italy,
  ORGANIZATION = Véronique Cortier, Steve Kremer,
  YEAR = 2006,
  MONTH = Jul,
  PDF = https://hal.inria.fr/inria-00080498/file/paper1.pdf,
  HAL_ID = inria-00080498,
  HAL_VERSION = v1,


@INPROCEEDINGSLegeay-2011,
  author = Matthieu Legeay,
  title = Permutation decoding : Towards an approach using algebraic properties
	of the $\sigma$-subcode,
  editor = Daniel Augot and Anne Canteaut,
  booktitle = WCC 2011,
  year = 2011,
  pages = 193-202


@inproceedingsblondeau:hal-01276270,
  TITLE = On Distinct Known Plaintext Attacks,
  AUTHOR = Blondeau, Céline and Nyberg, Ka

In [10]:
import re

def latex_to_unicode(s):
    """Parse a LaTeX string and replace with unicode where possible"""
    # remove '{' and '}'
    s = re.sub("{", "", s)
    s = re.sub("}", "", s)
    # replace simple strings
    for latex, unicode in latex_commands.items():
        latex = re.escape(latex)     # escape characters with meaning in the regex langage, like '*',
        unicode = re.escape(unicode) # with a '\', so they are considered as normal characters
        s = re.sub(latex, unicode, s)
    # replace complex paterns
    for latex, replacement_function in latex_commands_whith_parametter.items():
        pattern = re.compile(latex)
        s = re.sub(pattern, replacement_function, s)
    return s

In [11]:
print(latex_to_unicode(test_string))


@inproceedingsbackes:inria-00080498,
  TITLE = Computationally Sound Secrecy Proofs by Mechanized Flow Analysis,
  AUTHOR = Backes, Michael and Laud, Peeter,
  URL = https://hal.inria.fr/inria-00080498,
  BOOKTITLE = Workshop on Formal and Computational Cryptography (FCC2006),
  ADDRESS = Venice/Italy,
  ORGANIZATION = Véronique Cortier, Steve Kremer,
  YEAR = 2006,
  MONTH = Jul,
  PDF = https://hal.inria.fr/inria-00080498/file/paper1.pdf,
  HAL_ID = inria-00080498,
  HAL_VERSION = v1,


@INPROCEEDINGSLegeay-2011,
  author = Matthieu Legeay,
  title = Permutation decoding : Towards an approach using algebraic properties
	of the $\sigma$-subcode,
  editor = Daniel Augot and Anne Canteaut,
  booktitle = WCC 2011,
  year = 2011,
  pages = 193-202


@inproceedingsblondeau:hal-01276270,
  TITLE = On Distinct Known Plaintext Attacks,
  AUTHOR = Blondeau, Céline and Nyberg, Kaisa,
  URL = https://hal.inria.fr/hal-01276270,
  BOOKTITLE = WCC2015 - 9th International Workshop on Coding and Cry