In [1]:
import json

with open('../data/acl-entities-cleaned.json', 'r', encoding='utf-8') as f:
    acl_entities = json.load(f)

In [2]:
from pylatexenc.latex2text import LatexNodes2Text

text = r"\textbf{Hello} \frac{1}{2} + \alpha"
plain = LatexNodes2Text().latex_to_text(text)
print(plain)  # "Hello ½ + α"


Hello 1/2 + α


In [3]:
def de_latex(s: str) -> str:
    return LatexNodes2Text().latex_to_text(s)

In [4]:
from pybtex.database import Person

def format_authors(authors: str) -> str:
    # Split authors on 'and'
    author_list = [Person(a.strip()) for a in authors.split(" and\n")]
    formatted = []
    for person in author_list:
        # First + middle names
        first_parts = " ".join(person.first_names + person.middle_names)
        # Pre-last parts like "von", "de la"
        prelast = " ".join(person.prelast_names)
        # Last name
        last = " ".join(person.last_names)

        # Build name in "Firstname Lastname" style
        name = " ".join(part for part in [first_parts, prelast, last] if part)
        formatted.append(name)
    
    return ", ".join(formatted[:-1]) + (", and " if len(formatted) > 1 else "") + formatted[-1] if formatted else ""

In [5]:
original = acl_entities[3]['author']
de_latex_original = de_latex(original)
formatted = format_authors(de_latex_original)
print(f"Original: {original}", "\n")
print(f"De-latex: {de_latex_original}", "\n")
print(f"Formatted: {formatted}")

Original: Stepi{\v{s}}nik-Perdih, Timen  and
Pelicon, Andra{\v{z}}  and
{\v{S}}krlj, Bla{\v{z}}  and
{\v{Z}}nidar{\v{s}}i{\v{c}}, Martin  and
Lon{\v{c}}arski, Igor  and
Pollak, Senja 

De-latex: Stepišnik-Perdih, Timen  and
Pelicon, Andraž  and
Škrlj, Blaž  and
Žnidaršič, Martin  and
Lončarski, Igor  and
Pollak, Senja 

Formatted: Timen Stepišnik-Perdih, Andraž Pelicon, Blaž Škrlj, Martin Žnidaršič, Igor Lončarski, and Senja Pollak


In [11]:
cleaned_entities = [
    {
        **entity,
        'title': de_latex(entity['title']),
        'author': format_authors(de_latex(entity['author']))
    } if 'author' in entity else {
        **entity,
        'title': de_latex(entity['title']),
        'author': format_authors((de_latex(entity['editor'])))
    }
    for entity in acl_entities
]

In [13]:
with open('../data/acl.json', 'w', encoding='utf-8') as f:
    json.dump(cleaned_entities, f)