In [1]:
import sys
sys.path.append('../..')


In [2]:
from bs4 import BeautifulSoup
from aletk.utils import remove_extra_whitespace



In [3]:
example_article = "/home/alebg/philosophie-ch/mulligan-k-references.html"

with open(example_article) as f:
    example_content = f.read()

In [4]:
def load_html(file_content: str) -> str:
    soup = BeautifulSoup(file_content, 'html.parser')


    return str(soup)

def test_prettify(file_content: str) -> str:
    soup = BeautifulSoup(file_content, 'html.parser')


    return str(soup.prettify())

In [5]:
testp = test_prettify(example_content)

In [6]:
testp


'<div class="csl-entry" id="ref-c1-apel_ko-etal:2000" role="listitem">\n <span class="smallcaps">\n  Apel\n </span>\n , Karl-Otto,\n <span class="smallcaps">\n  Barnes\n </span>\n , Jonathan,\n <span class="smallcaps">\n  Bellone\n </span>\n , Enrico,\n <span class="smallcaps">\n  Chevalley\n </span>\n , Catherine,\n <span class="smallcaps">\n  Cohen\n </span>\n , Gerald A.,\n <span class="smallcaps">\n  Cournut\n </span>\n , Jean,\n <span class="smallcaps">\n  Descombes\n </span>\n , Vincent, et al., eds. 2000.\n <em>\n  <span>\n   Un si\n   <span>\n    è\n   </span>\n   cle de philosophie 1900-2000\n  </span>\n </em>\n .\n <span>\n  Folio essais\n </span>\n n.\xa0369. Paris: Gallimard.\n</div>\n<div class="csl-entry" id="ref-c1-baertschi-mulligan:2001" role="listitem">\n <span class="smallcaps">\n  Baertschi\n </span>\n , Bernard and\n <span class="smallcaps">\n  Mulligan\n </span>\n , Kevin, eds. 2001a.\n <em>\n  <span>\n   Les\nnationalismes\n  </span>\n </em>\n .\n <span>\n  <span

In [7]:
example = '<div class="csl-entry" id="ref-c1-baertschi-mulligan:2001" role="listitem"> <span class="smallcaps">Baertschi</span>,\n Bernard and <span class="smallcaps">Mulligan</span>, Kevin,\n eds. 2001a. <em><span>Les nationalismes</span></em>. <span><span>É</span>thique et philosophie morale</span>. Paris: Presses Universitaires de France. </div>'

#stripped = remove_extra_whitespace(example_content).replace('</div>', '</div>\n')
#print(stripped)



In [8]:
def preprocess_html(html: str) -> str:
    """
    Preprocess html content by removing extra whitespace (new lines, tabs, multiple spaces in a row, etc.), and then adding back newlines after closing div tags.
    """
    return remove_extra_whitespace(html).replace('</div>', '</div>\n')


In [9]:
from typing import FrozenSet, Tuple
import csv

type TReplacementItem = Tuple[
    str, # original
    str  # replacement
]

type TReplacementTable = Tuple[TReplacementItem]


def load_replacement_table(csv_filename: str, encoding: str) -> TReplacementTable:
    """
    Load a replacement table from a csv file.
    """
    with open(csv_filename, 'r', encoding=encoding) as f:
        reader = csv.DictReader(f)
        return tuple((row['original'], row['replacement']) for row in reader)
        




In [16]:
from dataclasses import dataclass


@dataclass(frozen=True, slots=True)
class TReplaceStringsResult:
    was_changed: bool
    new_html: str
    replacements_used: FrozenSet[str]


def replace_strings(
    html_content: str,
    replacement_table: TReplacementTable,
) -> TReplaceStringsResult:
    """
    Replace strings in an html file according to a replacement table. Re-formats the HTML to be human readable in the process.
    """

    preprocessed_html = preprocess_html(html_content)
    was_changed = False
    replacements_used_l = []

    for original, replacement in replacement_table:
        new_html = preprocessed_html.replace(original, replacement)
        if new_html != preprocessed_html:
            was_changed = True
            preprocessed_html = new_html
            replacements_used_l.append(original)
        
    return TReplaceStringsResult(
        was_changed=was_changed,
        new_html=preprocessed_html,
        replacements_used=frozenset(replacements_used_l)
    )
    


In [17]:
rt = load_replacement_table('../../data/rtrefs-v1.csv', 'utf-16')

In [18]:
res = replace_strings(example_content, rt)

In [20]:
#print(res.new_html)

In [21]:
with open('../../data/res.html', 'w') as f:
    f.write(res.new_html)