In [31]:
DATA_DIR = '../sqapo-data/'
with open(f'{DATA_DIR}/abridgment-files.txt') as f:
    abridgment_files = [line.strip() for line in f]
abridgment_files[:3]

['EdmundBurke.htm',
 'Hayek__The_Road_to_Serfdom.htm',
 'Nietzsche__Genealogy_of_Morals.htm']

In [33]:
import codecs
from pprint import pprint
from dataclasses import dataclass, asdict
from typing import Optional
import yaml
import re
import os
import openai

openai.api_key = os.environ['OPENAI_API_KEY']

IDENT_PROMPT_TEMPLATE = """
What follows is a truncated HTML page with an abridged version of a well-known book or article,
written by a noteworthy philosopher, scientist, or other sort of intellectual.
Your job is to output the name of the author and the title of the book/article, in
the following format:
```
AUTHOR::TITLE
```

As an example, if given the following input:
```
<br>
<center><big><big>Edmund Burke<br>
<big><big><b>Reflections on the Revolution in France</font></big></big></big></font></big></big></big><br>
... </b>Squashed down to read in about <font color="red"><b>30 minutes</b></font></big></big></big><br>
<i>"I love a manly, moral, regulated liberty"</i><br>
<br>
<img src="burke.jpg" border="0"></center><br>
<br>
<a href="http://en.wikipedia.org/wiki/Edmund_Burke"><img src="wikilogo.gif" border="0"> Wikipedia</a> - <a href="completetext-burke-revolutioninfrance.htm"><img src="textlogo.gif" border="0"> Full Text</a> - <a href="http://www.amazon.co.uk/gp/product/0199539022?ie=UTF8&amp;tag=squashephilos-21&amp;linkCode=as2&amp;camp=1634&amp;creative=6738&amp;creativeASIN=0199539022"><img src="booklogo2.gif" border="0"> Print Edition: ISBN 0199539022</a><br>
<br>
<table bgcolor="#B3B3B8"><td width="<table bgcolor="#B3B3B8"><td width="100%"><font color="#000000" size="2"><b>INTRODUCTION TO <i>Reflections on the Revolution in France</i></b></font></td></table><br>
Edmund Burke was born and raised in Ireland, from a family with splendidly mixed Protestant & Catholic, Irish & Norman, credentials. He went off to study law in London, but grew sick of it and turned instead to literature and politics, becoming a Member of Parliament for the modest English town of Wendover and in 1774 for the great port city of Bristol. <br>
```

The correct output would be:
```
Edmund Burke::Reflections on the Revolution in France
```

Here is the (possibly truncated) HTML content:
```
{html}
```
"""

FULLTEXT_LINK_PATTERN = re.compile(r'href="([^"]+\.htm)".*Full Text', re.IGNORECASE)

@dataclass
class AbridgmentData:
    file: str
    full_text_file: Optional[str]
    author: Optional[str]
    title: Optional[str]

abridgments: list[AbridgmentData] = []

In [None]:
for file_name in abridgment_files[:]:
    with codecs.open(f'{DATA_DIR}/sqapo.com/{file_name}', encoding='cp1252') as f:
        print(file_name)
        content = f.read()
        match = FULLTEXT_LINK_PATTERN.search(content)
        if match is None:
            print("ERROR: No full-text link found!")
            fulltext_file = None
        else:
            fulltext_file = match.group(1)

    if False:
        author, title = None, None
    else:
        prompt = IDENT_PROMPT_TEMPLATE.format(html=content[:10000])
        response = openai.Completion.create(engine="gpt-3.5-turbo-instruct", prompt=prompt, max_tokens=20, temperature=0)
        
        response_text = response.choices[0].text.strip()
        # print(response_text)
        if '::' not in response_text:
            print(f"ERROR: Invalid author::title response: '{response_text}'")
            author, title = None, None
        else:
            author, title = response_text.split('::')
    
    abridgments.append(AbridgmentData(file_name, fulltext_file, author, title))

# pprint(abridgments)

with open(f'{DATA_DIR}/abridgment-data.v1.yaml', 'w') as f:
    yaml.dump([asdict(abr) for abr in abridgments], f, default_flow_style=False, sort_keys=False)

In [34]:
with open(f'{DATA_DIR}/abridgment-data.v2.yaml') as f:
    abridgments = [AbridgmentData(**d) for d in yaml.safe_load(f)]
pprint(abridgments[:3])

[AbridgmentData(file='EdmundBurke.htm',
                full_text_file='completetext-burke-revolutioninfrance.htm',
                author='Edmund Burke',
                title='Reflections on the Revolution in France'),
 AbridgmentData(file='Hayek__The_Road_to_Serfdom.htm',
                full_text_file=None,
                author='F.A. Hayek',
                title='The Road to Serfdom'),
 AbridgmentData(file='Nietzsche__Genealogy_of_Morals.htm',
                full_text_file='Complete_Text__Nietzsche__Genealogy_of_Morals.htm',
                author='Friedrich Nietzsche',
                title='Towards a Genealogy of Morals')]


In [None]:
from dataclasses import dataclass
from typing import TypeAlias, Optional, Generator, Iterator, Union
import re
import itertools
from pprint import pprint

from bs4 import BeautifulSoup, Tag

@dataclass
class Token:
    position: int
    text: str
    markup_id: Optional[int]

    # def __eq__(self, other):
    #     return isinstance(other, Token) and other.text == self.text

In [None]:
test_html = '''
<body>
    <b>The Title</b>
    <p>Paragraph 1.
    <blockquote>
        <b>NOTE<br>
        Here's <i>more</i> material....
    </blockquote>
</body>
'''

def tokenize(tag: Tag) -> Generator[Token, None, None]:
    yield from extract_tokens(tag, itertools.count(0), itertools.count(0))

def extract_tokens(tag: Tag, positions: Iterator[int], markup_ids: Iterator[int]) -> Generator[Token, None, None]:
    markup_id = next(markup_ids)
    # TODO: re-construct attributes of tag in token string
    yield Token(next(positions), f'<{tag.name}>', markup_id)
    
    for child in tag.contents:
        if not isinstance(child, str):
            yield from extract_tokens(child, positions, markup_ids)
        else:
            for s in re.split(r'(\w+\s*|\s+)', child):
                if s != '':
                    yield Token(next(positions), s, markup_id=None)

    yield Token(next(positions), f'</{tag.name}>', markup_id)

soup = BeautifulSoup(test_html, 'html.parser')
tokens = list(tokenize(soup.body))

pprint(tokens)

In [79]:
import numpy as np


ArrayLike: TypeAlias = Union[list, np.ndarray]

COST_TYPE = 'uint16'

def distance_array(a: ArrayLike, b: ArrayLike, del_cost: int = 1, ins_cost: int = 1, sub_cost: int = 1):
    D = np.zeros((len(a) + 1, len(b) + 1), dtype=COST_TYPE)
    D[:, 0] = np.arange(len(a) + 1, dtype=COST_TYPE) * del_cost
    D[0, :] = np.arange(len(b) + 1, dtype=COST_TYPE) * ins_cost
    for i in range(len(a)):
        for j in range(len(b)):
            if a[i] == b[j]:
                D[i + 1][j + 1] = D[i][j]
            else:
                D[i + 1][j + 1] = min(D[i + 1][j] + del_cost,
                                      D[i][j + 1] + ins_cost, 
                                      D[i][j]     + sub_cost)
    return D

# D = distance_array('abcdef', 'bcxycefg')
    
def align(a: ArrayLike, b: ArrayLike, del_cost: int = 1, ins_cost: int = 1, sub_cost: int = 1):
    """ Perform Wagner-Fischer alignment. Return array with same length as `a` of indexes of corresponding
    elements of b, or -1 for deletions.
    """
    
    D = distance_array(a, b, del_cost, ins_cost, sub_cost)
    
    index_map = -np.ones(len(a), dtype='int')
    i = len(a)
    j = len(b)
    while i > 0 and j > 0:
        if D[i][j] == D[i-1][j-1] and a[i-1] == b[j-1]:
            index_map[i-1] = j-1
            i -= 1
            j -= 1
        elif D[i][j] == D[i-1][j-1] + sub_cost:
            i -= 1
            j -= 1
        elif D[i][j] == D[i][j-1] + del_cost:
            j -= 1
        elif D[i][j] == D[i-1][j] + ins_cost:
            i -= 1
        else:
            raise Error("Bug!")

    return index_map

align('abcde', 'bcxye')
# align('abcdef', 'bcxycefg')

array([-1,  0,  1, -1,  4])

In [45]:
from IPython.display import display, HTML
display(HTML('Hello, <span style="background-color: #efef7f">world!</span>'))
display(HTML('Here is a <span style="background-color: #ffafaf">...</span> deletion.'))

In [None]:
import difflib

string1 = 'abcde'
string2 = 'bcxye'

matcher = difflib.SequenceMatcher(None, string1, string2)
matching_blocks = matcher.get_matching_blocks()
for block in matching_blocks:
    match_str1 = string1[block.a:block.a+block.size]
    match_str2 = string2[block.b:block.b + block.size]
    print(f"[{block.a}:{block.a+block.size} <-> {block.b}:{block.b + block.size}]{match_str1}")
    # print(f"Match in string2: [{block.b}]{match_str2}")

In [83]:
def align(a: ArrayLike, b: ArrayLike):
    matcher = difflib.SequenceMatcher(None, a, b)
    matching_blocks = matcher.get_matching_blocks()
    index_map = -np.ones(len(a), dtype='int')
    for block in matching_blocks:
        index_map[block.a:block.a+block.size] = np.arange(block.b, block.b+block.size)
    return index_map

align('abcde', 'bcxye')

array([-1,  0,  1, -1,  4])

In [84]:
%%time

PUNC_SPLIT_PATTERN = re.compile(r'(\w+)\W+')
def normalize_token(tok: Token) -> str:
    if tok.markup_id is not None:
        return tok.text
    else:
        m = PUNC_SPLIT_PATTERN.match(tok.text)
        if m:
            return m.group(1).lower()
        else:
            return tok.text
    
for abr in abridgments:
    if abr.full_text_file is None: continue

    if not abr.file.startswith('leibniz'): continue  # for testing, just look at Leibniz's brief Monadology

    with codecs.open(f'{DATA_DIR}/sqapo.com/{abr.full_text_file}', encoding='cp1252') as f:
        soup = BeautifulSoup(f, 'html.parser')
        fulltext_tokens = list(tokenize(soup.body))
        fulltext_strings = [normalize_token(tok) for tok in fulltext_tokens]

    with codecs.open(f'{DATA_DIR}/sqapo.com/{abr.file}', encoding='cp1252') as f:
        soup = BeautifulSoup(f, 'html.parser')
        abridgment_tokens = list(tokenize(soup.body))
        abridgment_strings = [normalize_token(tok) for tok in abridgment_tokens]

    # with open(f'{DATA_DIR}/processed/{abr.file}-abr-tokens.yaml', 'w') as f:
    #     yaml.dump([asdict(tok) for tok in fulltext_tokens], f, default_flow_style=False, sort_keys=False)
    # with open(f'{DATA_DIR}/processed/{abr.file}-full-tokens.yaml', 'w') as f:
    #     yaml.dump([asdict(tok) for tok in abridgment_tokens], f, default_flow_style=False, sort_keys=False)

    alignment = align(fulltext_strings, abridgment_strings)  # , sub_cost=9999)
    with open(f'{DATA_DIR}/processed/{abr.file}-alignment.yaml', 'w') as f:
        for i in alignment:
            f.write(f'{i}\n')

CPU times: user 1.43 s, sys: 35.6 ms, total: 1.46 s
Wall time: 1.49 s


In [None]:

def alignment_spans(alignment, abridgment_length):
    align_conditions = []
    for i, j in enumerate(alignment):
        if i > 0:
            if i == -1:
                align_conditions.append('D')
            elif 

In [85]:
HIGHLIGHT_START = '<span style="background-color: #efef7f">'
HIGHLIGHT_END = '</span>'
INSERTION_START = '<span style="background-color: #7fef7f">'
INSERTION_END = '</span>'

def highlight_matches(fulltext_tokens, abridgment_strings, alignment):
    highlighted_tokens = []
    highlighting_state = False
    last_abridged_token = -1
    # next_markup_id = max([0] + [tok.markup_id for tok in fulltext_tokens])
    for i, tok in enumerate(fulltext_tokens):
        if tok.markup_id is not None:
            if highlighting_state:
                highlighted_tokens.append(HIGHLIGHT_END)
                highlighting_state = False
        elif alignment[i] != -1:
            if alignment[i] != last_abridged_token + 1:
                highlighted_tokens.append(INSERTION_START + ' '.join(abridgment_strings[j] for j in range(last_abridged_token + 1, alignment[i])) + INSERTION_END)
            if not highlighting_state:
                highlighted_tokens.append(HIGHLIGHT_START)
                highlighting_state = True
            last_abridged_token = alignment[i]
        elif alignment[i] == -1 and highlighting_state:
            highlighted_tokens.append(HIGHLIGHT_END)
            highlighting_state = False
        highlighted_tokens.append(tok.text)
    if highlighting_state:
        highlighted_tokens.append(HIGHLIGHT_END)

    return highlighted_tokens

highlighted_tokens = highlight_matches(fulltext_tokens, abridgment_strings, alignment)
display(HTML(''.join(highlighted_tokens)))

0
- search BOX - - text START - gottfried Leibniz Monadology

0
"-SEARCH BOX-  -TEXT START- A simple ideas and truths . the final reason of things must be in a necessary substance , which we call God . god holds an infinity of ideas , and chooses the most perfect ones . each simple substance has relations which express all the others , and , consequently , that it is a perpetual living mirror of the universe ; though it represents more distinctly the body of which it is the entelechy . each portion of matter is like a pond full of fishes , where each drop of its liquid parts is also another pond . thus there is nothing fallow , nothing sterile , nothing dead in the universe . all the parts of every living body are full of other living beings , each with its dominant entelechy or soul . thus there never is absolute birth nor complete death . minds are images of the Deity , capable of knowing the system of the universe , each being like a small divinity in its own sphere . whence the totality of all spirits must compose the city of God , where no good action would be unrewarded and no bad one unpunished . if we could understand the order of the universe , we should find that it exceeds the desires of the wisest men . the squashed philosophers edition of ... Monadology gottfried Leibniz 1714 Squashed Philosophers Complete Text"

0
"MORE FROM Squashed Philosophers...  About .. ● THE COMPLETE TEXTS ●  THE ABRIDGED TEXTS ●  Aristotle - Ethics ●  Aristotle - Politics ●  Augustine - Confessions ●  Ayer - Language, Truth and Logic ●  Bacon - Advancement of Learning ●  Bentham - Morals and Legislation ●  Berkeley - Principles of Human Knowledge ●  Boethius - Consolations of Philosophy ●  Burke - Revolution in France ●  Cicero - Friendship and Old Age ●  Clausewitz - On War ●  Comte - Positive Philosophy ●  Confucius - The Analects ●  Copernicus - The Revolutions ●  Darwin - The Origin of Species ●  Descartes - Discourse on Method ●  Descartes - Meditations ●  Einstein's Relativity ●  Emerson - Nature ●  Epicurus - Sovran Maxims ●  Erasmus - Praise of Folly ●  Euclid - Elements ●  Freud - Psychoanalysis ●  Galileo - Two World Systems ●  Hayek - The Road to Serfdom ●  Hegel - Philosophy of History ●  Hegel - Philosophy of Religion ●  Hobbes - Leviathan ●  Hume - Human Understanding ●  James - Varieties of Religious Experience ●  Kant - Critiques of Reason ●  Kant - Metaphysics of Morals ●  Kierkegaard - Either Or ●  Leibniz - Monadology ●  Locke - Human Understanding ●  Machiavelli - The Prince ●  Marcus Aurelius - Meditations ●  Marx - The Communist Manifesto ●  Marx and Engels - German Ideology ●  Mill - On Liberty ●  Mill - System of Logic ●  More - Utopia ●  Newton - Principia ●  Nietzsche - Beyond Good and Evil ●  Nietzsche - Genealogy of Morals ●  Paine - Rights of Man ●  Pascal - Thoughts ●  Plato - The Apology ●  Plato - The Republic ●  Plato - The Symposium ●  Popper - Scientific Discovery ●  Rand - Selfishness ●  Rousseau - Confessions ●  Rousseau - Social Contract ●  Sade - Philosophy in the Boudoir ●  Sartre - Existentialism is a Humanism ●  Schopenhauer - World as Will and Idea ●  Smith - Wealth of Nations ●  Spinoza - Ethics ●  The Ancient Greeks ●  The Aphorisms of the Philosophers ●  Thoreau - Walden ●  Tocqueville - America ●  Turing - Computing Machinery ●  Wittgenstein - Tractatus ●  Wollstonecraft - Rights of Woman ●  Email : glyn@sqapo.com  COPYRIGHT and ALL RIGHTS RESERVED: © Glyn Hughes, Sunday 16 September 2018 friday 20 december 2019 BUILT WITH WHIMBERRY  matrixstats"


In [90]:
import tempfile
with tempfile.NamedTemporaryFile(mode='w+t') as f:
    pprint(f.__dict__)
    pprint(dir(f))

{'_closer': <tempfile._TemporaryFileCloser object at 0x1059507f0>,
 'delete': True,
 'file': <_io.TextIOWrapper name='/var/folders/f6/zgx03b9x42ngkkrpbdb60jxc0000gn/T/tmp1vxgvpal' mode='w+t' encoding='UTF-8'>,
 'name': '/var/folders/f6/zgx03b9x42ngkkrpbdb60jxc0000gn/T/tmp1vxgvpal'}
['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_closer',
 'close',
 'delete',
 'file',
 'name']


In [38]:
import numpy as np
D = np.zeros(shape=(210694,7500), dtype='float16')
for i in range(D.shape[0]):
    for j in range(100):
        D[i, j] += i + j
D[:3,:3]

  D[i, j] += i + j


array([[0., 1., 2.],
       [1., 2., 3.],
       [2., 3., 4.]], dtype=float16)