This notebook creates a framework for versioned functions and allows them to be put into a pipeline for sequential string processing.

In [1]:
from abc import ABC, abstractmethod
import re

def print_io(fn):
    def __wrapper(*args, **kwargs):
        # Args for VersionedFunction are self, source_text
        print(f'• {args[0].__class__.__name__}() '.ljust(80, '='))
        print('  IN:', args[1])
        result = fn(*args, **kwargs)
        print('  OUT:', result)
        return result
    return __wrapper

class VersionedFunction(ABC):

    purpose = None
    version = None

    @property
    def purpose(self):
        return self.__class__.purpose

    @property
    def version(self):
        return self.__class__.version

    def __call__(self, source_text: str) -> str:
        print(
            f'• Running {self.__class__.__name__} ({self.purpose}, '
            f'version {self.version})\n  input: "{source_text}"'
        )
        return source_text


def run_pipeline_steps(source_text: str, *steps: VersionedFunction) -> str:

    # Check for invalid steps
    bad_steps = [s for s in steps if not isinstance(s, VersionedFunction)]
    if any(bad_steps):
        raise TypeError(
            f'Cannot run {bad_steps}, they are not VersionedFunction '
            'instancess'
        )

    # Start with the first item in the steps        
    intermediate = steps[0](source_text)

    # Then feed the results of the previous step into the next step, for
    # as long as there are steps
    for step in steps[1:]:
        intermediate = step(intermediate)

    return intermediate


class VersionedFunctionRegister:

    __registered_functions = {}

    # Note that this is defined as a class-method, but could be usefully
    # called from an instance too, since the storage is tied to the class
    # by name, not by a 'cls' reference.
    @classmethod
    def register(cls, target_function: VersionedFunction):
        if not issubclass(target_function, VersionedFunction):
            raise TypeError(f'{target_function} is not a VersionedFunction.')

        # Generate a key that will be used to keep track of the target_function
        key = (target_function.purpose, target_function.version)

        # Check for an existing item, and fail if it exists, in order to
        # preserve purpose/version distinctness.
        existing = VersionedFunctionRegister.__registered_functions.get(key)
        if existing:
            raise RuntimeError(
                f'The {target_function.__name__} VersionedFunction cannot '
                'be registered in VersionedFunctionRegister: There is already '
                f'a registered item ({existing.__name__}) with the same purpose '
                f'({target_function.purpose}) and version '
                f'({target_function.version}).'
            )
        # Otherwise register it
        else:
            VersionedFunctionRegister.__registered_functions[key] = target_function

    def __call__(self, purpose: str, version: int, verbose: bool=False):
        try:
            # Find, create, and return an instance if the key exists
            tgt_fn = self.__class__.__registered_functions[(purpose, version)]
            if verbose:
                tgt_fn.__call__ = print_io(tgt_fn.__call__)
            
            return tgt_fn()
        except KeyError:
            raise KeyError(
                f'No VersionedFunction type with a purpose "{purpose}" and '
                f'version "{version}" is registered'
            )
            
# The ingredients below have to follow the template as specified with a decorator, 
# `purpose`, `version`, and `__call__(self, source_text: str) -> str:`.
            
@VersionedFunctionRegister.register
class Lowercase(VersionedFunction):

    purpose = 'make lowercase'
    version = 1
    
    def __call__(self, source_text: str) -> str:
        return source_text.lower()

@VersionedFunctionRegister.register
class LowercaseBad(VersionedFunction):

    purpose = 'make lowercase'
    version = 2
    
    def __call__(self, source_text: str) -> str:
        return source_text.upper()


@VersionedFunctionRegister.register
class ReplaceHTMLLineBreaksToRegularLineBreaks(VersionedFunction):
    
    purpose = 'Replace HTML line breaks with "\r\n"'
    version = 1

    def __call__(self, source_text: str) -> str:
        return re.sub(r'<br\s*/>', '\r\n', source_text, flags=re.IGNORECASE)

@VersionedFunctionRegister.register
class ReplaceEgToForExample(VersionedFunction):    

    purpose = '"e.g." to "for example"'
    version = 1

    pat = re.compile(f"""
    (?P<E>[E|e]\.)  # E. or e. first character
    (?P<G>[G|g]\.)  # G. or g. second character
    """, re.VERBOSE)
    
    @staticmethod
    def match_func(match_obj):
        d = match_obj.groupdict()
        if d['E'][0] == 'E':
            astr = 'F'
        else:
            astr = 'f'
        astr += 'or example'
        return astr

    def __call__(self, source_text: str) -> str:
        return self.pat.sub(self.match_func, source_text)
    
if __name__ == '__main__':
    # Register our functions
    process_factory = VersionedFunctionRegister()

And now we are ready to construct a recipe with particular steps. After executing, try setting the last step to version 2 and compare the difference. It is also possible to set `verbose=True` in any step, but this requires executing the cell above to reset the registry for toggling verbosity to take effect.

In [2]:
recipe_1 = (
        process_factory('Replace HTML line breaks with "\r\n"', 1, verbose=False),
        process_factory('"e.g." to "for example"', 1),
        process_factory('make lowercase', 1),
    )

results = []
for doc in [
        "This IS my String. E.g., it's an incredible work.",
        "Another<BR/>String."
    ]:
    results.append(run_pipeline_steps(doc, *recipe_1))

for i, r in enumerate(results):
    print(i, r)

0 this is my string. for example, it's an incredible work.
1 another
string.
