In [1]:
# The database object is responsible for maintaining the
# snapshots (versions) of datasets that are manipulated using
# openclean operators in this notebook.

from openclean_notebook import DB

# Define a base directory on the local file system where all
# data is stored. The create=True flag will erase any data
# that previously exists in the base directory.
db = DB(basedir='.openclean', create=True)

In [2]:
from openclean.data.load import dataset

#
# Load full dataset 'VDH-COVID-19-PublicUseDataset-Cases' from the data.virginia.gov portal.

df = db.load_dataset(source='./data/bre9-aqqr.tsv.gz', name='covid-cases')

In [3]:
# Add a user-defined function that operates on two columns and
# that takes an additional parameter as input.
#
# When registering the function we can specify the number of
# input columns the function operates on. The convention is that
# the first n arguments of the registered function will receive
# their values from the n input columns that the user selects.
# Additional parameters will be called as keyword arguments.
#
# If the collables argument is not given when the function is
# registered the names of the first n function arguments are
# used as the defaults.

from openclean.engine.object.function import String

@db.register.eval(
    name='concat',
    label='Concat Columns',
    description='Concatenate values from two columns',
    columns=2,
    collabels=['Left Column', 'Right Column'],
    parameters=[String(name='delim', label='Delimiter', default=':')]
)
def concat_columns(value1, value2, delim):
    """Concatenate two values with the given delimiter."""
    return '{}{}{}'.format(value1, delim, value2)

In [4]:
# Print serialization of function library that will be available to the Spreadsheet view.

import json

print(json.dumps(db.library_dict(), indent=4))

{
    "functions": [
        {
            "name": "lower",
            "namespace": "string",
            "columns": 1,
            "columnLabels": null,
            "outputs": 1,
            "parameters": []
        },
        {
            "name": "upper",
            "namespace": "string",
            "columns": 1,
            "columnLabels": null,
            "outputs": 1,
            "parameters": []
        },
        {
            "name": "capitalize",
            "namespace": "string",
            "columns": 1,
            "columnLabels": null,
            "outputs": 1,
            "parameters": []
        },
        {
            "name": "concat",
            "namespace": null,
            "label": "Concat Columns",
            "description": "Concatenate values from two columns",
            "columns": 2,
            "columnLabels": [
                "Left Column",
                "Right Column"
            ],
            "outputs": 1,
            "parameters": [
           