In [1]:
# The database object is responsible for maintaining the
# snapshots (versions) of datasets that are maipulated using
# openclean operators in this notebook.

from openclean_jupyter import DB

# Define a base directory on the local file system where all
# data is stored. The create=True flag will erase any data
# that previously exists in the base directory.
db = DB(basedir='.openclean', create=True)

ImportError: cannot import name 'op' from 'openclean' (/home/heiko/projects/openclean/openclean-core/openclean/__init__.py)

In [2]:
from openclean.data.load import dataset

# Load a small subset of the dataset 'VDH-COVID-19-PublicUseDataset-Cases'
# from the data.virginia.gov portal. Here we use the openclean dataset
# method but pd.read_csv works equally well.

df = dataset('./data/covid.tsv')
df = db.load_dataset(df=df, name='covid-cases', primary_key=['Locality', 'Report Date'])
df.head()

Unnamed: 0,Report Date,FIPS,Locality,VDH Health District,Total Cases,Hospitalizations,Deaths
0,03/17/2020,51001,Accomack,Eastern Shore,0,0,0
1,03/17/2020,51003,Albemarle,Thomas Jefferson,0,0,0
2,03/17/2020,51005,Alleghany,Alleghany,0,0,0
3,03/17/2020,51007,Amelia,Piedmont,0,0,0
4,03/17/2020,51009,Amherst,Central Virginia,0,0,0


In [3]:
# We can update the dataset using pre-registered functions.
# Here we convert the values in column 'Locality' to upper case.

df = db.apply('covid-cases').to_upper('Locality')
df.head()

Unnamed: 0,Report Date,FIPS,Locality,VDH Health District,Total Cases,Hospitalizations,Deaths
0,03/17/2020,51001,ACCOMACK,Eastern Shore,0,0,0
1,03/17/2020,51003,ALBEMARLE,Thomas Jefferson,0,0,0
2,03/17/2020,51005,ALLEGHANY,Alleghany,0,0,0
3,03/17/2020,51007,AMELIA,Piedmont,0,0,0
4,03/17/2020,51009,AMHERST,Central Virginia,0,0,0


In [4]:
# This is a simple list of names for all registered functions
# that are available for apply (note that this is mainly intended
# for the UI, there should be a better way for the user to
# inspect the list of available commands in a notebook). 

db.register.serialize()

['capitalize', 'to_lower', 'to_upper']

In [5]:
# Add a user-defined function as a simple column-operator.
# register.eval adds a method to the object that is returned
# by db.apply that currently expects a single argument named
# 'columns' which specifies the column(s) on which the registered
# function is applied (evaluated) to create a modified dataset.

@db.register.eval('zigzag')
def zigzag_case(value):
    """Take a given string and return a string where
    upper and lower cases alternate.
    """
    result = ''
    functions = [str.upper, str.lower]
    i = 0
    for c in str(value):
        f = functions[i]
        i = (i + 1) % 2
        result += f(c)
    return result

In [6]:
# zigzag_case works like a normal function.

zigzag_case('abcdefgh')

'AbCdEfGh'

In [7]:
# Without registering it, we could use zigzag_case as a evaluation function
# in a dataset update.
df = db.apply('covid-cases').update(columns='Locality', func=zigzag_case)
df.head()

Unnamed: 0,Report Date,FIPS,Locality,VDH Health District,Total Cases,Hospitalizations,Deaths
0,03/17/2020,51001,AcCoMaCk,Eastern Shore,0,0,0
1,03/17/2020,51003,AlBeMaRlE,Thomas Jefferson,0,0,0
2,03/17/2020,51005,AlLeGhAnY,Alleghany,0,0,0
3,03/17/2020,51007,AmElIa,Piedmont,0,0,0
4,03/17/2020,51009,AmHeRsT,Central Virginia,0,0,0


In [8]:
# Because we registered zigzag_case it is also avalaible
# directly on apply()

df = db.apply('covid-cases').zigzag(columns='VDH Health District')
df.head()

Unnamed: 0,Report Date,FIPS,Locality,VDH Health District,Total Cases,Hospitalizations,Deaths
0,03/17/2020,51001,AcCoMaCk,EaStErN ShOrE,0,0,0
1,03/17/2020,51003,AlBeMaRlE,ThOmAs jEfFeRsOn,0,0,0
2,03/17/2020,51005,AlLeGhAnY,AlLeGhAnY,0,0,0
3,03/17/2020,51007,AmElIa,PiEdMoNt,0,0,0
4,03/17/2020,51009,AmHeRsT,CeNtRaL ViRgInIa,0,0,0


In [9]:
# We can also edit the dataset in the UI
db.edit('covid-cases')

In [10]:
# Make sure to checkout the latest version of the dataset before
# perfoming any operations on it.

df = db.checkout('covid-cases')

In [11]:
df.head()

Unnamed: 0,Report Date,FIPS,Locality,VDH Health District,Total Cases,Hospitalizations,Deaths
0,03/17/2020,51001,Accomack,EASTERN SHORE,0,0,0
1,03/17/2020,51003,Albemarle,THOMAS JEFFERSON,0,0,0
2,03/17/2020,51005,Alleghany,ALLEGHANY,0,0,0
3,03/17/2020,51007,Amelia,PIEDMONT,0,0,0
4,03/17/2020,51009,Amherst,CENTRAL VIRGINIA,0,0,0
