# Engine - Datastore

This is an example notebook to demonstrate how to update and maintain dataset versions using the openclean engine.

In [1]:
# Crete an persistent instance of the openclean engine. All the data
# files will be maintained in a sub-folder of the current working directory.

from openclean.engine.base import DB

db = DB(basedir='./archive', create=True)

In [2]:
# Download an test dataset from the Socrata API:
# 'Bidders List Master' from domain data.vermont.gov

from openclean.data.source.socrata import Socrata

df = Socrata().dataset('y343-ur4c').load()
df

Unnamed: 0,Date,Title or Project,Bidders Name,City,State,Location 1
0,07/26/2017,HE - NARCAN 4MG 7.25,Adapt Pharma,RADNOR,PA,"RADNOR, PA\n(40.038043, -75.344449)"
1,08/22/2017,Lab Bend Fixture,,,,
2,09/19/2017,FTA Cards for DPS,GE Healthcare,Marborough,MA,"Marborough, MA"
3,09/26/2017,02140-785 - Rescue Equipment,Reynolds & Son,Barre,VT,"Barre, VT\n(44.200603, -72.505569)"
4,09/20/2017,BGS - 170078 Trush Parking Lot Paving Repairs ...,"S T Paving , Inc",Waterbury,VT,"Waterbury, VT\n(44.334602, -72.753189)"
...,...,...,...,...,...,...
190,08/09/2017,"BGS - AHS Janitorial Services - St Albans, VT",Loso's Professional J.S.,South Burlington,VT,"South Burlington, VT\n(44.468286, -73.171594)"
191,09/06/2017,BGS - A & E Window Restoration - 133 State St,VT Architects Collaborative,Randolph,VT,"Randolph, VT\n(43.925266, -72.665754)"
192,09/07/2017,VDH - Car Seats,Even Flo,Cullman,AL,"Cullman, AL\n(34.173753, -86.843115)"
193,07/24/2017,RFP DMV Registration Renewal Forms,RR Donnelley,Derry,NH,"Derry, NH\n(42.881978, -71.324171)"


In [3]:
# Create a new persistent dataset archive from the downloaded
# data frame in the openclean engine database.

df = db.create(df, name='bidders', primary_key='Title or Project')

In [4]:
# Delete rows where bidder's name is empty.

from openclean.function.eval.null import IsEmpty
from openclean.operator.transform.filter import delete

db.commit(name='bidders', df=delete(df, IsEmpty('Bidders Name')))

Unnamed: 0,Date,Title or Project,Bidders Name,City,State,Location 1
143,07/26/2017,HE - NARCAN 4MG 7.25,Adapt Pharma,RADNOR,PA,"RADNOR, PA\n(40.038043, -75.344449)"
138,09/19/2017,FTA Cards for DPS,GE Healthcare,Marborough,MA,"Marborough, MA"
24,09/26/2017,02140-785 - Rescue Equipment,Reynolds & Son,Barre,VT,"Barre, VT\n(44.200603, -72.505569)"
59,09/20/2017,BGS - 170078 Trush Parking Lot Paving Repairs ...,"S T Paving , Inc",Waterbury,VT,"Waterbury, VT\n(44.334602, -72.753189)"
43,08/25/2017,AGR - Laboratory Epuipment,Daigger Scientific,Vernon Hills,IL,"Vernon Hills, IL\n(42.238635, -87.959562)"
...,...,...,...,...,...,...
64,08/09/2017,"BGS - AHS Janitorial Services - St Albans, VT",Loso's Professional J.S.,South Burlington,VT,"South Burlington, VT\n(44.468286, -73.171594)"
62,09/06/2017,BGS - A & E Window Restoration - 133 State St,VT Architects Collaborative,Randolph,VT,"Randolph, VT\n(43.925266, -72.665754)"
179,09/07/2017,VDH - Car Seats,Even Flo,Cullman,AL,"Cullman, AL\n(34.173753, -86.843115)"
159,07/24/2017,RFP DMV Registration Renewal Forms,RR Donnelley,Derry,NH,"Derry, NH\n(42.881978, -71.324171)"


In [5]:
# Register existing string functions as 'user defined functions'.

db.register.eval()(str.lower)
db.register.eval()(str.upper)
db.register.eval()(str.capitalize)

# Print function registry.
db.library.functions().to_listing()

[{'name': 'lower',
  'namespace': None,
  'description': Type help() for interactive help, or help(object) for help about object.,
  'columns': 1,
  'outputs': 1,
  'parameters': []},
 {'name': 'upper',
  'namespace': None,
  'description': Type help() for interactive help, or help(object) for help about object.,
  'columns': 1,
  'outputs': 1,
  'parameters': []},
 {'name': 'capitalize',
  'namespace': None,
  'description': Type help() for interactive help, or help(object) for help about object.,
  'columns': 1,
  'outputs': 1,
  'parameters': []}]

In [6]:
# Convert values in the 'Bidders Name' column to lower case.

db.dataset(name='bidders').update(columns='Bidders Name', func=db.library.functions().get('lower'))

Unnamed: 0,Date,Title or Project,Bidders Name,City,State,Location 1
143,07/26/2017,HE - NARCAN 4MG 7.25,adapt pharma,RADNOR,PA,"RADNOR, PA\n(40.038043, -75.344449)"
138,09/19/2017,FTA Cards for DPS,ge healthcare,Marborough,MA,"Marborough, MA"
24,09/26/2017,02140-785 - Rescue Equipment,reynolds & son,Barre,VT,"Barre, VT\n(44.200603, -72.505569)"
59,09/20/2017,BGS - 170078 Trush Parking Lot Paving Repairs ...,"s t paving , inc",Waterbury,VT,"Waterbury, VT\n(44.334602, -72.753189)"
43,08/25/2017,AGR - Laboratory Epuipment,daigger scientific,Vernon Hills,IL,"Vernon Hills, IL\n(42.238635, -87.959562)"
...,...,...,...,...,...,...
64,08/09/2017,"BGS - AHS Janitorial Services - St Albans, VT",loso's professional j.s.,South Burlington,VT,"South Burlington, VT\n(44.468286, -73.171594)"
62,09/06/2017,BGS - A & E Window Restoration - 133 State St,vt architects collaborative,Randolph,VT,"Randolph, VT\n(43.925266, -72.665754)"
179,09/07/2017,VDH - Car Seats,even flo,Cullman,AL,"Cullman, AL\n(34.173753, -86.843115)"
159,07/24/2017,RFP DMV Registration Renewal Forms,rr donnelley,Derry,NH,"Derry, NH\n(42.881978, -71.324171)"


In [7]:
# Show operations in the current dataset log.

for op in db.dataset('bidders').log():
    print(op.descriptor)

{'optype': 'load', 'columns': None}
{'optype': 'commit', 'columns': None}
{'optype': 'update', 'columns': ['Bidders Name'], 'name': 'lower'}


In [8]:
# Add a user-defined function as a simple column-operator.
# register.eval adds a method to the object that is returned
# by db.apply that currently expects a single argument named
# 'columns' which specifies the column(s) on which the registered
# function is applied (evaluated) to create a modified dataset.

@db.register.eval('zigzag')
def zigzag_case(value):
    """Take a given string and return a string where
    upper and lower cases alternate.
    """
    result = ''
    functions = [str.upper, str.lower]
    i = 0
    for c in str(value):
        f = functions[i]
        i = (i + 1) % 2
        result += f(c)
    return result

## Notebook Spreadsheet UI

The following steps simulate some of the interactions that a user has with a dataset sample via the spreadsheet UI for Jupyter Notebooks.

In [9]:
# Take a dataset sample of 10 rows.

db.sample(name='bidders', n=10, random_state=43)

Unnamed: 0,Date,Title or Project,Bidders Name,City,State,Location 1
66,08/09/2017,"BGS - AHS Janitorial Services - St. Johnsbury, VT",top carpet cleaning llc,St. Johnsbury Center,VT,"St. Johnsbury Center, VT\n(44.456699, -72.016098)"
13,08/14/2017,02140-779 - Firefighter Helmets,bergeron protective clothing,Epsom,NH,"Epsom, NH\n(43.222832, -71.332095)"
124,08/04/2017,Ballistic Panels,executive wood products,Sullivan,MO,"Sullivan, MO\n(38.213599, -91.16411)"
32,09/21/2017,02150-91 - Electric ATV,aagean,Stoughton,MA,"Stoughton, MA\n(42.125465, -71.102172)"
37,08/01/2017,"02170-776 - 14"" structural boot",bergeron protective clothing,Epsom,NH,"Epsom, NH\n(43.222832, -71.332095)"
101,08/02/2017,BGS - Retainer - General Enviromental Consulting,kd associates inc,South Burlington,VT,"South Burlington, VT\n(44.468286, -73.171594)"
80,09/06/2017,BGS - Curatorial Assistance Services,allyson evans,Calais,VT,"Calais, VT\n(44.389374, -72.512604)"
136,08/18/2017,Division of Emergency Management Mass Notifica...,everbridge,Burlington,MA,"Burlington, MA\n(42.504986, -71.195832)"
154,09/08/2017,Print Shop Digital Printing Workflow Solution,canon,South Burlington,VT,"South Burlington, VT\n(44.468286, -73.171594)"
163,07/27/2017,Reconstruction & Improvements Roxbury Fish Cu...,t buck construction,Turner,ME,"Turner, ME\n(44.256483, -70.25625)"


In [10]:
# Capitalize values in the 'Bidders Name' columns.

db.dataset(name='bidders').update(columns='Bidders Name', func=db.library.functions().get('capitalize'))

Unnamed: 0,Date,Title or Project,Bidders Name,City,State,Location 1
13,08/09/2017,"BGS - AHS Janitorial Services - St. Johnsbury, VT",Top carpet cleaning llc,St. Johnsbury Center,VT,"St. Johnsbury Center, VT\n(44.456699, -72.016098)"
10,08/14/2017,02140-779 - Firefighter Helmets,Bergeron protective clothing,Epsom,NH,"Epsom, NH\n(43.222832, -71.332095)"
16,08/04/2017,Ballistic Panels,Executive wood products,Sullivan,MO,"Sullivan, MO\n(38.213599, -91.16411)"
11,09/21/2017,02150-91 - Electric ATV,Aagean,Stoughton,MA,"Stoughton, MA\n(42.125465, -71.102172)"
12,08/01/2017,"02170-776 - 14"" structural boot",Bergeron protective clothing,Epsom,NH,"Epsom, NH\n(43.222832, -71.332095)"
15,08/02/2017,BGS - Retainer - General Enviromental Consulting,Kd associates inc,South Burlington,VT,"South Burlington, VT\n(44.468286, -73.171594)"
14,09/06/2017,BGS - Curatorial Assistance Services,Allyson evans,Calais,VT,"Calais, VT\n(44.389374, -72.512604)"
17,08/18/2017,Division of Emergency Management Mass Notifica...,Everbridge,Burlington,MA,"Burlington, MA\n(42.504986, -71.195832)"
18,09/08/2017,Print Shop Digital Printing Workflow Solution,Canon,South Burlington,VT,"South Burlington, VT\n(44.468286, -73.171594)"
19,07/27/2017,Reconstruction & Improvements Roxbury Fish Cu...,T buck construction,Turner,ME,"Turner, ME\n(44.256483, -70.25625)"


In [11]:
# Apply the zig-zag function to column 'City'

db.dataset(name='bidders').update(columns='City', func=zigzag_case)

Unnamed: 0,Date,Title or Project,Bidders Name,City,State,Location 1
0,08/09/2017,"BGS - AHS Janitorial Services - St. Johnsbury, VT",Top carpet cleaning llc,St. JoHnSbUrY CeNtEr,VT,"St. Johnsbury Center, VT\n(44.456699, -72.016098)"
20,08/14/2017,02140-779 - Firefighter Helmets,Bergeron protective clothing,EpSoM,NH,"Epsom, NH\n(43.222832, -71.332095)"
25,08/04/2017,Ballistic Panels,Executive wood products,SuLlIvAn,MO,"Sullivan, MO\n(38.213599, -91.16411)"
21,09/21/2017,02150-91 - Electric ATV,Aagean,StOuGhToN,MA,"Stoughton, MA\n(42.125465, -71.102172)"
22,08/01/2017,"02170-776 - 14"" structural boot",Bergeron protective clothing,EpSoM,NH,"Epsom, NH\n(43.222832, -71.332095)"
24,08/02/2017,BGS - Retainer - General Enviromental Consulting,Kd associates inc,SoUtH BuRlInGtOn,VT,"South Burlington, VT\n(44.468286, -73.171594)"
23,09/06/2017,BGS - Curatorial Assistance Services,Allyson evans,CaLaIs,VT,"Calais, VT\n(44.389374, -72.512604)"
26,08/18/2017,Division of Emergency Management Mass Notifica...,Everbridge,BuRlInGtOn,MA,"Burlington, MA\n(42.504986, -71.195832)"
27,09/08/2017,Print Shop Digital Printing Workflow Solution,Canon,SoUtH BuRlInGtOn,VT,"South Burlington, VT\n(44.468286, -73.171594)"
28,07/27/2017,Reconstruction & Improvements Roxbury Fish Cu...,T buck construction,TuRnEr,ME,"Turner, ME\n(44.256483, -70.25625)"


In [12]:
# Show operations in the current dataset log (recipe). Note that
# each log entry has a unique identifier that is used to reference
# the represented dataset snapshot in checkout() and rollback()
# operations.

snapshots = list()
for op in db.dataset('bidders').log():
    print('{} {}'.format(op.identifier, op.descriptor))
    snapshots.append(op.identifier)

d863328a3da645f5b304e51414e8897d {'optype': 'sample', 'columns': None, 'arguments': [{'name': 'n', 'value': 10}, {'name': 'randomState', 'value': 43}]}
a0440be6e67641639342cdbdf4ea7e78 {'optype': 'update', 'columns': ['Bidders Name'], 'name': 'capitalize'}
3c9336090c444c86a4a28b51855456e3 {'optype': 'update', 'columns': ['City'], 'name': 'zigzag'}


In [13]:
# Print snapshot that resulted from the second operation
# in the recipe, i.e., the capitalize operation on the
# 'Bidders Name'. City names should not be zig-zag in this
# snapshot.

db.dataset('bidders').checkout(snapshots[1])

Unnamed: 0,Date,Title or Project,Bidders Name,City,State,Location 1
13,08/09/2017,"BGS - AHS Janitorial Services - St. Johnsbury, VT",Top carpet cleaning llc,St. Johnsbury Center,VT,"St. Johnsbury Center, VT\n(44.456699, -72.016098)"
10,08/14/2017,02140-779 - Firefighter Helmets,Bergeron protective clothing,Epsom,NH,"Epsom, NH\n(43.222832, -71.332095)"
16,08/04/2017,Ballistic Panels,Executive wood products,Sullivan,MO,"Sullivan, MO\n(38.213599, -91.16411)"
11,09/21/2017,02150-91 - Electric ATV,Aagean,Stoughton,MA,"Stoughton, MA\n(42.125465, -71.102172)"
12,08/01/2017,"02170-776 - 14"" structural boot",Bergeron protective clothing,Epsom,NH,"Epsom, NH\n(43.222832, -71.332095)"
15,08/02/2017,BGS - Retainer - General Enviromental Consulting,Kd associates inc,South Burlington,VT,"South Burlington, VT\n(44.468286, -73.171594)"
14,09/06/2017,BGS - Curatorial Assistance Services,Allyson evans,Calais,VT,"Calais, VT\n(44.389374, -72.512604)"
17,08/18/2017,Division of Emergency Management Mass Notifica...,Everbridge,Burlington,MA,"Burlington, MA\n(42.504986, -71.195832)"
18,09/08/2017,Print Shop Digital Printing Workflow Solution,Canon,South Burlington,VT,"South Burlington, VT\n(44.468286, -73.171594)"
19,07/27/2017,Reconstruction & Improvements Roxbury Fish Cu...,T buck construction,Turner,ME,"Turner, ME\n(44.256483, -70.25625)"


In [14]:
# Remove the zig-zag operation from the dataset history
# by rolling back to the previous operation.

db.dataset('bidders').rollback(snapshots[1])

Unnamed: 0,Date,Title or Project,Bidders Name,City,State,Location 1
13,08/09/2017,"BGS - AHS Janitorial Services - St. Johnsbury, VT",Top carpet cleaning llc,St. Johnsbury Center,VT,"St. Johnsbury Center, VT\n(44.456699, -72.016098)"
10,08/14/2017,02140-779 - Firefighter Helmets,Bergeron protective clothing,Epsom,NH,"Epsom, NH\n(43.222832, -71.332095)"
16,08/04/2017,Ballistic Panels,Executive wood products,Sullivan,MO,"Sullivan, MO\n(38.213599, -91.16411)"
11,09/21/2017,02150-91 - Electric ATV,Aagean,Stoughton,MA,"Stoughton, MA\n(42.125465, -71.102172)"
12,08/01/2017,"02170-776 - 14"" structural boot",Bergeron protective clothing,Epsom,NH,"Epsom, NH\n(43.222832, -71.332095)"
15,08/02/2017,BGS - Retainer - General Enviromental Consulting,Kd associates inc,South Burlington,VT,"South Burlington, VT\n(44.468286, -73.171594)"
14,09/06/2017,BGS - Curatorial Assistance Services,Allyson evans,Calais,VT,"Calais, VT\n(44.389374, -72.512604)"
17,08/18/2017,Division of Emergency Management Mass Notifica...,Everbridge,Burlington,MA,"Burlington, MA\n(42.504986, -71.195832)"
18,09/08/2017,Print Shop Digital Printing Workflow Solution,Canon,South Burlington,VT,"South Burlington, VT\n(44.468286, -73.171594)"
19,07/27/2017,Reconstruction & Improvements Roxbury Fish Cu...,T buck construction,Turner,ME,"Turner, ME\n(44.256483, -70.25625)"


In [15]:
# Show remaining operations in the current dataset log.

for op in db.dataset('bidders').log():
    print('{} {}'.format(op.identifier, op.descriptor))
    snapshots.append(op.identifier)

d863328a3da645f5b304e51414e8897d {'optype': 'sample', 'columns': None, 'arguments': [{'name': 'n', 'value': 10}, {'name': 'randomState', 'value': 43}]}
a0440be6e67641639342cdbdf4ea7e78 {'optype': 'update', 'columns': ['Bidders Name'], 'name': 'capitalize'}


In [16]:
# Apply changes to the full dataset.

db.checkout('bidders', commit=True)

Unnamed: 0,Date,Title or Project,Bidders Name,City,State,Location 1
143,07/26/2017,HE - NARCAN 4MG 7.25,Adapt pharma,RADNOR,PA,"RADNOR, PA\n(40.038043, -75.344449)"
138,09/19/2017,FTA Cards for DPS,Ge healthcare,Marborough,MA,"Marborough, MA"
24,09/26/2017,02140-785 - Rescue Equipment,Reynolds & son,Barre,VT,"Barre, VT\n(44.200603, -72.505569)"
59,09/20/2017,BGS - 170078 Trush Parking Lot Paving Repairs ...,"S t paving , inc",Waterbury,VT,"Waterbury, VT\n(44.334602, -72.753189)"
43,08/25/2017,AGR - Laboratory Epuipment,Daigger scientific,Vernon Hills,IL,"Vernon Hills, IL\n(42.238635, -87.959562)"
...,...,...,...,...,...,...
64,08/09/2017,"BGS - AHS Janitorial Services - St Albans, VT",Loso's professional j.s.,South Burlington,VT,"South Burlington, VT\n(44.468286, -73.171594)"
62,09/06/2017,BGS - A & E Window Restoration - 133 State St,Vt architects collaborative,Randolph,VT,"Randolph, VT\n(43.925266, -72.665754)"
179,09/07/2017,VDH - Car Seats,Even flo,Cullman,AL,"Cullman, AL\n(34.173753, -86.843115)"
159,07/24/2017,RFP DMV Registration Renewal Forms,Rr donnelley,Derry,NH,"Derry, NH\n(42.881978, -71.324171)"


In [17]:
# Show operations in the resulting dataset log.

for op in db.dataset('bidders').log():
    print(op.descriptor)

{'optype': 'load', 'columns': None}
{'optype': 'commit', 'columns': None}
{'optype': 'update', 'columns': ['Bidders Name'], 'name': 'lower'}
{'optype': 'update', 'columns': ['Bidders Name'], 'name': 'capitalize'}
