In [1]:
from openclean.data.load import dataset

df = dataset('/home/heiko/data/socrata/data.virginia.gov/20200703/tsv/bre9-aqqr.tsv.gz')

In [2]:
df

Unnamed: 0,Report Date,FIPS,Locality,VDH Health District,Total Cases,Hospitalizations,Deaths
0,07/02/2020,51001,Accomack,Eastern Shore,1042,72,14
1,07/02/2020,51003,Albemarle,Thomas Jefferson,412,33,9
2,07/02/2020,51005,Alleghany,Alleghany,33,5,2
3,07/02/2020,51007,Amelia,Piedmont,44,11,1
4,07/02/2020,51009,Amherst,Central Virginia,36,3,0
...,...,...,...,...,...,...,...
14359,03/17/2020,51800,Suffolk,Western Tidewater,0,0,0
14360,03/17/2020,51810,Virginia Beach,Virginia Beach,4,0,0
14361,03/17/2020,51820,Waynesboro,Central Shenandoah,0,0,0
14362,03/17/2020,51830,Williamsburg,Peninsula,1,0,0


## Impact of calling an eval function of every data frame row

In [3]:
import time

start = time.perf_counter()

for i in range(10):
    for rowid, values in df.iterrows():
        values[2].lower()
    
end = time.perf_counter()

print('Runtime {} s'.format(end -start))

Runtime 7.702656302019022 s


In [4]:
import time
from openclean.function.eval.base import Col

col = Col('Locality').prepare(df)

start = time.perf_counter()

for i in range(10):
    for rowid, values in df.iterrows():
        col.eval(values).lower()
    
end = time.perf_counter()

print('Runtime {} s'.format(end -start))

Runtime 7.897803692962043 s


## Set Lookup vs. List Lookup vs. Dictionary Lookup

In [5]:
from openclean.function.distinct import distinct

vocabulary = distinct(df, 'Locality')

In [6]:
values = set(list(vocabulary.keys()) + list(distinct(df, 'VDH Health District').keys()))

In [7]:
import time

start = time.perf_counter()

for i in range(100000):
    found = 0
    for v in values:
        if v in vocabulary:
            found += 1
            
end = time.perf_counter()

print('Runtime  for Counter/dict {} s'.format(end -start))

Runtime  for Counter/dict 1.8382479379652068 s


In [8]:
start = time.perf_counter()

vocablist = list(vocabulary.keys())

for i in range(100000):
    found = 0
    for v in values:
        if v in vocablist:
            found += 1
            
end = time.perf_counter()

print('Runtime  for list {} s'.format(end -start))

Runtime  for list 14.772881913930178 s


In [9]:
start = time.perf_counter()

vocabset = set(vocabulary.keys())

for i in range(100000):
    found = 0
    for v in values:
        if v in vocabset:
            found += 1
            
end = time.perf_counter()

print('Runtime  for set {} s'.format(end -start))

Runtime  for set 1.3019300369778648 s
