In [1]:
import pandas as pd
import numpy as np
import sys
import timeit
import altair as alt

In [2]:
sys.path.append('../')

In [3]:
import dfs
from SE.qa.dataframe import validate_df

In [4]:
def _sample(length)->pd.DataFrame:
    d = {'x': np.random.random(length),
         'y': np.random.randint(0, 1000, size=length),
         'z': [chr(i) for i in np.random.randint(ord('a'), ord('z') + 1, length)]
    }
    return pd.DataFrame(d)

In [5]:
schema_p1 = {
    "metadata": {"protocol_version":1.0},
    "strict_cols":True,
    "columns": {
        "x": {"dtype":"float", "max_value": 1.2},
        "y": {"dtype": "int", "max_value":1200},
        "z": {"dtype":'string', "na_limit": 0.1}
    }
}

In [6]:
sample_ns = (10_000, 1_000_000, 5_000_000, 10_000_000, 25_000_000, 50_000_000)

In [7]:
setup = '''
import pandas as pd
import numpy as np

import dataframe_schema as dfs
from SE.qa.dataframe import validate_df

schema_p1 = {
    "metadata": {"protocol_version":1.0},
    "strict_cols":True,
    "columns": {
        "x": {"dtype":"float", "max_value": 1.2},
        "y": {"dtype": "int", "max_value":1200},
        "z": {"dtype":'string', "na_limit": 0.1}
    }
}

S = dfs.DfSchema.from_dict(schema_p1)
schema_p2 = S.dict()

def _sample(length)->pd.DataFrame:
    d = {'x': np.random.random(length),
         'y': np.random.randint(0, 1000, size=length),
         'z': [chr(i) for i in np.random.randint(ord('a'), ord('z') + 1, length)]
    }
    return pd.DataFrame(d)

sample = _sample'''

In [8]:
tests = {
    'SE': 'validate_df(sample, schema_p1)',
    'dfs_s_p1': 'dfs.validate_df(sample, schema_p1, summary=True)',
    'dfs_f_p1': 'dfs.validate_df(sample, schema_p1, summary=False)',
    'dfs_s_p2': 'dfs.validate_df(sample, schema_p2, summary=True)',
    'dfs_f_p2': 'dfs.validate_df(sample, schema_p2, summary=False)',
    'dfs_S_s': 'S.validate_df(sample, summary=True)',
    'dfs_S_f': 'S.validate_df(sample, summary=False)'
}

In [9]:
results = {}
for k, test in tests.items():
    print(k)
    results[k] = {}
    
    for s in samples_ns:
        setup_ = setup + f'({s})'
        mmnts = timeit.timeit(stmt=test, setup=setup_, number=4)
        results[k][s] = mmnts


SE
dfs_s_p1
dfs_f_p1
dfs_s_p2
dfs_f_p2
dfs_S_s
dfs_S_f


In [11]:
df = pd.DataFrame(results)
df

Unnamed: 0,SE,dfs_s_p1,dfs_f_p1,dfs_s_p2,dfs_f_p2,dfs_S_s,dfs_S_f
10000,0.005895,0.004896,0.005577,0.006282,0.004488,0.003945,0.003983
1000000,0.174699,0.174635,0.173829,0.198207,0.166253,0.181899,0.173226
5000000,0.823992,0.811855,0.773859,0.765175,0.782001,1.07326,0.777115
10000000,1.658748,1.648245,1.665948,1.554624,1.630465,1.508329,1.726881
25000000,4.233288,3.951165,3.679488,4.556807,4.664113,4.299214,4.309913
50000000,8.109915,7.355722,7.614542,7.208424,8.166298,8.384956,8.289013


In [13]:
dfshort = df.stack().reset_index(drop=False)
dfshort.columns = ['sample_size', 'method', 'value']

alt.Chart(dfshort).mark_line(point=True).encode(
    x='sample_size:Q',
    y='value:Q',
    color='method:N',
    tooltip=['method:N', 'sample_size:Q', 'value:Q']
).interactive()

In [7]:
%%timeit
validate_df(samples[100], schema_p1)

614 µs ± 14.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [15]:
schema = dfs.DfSchema.from_dict(schema_p1)

In [11]:
%%timeit
dfs.validate_df(samples[100], schema_p1, summary=True)

570 µs ± 23.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [10]:
%%timeit
dfs.validate_df(samples[100], schema_p1, summary=False)

559 µs ± 23 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
