In [1]:

import os
import datetime
import sqlite3

from google.cloud import bigquery

import data_algebra
import data_algebra.test_util
from data_algebra.data_ops import *
import data_algebra.BigQuery
import data_algebra.SQLite

import pytest


In [2]:
# set up big query client

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/johnmount/big_query/big_query_jm.json"
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"]  # trigger key error if not present
bq_client = bigquery.Client()

bq_handle = data_algebra.BigQuery.BigQueryModel().db_handle(bq_client)
data_catalog = 'data-algebra-test'
data_schema = 'test_1'

# set up sqlite client
conn_sqlite = sqlite3.connect(":memory:")
db_model_sqlite = data_algebra.SQLite.SQLiteModel()
db_model_sqlite.prepare_connection(conn_sqlite)
db_handle_sqlite = db_model_sqlite.db_handle(conn_sqlite)

db_handles = [bq_handle, db_handle_sqlite]


In [3]:
# test prefixing table ops

bq_handle_prefixed = data_algebra.BigQuery.BigQueryModel(
    table_prefix=f'{data_catalog}.{data_schema}'
    ).db_handle(bq_client)  # don't close, sharing connection

d = data_algebra.default_data_model.pd.DataFrame({
    'group': ['a', 'a', 'b', 'b'],
    'val': [1, 2, 3, 4],
})
table_name = 'pytest_temp_z'

descr = bq_handle_prefixed.insert_table(d, table_name=table_name, allow_overwrite=True)

bq_handle_prefixed.drop_table(table_name)

print(bq_handle_prefixed.to_sql(descr, pretty=True))

SELECT `group`,
       `val`
FROM `data-algebra-test.test_1.pytest_temp_z`


In [4]:
def test_bigquery_1():
    d = data_algebra.default_data_model.pd.DataFrame({
        'group': ['a', 'a', 'b', 'b'],
        'val': [1, 2, 3, 4],
    })
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    # this is the pattern BigQuery needs to compute
    # median, window function then a pseudo-aggregation
    # NOTE: sqllite doesn't allow median as a window function,
    # meaning it can not run this query.
    ops = describe_table(d, table_name=table_name). \
        extend(
            {'med_val': 'median(val)'},
            partition_by=['group']). \
        project(
            {'med_val': 'mean(med_val)'},  # pseudo-aggregator
            group_by=['group'])

    expect = data_algebra.default_data_model.pd.DataFrame({
        'group': ['a', 'b'],
        'med_val': [1.5, 3.5],
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=[bq_handle],
        check_parse=False,
    )


test_bigquery_1()


def test_bigquery_2():
    d = data_algebra.default_data_model.pd.DataFrame({
        'group': ['a', 'a', 'a', 'b', 'b'],
        'v1': [1, 2, 2, 0, 0],
        'v2': [1, 2, 3, 4, 5],
    })
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    # this is the pattern BigQuery needs to compute
    # median, window function then a pseudo-aggregation
    # refs on BigQuery window fn horseshit:
    #  https://iamhectorotero.github.io/median-and-group-by/
    #  https://chartio.com/resources/tutorials/how-countdistinct-field-works-in-google-bigquery/
    ops = describe_table(d, table_name=table_name). \
        extend({
            'med_1': 'v1.median()',  # median is only a window fn in Big Query
            'med_2': 'v2.median()',  # median is only a window fn in Big Query
            },
            partition_by=['group']). \
        project({
            'med_1': 'med_1.mean()',  # pseudo aggregator
            'med_2': 'med_2.mean()',  # pseudo aggregator
            'mean_1': 'v1.mean()',
            'mean_2': 'v2.mean()',
            'nu_1': 'v1.nunique()',
            'nu_2': 'v2.nunique()',
            },
            group_by=['group'])

    expect = data_algebra.default_data_model.pd.DataFrame({
        'group': ['a', 'b'],
        'med_1': [2, 0],
        'med_2': [2.0, 4.5],
        'mean_1': [1.66666666667, 0.0],
        'mean_2': [2.0, 4.5],
        'nu_1': [2, 1],
        'nu_2': [3, 2],
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=[bq_handle],
        check_parse=False,
    )


test_bigquery_2()


def test_bigquery_insert_raise():
    d = data_algebra.default_data_model.pd.DataFrame({
        'group': ['a', 'a', 'a', 'b', 'b'],
        'v1': [1, 2, 2, 0, 0],
        'v2': [1, 2, 3, 4, 5]})
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_dx'
    bq_handle.drop_table(table_name)
    bq_handle.insert_table(d, table_name=table_name, allow_overwrite=True)
    with pytest.raises(ValueError):
        bq_handle.insert_table(d, table_name=table_name, allow_overwrite=False)
    bq_handle.drop_table(table_name)


test_bigquery_insert_raise()


def test_bigquery_date_1():
    d = data_algebra.default_data_model.pd.DataFrame({
        'group': ['a', 'a', 'a', 'b', 'b'],
        'v1': [1, 2, 2, 0, 0],
        'v2': [1, 2, 3, 4, 5],
        'dt': data_algebra.default_data_model.pd.to_datetime([1490195805, 1490195815, 1490295805, 1490196805, 1490195835], unit='s')
    })
    d['dt_str'] = d.dt.astype(str)

    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name) .\
        extend({
            'date': bq_handle.fns.datetime_to_date('dt'),
            'date_str': bq_handle.fns.trimstr('dt_str', start=0, stop=10),
         }) . \
        extend({
            'mean_v1': 'v1.mean()',
            'count': '_size()',
            },
            partition_by='group') . \
        drop_columns(['dt'])  # date will comback with UTC and other alterations

    expect = d.copy()
    expect['date'] = expect.dt.dt.date.copy()
    expect['date_str'] = expect.dt_str.str.slice(start=0, stop=10)
    expect['mean_v1'] = [5/3, 5/3, 5/3, 0, 0]
    expect['count'] = [3, 3, 3, 2, 2]
    del expect['dt']

    # res_pandas = ops.transform(d)
    # bq_handle.insert_table(d, table_name=table_name, allow_overwrite=True)
    # res_bq = bq_handle.read_query(ops)
    # assert data_algebra.test_util.equivalent_frames(expect, res_pandas)
    # assert data_algebra.test_util.equivalent_frames(expect, res_bq)

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=[bq_handle],
        check_parse=False,
    )


test_bigquery_date_1()


def test_big_query_and():
    d = data_algebra.default_data_model.pd.DataFrame({
        'group': ['a', 'a', 'a', 'b', 'b'],
        'v1': [1, 2, 2, 0, 2],
        'v2': [1, 2, 3, 4, 5],
    })
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    # build a description that looks like the BigQuery db handle built it.
    ops = describe_table(d, table_name=table_name) .\
        select_rows("(group == 'a') & (v1 == 2)")

    # see & gets translated to AND
    sql = bq_handle.to_sql(ops)
    assert sql.find('&') < 0
    assert sql.find('AND') > 0

    expect = data_algebra.default_data_model.pd.DataFrame({
        'group': ['a', 'a'],
        'v1': [2, 2],
        'v2': [2, 3],
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=db_handles,
        check_parse=False,
    )


test_big_query_and()


def test_big_query_notor():
    d = data_algebra.default_data_model.pd.DataFrame({
        'group': ['a', 'a', 'a', 'b', 'b'],
        'v1': [1, 2, 2, 0, 2],
        'v2': [1, 2, 3, 4, 5],
    })
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    # build a description that looks like the BigQuery db handle built it.
    ops = describe_table(d, table_name=table_name) .\
        select_rows("not ((group == 'a') or (v1 == 2))")

    # see & gets translated to AND
    sql = bq_handle.to_sql(ops)
    assert sql.find('|') < 0
    assert sql.find('OR') > 0

    expect = data_algebra.default_data_model.pd.DataFrame({
        'group': ['b'],
        'v1': [0],
        'v2': [4],
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=db_handles,
        check_parse=False,
    )


test_big_query_notor()


def test_TRIMSTR():
    d = data_algebra.default_data_model.pd.DataFrame({
        'x': ['0123456', 'abcdefghijk'],
        'y': ['012345', 'abcdefghij'],
    })
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name) .\
        extend({
         'nx': bq_handle.fns.trimstr('x', start=0, stop=5)
        })

    expect = data_algebra.default_data_model.pd.DataFrame({
        'x': ['0123456', 'abcdefghijk'],
        'y': ['012345', 'abcdefghij'],
        'nx': ['01234', 'abcde'],
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=[bq_handle],
        check_parse=False,
    )


test_TRIMSTR()


def test_AS_INT64():
    d = data_algebra.default_data_model.pd.DataFrame({
        'x': ['0123456', '66'],
        'y': ['012345', '77'],
    })
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name) .\
        extend({
         'nx': bq_handle.fns.as_int64('x')
        })

    expect = data_algebra.default_data_model.pd.DataFrame({
        'x': ['0123456', '66'],
        'y': ['012345', '77'],
        'nx': [123456, 66]
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=[bq_handle],
        check_parse=False,
    )


test_AS_INT64()


def test_DATE():
    d = data_algebra.default_data_model.pd.DataFrame({
        'x': data_algebra.default_data_model.pd.to_datetime([1490196805, 1490195835], unit='s'),
        'y': ['012345', '77'],
    })
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name) .\
        extend({
         'nx': bq_handle.fns.datetime_to_date('x')
        }) .\
        extend({
         'nxs': bq_handle.fns.as_str('nx')
        }) .\
        select_columns(['nxs'])

    expect = data_algebra.default_data_model.pd.DataFrame({
        'nxs': ['2017-03-22', '2017-03-22']
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=[bq_handle],
        check_parse=False,
    )


test_DATE()


def test_COALESCE_0():
    d = data_algebra.default_data_model.pd.DataFrame({
        'x': [1, None, 3]
    })
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name) .\
        extend({
         'nx': bq_handle.fns.coalesce_0('x')
        })

    expect = data_algebra.default_data_model.pd.DataFrame({
        'x': [1, None, 3],
        'nx': [1, 0, 3]
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=[bq_handle],
        check_parse=False,
    )


test_COALESCE_0()

def test_PARSE_DATE():
    d = data_algebra.default_data_model.pd.DataFrame({
        'x': ['2001-01-01', '2020-04-02']
    })
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name) .\
        extend({
         'nx': bq_handle.fns.parse_date('x')
        })
    res = ops.transform(d)
    assert isinstance(res.nx[0], datetime.date)

    expect = data_algebra.default_data_model.pd.DataFrame({
        'x': ['2001-01-01', '2020-04-02']
    })
    expect['nx'] = data_algebra.default_data_model.pd.to_datetime(d.x, format="%Y-%m-%d")
    assert data_algebra.test_util.equivalent_frames(res, expect)

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=[bq_handle],
        check_parse=False,
    )


test_PARSE_DATE()


def test_DATE_PARTS():
    d = data_algebra.default_data_model.pd.DataFrame({
        'x': ['2001-01-01', '2020-04-02'],
        't': ['2001-01-01 01:33:22', '2020-04-02 13:11:10'],
    })
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name) .\
        extend({
            'nx': bq_handle.fns.parse_date('x', format="%Y-%m-%d"),
            'nt': bq_handle.fns.parse_datetime('t', format="%Y-%m-%d %H:%M:%S"),
            'nd': bq_handle.fns.parse_datetime('x', format="%Y-%m-%d"),
        }) .\
        extend({
            'date2': bq_handle.fns.datetime_to_date('nt'),
            'day_of_week': bq_handle.fns.dayofweek('nx'),
            'day_of_year': bq_handle.fns.dayofyear('nx'),
            'month': bq_handle.fns.month('nx'),
            'day_of_month': bq_handle.fns.dayofmonth('nx'),
            'quarter': bq_handle.fns.quarter('nx'),
            'year': bq_handle.fns.year('nx'),
            'diff': bq_handle.fns.timestamp_diff('nt', 'nd'),
            'sdt': bq_handle.fns.format_datetime('nt', format="%Y-%m-%d %H:%M:%S"),
            'sd': bq_handle.fns.format_date('nx', format="%Y-%m-%d"),
            'dd': bq_handle.fns.date_diff('nx', 'nx'),
        })
    res = ops.transform(d)
    assert isinstance(res.nx[0], datetime.date)
    assert isinstance(res.sdt[0], str)
    assert isinstance(res.sd[0], str)

    expect = data_algebra.default_data_model.pd.DataFrame({
        'x': ['2001-01-01', '2020-04-02'],
        't': ['2001-01-01 01:33:22', '2020-04-02 13:11:10'],
        'day_of_week': [2, 5],
        'day_of_year': [1, 93],
        'month': [1, 4],
        'day_of_month': [1, 2],
        'quarter': [1, 2],
        'year': [2001, 2020],
        'dd': [0, 0],
    })
    expect['nx'] = data_algebra.default_data_model.pd.to_datetime(expect.x, format="%Y-%m-%d").dt.date.copy()
    expect['nt'] = data_algebra.default_data_model.pd.to_datetime(expect.t, format="%Y-%m-%d %H:%M:%S")
    expect['nd'] = data_algebra.default_data_model.pd.to_datetime(expect.x, format="%Y-%m-%d")
    expect['date2'] = expect.nt.dt.date.copy()
    expect['diff'] = [
            data_algebra.default_data_model.pd.Timedelta(expect['nt'][i] - expect['nd'][i]).total_seconds()
            for i in range(len(expect['nt']))]
    expect['sdt'] = expect.t
    expect['sd'] = expect.x

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=[bq_handle],
        check_parse=False,
    )


test_DATE_PARTS()


def test_coalesce():
    d = data_algebra.default_data_model.pd.DataFrame({
        'a': [1, None, None, None, None, 6, 7, None],
        'b': [10, 20, None, None, None, 60, None, None],
        'c': [None, 200, 300, None, 500, 600, 700, None],
        'd': [1000, None, 3000, 4000, None, 6000, None, None],
    })
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name)  .\
        extend({'fixed': bq_handle.fns.coalesce(['a','b', 'c', 'd'])})

    expect = data_algebra.default_data_model.pd.DataFrame({
        'a': [1, None, None, None, None, 6, 7, None],
        'b': [10, 20, None, None, None, 60, None, None],
        'c': [None, 200, 300, None, 500, 600, 700, None],
        'd': [1000, None, 3000, 4000, None, 6000, None, None],
        'fixed': [1, 20, 300, 4000, 500, 6, 7, None],
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=[bq_handle],
        check_parse=False,
    )

test_coalesce()


def test_base_Sunday():
    d = data_algebra.default_data_model.pd.DataFrame({
        'date_str': ['2021-04-25', '2021-04-27']
    })
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name) .\
        extend({
            'dt': bq_handle.fns.parse_date('date_str', format="%Y-%m-%d")
        }) .\
        extend({
            's': bq_handle.fns.base_Sunday('dt')
        }) .\
        drop_columns(['dt']) .\
        extend({
            's': bq_handle.fns.format_date('s', format="%Y-%m-%d")
        })

    expect = data_algebra.default_data_model.pd.DataFrame({
        'date_str': ['2021-04-25', '2021-04-27'],
        's': ['2021-04-25', '2021-04-25']
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=[bq_handle],
        check_parse=False,
    )


test_base_Sunday()


def test_bq_concat_rows():
    d = data_algebra.default_data_model.pd.DataFrame({
        'd': [1, 2]
    })
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name) .\
        extend({'d': 'd + 1'}) .\
        concat_rows(b=describe_table(d, table_name=table_name))

    expect = data_algebra.default_data_model.pd.DataFrame({
        'd': [2, 3, 1, 2],
        'source_name': ['a', 'a', 'b', 'b']
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=[bq_handle],
        check_parse=False,
    )


test_bq_concat_rows()


def test_bq_join_rows():
    d1 = data_algebra.default_data_model.pd.DataFrame({
        'k': ['a', 'b'],
        'd': [1, 2]
    })
    table_name_d1 = f'{data_catalog}.{data_schema}.pytest_temp_d1'
    d2 = data_algebra.default_data_model.pd.DataFrame({
        'k': ['a', 'b'],
        'e': [4, 5]
    })
    table_name_d2 = f'{data_catalog}.{data_schema}.pytest_temp_d2'

    ops = describe_table(d1, table_name=table_name_d1) .\
        extend({'d': 'd + 1'}) .\
        natural_join(b=describe_table(d2, table_name=table_name_d2),
                     by=['k'],
                     jointype='inner')

    expect = data_algebra.default_data_model.pd.DataFrame({
        'k': ['a', 'b'],
        'd': [2, 3],
        'e': [4, 5],
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data={table_name_d1: d1, table_name_d2: d2},
        expect=expect,
        db_handles=db_handles,
        check_parse=False,
    )


test_bq_join_rows()



In [5]:

def test_ideom_extend_one_count():
    d = data_algebra.default_data_model.pd.DataFrame({
        'group': ['a', 'a', 'b', 'b'],
        'val': [1, 2, 3, 4],
    })
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name) .\
        extend({
            'one': 1
        }) .\
        project({
            'count': 'one.sum()'
        })

    expect = data_algebra.default_data_model.pd.DataFrame({
        'count': [4]
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=db_handles,
        check_parse=False,
    )


test_ideom_extend_one_count()


def test_ideom_extend_special_count():
    d = data_algebra.default_data_model.pd.DataFrame({
        'group': ['a', 'a', 'b', 'b'],
        'val': [1, 2, 3, 4],
    })
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name) .\
        project({
            'count': '_count()'
        })

    expect = data_algebra.default_data_model.pd.DataFrame({
        'count': [4]
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=db_handles,
        check_parse=False,
    )


test_ideom_extend_special_count()


# previously forbidden
def test_ideom_forbidden_extend_test_trinary():
    d = data_algebra.default_data_model.pd.DataFrame({
        'group': ['a', 'a', 'b', 'b'],
        'val': [1, 2, 3, 4],
    })
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name) .\
        extend({ # {'select': '(val > 2.5).if_else("high", "low")' } # doesn't work in Pandas
            'select': '(val > 2.5).if_else("high", "low")'
        })

    expect = data_algebra.default_data_model.pd.DataFrame({
        'group': ['a', 'a', 'b', 'b'],
        'val': [1, 2, 3, 4],
        'select': ['low', 'low', 'high', 'high']
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=db_handles,
        check_parse=False,
    )


test_ideom_forbidden_extend_test_trinary()


def test_ideom_extend_test_trinary():
    d = data_algebra.default_data_model.pd.DataFrame({
        'group': ['a', 'a', 'b', 'b'],
        'val': [1, 2, 3, 4],
    })
    table_name = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name) .\
        extend({ # {'select': '(val > 2.5).if_else("high", "low")' } # doesn't work in Pandas
            'select': '(val > 2.5)'
        }) .\
        extend({
            'select': 'select.if_else("high", "low")'
        })

    expect = data_algebra.default_data_model.pd.DataFrame({
        'group': ['a', 'a', 'b', 'b'],
        'val': [1, 2, 3, 4],
        'select': ['low', 'low', 'high', 'high']
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=db_handles,
        check_parse=False,
    )


test_ideom_extend_test_trinary()


def test_ideom_simulate_cross_join():
    d = data_algebra.default_data_model.pd.DataFrame({
        'x': [1, 2, 3, 4],
    })
    table_name_d = f'{data_catalog}.{data_schema}.pytest_temp_d'

    e = data_algebra.default_data_model.pd.DataFrame({
        'y': ['a', 'b', 'c'],
    })
    table_name_e = f'{data_catalog}.{data_schema}.pytest_temp_e'

    ops = describe_table(d, table_name=table_name_d) .\
        extend({ # {'select': '(val > 2.5).if_else("high", "low")' } # doesn't work in Pandas
            'one': 1
        }) .\
        natural_join(
            b=describe_table(e, table_name=table_name_e) . \
                extend({  # {'select': '(val > 2.5).if_else("high", "low")' } # doesn't work in Pandas
                    'one': 1
                }),
            by=['one'],
            jointype='left'
        ) .\
        drop_columns(['one'])

    expect = data_algebra.default_data_model.pd.DataFrame({
        'x': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
        'y': ['a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c'],
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data={table_name_d: d, table_name_e: e},
        expect=expect,
        db_handles=db_handles,
        check_parse=False,
    )


test_ideom_simulate_cross_join()


def test_ideom_simulate_cross_join_select():
    d = data_algebra.default_data_model.pd.DataFrame({
        'x': [1, 2, 3, 4],
    })
    table_name_d = f'{data_catalog}.{data_schema}.pytest_temp_d'

    e = data_algebra.default_data_model.pd.DataFrame({
        'y': ['a', 'b', 'c'],
    })
    table_name_e = f'{data_catalog}.{data_schema}.pytest_temp_e'

    ops = describe_table(d, table_name=table_name_d) .\
        extend({ # {'select': '(val > 2.5).if_else("high", "low")' } # doesn't work in Pandas
            'one': 1
        }) .\
        natural_join(
            b=describe_table(e, table_name=table_name_e) . \
                extend({  # {'select': '(val > 2.5).if_else("high", "low")' } # doesn't work in Pandas
                    'one': 1
                }),
            by=['one'],
            jointype='left'
        ) .\
        select_columns(['x', 'y'])

    expect = data_algebra.default_data_model.pd.DataFrame({
        'x': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
        'y': ['a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c'],
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data={table_name_d: d, table_name_e: e},
        expect=expect,
        db_handles=db_handles,
        check_parse=False,
    )


test_ideom_simulate_cross_join_select()


def test_ideom_cross_join():
    d = data_algebra.default_data_model.pd.DataFrame({
        'x': [1, 2, 3, 4],
    })
    table_name_d = f'{data_catalog}.{data_schema}.pytest_temp_d'

    e = data_algebra.default_data_model.pd.DataFrame({
        'y': ['a', 'b', 'c'],
    })
    table_name_e = f'{data_catalog}.{data_schema}.pytest_temp_e'

    ops = describe_table(d, table_name=table_name_d) .\
        natural_join(
            b=describe_table(e, table_name=table_name_e),
            by=[],
            jointype='cross'
        )

    expect = data_algebra.default_data_model.pd.DataFrame({
        'x': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
        'y': ['a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c'],
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data={table_name_d: d, table_name_e: e},
        expect=expect,
        db_handles=db_handles,
        check_parse=False,
    )


test_ideom_cross_join()


# Note: switching from _row_number to _count
def test_ideom_row_number():
    d = data_algebra.default_data_model.pd.DataFrame({
        'i': [1, 3, 2, 4, 5],
        'g': [1, 2, 2, 1, 1],
    })
    table_name_d = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name_d) .\
        extend({
            'one': 1
            }) .\
        extend({
            'n': 'one.cumsum()'
            },
            partition_by=['g'],
            order_by=['i'],
            ) .\
        drop_columns(['one']) .\
        order_rows(['i'])

    expect = data_algebra.default_data_model.pd.DataFrame({
        'i': [1, 2, 3, 4, 5],
        'g': [1, 2, 2, 1, 1],
        'n': [1, 1, 2, 2, 3],
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=db_handles,
        check_parse=False,
    )


test_ideom_row_number()


def test_ideom_sum_cumsum():
    d = data_algebra.default_data_model.pd.DataFrame({
        'i': [1, 2, 3, 4, 5],
        'o': [1, 1, 1, 1, 1],
        'g': [1, 2, 2, 1, 1],
    })
    table_name_d = f'{data_catalog}.{data_schema}.pytest_temp_d'

    with pytest.raises(ValueError):
        ops = describe_table(d, table_name=table_name_d). \
                extend({
                's2': 'o.sum()',
                },
                partition_by=['g'],
                order_by=['i'],
            )

    with pytest.raises(ValueError):
        ops = describe_table(d, table_name=table_name_d). \
                extend({
                's2': 'o.cumsum()',
                },
                partition_by=['g'],
            )

    ops = describe_table(d, table_name=table_name_d). \
        extend({
            's': '(1).cumsum()',
            },
            partition_by=['g'],
            order_by=['i'],
            ). \
        extend({
            'n': 's.max()',  # max over cumsum to get sum!
            'n2': '(1).sum()',  # no order present, so meaning is non-cumulative.
            },
            partition_by=['g']
        ). \
        order_rows(['i'])

    expect = data_algebra.default_data_model.pd.DataFrame({
        'i':  [1, 2, 3, 4, 5],
        'o':  [1, 1, 1, 1, 1],
        'g':  [1, 2, 2, 1, 1],
        'n':  [3, 2, 2, 3, 3],
        'n2': [3, 2, 2, 3, 3],
        's':  [1, 1, 2, 2, 3],
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=db_handles,
        check_parse=False,
    )


test_ideom_sum_cumsum()


def test_ideom_project_sum():
    d = data_algebra.default_data_model.pd.DataFrame({
        'i': [1, 2, 3, 4, 5],
        'g': [1, 2, 2, 1, 1],
    })
    table_name_d = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name_d). \
        project({
            's': '(1).sum()',
            },
            group_by=['g'],
            ). \
        order_rows(['g'])

    expect = data_algebra.default_data_model.pd.DataFrame({
        'g':  [1, 2],
        's':  [3, 2],
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=db_handles,
        check_parse=False,
    )


test_ideom_project_sum()


def test_ideom_concat_op():
    d = data_algebra.default_data_model.pd.DataFrame({
        'x': ['a', 'b', 'c'],
        'y': ['1', '2', '3'],
    })
    table_name_d = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name_d). \
        extend({
            'z': 'x %+% y %+% + x'
            })

    expect = data_algebra.default_data_model.pd.DataFrame({
        'x': ['a', 'b', 'c'],
        'y': ['1', '2', '3'],
        'z': ['a1a', 'b2b', 'c3c']
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=db_handles,
        check_parse=False,
    )


test_ideom_concat_op()


def test_ideom_coalesce_op():
    d = data_algebra.default_data_model.pd.DataFrame({
        'x': ['a', 'b', None, None],
        'y': ['1', None, '3', None],
    })
    table_name_d = f'{data_catalog}.{data_schema}.pytest_temp_d'

    ops = describe_table(d, table_name=table_name_d). \
        extend({
            'z': 'x %?% y'
            })

    expect = data_algebra.default_data_model.pd.DataFrame({
        'x': ['a', 'b', None, None],
        'y': ['1', None, '3', None],
        'z': ['a', 'b', '3', None],
    })

    data_algebra.test_util.check_transform_on_handles(
        ops=ops,
        data=d,
        expect=expect,
        db_handles=db_handles,
        check_parse=False,
    )


test_ideom_coalesce_op()



In [6]:
# clean up
bq_handle.close()
db_handle_sqlite.close()

In [7]:
1 + 1

2