## A catalog of [data algebra expression](https://github.com/WinVector/data_algebra) methods.


In [1]:
import datetime
import os
import warnings
import numpy as np
import pandas as pd
import pickle
import gzip

from data_algebra.data_ops import *
import data_algebra.SQLite
import data_algebra.BigQuery
import data_algebra.PostgreSQL
import data_algebra.SparkSQL
import data_algebra.MySQL
from data_algebra.parse_by_lark import parse_by_lark
import data_algebra.test_util
import data_algebra.util


def mk_example():
    datetime_format = "%Y-%m-%d %H:%M:%S"
    date_format = "%Y-%m-%d"
    d = pd.DataFrame({
        'row_id': [0, 1, 2, 3],
        'a': [False, False, True, True],
        'b': [False, True, False, True],
        'q': [1, 1, 2, 2],
        'x': [.1, .2, .3, .4],
        'y': [2.4, 1.33, 1.2, 1.1],
        'z': [1.6, None, -2.1, numpy.nan],
        'g': ['a', 'a', 'b', 'ccc'],
        's2': ['z', 'q', '11', 'b'],
        "str_datetime_col": ["2000-01-01 12:13:21", "2020-04-05 14:03:00", "2000-01-01 12:13:21", "2020-04-05 14:03:00"],
        "str_date_col": ["2000-03-01", "2020-04-05", "2000-03-01", "2020-04-05"],
        "datetime_col_0": pd.to_datetime(
            pd.Series(["2010-01-01 12:13:21", "2030-04-05 14:03:00", "2010-01-01 12:13:21", "2030-04-05 14:03:00"]),
            format=datetime_format,
        ),
        "datetime_col_1": pd.to_datetime(
            pd.Series(["2010-01-01 12:11:21", "2030-04-06 14:03:00", "2010-01-01 12:11:21", "2030-04-06 14:03:00"]),
            format=date_format,
        ),
        "date_col_0": pd.to_datetime(
            pd.Series(["2000-01-02", "2035-04-05", "2000-01-02", "2035-04-05"]),
            format=date_format
        ).dt.date,
        "date_col_1": pd.to_datetime(
            pd.Series(["2000-01-02", "2035-05-05", "2000-01-02", "2035-05-05"]),
            format=date_format
        ).dt.date,
    })
    return d


def f(expression):
    return (
        descr(d=d)
            .extend({'new_column': expression})
            .select_columns(['row_id', 'new_column'])
            .order_rows(['row_id'])
    )


def fg(expression):
    return (
        descr(d=d)
            .extend(
                {'new_column': expression},
                partition_by=['g'])
            .select_columns(['g', 'row_id', 'new_column'])
            .order_rows(['g', 'row_id'])
    )


def fp(expression):
    return (
        descr(d=d)
            .project(
                {'new_column': expression},
                group_by=['g'])
            .order_rows(['g'])
    )


def fw(expression):
    return (
        descr(d=d)
            .extend(
                {'new_column': expression},
                partition_by=['g'],
                order_by=['row_id'])
            .select_columns(['g', 'row_id', 'new_column'])
            .order_rows(['g', 'row_id'])
    )


In [2]:
d = mk_example()

d


Unnamed: 0,row_id,a,b,q,x,y,z,g,s2,str_datetime_col,str_date_col,datetime_col_0,datetime_col_1,date_col_0,date_col_1
0,0,False,False,1,0.1,2.4,1.6,a,z,2000-01-01 12:13:21,2000-03-01,2010-01-01 12:13:21,2010-01-01 12:11:21,2000-01-02,2000-01-02
1,1,False,True,1,0.2,1.33,,a,q,2020-04-05 14:03:00,2020-04-05,2030-04-05 14:03:00,2030-04-06 14:03:00,2035-04-05,2035-05-05
2,2,True,False,2,0.3,1.2,-2.1,b,11,2000-01-01 12:13:21,2000-03-01,2010-01-01 12:13:21,2010-01-01 12:11:21,2000-01-02,2000-01-02
3,3,True,True,2,0.4,1.1,,ccc,b,2020-04-05 14:03:00,2020-04-05,2030-04-05 14:03:00,2030-04-06 14:03:00,2035-04-05,2035-05-05


In [3]:
data_def = {k: v for (k, v) in descr(d=d).column_map().items()}

def parse(exp):
    return parse_by_lark(exp, data_def=data_def)


In [4]:
expressions = [
    'x + y',
    'x - y',
    'row_id // q',
    'row_id % q',
    'x / y',
    'x * y',
    'x ** y',
    'x == y',
    'x > y',
    'x >= y',
    'x < y',
    'x <= y',
    'x != y',
    '-x',
    'not a',
    'a & b',
    'a and b',
    'a | b',
    'a or b',
    'z.sign()',
    'x.sin()',
    'x.cos()',
    'x.arcsin()',
    'x.arccos()',
    'x.arctan()',
    'x.arctan2(y)',
    'x.sinh()',
    'x.cosh()',
    'x.tanh()',
    'x.arcsinh()',
    'x.arccosh()',
    'x.arctanh()',
    'y.floor()',
    'z.floor()',
    'y.ceil()',
    'z.ceil()',
    'x.sum()',
    'x.exp()',
    'y.expm1()',
    'x.log()',
    'x.log10()',
    'x.log1p()',
    'y.mod(0.5)',
    'y.remainder(0.5)',
    'x.sqrt()',
    'z.abs()',
    'row_id.maximum(x)',
    'row_id.minimum(x)',
    'row_id.fmax(x)',
    'row_id.fmin(x)',
    'y.round()',
    'y.around(2)',
    'z.is_null()',
    'z.is_bad()',
    'z.count()',
    'a.if_else(x, y)',
    'row_id.is_in({1, 3})',
    'g.concat(s2)',
    'g %+% "_" %+% s2',
    'z.coalesce(2)',
    'z %?% 2',
    'z.coalesce_0()',
    'g.mapv({"a": 1, "b": 2, "z": 26}, 0)',
    'y.as_int64()',
    'y.as_str()',
    'g.trimstr(0, 2)',
    'datetime_col_0.datetime_to_date()',
    'str_date_col.parse_date()',
    'str_datetime_col.parse_datetime()',
    'datetime_col_0.format_datetime()',
    'date_col_0.format_date()',
    'date_col_0.dayofweek()',
    'date_col_0.dayofyear()',
    'date_col_0.dayofmonth()',
    'date_col_0.weekofyear()',
    'date_col_0.month()',
    'date_col_0.quarter()',
    'date_col_0.year()',
    'datetime_col_0.timestamp_diff(datetime_col_1)',
    'date_col_0.date_diff(date_col_1)',
    'date_col_1.base_Sunday()',
]

grouped_expressions = [
    'x.sum()',
    '(1).sum()',
    '_ngroup()',
    'a.all()',
    'a.any()',
    'x.max()',
    'x.mean()',
    'x.median()',
    'x.min()',
    'x.nunique()',
    'x.size()',
    '_size()',
    '_count()',
]

project_expressions = [
    'x.sum()',
    '(1).sum()',
    'a.all()',
    'a.any()',
    'x.max()',
    'x.mean()',
    'x.median()',
    'x.min()',
    'x.nunique()',
    'x.size()',
    'x.std()',
    'x.var()',
]

windowed_expressions = [
    'z.bfill()',
    'z.ffill()',
    'x.last()',
    'x.rank()',
    'x.cumprod()',
    'x.cumsum()',
    'z.cumcount()',
    'x.cummax()',
    'x.cummin()',
    'x.shift()',
    '_row_number()',
]

u_expressions = [  # not simply checkable as output varies
    '_uniform()',
]

In [5]:
e_expectations = [(parse(exp).op, 'e', exp, f(exp), f(exp).transform(d)) for exp in expressions]
g_expectations = [(parse(exp).op, 'g', exp, fg(exp), fg(exp).transform(d)) for exp in grouped_expressions]
p_expectations = [(parse(exp).op, 'p', exp, fp(exp), fp(exp).transform(d)) for exp in project_expressions]
w_expectations = [(parse(exp).op, 'w', exp, fw(exp), fw(exp).transform(d)) for exp in windowed_expressions]
u_results = [(parse(exp).op, 'u', exp, f(exp), f(exp).transform(d)) for exp in u_expressions]

In [6]:
with gzip.open('expr_expectations.pkl.gz', 'wb') as out_f:
    pickle.dump(
        {
            'd': d,
            'e_expectations': e_expectations,
            'g_expectations': g_expectations,
            'p_expectations': p_expectations,
            'w_expectations': w_expectations,
            'u_results': u_results,
        },
        out_f)

In [7]:
ops_list = e_expectations + g_expectations + p_expectations + w_expectations + u_results

op_catalog = pd.DataFrame({
    'op': [op for op, op_class, exp, ops, expect in ops_list],
    'expression': [exp for op, op_class, exp, ops, expect in ops_list],
    'op_class': [op_class for op, op_class, exp, ops, expect in ops_list],
})
op_catalog['Pandas'] = 'y'


def test_on_db(db_handle):
    # test on db
    res_vector = ['?'] * len(ops_list)
    db_handle.insert_table(d, table_name='d', allow_overwrite=True)
    for i in range(len(ops_list)):
        op = ops_list[i][0]
        op_class = ops_list[i][1]
        exp = ops_list[i][2]
        ops = ops_list[i][3]
        expect = ops_list[i][4]
        try:
            res = db_handle.read_query(ops)
            if op_class != 'u':
                if data_algebra.test_util.equivalent_frames(res, expect):
                    res_vector[i] = 'y'
                else:
                    res_vector[i] = 'w'
                    print()
                    print("difference (w)")
                    print(f'op: {op}, op_class: {op_class}, example expression: {exp}, db: {db_handle.db_model}')
                    print("Pandas result (expectation):")
                    print(expect)
                    print("DB result:")
                    print(res)
                    print("query")
                    print(db_handle.to_sql(ops))
                    print()
            else:
                res_vector[i] = 'y'
        except Exception as ex:
            res_vector[i] = 'n'
            print()
            print("error (n)")
            print(f'op: {op}, op_class: {op_class}, example expression: {exp}, db: {db_handle.db_model}')
            print(f"caught: {ex}")
            print()
    db_handle.drop_table('d')
    return res_vector


db_handles = [
    data_algebra.SQLite.example_handle(),
    data_algebra.BigQuery.example_handle(),
    data_algebra.PostgreSQL.example_handle(),
    data_algebra.SparkSQL.example_handle(),
    data_algebra.MySQL.example_handle(),
]

22/01/14 14:16:58 WARN Utils: Your hostname, JAMiMac.local resolves to a loopback address: 127.0.0.1; using 192.168.0.155 instead (on interface en1)
22/01/14 14:16:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/14 14:16:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [8]:
warnings.filterwarnings("ignore")
for db_handle in db_handles:
    db_test_res = test_on_db(db_handle)
    op_catalog[str(db_handle.db_model)] = db_test_res
    db_handle.close()



error (n)
op: arctan2, op_class: e, example expression: x.arctan2(y), db: SQLiteModel
caught: Execution failed on sql '-- data_algebra SQL https://github.com/WinVector/data_algebra
--  dialect: SQLiteModel
--       string quote: '
--   identifier quote: "
WITH
 "table_reference_0" AS (
  SELECT
   "row_id" ,
   "x" ,
   "y"
  FROM
   "d"
 ) ,
 "extend_1" AS (
  SELECT  -- .extend({ 'new_column': 'x.arctan2(y)'})
   "row_id" ,
   ARCTAN2("x", "y") AS "new_column"
  FROM
   "table_reference_0"
 )
SELECT  -- .order_rows(['row_id'])
 *
FROM
 "extend_1"
ORDER BY
 "row_id"
': no such function: ARCTAN2


difference (w)
op: remainder, op_class: e, example expression: y.remainder(0.5), db: SQLiteModel
Pandas result (expectation):
   row_id  new_column
0       0        0.40
1       1        0.33
2       2        0.20
3       3        0.10
DB result:
   row_id new_column
0       0       None
1       1       None
2       2       None
3       3       None
query
-- data_algebra SQL https://github.c

                                                                                


error (n)
op: arcsin, op_class: e, example expression: x.arcsin(), db: SparkSQLModel
caught: Undefined function: 'ARCSIN'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 16 pos 3


error (n)
op: arccos, op_class: e, example expression: x.arccos(), db: SparkSQLModel
caught: Undefined function: 'ARCCOS'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 16 pos 3


error (n)
op: arctan, op_class: e, example expression: x.arctan(), db: SparkSQLModel
caught: Undefined function: 'ARCTAN'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 16 pos 3


error (n)
op: arctan2, op_class: e, example expression: x.arctan2(y), db: SparkSQLModel
caught: Undefined function: 'ARCTAN2'. This function is neither a registered temporary function nor a permanent function registered

22/01/14 14:19:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/01/14 14:19:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.



error (n)
op: remainder, op_class: e, example expression: y.remainder(0.5), db: SparkSQLModel
caught: Undefined function: 'REMAINDER'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 16 pos 3


difference (w)
op: is_null, op_class: e, example expression: z.is_null(), db: SparkSQLModel
Pandas result (expectation):
   row_id  new_column
0       0       False
1       1        True
2       2       False
3       3        True
DB result:
   row_id  new_column
0       0       False
1       1       False
2       2       False
3       3       False
query
-- data_algebra SQL https://github.com/WinVector/data_algebra
--  dialect: SparkSQLModel
--       string quote: "
--   identifier quote: `
WITH
 `table_reference_0` AS (
  SELECT
   `row_id` ,
   `z`
  FROM
   `d`
 ) ,
 `extend_1` AS (
  SELECT  -- .extend({ 'new_column': 'z.is_null()'})
   `row_id` ,
   (`z` IS NULL) AS `new_column`
  FROM
   `table_reference_0`
 )


22/01/14 14:19:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/01/14 14:19:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.



difference (w)
op: count, op_class: e, example expression: z.count(), db: SparkSQLModel
Pandas result (expectation):
   row_id  new_column
0       0           2
1       1           2
2       2           2
3       3           2
DB result:
   row_id  new_column
0       0           4
1       1           4
2       2           4
3       3           4
query
-- data_algebra SQL https://github.com/WinVector/data_algebra
--  dialect: SparkSQLModel
--       string quote: "
--   identifier quote: `
WITH
 `table_reference_0` AS (
  SELECT
   `row_id` ,
   `z`
  FROM
   `d`
 ) ,
 `extend_1` AS (
  SELECT  -- .extend({ 'new_column': 'z.count()'}, partition_by=1)
   `row_id` ,
   SUM(CASE WHEN `z` IS NOT NULL THEN 1 ELSE 0 END) OVER (  )  AS `new_column`
  FROM
   `table_reference_0`
 )
SELECT  -- .order_rows(['row_id'])
 *
FROM
 `extend_1`
ORDER BY
 `row_id`



error (n)
op: as_int64, op_class: e, example expression: y.as_int64(), db: SparkSQLModel
caught: 
DataType int64 is not supported.(line 16,

In [9]:
op_catalog = op_catalog.sort_values(by=['op_class', 'op', 'expression'], inplace=False).reset_index(
    drop=True, inplace=False)
op_catalog['version'] = data_algebra.__version__
op_catalog

Unnamed: 0,op,expression,op_class,Pandas,SQLiteModel,BigQueryModel,PostgreSQLModel,SparkSQLModel,MySQLModel,version
0,!=,x != y,e,y,y,y,y,y,y,1.3.0
1,%,row_id % q,e,y,y,n,n,y,n,1.3.0
2,*,x * y,e,y,y,y,y,y,y,1.3.0
3,**,x ** y,e,y,y,y,y,y,y,1.3.0
4,+,x + y,e,y,y,y,y,y,y,1.3.0
...,...,...,...,...,...,...,...,...,...,...
113,cumsum,x.cumsum(),w,y,y,y,y,y,y,1.3.0
114,ffill,z.ffill(),w,y,n,n,n,n,n,1.3.0
115,last,x.last(),w,y,n,n,n,w,n,1.3.0
116,rank,x.rank(),w,y,n,n,n,y,n,1.3.0


In [10]:
op_catalog.to_csv('op_catalog.csv', index=False)

In [11]:
table_as_python = data_algebra.util.pandas_to_example_str(op_catalog)
table_as_python = pretty_format_python(table_as_python)

with open('op_catalog.py', 'w') as f_out:
    print("""

import data_algebra

pd = data_algebra.default_data_model.pd

    """, file=f_out)
    print("methods_table = " + table_as_python, file=f_out)
