## A catalog of [data algebra expression](https://github.com/WinVector/data_algebra) methods.


In [1]:
import datetime
import pandas as pd
import pickle
import gzip

from data_algebra.data_ops import *

datetime_format = "%Y-%m-%d %H:%M:%S"
date_format = "%Y-%m-%d"

d = pd.DataFrame({
    'row_id': [0, 1, 2, 3],
    'a': [False, False, True, True],
    'b': [False, True, False, True],
    'q': [1, 1, 2, 2],
    'x': [.1, .2, .3, .4],
    'y': [2.4, 1.33, 1.2, 1.1],
    'z': [1.6, None, -2.1, 0],
    'g': ['a', 'a', 'b', 'ccc'],
    "str_datetime_col": ["2000-01-01 12:13:21", "2020-04-05 14:03:00", "2000-01-01 12:13:21", "2020-04-05 14:03:00"],
    "str_date_col": ["2000-03-01", "2020-04-05", "2000-03-01", "2020-04-05"],
    "datetime_col_0": pd.to_datetime(
        pd.Series(["2010-01-01 12:13:21", "2030-04-05 14:03:00", "2010-01-01 12:13:21", "2030-04-05 14:03:00"]),
        format=datetime_format,
    ),
    "datetime_col_1": pd.to_datetime(
        pd.Series(["2010-01-01 12:11:21", "2030-04-06 14:03:00", "2010-01-01 12:11:21", "2030-04-06 14:03:00"]),
        format=date_format,
    ),
    "date_col_0": pd.to_datetime(
        pd.Series(["2000-01-02", "2035-04-05", "2000-01-02", "2035-04-05"]),
        format=date_format
    ).dt.date,
    "date_col_1": pd.to_datetime(
        pd.Series(["2000-01-02", "2035-05-05", "2000-01-02", "2035-05-05"]),
        format=date_format
    ).dt.date,
})

d

Unnamed: 0,row_id,a,b,q,x,y,z,g,str_datetime_col,str_date_col,datetime_col_0,datetime_col_1,date_col_0,date_col_1
0,0,False,False,1,0.1,2.4,1.6,a,2000-01-01 12:13:21,2000-03-01,2010-01-01 12:13:21,2010-01-01 12:11:21,2000-01-02,2000-01-02
1,1,False,True,1,0.2,1.33,,a,2020-04-05 14:03:00,2020-04-05,2030-04-05 14:03:00,2030-04-06 14:03:00,2035-04-05,2035-05-05
2,2,True,False,2,0.3,1.2,-2.1,b,2000-01-01 12:13:21,2000-03-01,2010-01-01 12:13:21,2010-01-01 12:11:21,2000-01-02,2000-01-02
3,3,True,True,2,0.4,1.1,0.0,ccc,2020-04-05 14:03:00,2020-04-05,2030-04-05 14:03:00,2030-04-06 14:03:00,2035-04-05,2035-05-05


In [2]:

def f(expression):
    return ex(
        data(d=d)
            .extend({'new_column': expression})
            .select_columns(['row_id', 'new_column'])
            .order_rows(['row_id'])
    )

f('z.sign()')


Unnamed: 0,row_id,new_column
0,0,1.0
1,1,
2,2,-1.0
3,3,0.0


In [3]:
expressions = [
    'x + y',
    'x - y',
    'row_id // q',
    'x / y',
    'x * y',
    'x ** y',
    'x == y',
    'x > y',
    'x >= y',
    'x < y',
    'x <= y',
    'x != y',
    'a',
    '1',
    'not a',
    'a & b',
    'a | b',
    'z.sign()',
    'x.sum()',
    '(1).sum()',
    'x.sin()',
    'x.cos()',
    'x.arcsin()',
    'x.arccos()',
    'x.arctan()',
    'x.arctan2(y)',
    'x.sinh()',
    'x.cosh()',
    'x.tanh()',
    'x.arcsinh()',
    'x.arccosh()',
    'x.arctanh()',
    'z.floor()',
    'z.ceil()',
    'x.sum()',
    'x.exp()',
    'y.expm1()',
    'x.log()',
    'x.log10()',
    'x.log1p()',
    'y.mod(0.5)',
    'y.remainder(0.5)',
    'x.sqrt()',
    'z.abs()',
    'row_id.maximum(x)',
    'row_id.minimum(x)',
    'row_id.fmax(x)',
    'row_id.fmin(x)',
    'y.round()',
    'y.around(2)',
    'z.is_null()',
    'z.is_bad()',
    'z.count()',
    'z.if_else(x, y)',
    'row_id.is_in({1, 3})',
    'g.concat(a)',
    'g %+% "_" %+% a',
    'z.coalesce(2)',
    'z %?% 2',
    'z.coalesce_0()',
    'g.mapv({"a": 1, "b": 2, "z": 26}, 0)',
    'y.as_int64()',
    'y.as_str()',
    'g.trimstr(0, 2)',
    'datetime_col_0.datetime_to_date()',
    'str_date_col.parse_date()',
    'str_datetime_col.parse_datetime()',
    'datetime_col_0.format_datetime()',
    'date_col_0.format_date()',
    'date_col_0.dayofweek()',
    'date_col_0.dayofyear()',
    'date_col_0.dayofmonth()',
    'date_col_0.weekofyear()',
    'date_col_0.month()',
    'date_col_0.quarter()',
    'date_col_0.year()',
    'datetime_col_0.timestamp_diff(datetime_col_1)',
    'date_col_0.date_diff(date_col_1)',
    'date_col_1.base_Sunday()',
]

print(len(expressions))

79


In [4]:
for exp in expressions:
    print()
    print(exp)
    res = f(exp)
    print(type(res['new_column'].values[0]))
    print(res)
    print()


x + y
<class 'numpy.float64'>
   row_id  new_column
0       0        2.50
1       1        1.53
2       2        1.50
3       3        1.50


x - y
<class 'numpy.float64'>
   row_id  new_column
0       0       -2.30
1       1       -1.13
2       2       -0.90
3       3       -0.70


row_id // q
<class 'numpy.int64'>
   row_id  new_column
0       0           0
1       1           1
2       2           1
3       3           1


x / y
<class 'numpy.float64'>
   row_id  new_column
0       0    0.041667
1       1    0.150376
2       2    0.250000
3       3    0.363636


x * y
<class 'numpy.float64'>
   row_id  new_column
0       0       0.240
1       1       0.266
2       2       0.360
3       3       0.440


x ** y
<class 'numpy.float64'>
   row_id  new_column
0       0    0.003981
1       1    0.117590
2       2    0.235801
3       3    0.364977


x == y
<class 'numpy.bool_'>
   row_id  new_column
0       0       False
1       1       False
2       2       False
3       3       False


x

In [5]:
def fg(expression):
    return ex(
        data(d=d)
            .extend(
                {'new_column': expression},
                partition_by=['g'])
            .select_columns(['g', 'row_id', 'new_column'])
            .order_rows(['g', 'row_id'])
    )

fg('x.max()')


Unnamed: 0,g,row_id,new_column
0,a,0,0.2
1,a,1,0.2
2,b,2,0.3
3,ccc,3,0.4


In [6]:
grouped_expressions = [
    'a.all()',
    'a.any()',
    'x.max()',
    'x.mean()',
    'x.median()',
    'x.min()',
    'x.ngroup()',
    'x.nunique()',
    'x.size()',
    'x.std()',
    'x.var()',
]

print(len(grouped_expressions))

11


In [7]:
for exp in grouped_expressions:
    print()
    print(exp)
    print(fg(exp))
    print()


a.all()
     g  row_id  new_column
0    a       0       False
1    a       1       False
2    b       2        True
3  ccc       3        True


a.any()
     g  row_id  new_column
0    a       0       False
1    a       1       False
2    b       2        True
3  ccc       3        True


x.max()
     g  row_id  new_column
0    a       0         0.2
1    a       1         0.2
2    b       2         0.3
3  ccc       3         0.4


x.mean()
     g  row_id  new_column
0    a       0        0.15
1    a       1        0.15
2    b       2        0.30
3  ccc       3        0.40


x.median()
     g  row_id  new_column
0    a       0        0.15
1    a       1        0.15
2    b       2        0.30
3  ccc       3        0.40


x.min()
     g  row_id  new_column
0    a       0         0.1
1    a       1         0.1
2    b       2         0.3
3  ccc       3         0.4


x.ngroup()
     g  row_id  new_column
0    a       0         NaN
1    a       1         NaN
2    b       2         NaN
3  ccc

In [8]:
def fw(expression):
    return ex(
        data(d=d)
            .extend(
                {'new_column': expression},
                partition_by=['g'],
                order_by=['row_id'])
            .select_columns(['g', 'row_id', 'new_column'])
            .order_rows(['g', 'row_id'])
    )

fw('x.cumprod()')

Unnamed: 0,g,row_id,new_column
0,a,0,0.1
1,a,1,0.02
2,b,2,0.3
3,ccc,3,0.4


In [9]:

windowed_expressions = [
    'z.bfill()',
    'z.ffill()',
    'x.last()',
    'x.rank()',
    'x.cumprod()',
    'x.cumsum()',
    'z.cumcount()',
    'x.cummax()',
    'x.cummin()',
    'x.shift()',
]

print(len(windowed_expressions))

10


In [10]:
for exp in windowed_expressions:
    print()
    print(exp)
    print(fw(exp))
    print()


z.bfill()
     g  row_id  new_column
0    a       0         1.6
1    a       1         NaN
2    b       2        -2.1
3  ccc       3         0.0


z.ffill()
     g  row_id  new_column
0    a       0         1.6
1    a       1         1.6
2    b       2        -2.1
3  ccc       3         0.0


x.last()
     g  row_id  new_column
0    a       0         0.2
1    a       1         0.2
2    b       2         0.3
3  ccc       3         0.4


x.rank()
     g  row_id  new_column
0    a       0         1.0
1    a       1         2.0
2    b       2         1.0
3  ccc       3         1.0


x.cumprod()
     g  row_id  new_column
0    a       0        0.10
1    a       1        0.02
2    b       2        0.30
3  ccc       3        0.40


x.cumsum()
     g  row_id  new_column
0    a       0         0.1
1    a       1         0.3
2    b       2         0.3
3  ccc       3         0.4


z.cumcount()
     g  row_id  new_column
0    a       0           0
1    a       1           1
2    b       2        

In [11]:
f_expectations = {exp: f(exp) for exp in expressions}
g_expectations = {exp: fg(exp) for exp in grouped_expressions}
w_expectations = {exp: fw(exp) for exp in windowed_expressions}

with gzip.open('expr_expectations.pkl.gz', 'wb') as out_f:
    pickle.dump(
        {
            'd': d,
            'f_expectations': f_expectations,
            'g_expectations': g_expectations,
            'w_expectations': w_expectations,
        },
        out_f)