Example of data transforms as categorical arrows ([`R` version](https://github.com/WinVector/rquery/blob/master/Examples/Arrow/Arrow.md) [`Python` version](https://github.com/WinVector/data_algebra/blob/master/Examples/Arrow/Arrow.ipynb)).

In [1]:
import pandas
import graphviz

import data_algebra.diagram
from data_algebra.data_ops import *  # https://github.com/WinVector/data_algebra
import data_algebra.util
import data_algebra.arrow

d = pandas.DataFrame({
    'g': ['a', 'b', 'b', 'c', 'c', 'c'],
    'x': [1, 4, 5, 7, 8, 9],
    'v': [10.0, 40.0, 50.0, 70.0, 80.0, 90.0],
    'i': [True, True, False, False, False, False],
})

table_description = describe_table(d)


id_ops_a = table_description. \
    project(group_by=['g']). \
    extend({
        'ngroup': '_row_number()',
    },
    order_by=['g'])

id_ops_b = table_description. \
    natural_join(id_ops_a, by=['g'], jointype='LEFT')
a1 = data_algebra.arrow.DataOpArrow(id_ops_b)
a1

DataOpArrow(TableDescription(
 table_name='data_frame',
 column_names=[
   'g', 'x', 'v', 'i']) .\
   natural_join(b=
      TableDescription(
       table_name='data_frame',
       column_names=[
         'g', 'x', 'v', 'i']) .\
         project({
          },
         group_by=['g']) .\
         extend({
          'ngroup': '_row_number()'},
         order_by=['g']),
      by=['g'], jointype='LEFT'))

In [2]:
print(a1)

[
  [ g: <class 'str'>, x: <class 'numpy.int64'>, v: <class 'numpy.float64'>,
    i: <class 'numpy.bool_'> ]
   ->
  [ g, x, v, i, ngroup ]
]



In [3]:
a1.fit(d)

DataOpArrow(TableDescription(
 table_name='data_frame',
 column_names=[
   'g', 'x', 'v', 'i']) .\
   natural_join(b=
      TableDescription(
       table_name='data_frame',
       column_names=[
         'g', 'x', 'v', 'i']) .\
         project({
          },
         group_by=['g']) .\
         extend({
          'ngroup': '_row_number()'},
         order_by=['g']),
      by=['g'], jointype='LEFT'))

In [4]:
print(a1)

[
  [ x: <class 'numpy.int64'>, i: <class 'numpy.bool_'>,
    v: <class 'numpy.float64'>, g: <class 'str'> ]
   ->
  [ g: <class 'str'>, x: <class 'numpy.int64'>, v: <class 'numpy.float64'>,
    i: <class 'numpy.bool_'>, ngroup: <class 'numpy.int64'> ]
]



In [5]:
cols2_too_small = [c for c in (set(id_ops_b.column_names) - set(['i']))]
ordered_ops = TableDescription('d2', cols2_too_small). \
    extend({
        'row_number': '_row_number()',
        'shift_v': 'v.shift()',
    },
    order_by=['x'],
    partition_by=['g'])
a2 = data_algebra.arrow.DataOpArrow(ordered_ops)
print(a2)


[
  [ ngroup, x, v, g ]
   ->
  [ ngroup, x, v, g, row_number, shift_v ]
]



In [6]:
try:
    a1 >> a2
except ValueError as e:
    print(str(e))
    

extra incoming columns: {'i'}


In [7]:
cols2_too_large = id_ops_b.column_names + ['q']
ordered_ops = TableDescription('d2', cols2_too_large). \
    extend({
        'row_number': '_row_number()',
        'shift_v': 'v.shift()',
    },
    order_by=['x'],
    partition_by=['g'])
a2 = data_algebra.arrow.DataOpArrow(ordered_ops)
print(a2)

[
  [ i, v, g, ngroup, x, q ]
   ->
  [ g, x, v, i, ngroup, q, row_number, shift_v ]
]



In [8]:
try:
    a1 >> a2
except ValueError as e:
    print(str(e))


missing required columns: {'q'}


In [9]:
ordered_ops = TableDescription('d2', id_ops_b.column_names). \
    extend({
        'row_number': '_row_number()',
        'shift_v': 'v.shift()',
    },
    order_by=['x'],
    partition_by=['g'])
a2 = data_algebra.arrow.DataOpArrow(ordered_ops)
a2

DataOpArrow(TableDescription(
 table_name='d2',
 column_names=[
   'g', 'x', 'v', 'i', 'ngroup']) .\
   extend({
    'row_number': '_row_number()',
    'shift_v': 'v.shift()'},
   partition_by=['g'],
   order_by=['x']))

In [10]:
print(a2)

[
  [ i, v, g, ngroup, x ]
   ->
  [ g, x, v, i, ngroup, row_number, shift_v ]
]



In [11]:
a2.fit(a1.transform(d))
print(a2)

[
  [ i: <class 'numpy.bool_'>, v: <class 'numpy.float64'>, g: <class 'str'>,
    ngroup: <class 'numpy.int64'>, x: <class 'numpy.int64'> ]
   ->
  [ g: <class 'str'>, x: <class 'numpy.int64'>, v: <class 'numpy.float64'>,
    i: <class 'numpy.bool_'>, ngroup: <class 'numpy.int64'>,
    row_number: <class 'numpy.int64'>, shift_v: <class 'numpy.float64'> ]
]



In [12]:
print(a1 >> a2)

[
  [ x: <class 'numpy.int64'>, i: <class 'numpy.bool_'>,
    v: <class 'numpy.float64'>, g: <class 'str'> ]
   ->
  [ g: <class 'str'>, x: <class 'numpy.int64'>, v: <class 'numpy.float64'>,
    i: <class 'numpy.bool_'>, ngroup: <class 'numpy.int64'>,
    row_number: <class 'numpy.int64'>, shift_v: <class 'numpy.float64'> ]
]



In [13]:
(a1 >> a2).pipeline

TableDescription(
 table_name='data_frame',
 column_names=[
   'g', 'x', 'v', 'i']) .\
   natural_join(b=
      TableDescription(
       table_name='data_frame',
       column_names=[
         'g', 'x', 'v', 'i']) .\
         project({
          },
         group_by=['g']) .\
         extend({
          'ngroup': '_row_number()'},
         order_by=['g']),
      by=['g'], jointype='LEFT') .\
   extend({
    'row_number': '_row_number()',
    'shift_v': 'v.shift()'},
   partition_by=['g'],
   order_by=['x'])

In [14]:
unordered_ops = TableDescription('d3', ordered_ops.column_names). \
    extend({
        'size': '_size()',
        'max_v': 'v.max()',
        'min_v': 'v.min()',
        'sum_v': 'v.sum()',
        'mean_v': 'v.mean()',
        'count_v': 'v.count()',
        'size_v': 'v.size()',
    },
    partition_by=['g'])
a3 = data_algebra.arrow.DataOpArrow(unordered_ops)
a3

DataOpArrow(TableDescription(
 table_name='d3',
 column_names=[
   'g', 'x', 'v', 'i', 'ngroup', 'row_number', 'shift_v']) .\
   extend({
    'size': '_size()',
    'max_v': 'v.max()',
    'min_v': 'v.min()',
    'sum_v': 'v.sum()',
    'mean_v': 'v.mean()',
    'count_v': 'v.count()',
    'size_v': 'v.size()'},
   partition_by=['g']))

In [15]:
print(a3)

[
  [ shift_v, i, v, g, ngroup, x, row_number ]
   ->
  [ g, x, v, i, ngroup, row_number, shift_v, size, max_v, min_v, sum_v,
    mean_v, count_v, size_v ]
]



In [16]:
a3.transform(a2.transform(a1))

DataOpArrow(TableDescription(
 table_name='data_frame',
 column_names=[
   'g', 'x', 'v', 'i']) .\
   natural_join(b=
      TableDescription(
       table_name='data_frame',
       column_names=[
         'g', 'x', 'v', 'i']) .\
         project({
          },
         group_by=['g']) .\
         extend({
          'ngroup': '_row_number()'},
         order_by=['g']),
      by=['g'], jointype='LEFT') .\
   extend({
    'row_number': '_row_number()',
    'shift_v': 'v.shift()'},
   partition_by=['g'],
   order_by=['x']) .\
   extend({
    'size': '_size()',
    'max_v': 'v.max()',
    'min_v': 'v.min()',
    'sum_v': 'v.sum()',
    'mean_v': 'v.mean()',
    'count_v': 'v.count()',
    'size_v': 'v.size()'},
   partition_by=['g']))

In [17]:
a1 >> a2 >> a3

DataOpArrow(TableDescription(
 table_name='data_frame',
 column_names=[
   'g', 'x', 'v', 'i']) .\
   natural_join(b=
      TableDescription(
       table_name='data_frame',
       column_names=[
         'g', 'x', 'v', 'i']) .\
         project({
          },
         group_by=['g']) .\
         extend({
          'ngroup': '_row_number()'},
         order_by=['g']),
      by=['g'], jointype='LEFT') .\
   extend({
    'row_number': '_row_number()',
    'shift_v': 'v.shift()'},
   partition_by=['g'],
   order_by=['x']) .\
   extend({
    'size': '_size()',
    'max_v': 'v.max()',
    'min_v': 'v.min()',
    'sum_v': 'v.sum()',
    'mean_v': 'v.mean()',
    'count_v': 'v.count()',
    'size_v': 'v.size()'},
   partition_by=['g']))

In [18]:
(a1 >> a2) >> a3

DataOpArrow(TableDescription(
 table_name='data_frame',
 column_names=[
   'g', 'x', 'v', 'i']) .\
   natural_join(b=
      TableDescription(
       table_name='data_frame',
       column_names=[
         'g', 'x', 'v', 'i']) .\
         project({
          },
         group_by=['g']) .\
         extend({
          'ngroup': '_row_number()'},
         order_by=['g']),
      by=['g'], jointype='LEFT') .\
   extend({
    'row_number': '_row_number()',
    'shift_v': 'v.shift()'},
   partition_by=['g'],
   order_by=['x']) .\
   extend({
    'size': '_size()',
    'max_v': 'v.max()',
    'min_v': 'v.min()',
    'sum_v': 'v.sum()',
    'mean_v': 'v.mean()',
    'count_v': 'v.count()',
    'size_v': 'v.size()'},
   partition_by=['g']))

In [19]:
a1 >> (a2 >> a3)

DataOpArrow(TableDescription(
 table_name='data_frame',
 column_names=[
   'g', 'x', 'v', 'i']) .\
   natural_join(b=
      TableDescription(
       table_name='data_frame',
       column_names=[
         'g', 'x', 'v', 'i']) .\
         project({
          },
         group_by=['g']) .\
         extend({
          'ngroup': '_row_number()'},
         order_by=['g']),
      by=['g'], jointype='LEFT') .\
   extend({
    'row_number': '_row_number()',
    'shift_v': 'v.shift()'},
   partition_by=['g'],
   order_by=['x']) .\
   extend({
    'size': '_size()',
    'max_v': 'v.max()',
    'min_v': 'v.min()',
    'sum_v': 'v.sum()',
    'mean_v': 'v.mean()',
    'count_v': 'v.count()',
    'size_v': 'v.size()'},
   partition_by=['g']))

In [20]:
ops = (a1 >> a2 >> a3).pipeline

ops.transform(d)

Unnamed: 0,g,x,v,i,ngroup,row_number,shift_v,size,max_v,min_v,sum_v,mean_v,count_v,size_v
0,a,1,10.0,True,1,1,,1,10.0,10.0,10.0,10.0,1,1
1,b,4,40.0,True,2,1,,2,50.0,40.0,90.0,45.0,2,2
2,b,5,50.0,False,2,2,40.0,2,50.0,40.0,90.0,45.0,2,2
3,c,7,70.0,False,3,1,,3,90.0,70.0,240.0,80.0,3,3
4,c,8,80.0,False,3,2,70.0,3,90.0,70.0,240.0,80.0,3,3
5,c,9,90.0,False,3,3,80.0,3,90.0,70.0,240.0,80.0,3,3


In [21]:
(a1 >> a2 >> a3).transform(d)

Unnamed: 0,g,x,v,i,ngroup,row_number,shift_v,size,max_v,min_v,sum_v,mean_v,count_v,size_v
0,a,1,10.0,True,1,1,,1,10.0,10.0,10.0,10.0,1,1
1,b,4,40.0,True,2,1,,2,50.0,40.0,90.0,45.0,2,2
2,b,5,50.0,False,2,2,40.0,2,50.0,40.0,90.0,45.0,2,2
3,c,7,70.0,False,3,1,,3,90.0,70.0,240.0,80.0,3,3
4,c,8,80.0,False,3,2,70.0,3,90.0,70.0,240.0,80.0,3,3
5,c,9,90.0,False,3,3,80.0,3,90.0,70.0,240.0,80.0,3,3


In [22]:
d >> a1 >> a2 >> a3

Unnamed: 0,g,x,v,i,ngroup,row_number,shift_v,size,max_v,min_v,sum_v,mean_v,count_v,size_v
0,a,1,10.0,True,1,1,,1,10.0,10.0,10.0,10.0,1,1
1,b,4,40.0,True,2,1,,2,50.0,40.0,90.0,45.0,2,2
2,b,5,50.0,False,2,2,40.0,2,50.0,40.0,90.0,45.0,2,2
3,c,7,70.0,False,3,1,,3,90.0,70.0,240.0,80.0,3,3
4,c,8,80.0,False,3,2,70.0,3,90.0,70.0,240.0,80.0,3,3
5,c,9,90.0,False,3,3,80.0,3,90.0,70.0,240.0,80.0,3,3


In [23]:
d >> (a1 >> a2 >> a3)


Unnamed: 0,g,x,v,i,ngroup,row_number,shift_v,size,max_v,min_v,sum_v,mean_v,count_v,size_v
0,a,1,10.0,True,1,1,,1,10.0,10.0,10.0,10.0,1,1
1,b,4,40.0,True,2,1,,2,50.0,40.0,90.0,45.0,2,2
2,b,5,50.0,False,2,2,40.0,2,50.0,40.0,90.0,45.0,2,2
3,c,7,70.0,False,3,1,,3,90.0,70.0,240.0,80.0,3,3
4,c,8,80.0,False,3,2,70.0,3,90.0,70.0,240.0,80.0,3,3
5,c,9,90.0,False,3,3,80.0,3,90.0,70.0,240.0,80.0,3,3
