Example of data transforms as categorical arrows ([`R` version](https://github.com/WinVector/rquery/blob/master/Examples/Arrow/Arrow.md) [`Python` version](https://github.com/WinVector/data_algebra/blob/master/Examples/Arrow/Arrow.ipynb)).

In [1]:
import pandas
import graphviz

import data_algebra.diagram
from data_algebra.data_ops import *  # https://github.com/WinVector/data_algebra
import data_algebra.util
import data_algebra.arrow

d = pandas.DataFrame({
    'g': [1, 2, 2, 3, 3, 3],
    'x': [1, 4, 5, 7, 8, 9],
    'v': [10, 40, 50, 70, 80, 90],
})

table_description = describe_table(d)


id_ops_a = table_description. \
    project(group_by=['g']). \
    extend({
        'ngroup': '_row_number()',
    },
    order_by=['g']). \
    extend({'irrelevant_col': 1})

id_ops_b = table_description. \
    natural_join(id_ops_a, by=['g'], jointype='LEFT')
a1 = data_algebra.arrow.DataOpArrow(id_ops_b)
a1

DataOpArrow(TableDescription(table_name='data_frame', column_names=['g', 'x', 'v']) .\
   natural_join(b=
      TableDescription(table_name='data_frame', column_names=['g', 'x', 'v']) .\
         project({}, group_by=['g']) .\
         extend({'ngroup': '_row_number()'}, order_by=['g']) .\
         extend({'irrelevant_col': '1'}),
      by=['g'], jointype='LEFT'))

In [2]:
print(a1)

[{'x', 'g', 'v'} -> ['g', 'x', 'v', 'ngroup', 'irrelevant_col']]


In [3]:
cols2 = [c for c in (set(id_ops_b.column_names) - set(['irrelevant_col']))]
ordered_ops = TableDescription('d2', cols2). \
    extend({
        'row_number': '_row_number()',
        'shift_v': 'v.shift()',
    },
    order_by=['x'],
    partition_by=['g'])
a2 = data_algebra.arrow.DataOpArrow(ordered_ops)
a2

DataOpArrow(TableDescription(table_name='d2', column_names=['x', 'g', 'ngroup', 'v']) .\
   extend({'row_number': '_row_number()', 'shift_v': 'v.shift()'}, partition_by=['g'], order_by=['x']))

In [4]:
print(a2)

[{'x', 'g', 'v', 'ngroup'} -> ['x', 'g', 'ngroup', 'v', 'row_number', 'shift_v']]


In [5]:
try:
    a1 >> a2
except ValueError as e:
    print(str(e))
    

extra incoming columns: {'irrelevant_col'}


In [6]:
print(a2.transform(a1, strict=False))

[{'x', 'g', 'v'} -> ['x', 'g', 'ngroup', 'v', 'row_number', 'shift_v']]


In [7]:
ordered_ops = TableDescription('d2', id_ops_b.column_names). \
    extend({
        'row_number': '_row_number()',
        'shift_v': 'v.shift()',
    },
    order_by=['x'],
    partition_by=['g'])
a2 = data_algebra.arrow.DataOpArrow(ordered_ops)
a2

DataOpArrow(TableDescription(table_name='d2', column_names=['g', 'x', 'v', 'ngroup', 'irrelevant_col']) .\
   extend({'row_number': '_row_number()', 'shift_v': 'v.shift()'}, partition_by=['g'], order_by=['x']))

In [8]:
print(a1 >> a2)

[{'x', 'g', 'v'} -> ['g', 'x', 'v', 'ngroup', 'irrelevant_col', 'row_number', 'shift_v']]


In [9]:
(a1 >> a2).pipeline

TableDescription(table_name='data_frame', column_names=['g', 'x', 'v']) .\
   natural_join(b=
      TableDescription(table_name='data_frame', column_names=['g', 'x', 'v']) .\
         project({}, group_by=['g']) .\
         extend({'ngroup': '_row_number()'}, order_by=['g']) .\
         extend({'irrelevant_col': '1'}),
      by=['g'], jointype='LEFT') .\
   extend({'row_number': '_row_number()', 'shift_v': 'v.shift()'}, partition_by=['g'], order_by=['x'])

In [10]:
unordered_ops = TableDescription('d3', ordered_ops.column_names). \
    extend({
        'size': '_size()',
        'max_v': 'v.max()',
        'min_v': 'v.min()',
        'sum_v': 'v.sum()',
        'mean_v': 'v.mean()',
        'count_v': 'v.count()',
        'size_v': 'v.size()',
    },
    partition_by=['g'])
a3 = data_algebra.arrow.DataOpArrow(unordered_ops)
a3

DataOpArrow(TableDescription(table_name='d3', column_names=['g', 'x', 'v', 'ngroup', 'irrelevant_col', 'row_number', 'shift_v']) .\
   extend({'size': '_size()', 'max_v': 'v.max()', 'min_v': 'v.min()', 'sum_v': 'v.sum()', 'mean_v': 'v.mean()', 'count_v': 'v.count()', 'size_v': 'v.size()'}, partition_by=['g']))

In [11]:
print(a3)

[{'x', 'row_number', 'ngroup', 'shift_v', 'irrelevant_col', 'g', 'v'} -> ['g', 'x', 'v', 'ngroup', 'irrelevant_col', 'row_number', 'shift_v', 'size', 'max_v', 'min_v', 'sum_v', 'mean_v', 'count_v', 'size_v']]


In [12]:
a3.transform(a2.transform(a1))

DataOpArrow(TableDescription(table_name='data_frame', column_names=['g', 'x', 'v']) .\
   natural_join(b=
      TableDescription(table_name='data_frame', column_names=['g', 'x', 'v']) .\
         project({}, group_by=['g']) .\
         extend({'ngroup': '_row_number()'}, order_by=['g']) .\
         extend({'irrelevant_col': '1'}),
      by=['g'], jointype='LEFT') .\
   extend({'row_number': '_row_number()', 'shift_v': 'v.shift()'}, partition_by=['g'], order_by=['x']) .\
   extend({'size': '_size()', 'max_v': 'v.max()', 'min_v': 'v.min()', 'sum_v': 'v.sum()', 'mean_v': 'v.mean()', 'count_v': 'v.count()', 'size_v': 'v.size()'}, partition_by=['g']))

In [13]:
a1 >> a2 >> a3

DataOpArrow(TableDescription(table_name='data_frame', column_names=['g', 'x', 'v']) .\
   natural_join(b=
      TableDescription(table_name='data_frame', column_names=['g', 'x', 'v']) .\
         project({}, group_by=['g']) .\
         extend({'ngroup': '_row_number()'}, order_by=['g']) .\
         extend({'irrelevant_col': '1'}),
      by=['g'], jointype='LEFT') .\
   extend({'row_number': '_row_number()', 'shift_v': 'v.shift()'}, partition_by=['g'], order_by=['x']) .\
   extend({'size': '_size()', 'max_v': 'v.max()', 'min_v': 'v.min()', 'sum_v': 'v.sum()', 'mean_v': 'v.mean()', 'count_v': 'v.count()', 'size_v': 'v.size()'}, partition_by=['g']))

In [14]:
(a1 >> a2) >> a3

DataOpArrow(TableDescription(table_name='data_frame', column_names=['g', 'x', 'v']) .\
   natural_join(b=
      TableDescription(table_name='data_frame', column_names=['g', 'x', 'v']) .\
         project({}, group_by=['g']) .\
         extend({'ngroup': '_row_number()'}, order_by=['g']) .\
         extend({'irrelevant_col': '1'}),
      by=['g'], jointype='LEFT') .\
   extend({'row_number': '_row_number()', 'shift_v': 'v.shift()'}, partition_by=['g'], order_by=['x']) .\
   extend({'size': '_size()', 'max_v': 'v.max()', 'min_v': 'v.min()', 'sum_v': 'v.sum()', 'mean_v': 'v.mean()', 'count_v': 'v.count()', 'size_v': 'v.size()'}, partition_by=['g']))

In [15]:
a1 >> (a2 >> a3)

DataOpArrow(TableDescription(table_name='data_frame', column_names=['g', 'x', 'v']) .\
   natural_join(b=
      TableDescription(table_name='data_frame', column_names=['g', 'x', 'v']) .\
         project({}, group_by=['g']) .\
         extend({'ngroup': '_row_number()'}, order_by=['g']) .\
         extend({'irrelevant_col': '1'}),
      by=['g'], jointype='LEFT') .\
   extend({'row_number': '_row_number()', 'shift_v': 'v.shift()'}, partition_by=['g'], order_by=['x']) .\
   extend({'size': '_size()', 'max_v': 'v.max()', 'min_v': 'v.min()', 'sum_v': 'v.sum()', 'mean_v': 'v.mean()', 'count_v': 'v.count()', 'size_v': 'v.size()'}, partition_by=['g']))

In [16]:
ops = (a1 >> a2 >> a3).pipeline

ops.transform(d)

Unnamed: 0,g,x,v,ngroup,irrelevant_col,row_number,shift_v,size,max_v,min_v,sum_v,mean_v,count_v,size_v
0,1,1,10,1,1,1,,1,10,10,10,10,1,1
1,2,4,40,2,1,1,,2,50,40,90,45,2,2
2,2,5,50,2,1,2,40.0,2,50,40,90,45,2,2
3,3,7,70,3,1,1,,3,90,70,240,80,3,3
4,3,8,80,3,1,2,70.0,3,90,70,240,80,3,3
5,3,9,90,3,1,3,80.0,3,90,70,240,80,3,3


In [17]:
(a1 >> a2 >> a3).transform(d)

Unnamed: 0,g,x,v,ngroup,irrelevant_col,row_number,shift_v,size,max_v,min_v,sum_v,mean_v,count_v,size_v
0,1,1,10,1,1,1,,1,10,10,10,10,1,1
1,2,4,40,2,1,1,,2,50,40,90,45,2,2
2,2,5,50,2,1,2,40.0,2,50,40,90,45,2,2
3,3,7,70,3,1,1,,3,90,70,240,80,3,3
4,3,8,80,3,1,2,70.0,3,90,70,240,80,3,3
5,3,9,90,3,1,3,80.0,3,90,70,240,80,3,3


In [18]:
d >> a1 >> a2 >> a3

Unnamed: 0,g,x,v,ngroup,irrelevant_col,row_number,shift_v,size,max_v,min_v,sum_v,mean_v,count_v,size_v
0,1,1,10,1,1,1,,1,10,10,10,10,1,1
1,2,4,40,2,1,1,,2,50,40,90,45,2,2
2,2,5,50,2,1,2,40.0,2,50,40,90,45,2,2
3,3,7,70,3,1,1,,3,90,70,240,80,3,3
4,3,8,80,3,1,2,70.0,3,90,70,240,80,3,3
5,3,9,90,3,1,3,80.0,3,90,70,240,80,3,3


In [19]:
d >> (a1 >> a2) >> a3

Unnamed: 0,g,x,v,ngroup,irrelevant_col,row_number,shift_v,size,max_v,min_v,sum_v,mean_v,count_v,size_v
0,1,1,10,1,1,1,,1,10,10,10,10,1,1
1,2,4,40,2,1,1,,2,50,40,90,45,2,2
2,2,5,50,2,1,2,40.0,2,50,40,90,45,2,2
3,3,7,70,3,1,1,,3,90,70,240,80,3,3
4,3,8,80,3,1,2,70.0,3,90,70,240,80,3,3
5,3,9,90,3,1,3,80.0,3,90,70,240,80,3,3
