In [1]:

import pandas
from data_algebra.data_ops import *


# some example data
d1 = pandas.DataFrame({
    'ID': [2, 3, 7, 7],
    'OP': ['A', 'B', 'B', 'D'],
})

d2 = pandas.DataFrame({
    'ID': [1, 1, 2, 3, 4, 2, 4, 4, 5, 5, 6],
    'OP': ['A', 'B', 'A', 'D', 'C', 'A', 'D', 'B', 'A', 'B', 'B'],
})


def merge_in_counts(pipeline, id_cols, new_table_descr):
    new_count_col = f'count_{new_table_descr.table_name}'
    count_cols = [c for c in pipeline.column_names if c.startswith('count_')] + [new_count_col]
    return (
        pipeline.natural_join(
            b=new_table_descr
                .project({new_count_col: '(1).sum()'}, group_by=id_cols),
            by=id_cols,
            jointype='full',
            )
        .extend({f'{c}': f'{c}.coalesce_0()' for c in count_cols})
    )


ops = (
    data(d1=d1)
        .project({'count_d1': '(1).sum()'}, group_by=['ID'])
        .use(merge_in_counts, ['ID'], data(d2=d2))
)

print(ops)

(
    TableDescription(table_name="d1", column_names=["ID", "OP"])
    .project({"count_d1": "(1).sum()"}, group_by=["ID"])
    .natural_join(
        b=TableDescription(table_name="d2", column_names=["ID", "OP"]).project(
            {"count_d2": "(1).sum()"}, group_by=["ID"]
        ),
        by=["ID"],
        jointype="FULL",
    )
    .extend({"count_d1": "count_d1.coalesce(0)", "count_d2": "count_d2.coalesce(0)"})
)



In [2]:
ops.ex()


Unnamed: 0,ID,count_d1,count_d2
0,2,1.0,2.0
1,3,1.0,1.0
2,7,2.0,0.0
3,1,0.0,2.0
4,4,0.0,3.0
5,5,0.0,2.0
6,6,0.0,1.0
