This is a quick note showing how to compare two data frames using
the [data algebra](https://github.com/WinVector/data_algebra).  The question is: which rows
are in one data frame and not the other

First let's set up our example

In [1]:

# import packages
import string
import sqlite3
import numpy
import numpy.random
import pandas

from data_algebra.data_ops import *
from data_algebra.cdata import *
import data_algebra.SQLite

In [2]:
# build synthetic example data

# seed the pseudo-random generator for repeatability
numpy.random.seed(1999)

# choose our simulated number of observations
n_obs = 100
symbols = list(string.ascii_lowercase)

d1 = pandas.DataFrame({
    'group': numpy.random.choice(symbols, size=n_obs, replace=True),
})

d2 = pandas.DataFrame({
    'group': numpy.random.choice(symbols, size=n_obs, replace=True),
})

Our example question is: which rows are unique to `d1` and which are unique to `d2`.

Let's define our grouping columns and proceed.

In [3]:
# which columns we consider to be row keys
# can be more than one column
grouping_columns = ['group']

summary_ops = (
    descr(d1=d1)
        .project(
            {'d1_count': '(1).sum()'},
            group_by=grouping_columns)
        .natural_join(
            b=descr(d2=d2)
                .project(
                    {'d2_count': '(1).sum()'},
                    group_by=grouping_columns),
            by=grouping_columns,
            jointype='full')
        .extend({
            'd1_count': 'd1_count.coalesce(0)',
            'd2_count': 'd2_count.coalesce(0)',
            })
)
summary_table = summary_ops.eval({'d1': d1, 'd2': d2})

summary_table

Unnamed: 0,group,d1_count,d2_count
0,a,4.0,4.0
1,b,2.0,2.0
2,c,2.0,4.0
3,d,3.0,4.0
4,e,8.0,1.0
5,f,1.0,2.0
6,g,5.0,5.0
7,h,5.0,4.0
8,i,4.0,3.0
9,j,3.0,7.0


From this summary it is easy to see which columns are unique to one table or another.

In [4]:
ex(
    data(summary_table)
        .select_rows('(d1_count <= 0) | (d2_count <= 0)')
        .order_rows(grouping_columns)
)

Unnamed: 0,group,d1_count,d2_count
0,u,0.0,3.0
1,w,4.0,0.0
