One of the reasons we added [common table expression elimination](https://github.com/WinVector/data_algebra/blob/main/Examples/with/common_table_expression_elimination.ipynb) to the [data algebra](https://github.com/WinVector/data_algebra) SQL query generator is: we have been building larger and larger queries use the composability/[macro](https://github.com/WinVector/data_algebra/blob/main/Examples/Macros/use.ipynb) features. In particular, we have started collecting solutions to complex data processing problems [here](https://github.com/WinVector/data_algebra/blob/main/data_algebra/solutions.py).

For instance: the "last observed carried forward" solution re-uses calculations in a manner that benefits from common calculation elimination.


In [1]:

import pandas as pd
import data_algebra
from data_algebra.data_ops import *
from data_algebra.sql_format_options import SQLFormatOptions
import data_algebra.test_util
import data_algebra.BigQuery
import data_algebra.solutions

In [2]:
d = pd.DataFrame({
    'v': [1., numpy.nan, 3., numpy.nan, 2., numpy.nan],
    'g': ['a', 'a', 'a', 'b', 'b', 'b'],
    'o': [1, 2, 3, 4, 5, 6],
})

d

Unnamed: 0,v,g,o
0,1.0,a,1
1,,a,2
2,3.0,a,3
3,,b,4
4,2.0,b,5
5,,b,6


In [3]:
ops = data_algebra.solutions.last_observed_carried_forward(
    descr(d=d),
    order_by=['o'],
    partition_by=['g'],
    value_column_name='v',
    selection_predicate='is_bad()',
)

In [4]:
res = ops.transform(d)

res

Unnamed: 0,v,g,o
0,1.0,a,1
1,1.0,a,2
2,3.0,a,3
3,,b,4
4,2.0,b,5
5,2.0,b,6


In [5]:
expect = pd.DataFrame({
    'v': [1.0, 1.0, 3.0, None, 2.0, 2.0],
    'g': ['a', 'a', 'a', 'b', 'b', 'b'],
    'o': [1, 2, 3, 4, 5, 6],
})
assert data_algebra.test_util.equivalent_frames(
    res,
    expect)

In [6]:
sql = data_algebra.BigQuery.BigQueryModel().to_sql(
    ops,
    sql_format_options=SQLFormatOptions(
        use_with=True,
        use_cte_elim=True)
)

print(sql)


-- data_algebra SQL https://github.com/WinVector/data_algebra
--  dialect: BigQueryModel 1.3.4
--       string quote: "
--   identifier quote: `
WITH
 `extend_1` AS (
  SELECT  -- .extend({ 'locf_to_use': '(v.is_bad()).where(0, 1)'})..extend({ 'locf_tiebreaker': '_row_number()'}, partition_by=1, order_by=['g', 'o'])
   `v` ,
   `g` ,
   `o` ,
   CASE WHEN (`v` IS NULL OR IS_INF(`v`) OR (`v` != 0 AND `v` = -`v`)) THEN 0 ELSE 1 END AS `locf_to_use` ,
   ROW_NUMBER() OVER ( ORDER BY `g`, `o`  )  AS `locf_tiebreaker`
  FROM
   `d`
 ) ,
 `extend_2` AS (
  SELECT  -- .extend({ 'locf_non_null_rank': 'locf_to_use.cumsum()'}, partition_by=['g'], order_by=['o', 'locf_tiebreaker'])
   `v` ,
   `g` ,
   `o` ,
   SUM(`locf_to_use`) OVER ( PARTITION BY `g` ORDER BY `o`, `locf_tiebreaker`  )  AS `locf_non_null_rank`
  FROM
   `extend_1`
 ) ,
 `extend_4` AS (
  SELECT  -- .extend({ 'locf_non_null_rank': 'locf_to_use.cumsum()'}, partition_by=['g'], order_by=['o', 'locf_tiebreaker'])
   `v` ,
   `g` ,
 