The data algebra supports a number of ways for the
user to specify their own custom Pandas and SQL functions.

Let's look at a few examples.

First we import or packages and build some example data.

In [1]:

import sqlite3

import numpy
import pandas

from data_algebra.data_ops import *
import data_algebra.SQLite

db_model = data_algebra.SQLite.SQLiteModel()

# some example data
d = pandas.DataFrame({
    'ID': [1, 1, 2, 3, 4, 4, 4, 4, 5, 5, 6],
    'OP': ['A', 'B', 'A', 'D', 'C', 'A', 'D', 'B', 'A', 'B', 'B'],
    'DATE': ['2001-01-02 00:00:00', '2015-04-25 00:00:00', '2000-04-01 00:00:00',
             '2014-04-07 00:00:00', '2012-12-01 00:00:00', '2005-06-16 00:00:00',
             '2009-01-20 00:00:00', '2009-01-20 00:00:00', '2010-10-10 00:00:00',
             '2003-11-09 00:00:00', '2004-01-09 00:00:00'],
    })

d


Unnamed: 0,ID,OP,DATE
0,1,A,2001-01-02 00:00:00
1,1,B,2015-04-25 00:00:00
2,2,A,2000-04-01 00:00:00
3,3,D,2014-04-07 00:00:00
4,4,C,2012-12-01 00:00:00
5,4,A,2005-06-16 00:00:00
6,4,D,2009-01-20 00:00:00
7,4,B,2009-01-20 00:00:00
8,5,A,2010-10-10 00:00:00
9,5,B,2003-11-09 00:00:00


In [2]:
ops = describe_table(d, table_name='d'). \
    extend({
    'date_trimmed': user_fn(lambda x: x.str.slice(start=0, stop=10),
                            args = 'DATE',
                            name='trim_date_1_10',
                            sql_name='SUBSTR', sql_suffix=', 1, 10')})

print(ops)

TableDescription(
 table_name='d',
 column_names=[
   'ID', 'OP', 'DATE']) .\
   extend({
    'date_trimmed': trim_date_1_10(DATE)})


In [3]:
ops.transform(d)

Unnamed: 0,ID,OP,DATE,date_trimmed
0,1,A,2001-01-02 00:00:00,2001-01-02
1,1,B,2015-04-25 00:00:00,2015-04-25
2,2,A,2000-04-01 00:00:00,2000-04-01
3,3,D,2014-04-07 00:00:00,2014-04-07
4,4,C,2012-12-01 00:00:00,2012-12-01
5,4,A,2005-06-16 00:00:00,2005-06-16
6,4,D,2009-01-20 00:00:00,2009-01-20
7,4,B,2009-01-20 00:00:00,2009-01-20
8,5,A,2010-10-10 00:00:00,2010-10-10
9,5,B,2003-11-09 00:00:00,2003-11-09


In [4]:
q = ops.to_sql(db_model)
q

'SELECT SUBSTR("DATE", 1, 10) AS "date_trimmed", "DATE", "ID", "OP" FROM "d"'

In [5]:
con = sqlite3.connect(':memory:')

In [6]:
d.to_sql(name='d', con=con)

In [7]:
res_db = pandas.read_sql(q, con=con)
res_db

Unnamed: 0,date_trimmed,DATE,ID,OP
0,2001-01-02,2001-01-02 00:00:00,1,A
1,2015-04-25,2015-04-25 00:00:00,1,B
2,2000-04-01,2000-04-01 00:00:00,2,A
3,2014-04-07,2014-04-07 00:00:00,3,D
4,2012-12-01,2012-12-01 00:00:00,4,C
5,2005-06-16,2005-06-16 00:00:00,4,A
6,2009-01-20,2009-01-20 00:00:00,4,D
7,2009-01-20,2009-01-20 00:00:00,4,B
8,2010-10-10,2010-10-10 00:00:00,5,A
9,2003-11-09,2003-11-09 00:00:00,5,B


In [8]:
con.close()
