# Thinking about the data frame class

# Main motivation

* This module will provide `dfply` functionality for a csv file read in with the `DictReader`

In [1]:
from csv import DictReader, Sniffer
with open('./data/auto_sales.csv', newline='\n') as csvfile:
    dialect = Sniffer().sniff(csvfile.read(1024))
    csvfile.seek(0)
    reader = DictReader(csvfile, dialect=dialect)
    print(dir(reader.reader))
    columns = reader.fieldnames
    l = list(reader)
l, columns

['__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__ne__', '__new__', '__next__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'dialect', 'line_num']


([OrderedDict([('Salesperson', 'Ann'),
               ('Compact', '22'),
               ('Sedan', '18'),
               ('SUV', '15'),
               ('Truck', '12')]),
  OrderedDict([('Salesperson', 'Bob'),
               ('Compact', '19'),
               ('Sedan', '12'),
               ('SUV', '17'),
               ('Truck', '20')]),
  OrderedDict([('Salesperson', 'Yolanda'),
               ('Compact', '19'),
               ('Sedan', '8'),
               ('SUV', '32'),
               ('Truck', '15')]),
  OrderedDict([('Salesperson', 'Xerxes'),
               ('Compact', '12'),
               ('Sedan', '23'),
               ('SUV', '18'),
               ('Truck', '9')])],
 ['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'])

## Options

* Replace `dict` rows a la `csv.DictReader` with `PMap` or `Records` from `pyrsistent`
    * Advantages
        * Immutable and persistent
        * We can leverage some of the built-in features
            * Types and Invariants
            * Transformations
    * Disadvantages
        * Performance hit
        * Rowwise
* Create column generators + operations
    * Advantages
        * Makes columns operations very easy
        * We can avoid Intention overhead
    * Disadvantages
        * Need to `intertools.tee` the original iterator

## Use a `PRecord` to store the Column index 

Each dataframe will have a `column` attribute with the following properties

* `all_columns` a ordered vector of all columns names
    * Used to perform set complement with the invert operator `~`
    * Used to refer to columns by index 
    * **TODO: do we want to store indexes in a dictionary? (probably premature optimization)**
* `current_set` a set of column labels
    * Used to create subsets
    * Will be returned by column fields in a dynamically created subclass

Note that this class acts like a set with the contents of `current_set` and most pset method/operators are available.

In [10]:
# Things I decided to remove/ignore
_set_methods_return_set = ['remove', 
                           'copy', 
                           'discard', 
                           'add']

In [2]:
from columns import Columns, make_columns
from pyrsistent import pset

In [5]:
columns = ['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']
cols = make_columns(columns)
cols

Column3(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), Compact=Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Compact'])), Sedan=Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Sedan'])), current_set=StrPSet(['Sedan', 'Truck', 'SUV', 'Compact', 'Salesperson']), SUV=Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['SUV'])), Truck=Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Truck'])), Salesperson=Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Salesperson'])))

In [47]:
columns = ['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']
c1 = Columns(all_columns=columns, current_set=columns)
c2 = Columns(all_columns=columns, current_set=columns)
c3 = Columns(all_columns=columns, current_set=pset(['Truck','SUV']))

def check_membership(items, cols):
    return all(i in items for i in cols)
def check_equality(items, cols):
    return pset(items) == pset(cols)

assert check_equality(['Compact', 'Truck', 'Salesperson', 'Sedan', 'SUV'], ~(c1 ^ c1))
assert not c1 != c1
assert not c1 != c2
assert c1 != c3
assert check_membership(['Compact', 'Salesperson', 'Sedan'], c1 ^ c1)
assert check_equality([],  c1 ^ c2)
assert check_equality(['Compact', 'Salesperson', 'Sedan'], c1 ^ c3)
assert check_equality(['Compact', 'Truck', 'Salesperson', 'Sedan', 'SUV'], ~(c1 ^ c2))
assert check_equality(['Truck', 'SUV'], ~(c1 ^ c3))
assert check_equality(columns, c1 & c1)
assert check_equality(columns, c1 & c2)
assert check_equality(['Truck', 'SUV'], c1 & c3)
assert check_equality([], c1 - c1)
assert check_equality([], c1 - c2)
assert check_equality(['Compact', 'Salesperson', 'Sedan'], c1 - c3)
assert check_equality(['Truck', 'SUV'], ~(c1 - c3))

In [19]:
col_fields = column_fields(columns)
assert check_membership(columns, list(col_fields.keys()))
assert all(isinstance(v, pyrsistent._field_common._PField) for v in col_fields.values())

NameError: name 'column_fields' is not defined

In [33]:
from columns import _column_fields
col_fields = _column_fields(columns)
col_fields

{'Salesperson': <pyrsistent._field_common._PField at 0x1126b9780>,
 'Compact': <pyrsistent._field_common._PField at 0x1126b9fc0>,
 'Sedan': <pyrsistent._field_common._PField at 0x1126b9f10>,
 'SUV': <pyrsistent._field_common._PField at 0x1126b9e60>,
 'Truck': <pyrsistent._field_common._PField at 0x1126b9e08>}

In [34]:
from columns import _column_fields
from pyrsistent._field_common import _PField
def test_col_fields():
    col_fields = _column_fields(columns)
    assert check_membership(columns, list(col_fields.keys()))
    assert all(isinstance(v, _PField) for v in col_fields.values())
test_col_fields()

In [35]:
from columns import _make_columns_input
col_input = _make_columns_input(columns)
col_input

pmap({'all_columns': ['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'], 'Sedan': 'Sedan', 'Compact': 'Compact', 'current_set': ['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'], 'Truck': 'Truck', 'SUV': 'SUV', 'Salesperson': 'Salesperson'})

In [36]:
from columns import _make_columns_input
def test_col_input():
    col_input = _make_columns_input(columns)
    assert col_input['all_columns'] == columns
    assert col_input['current_set'] == columns
    assert all(col_input[k] == k for k in columns)
test_col_input()

In [42]:
from columns import make_columns
columns = ['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']
cols = make_columns(columns)
cols

Column5(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), Compact=Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Compact'])), Sedan=Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Sedan'])), current_set=StrPSet(['Sedan', 'Truck', 'SUV', 'Compact', 'Salesperson']), SUV=Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['SUV'])), Truck=Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Truck'])), Salesperson=Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Salesperson'])))

In [40]:
cols.Salesperson 

Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Salesperson']))

In [50]:
from columns import make_columns
def test_columns_field():
    cols = make_columns(columns)
    assert check_membership(columns, cols.Salesperson.all_columns)
    assert check_equality(['Salesperson'], cols.Salesperson.current_set)
test_columns_field()

In [41]:
cols.Truck 

Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Truck']))

In [37]:
cols.Salesperson + cols.Truck

Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Truck', 'Salesperson']))

In [38]:
~(cols.Salesperson + cols.Truck)

Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Compact', 'Sedan', 'SUV']))

In [57]:
def test_field_operations():
    cols = make_columns(columns)
    out1 = cols.Salesperson + cols.Truck
    out2 = ~(cols.Salesperson + cols.Truck)
    assert all(check_equality(columns, o.all_columns) for o in (out1, out2))
    assert check_equality(['Truck', 'Salesperson'], out1.current_set)
    assert check_equality(['Compact', 'Sedan', 'SUV'], out2.current_set)
test_field_operations()

## Intensions on column precord

Applying an col record gives col names

In [58]:
from dfply import Intention
from dfply.base import _set_magic_method

# To make the invert operator, ~, work for complements
setattr(Intention, '__invert__', _set_magic_method('__invert__'))
X = Intention()

columns = ['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']
cols = make_columns(columns)
cols

Column14(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), Compact=Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Compact'])), Sedan=Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Sedan'])), current_set=StrPSet(['Sedan', 'Truck', 'SUV', 'Compact', 'Salesperson']), SUV=Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['SUV'])), Truck=Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Truck'])), Salesperson=Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Salesperson'])))

In [59]:
e1 = X.SUV
e2 = X.Truck
e3 = e1 + e2
e4 = ~e3

In [60]:
a1 = e1.evaluate(cols)
a2 = e2.evaluate(cols)
a3 = e3.evaluate(cols)
a4 = e4.evaluate(cols)
a1, a2, a3, a4

(Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['SUV'])),
 Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Truck'])),
 Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Truck', 'SUV'])),
 Columns(all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']), current_set=StrPSet(['Compact', 'Sedan', 'Salesperson'])))

## Making a `Row` with type

In [2]:
from row import *

In [3]:
columns = ['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']
my_col_types = {'Salesperson':str, 
             'SUV':int, 
             'Truck':int}
my_col_types2 = freeze(my_col_types).update({'Sedan':int, 'Compact':int})
my_col_types2

pmap({'Compact': <class 'int'>, 'Truck': <class 'int'>, 'SUV': <class 'int'>, 'Sedan': <class 'int'>, 'Salesperson': <class 'str'>})

In [4]:
Row = make_row_class(columns, my_col_types)
rows1 = [Row(**row) for row in l]
Row = make_row_class(columns, my_col_types2)
rows2 = [Row(**row) for row in l]
rows2

[Row2(SUV=15, Compact=22, Salesperson='Ann', Truck=12, Sedan=18),
 Row2(SUV=17, Compact=19, Salesperson='Bob', Truck=20, Sedan=12),
 Row2(SUV=32, Compact=19, Salesperson='Yolanda', Truck=15, Sedan=8),
 Row2(SUV=18, Compact=12, Salesperson='Xerxes', Truck=9, Sedan=23)]

In [5]:
r1 = rows2[0]
r1._precord_fields['Salesperson'].type

{NoneType, str}

### This approach makes column arithmetic easy

In [6]:
[r.Salesperson.lower() for r in rows2]

['ann', 'bob', 'yolanda', 'xerxes']

## The adjust method gives us the ability to map functions onto a column

In [7]:
[r.adjust('Salesperson', lambda s: s.lower()) for r in rows2]

[Row2(SUV=15, Compact=22, Salesperson='ann', Truck=12, Sedan=18),
 Row2(SUV=17, Compact=19, Salesperson='bob', Truck=20, Sedan=12),
 Row2(SUV=32, Compact=19, Salesperson='yolanda', Truck=15, Sedan=8),
 Row2(SUV=18, Compact=12, Salesperson='xerxes', Truck=9, Sedan=23)]

## Embedding `r.adjust` in a lambda allows us to map over the data frame

In [8]:
row_func = lambda r: r.adjust('Salesperson', lambda s: s.lower())
list(map(row_func, rows2))

[Row2(SUV=15, Compact=22, Salesperson='ann', Truck=12, Sedan=18),
 Row2(SUV=17, Compact=19, Salesperson='bob', Truck=20, Sedan=12),
 Row2(SUV=32, Compact=19, Salesperson='yolanda', Truck=15, Sedan=8),
 Row2(SUV=18, Compact=12, Salesperson='xerxes', Truck=9, Sedan=23)]

## We can use `r.adjust_with` to apply many functions at once

In [9]:
funcs = {col:lambda x: 2*x for col in columns if col != 'Salesperson'}
[r.adjust_with(**funcs) for r in rows2]

[Row2(SUV=30, Compact=44, Salesperson='Ann', Truck=24, Sedan=36),
 Row2(SUV=34, Compact=38, Salesperson='Bob', Truck=40, Sedan=24),
 Row2(SUV=64, Compact=38, Salesperson='Yolanda', Truck=30, Sedan=16),
 Row2(SUV=36, Compact=24, Salesperson='Xerxes', Truck=18, Sedan=46)]

## Embedding `r.adjust_with` in a lambda allows us to map over the data frame

In [10]:
funcs = {col:lambda x: 2*x for col in columns if col != 'Salesperson'}
row_func2 = lambda r: r.adjust_with(**funcs)
list(map(row_func2, rows2))

[Row2(SUV=30, Compact=44, Salesperson='Ann', Truck=24, Sedan=36),
 Row2(SUV=34, Compact=38, Salesperson='Bob', Truck=40, Sedan=24),
 Row2(SUV=64, Compact=38, Salesperson='Yolanda', Truck=30, Sedan=16),
 Row2(SUV=36, Compact=24, Salesperson='Xerxes', Truck=18, Sedan=46)]

## Rows content can also be accessed using the standard dict indexing

In [11]:
[r['Truck'] + r['SUV'] for r in rows2]

[27, 37, 47, 27]

## The set method is another method of altering the content of a row

In [13]:
[r.set(Salesperson = r.Salesperson.lower(), SUV = r.SUV*2) for r in rows2]

[Row2(SUV=30, Compact=22, Salesperson='ann', Truck=12, Sedan=18),
 Row2(SUV=34, Compact=19, Salesperson='bob', Truck=20, Sedan=12),
 Row2(SUV=64, Compact=19, Salesperson='yolanda', Truck=15, Sedan=8),
 Row2(SUV=36, Compact=12, Salesperson='xerxes', Truck=9, Sedan=23)]

## Using this approach makes it harder to make new columns -- We will need to make a new row class when mutating or selecting

In [14]:
[r.set(Utility = r.Truck + r.SUV) for r in rows2]

AttributeError: 'Utility' is not among the specified fields for Row2

## Use `alter` and `alter_with` to create new columns

**Note that we lose the Row structure, which will need to be applied in the data frame.**

In [15]:
r2 = r1.alter('Utility', lambda r: r.Truck + r.SUV, new_type = int)
r2

pmap({'Utility': 27, 'SUV': 15, 'Salesperson': 'Ann', 'Compact': 22, 'Truck': 12, 'Sedan': 18})

In [18]:
row_func3 = lambda row: row.alter('Utility', lambda r: r.Truck + r.SUV, new_type = int)
list(map(row_func3, rows2))

[pmap({'Utility': 27, 'SUV': 15, 'Salesperson': 'Ann', 'Compact': 22, 'Truck': 12, 'Sedan': 18}),
 pmap({'Utility': 37, 'SUV': 17, 'Salesperson': 'Bob', 'Compact': 19, 'Truck': 20, 'Sedan': 12}),
 pmap({'Utility': 47, 'SUV': 32, 'Salesperson': 'Yolanda', 'Compact': 19, 'Truck': 15, 'Sedan': 8}),
 pmap({'Utility': 27, 'SUV': 18, 'Salesperson': 'Xerxes', 'Compact': 12, 'Truck': 9, 'Sedan': 23})]

In [16]:
r3 = r1.alter_with(Utility = lambda r: r.Truck + r.SUV, Car = lambda r: r.Sedan + r.Compact)
r3

pmap({'Salesperson': 'Ann', 'Sedan': 18, 'SUV': 15, 'Utility': 27, 'Compact': 22, 'Car': 40, 'Truck': 12})

In [20]:
row_func4 = lambda row: row.alter_with(Utility = lambda r: r.Truck + r.SUV, Car = lambda r: r.Sedan + r.Compact)
list(map(row_func4, rows2))

[pmap({'Salesperson': 'Ann', 'Sedan': 18, 'SUV': 15, 'Utility': 27, 'Compact': 22, 'Car': 40, 'Truck': 12}),
 pmap({'Salesperson': 'Bob', 'Sedan': 12, 'SUV': 17, 'Utility': 37, 'Compact': 19, 'Car': 31, 'Truck': 20}),
 pmap({'Salesperson': 'Yolanda', 'Sedan': 8, 'SUV': 32, 'Utility': 47, 'Compact': 19, 'Car': 27, 'Truck': 15}),
 pmap({'Salesperson': 'Xerxes', 'Sedan': 23, 'SUV': 18, 'Utility': 27, 'Compact': 12, 'Car': 35, 'Truck': 9})]

In [22]:
funcs2 = {'Utility': lambda r: r.Truck + r.SUV, 
          'Car':lambda r: r.Sedan + r.Compact}
row_func4 = lambda row: row.alter_with(**funcs2)
list(map(row_func4, rows2))

[pmap({'Salesperson': 'Ann', 'Sedan': 18, 'SUV': 15, 'Utility': 27, 'Compact': 22, 'Car': 40, 'Truck': 12}),
 pmap({'Salesperson': 'Bob', 'Sedan': 12, 'SUV': 17, 'Utility': 37, 'Compact': 19, 'Car': 31, 'Truck': 20}),
 pmap({'Salesperson': 'Yolanda', 'Sedan': 8, 'SUV': 32, 'Utility': 47, 'Compact': 19, 'Car': 27, 'Truck': 15}),
 pmap({'Salesperson': 'Xerxes', 'Sedan': 23, 'SUV': 18, 'Utility': 27, 'Compact': 12, 'Car': 35, 'Truck': 9})]

## Making generators for each row

In [821]:
from csv import DictReader, Sniffer
with open('./data/auto_sales.csv', newline='\n') as csvfile:
    dialect = Sniffer().sniff(csvfile.read(1024))
    csvfile.seek(0)
    reader = DictReader(csvfile, dialect=dialect)
    out_raw = [row for row in reader]
out_raw

[OrderedDict([('Salesperson', 'Ann'),
              ('Compact', '22'),
              ('Sedan', '18'),
              ('SUV', '15'),
              ('Truck', '12')]),
 OrderedDict([('Salesperson', 'Bob'),
              ('Compact', '19'),
              ('Sedan', '12'),
              ('SUV', '17'),
              ('Truck', '20')]),
 OrderedDict([('Salesperson', 'Yolanda'),
              ('Compact', '19'),
              ('Sedan', '8'),
              ('SUV', '32'),
              ('Truck', '15')]),
 OrderedDict([('Salesperson', 'Xerxes'),
              ('Compact', '12'),
              ('Sedan', '23'),
              ('SUV', '18'),
              ('Truck', '9')])]

In [852]:
from toolz import peek
from collections import deque
from functools import reduce

class DataFrame(object):
    def __init__(self, 
                 iter_of_dict, 
                 col_type_dict = {}, 
                 cache = [], 
                 mutators = (lambda x: x,)):
        self.rows = iter_of_dict
        self.col_type_dict = col_type_dict
        # Cache holds raw unprocessed rows
        self._cache = deque(cache)
        try:
            if len(self._cache) > 0:
                first = self._cache[0]
            else:
                first = next(self.rows)
        except StopIteration:
            raise ValueError("There needs to be at least one row (in either iter_of_dict or cache)")
        self._mutators = mutators
        # Apply the mutators then determine the Row type
        # DANGER!!! This is too danagerous, need to default to string.  See Row method
        mut_first = self._apply_mutators(first)
        fieldnames = list(mut_first.keys())
        self.col_record = make_column_record(fieldnames)
        self.Row = makeRow(fieldnames, self.col_type_dict)
        if len(self._cache) == 0:
            # Cache is raw unprocessed rows
            self._cache.append(first)
        # Make sure that the last action is to convert to a Row
        self._mutators = self._mutators + (lambda r: self.Row(**r), )
            
    def _apply_mutators(self, row):
        # Note the newest mutator needs to be last
        return reduce(lambda acc, f: f(acc), self._mutators, row)
    
    def __iter__(self):
        return self
        
    def __next__(self):
        if len(self._cache) > 0:
            new_row = self.Row(**self._apply_mutators(self._cache.popleft()))
            return new_row
        new_row =  self.Row(**self._apply_mutators(next(self.rows)))
        return new_row
    
    def __contains__(self, key):
        return key in self.col_record
    
    def _mutated_stream(self, row_mutator):
        example_current_row = self.Row(**self._apply_mutators(self._cache[0]))
        example_output = row_mutator(example_current_row)
        new_col_types = {col:type(val) for col, val in example_output.items()}
        # Newest mutator needs to be last to be applied last
        new_mutators = self._mutators + (row_mutator, ) 
        return DataFrame(self.rows, col_type_dict=new_col_types, cache=self._cache, mutators=new_mutators)

In [860]:
from csv import DictReader, Sniffer
with open('./data/auto_sales.csv', newline='\n') as csvfile:
    dialect = Sniffer().sniff(csvfile.read(1024))
    csvfile.seek(0)
    reader = DictReader(csvfile, dialect=dialect)
    df = DataFrame(reader, col_type_dict=my_col_types2)
    out1 = [row for row in df]
out1

[Row(Salesperson='Ann', SUV=15, Truck=12, Sedan=18, Compact=22),
 Row(Salesperson='Bob', SUV=17, Truck=20, Sedan=12, Compact=19),
 Row(Salesperson='Yolanda', SUV=32, Truck=15, Sedan=8, Compact=19),
 Row(Salesperson='Xerxes', SUV=18, Truck=9, Sedan=23, Compact=12)]

## Example mutate all ints

In [854]:
def example_row_mutator(row):
    return {col:2*i if isinstance(i, int) else i for col, i in row.items()}

In [861]:
from csv import DictReader, Sniffer
with open('./data/auto_sales.csv', newline='\n') as csvfile:
    dialect = Sniffer().sniff(csvfile.read(1024))
    csvfile.seek(0)
    reader = DictReader(csvfile, dialect=dialect)
    df = DataFrame(reader, col_type_dict=my_col_types2)
    df2 = df._mutated_stream(example_row_mutator)
    out = [row for row in df2]
out

[Row(Salesperson='Ann', SUV=30, Truck=24, Sedan=36, Compact=44),
 Row(Salesperson='Bob', SUV=34, Truck=40, Sedan=24, Compact=38),
 Row(Salesperson='Yolanda', SUV=64, Truck=30, Sedan=16, Compact=38),
 Row(Salesperson='Xerxes', SUV=36, Truck=18, Sedan=46, Compact=24)]

## Example - types change automatically  

In [856]:
def example_row_mutator2(row):
    return {col:2.0*i if isinstance(i, int) else i for col, i in row.items()}

In [862]:
from csv import DictReader, Sniffer
with open('./data/auto_sales.csv', newline='\n') as csvfile:
    dialect = Sniffer().sniff(csvfile.read(1024))
    csvfile.seek(0)
    reader = DictReader(csvfile, dialect=dialect)
    df = DataFrame(reader, col_type_dict=my_col_types2)
    df2 = df._mutated_stream(example_row_mutator2)
    for row in df2:
        print(row)

Row(Salesperson='Ann', SUV=30.0, Truck=24.0, Sedan=36.0, Compact=44.0)
Row(Salesperson='Bob', SUV=34.0, Truck=40.0, Sedan=24.0, Compact=38.0)
Row(Salesperson='Yolanda', SUV=64.0, Truck=30.0, Sedan=16.0, Compact=38.0)
Row(Salesperson='Xerxes', SUV=36.0, Truck=18.0, Sedan=46.0, Compact=24.0)


## Example select

In [858]:
def example_row_mutator3(row):
    return {col:val for col, val in row.items() if col not in ['Truck', 'SUV']}

In [866]:
from csv import DictReader, Sniffer
with open('./data/auto_sales.csv', newline='\n') as csvfile:
    dialect = Sniffer().sniff(csvfile.read(1024))
    csvfile.seek(0)
    reader = DictReader(csvfile, dialect=dialect)
    df = DataFrame(reader, col_type_dict=my_col_types2)
    df3 = df._mutated_stream(example_row_mutator3)
    print(df3.col_record)
    print(df3._cache)
    for row in df3:
        print(row)

Columns(Salesperson='Salesperson', Sedan='Sedan', columns=StrPSet(['Salesperson', 'Sedan', 'Compact']), Compact='Compact')
deque([OrderedDict([('Salesperson', 'Ann'), ('Compact', '22'), ('Sedan', '18'), ('SUV', '15'), ('Truck', '12')])])
Row(Salesperson='Ann', Sedan=18, Compact=22)
Row(Salesperson='Bob', Sedan=12, Compact=19)
Row(Salesperson='Yolanda', Sedan=8, Compact=19)
Row(Salesperson='Xerxes', Sedan=23, Compact=12)


## Intension Exploration

Applying to rows gives the values and evaluates expressions

In [881]:
from dfply import X

i = X.Truck + X.Salesperson

In [882]:
rows

[Row(Salesperson='Ann', SUV=15, Truck=12, Sedan=18, Compact=22),
 Row(Salesperson='Bob', SUV=17, Truck=20, Sedan=12, Compact=19),
 Row(Salesperson='Yolanda', SUV=32, Truck=15, Sedan=8, Compact=19),
 Row(Salesperson='Xerxes', SUV=18, Truck=9, Sedan=23, Compact=12)]

In [883]:
r1 = rows[0]
r1

Row(Salesperson='Ann', SUV=15, Truck=12, Sedan=18, Compact=22)

In [884]:
i.evaluate(r1)

NotImplemented

In [885]:
i(r1)

<dfply.base.Intention at 0x119348470>

In [874]:
i

<dfply.base.Intention at 0x119348be0>

In [886]:
e1 = X.SUV
e2 = X.Truck
e3 = e1 + e2

In [887]:
a1 = e1.evaluate(r1)
a2 = e2.evaluate(r1)
a3 = e3.evaluate(r1)
a1, a2, a3

(15, 12, 27)

In [888]:
e4 = X.Salesperson.lower()
a4 = e4.evaluate(r1)
a4

'ann'

In [889]:
e6 = X.Truck > X.SUV
a6 = e6.evaluate(r1)
a6

False

In [890]:
[e6.evaluate(r) for r in rows]

[False, True, False, False]

In [891]:
e7 = abs(X.Truck)
e7.evaluate(rows2[0])

12

## Monad stuff

In [424]:
!pip install oslash



### Cont

In [411]:
from oslash.cont import Cont
from oslash.util import identity, compose


# pure = Cont.pure
unit = Cont.unit
call_cc = Cont.call_cc

In [416]:
add = lambda x, y: unit(x + y)
square = lambda x: unit(x * x)

pythagoras = lambda x, y: square(x) | (lambda xx: (
                          square(y) | (lambda yy: 
                          add(xx, yy))))

In [417]:
pythagoras(3,4)(identity)

25

In [418]:
pythagoras(4,4)(identity)

32

### Maybe

In [425]:
from oslash.maybe import Maybe, Just, Nothing
from oslash.util import identity, compose, fmap

pure = Just.pure
unit = Just.unit

In [427]:
f = lambda x: x * 2
x = Just(21)

x.map(f), Just(42), x.map(f) == Just(42)

(Just 42, Just 42, True)

In [102]:
str(None)

'None'