# Thinking about the data frame class

## Options

* Replace `dict` rows a la `csv.DictReader` with `PMap` or `Records` from `pyrsistent`
    * Advantages
        * Immutable and persistent
        * We can leverage some of the built-in features
            * Types and Invariants
            * Transformations
    * Disadvantages
        * Performance hit
        * Rowwise
* Create column generators + operations
    * Advantages
        * Makes columns operations very easy
        * We can avoid Intention overhead
    * Disadvantages
        * Need to `intertools.tee` the original iterator

# Examples

In [5]:
from csv import DictReader, Sniffer
with open('./data/auto_sales.csv', newline='\n') as csvfile:
    dialect = Sniffer().sniff(csvfile.read(1024))
    csvfile.seek(0)
    reader = DictReader(csvfile, dialect=dialect)
    columns = reader.fieldnames
    l = list(reader)
l, columns

([OrderedDict([('Salesperson', 'Ann'),
               ('Compact', '22'),
               ('Sedan', '18'),
               ('SUV', '15'),
               ('Truck', '12')]),
  OrderedDict([('Salesperson', 'Bob'),
               ('Compact', '19'),
               ('Sedan', '12'),
               ('SUV', '17'),
               ('Truck', '20')]),
  OrderedDict([('Salesperson', 'Yolanda'),
               ('Compact', '19'),
               ('Sedan', '8'),
               ('SUV', '32'),
               ('Truck', '15')]),
  OrderedDict([('Salesperson', 'Xerxes'),
               ('Compact', '12'),
               ('Sedan', '23'),
               ('SUV', '18'),
               ('Truck', '9')])],
 ['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'])

In [15]:
from pyrsistent import freeze, pmap

In [16]:
list(map(pmap, l))

[pmap({'Compact': '22', 'Sedan': '18', 'Salesperson': 'Ann', 'Truck': '12', 'SUV': '15'}),
 pmap({'Compact': '19', 'Sedan': '12', 'Salesperson': 'Bob', 'Truck': '20', 'SUV': '17'}),
 pmap({'Compact': '19', 'Sedan': '8', 'Salesperson': 'Yolanda', 'Truck': '15', 'SUV': '32'}),
 pmap({'Compact': '12', 'Sedan': '23', 'Salesperson': 'Xerxes', 'Truck': '9', 'SUV': '18'})]

In [17]:
r = pmap(l[0])
r2 = r.discard('Truck')

In [18]:
'class'.isidentifier()

True

In [19]:
from pyrsistent import PRecord, field

In [20]:
from pyrsistent import PRecord, field
class ARecord(PRecord):
    x = field()

r = ARecord(x=3)
r

ARecord(x=3)

In [21]:
from pyrsistent import PRecord, field

ARecord = type('ARecord', (PRecord,), {'x':field()})
r = ARecord(x=3)
r

ARecord(x=3)

## Testing basic functionality of rows as `PRecords`

In [9]:
from pyrsistent import ny
col_types = [str, int, int, int]
Row = type('Row', (PRecord,), {name:field(initial=None) for name in columns if name.isidentifier()})

?Row.transform(['Salesperson', ny], lambda s: s.lower())

In [10]:
r1 = Row(**l[0])
rows = list(map(lambda r: Row(**r), l))
rows

[Row(Compact='22', Truck='12', SUV='15', Salesperson='Ann', Sedan='18'),
 Row(Compact='19', Truck='20', SUV='17', Salesperson='Bob', Sedan='12'),
 Row(Compact='19', Truck='15', SUV='32', Salesperson='Yolanda', Sedan='8'),
 Row(Compact='12', Truck='9', SUV='18', Salesperson='Xerxes', Sedan='23')]

In [11]:
Row(**{'a':5, 'b':2, 'Salesperson':'Bob'})

AttributeError: 'a' is not among the specified fields for Row

In [12]:
int('a')

ValueError: invalid literal for int() with base 10: 'a'

In [97]:
str.__name__

'str'

## Making a `Row` with type

In [52]:
from pyrsistent import PRecord, field, freeze, pmap



def col_factory(type_constructor):
    if type_constructor.__name__ == 'str':
        return lambda val: val if len(val) > 0 else None
    else:
        def factory(val):
            try:
                return type_constructor(val)
            except ValueError:
                return None
        return factory

def get_col_types(names, col_type_dict):
    return [col_type_dict.get(name, str) for name in names]

def get_field(col_type, **kwargs):
    return field(type=(col_type, type(None)),
                 factory = col_factory(col_type),
                 initial = None,
                 **kwargs)
    
def columns_and_types(names, col_type_dict):
    return zip(names, get_col_types(names, col_type_dict))

def row_fields(names, col_type_dict, kwarg_dict={}):
    return {name:get_field(col_type, **kwarg_dict.get(name, {})) 
            for name, col_type in columns_and_types(names, col_type_dict) 
            if name.isidentifier()}

# Filter by keys, useful for select
def filter_method(self, pred):
    out = self
    for key in self:
        if not pred(key):
            out =  out.discard(key)
    return out
            
# Method for mapping a function to type A to a Maybe A (i.e. deal with None's)
def alter(self, f, k):
    out = self.get(k, None)
    if out is None:
        return self.set(**{k:None})
    else:
        try:
            return self.set(**{k:f(out)})
        except:
            return self.set(**{k:None})

def alter_with_dict(self, dict_of_func):
    
    return self.set()
        
def map(self, f):
    out = self
    for key in self:
        out = out.set(**{k:f(self[k])})
    return out

def makeRow(names, col_type_dict={}, field_kwargs = {'mandatory':True}):
    return type('Row', (PRecord,), row_fields(names, col_type_dict, kwarg_dict=field_kwargs))

def makeRowFromDict(d, col_types={}, **kwargs):
    names = list(d.keys())
    if hasattr
    col_type_dict = get_col_types
    fields = row_fields(names, col_type_dict, **kwargs)
    return type('Row', (PRecord, ), fields)(**d)

In [53]:
makeRowFromDict({'Truck': 12, 'Utility': 27, 'Sedan': 18, 'SUV': 15, 'Compact': 22, 'Salesperson': 'Ann'})

TypeError: object of type 'int' has no len()

In [30]:
my_col_types = {'Salesperson':str, 
             'SUV':int, 
             'Truck':int}
my_col_types2 = freeze(my_col_types).update({'Sedan':int, 'Compact':int})
my_col_types2

pmap({'Compact': <class 'int'>, 'Sedan': <class 'int'>, 'Truck': <class 'int'>, 'SUV': <class 'int'>, 'Salesperson': <class 'str'>})

In [16]:
Row = makeRow(columns, my_col_types)
rows1 = [Row(**row) for row in l]
rows1

[Row(Truck=12, Compact='22', SUV=15, Salesperson='Ann', Sedan='18'),
 Row(Truck=20, Compact='19', SUV=17, Salesperson='Bob', Sedan='12'),
 Row(Truck=15, Compact='19', SUV=32, Salesperson='Yolanda', Sedan='8'),
 Row(Truck=9, Compact='12', SUV=18, Salesperson='Xerxes', Sedan='23')]

In [19]:
Row = makeRow(columns, my_col_types2)
rows2 = [Row(**row) for row in l]
rows2

[Row(Truck=12, Compact=22, SUV=15, Salesperson='Ann', Sedan=18),
 Row(Truck=20, Compact=19, SUV=17, Salesperson='Bob', Sedan=12),
 Row(Truck=15, Compact=19, SUV=32, Salesperson='Yolanda', Sedan=8),
 Row(Truck=9, Compact=12, SUV=18, Salesperson='Xerxes', Sedan=23)]

In [55]:
r1 = rows2[0]
r1

Row(Truck=12, Compact=22, SUV=15, Salesperson='Ann', Sedan=18)

In [58]:
pmap(r1._precord_fields)

pmap({'Truck': <pyrsistent._field_common._PField object at 0x11767f410>, 'Sedan': <pyrsistent._field_common._PField object at 0x11767f150>, 'SUV': <pyrsistent._field_common._PField object at 0x11767f518>, 'Compact': <pyrsistent._field_common._PField object at 0x11767f468>, 'Salesperson': <pyrsistent._field_common._PField object at 0x11063b0f8>})

In [59]:
pmap(r1._precord_initial_values)

pmap({'Truck': None, 'Sedan': None, 'SUV': None, 'Compact': None, 'Salesperson': None})

In [61]:
pmap(r1._precord_invariants)

pmap({})

In [62]:
pmap(r1._precord_mandatory_fields)

pmap({})

## Note Map.mapWithKey is just set with a dict comp for kwargs 

In [36]:
r1.set(**{k:2*v for k, v in r1.items() if k != 'Salesperson'})

Row(Truck=24, Compact=44, SUV=30, Salesperson='Ann', Sedan=36)

In [32]:
?r1.update

In [49]:
pmap(r1).update({'Utility':r1.Truck + r1.SUV})

pmap({'Truck': 12, 'Utility': 27, 'Sedan': 18, 'SUV': 15, 'Compact': 22, 'Salesperson': 'Ann'})

In [50]:
r1.update({'Truck': 12, 'Utility': 27, 'Sedan': 18, 'SUV': 15, 'Compact': 22, 'Salesperson': 'Ann'})

AttributeError: 'Utility' is not among the specified fields for Row

['_Evolver',
 '__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_buckets',
 '_cached_hash',
 '_contains',
 '_get_bucket',
 '_getitem',
 '_precord_fields',
 '_precord_initial_values',
 '_precord_invariants',
 '_precord_mandatory_fields',
 '_size',
 'copy',
 'create',
 'discard',
 'evolver',
 'get',
 'items',
 'iteritems',
 'iterkeys',
 'itervalues',
 'keys',
 'remove',
 'serialize',
 'set',
 'transform',
 'update',
 'update_with',
 'values']

In [41]:
e1 = r1.evolver()

In [43]:
e1.set(Utility = r1.Truck + r1.Utility)

AttributeError: Row has no attribute 'Utility'

## Use a `PRecord` to store the Column index 

In [None]:
Columns = type('Columns', (PRecord,) {'columns': })

## Initial Testing with `column` (universe) and `set` (current collection)

In [260]:
from pyrsistent import pset, pset_field, pvector_field, PRecord

_COMPARISON_METHODS = ['__ne__',
                       '__ge__', 
                       '__lt__', 
                       '__eq__', 
                       '__gt__', 
                       'issuperset', 
                       'issubset',
                       '__contains__', 
                       'isdisjoint',
                       '__le__']

_UNARY_OPERATIONS = ['__len__', 
                     '__sizeof__'] 

_SET_OPERATIONS = ['__and__', 
                   '__sub__', 
                   '__or__',  
                   '__xor__', 
                   'union', 
                   'intersection', 
                   'difference', 
                   'symmetric_difference', 
                   'update']


def wrap_comparison_method(method_name):
    def magic_method(self, other):
        if isinstance(other, PRecord) and hasattr(other, 'current_set'): 
            return getattr(self.current_set, method_name)(other.current_set)
        else:
            return getattr(self.current_set, method_name)(other)
    return magic_method


def wrap_unary_method(method_name):
    def magic_method(self):
        return getattr(self.current_set, method_name)()
    return magic_method


def wrap_set_operations(method_name):
    def magic_method(self, other):
        if isinstance(other, PRecord) and hasattr(other, 'current_set'): 
            current_set = getattr(self.current_set, method_name)(other.current_set)
        else:
            current_set = getattr(self.current_set, method_name)(other)
        return self.__class__(all_columns = self.all_columns, current_set=current_set)
    return magic_method


def column_invert(self):
    current_set = pset(self.all_columns) - self.current_set
    return self.__class__(all_columns=self.all_columns, current_set=current_set)

def column_iter(self):
    for item in self.current_set:
        yield item

class Columns(PRecord):
    all_columns = pvector_field(str)
    current_set = pset_field(str)
    
    
for method in _COMPARISON_METHODS:
    setattr(Columns, method, wrap_comparison_method(method))

for method in _UNARY_OPERATIONS:
    setattr(Columns, method, wrap_unary_method(method))
    
for method in _SET_OPERATIONS:
    setattr(Columns, method, wrap_set_operations(method))
    
setattr(Columns, '__invert__', column_invert)
setattr(Columns, '__iter__', lambda self: self.current_set.__iter__())

In [261]:
# Things I decided to remove
_set_methods_return_set = ['remove', 
                           'copy', 
                           'discard', 
                           'add']

In [288]:
columns = ['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']
c1 = Columns(all_columns=columns, current_set=columns)
c2 = Columns(all_columns=columns, current_set=columns)
c3 = Columns(all_columns=columns, current_set=pset(['Truck','SUV']))

def check_membership(items, cols):
    return all(i in items for i in cols)
def check_equality(items, cols):
    return pset(items) == cols

assert check_equality(['Compact', 'Truck', 'Salesperson', 'Sedan', 'SUV'], ~(c1 ^ c1))
assert not c1 != c1
assert not c1 != c2
assert c1 != c3
assert check_membership(['Compact', 'Salesperson', 'Sedan'], c1 ^ c1)
assert check_equality([],  c1 ^ c2)
assert check_equality(['Compact', 'Salesperson', 'Sedan'], c1 ^ c3)
assert check_equality(['Compact', 'Truck', 'Salesperson', 'Sedan', 'SUV'], ~(c1 ^ c2))
assert check_equality(['Truck', 'SUV'], ~(c1 ^ c3))
assert check_equality(columns, c1 & c1)
assert check_equality(columns, c1 & c2)
assert check_equality(['Truck', 'SUV'], c1 & c3)
assert check_equality([], c1 - c1)
assert check_equality([], c1 - c2)
assert check_equality(['Compact', 'Salesperson', 'Sedan'], c1 - c3)
assert check_equality(['Truck', 'SUV'], ~(c1 - c3))

In [291]:
def make_unique_column_name():
    n = 0
    def col_maker():
        nonlocal n
        n += 1
        return 'Column' + str(n)
    return col_maker

In [293]:
col_name = make_unique_column_name()
for i in range(5):
    print(col_name())


Column1
Column2
Column3
Column4
Column5


In [294]:

def make_current_column_class():
    col_names = make_unique_column_name()
    def column_class_maker(columns):
        cls_name = col_names()
        return 
    

In [336]:
def column_fields(columns):
    return {n:field(factory=lambda s: Columns(all_columns=columns, current_set=pset([s]))) for n in columns if n.isidentifier()}


def make_column_type(columns):
    return type(col_name(), (Columns, ), column_fields(columns))

In [337]:
cols = column_fields(columns)
cols

{'Salesperson': <pyrsistent._field_common._PField at 0x117bdeaf0>,
 'Compact': <pyrsistent._field_common._PField at 0x117bdeeb8>,
 'Sedan': <pyrsistent._field_common._PField at 0x117bac258>,
 'SUV': <pyrsistent._field_common._PField at 0x117bac200>,
 'Truck': <pyrsistent._field_common._PField at 0x117bac1a8>}

In [338]:
?pmap({'a':1, 'b':2}).update_with

In [339]:
def make_columns_input(columns):
    output = pmap({'all_columns':columns,
                   'current_set':columns})
    return output.update({n:n for n in columns})
make_columns_input(columns)

pmap({'Compact': 'Compact', 'Truck': 'Truck', 'SUV': 'SUV', 'current_set': ['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'], 'Salesperson': 'Salesperson', 'Sedan': 'Sedan', 'all_columns': ['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']})

In [342]:
def make_columns(columns):
    return make_column_type(columns)(**make_columns_input(columns))
cols = make_columns(columns)
cols

Column18(Compact=Columns(current_set=StrPSet(['Compact']), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'])), Truck=Columns(current_set=StrPSet(['Truck']), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'])), SUV=Columns(current_set=StrPSet(['SUV']), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'])), current_set=StrPSet(['Compact', 'Truck', 'Salesperson', 'Sedan', 'SUV']), Salesperson=Columns(current_set=StrPSet(['Salesperson']), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'])), Sedan=Columns(current_set=StrPSet(['Sedan']), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'])), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']))

In [345]:
cols.Salesperson + cols.Truck

Columns(current_set=StrPSet(['Truck', 'Salesperson']), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']))

In [346]:
~(cols.Salesperson + cols.Truck)

Columns(current_set=StrPSet(['Compact', 'SUV', 'Sedan']), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']))

## Intensions on column precord

Applying an col record gives col names

In [431]:
from dfply import Intention
from dfply.base import _set_magic_method

setattr(Intention, '__invert__', _set_magic_method('__invert__'))
columns = ['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']
cols = make_columns(columns)
cols

Column23(Compact=Columns(current_set=StrPSet(['Compact']), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'])), Truck=Columns(current_set=StrPSet(['Truck']), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'])), SUV=Columns(current_set=StrPSet(['SUV']), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'])), current_set=StrPSet(['Compact', 'Truck', 'Salesperson', 'Sedan', 'SUV']), Salesperson=Columns(current_set=StrPSet(['Salesperson']), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'])), Sedan=Columns(current_set=StrPSet(['Sedan']), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'])), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck']))

In [434]:
e1 = X.SUV
e2 = X.Truck
e3 = e1 + e2
e4 = ~e3

## <font color=red>TODO: Need to add `__invert__` to Intension</font>

In [435]:
a1 = e1.evaluate(cols)
a2 = e2.evaluate(cols)
a3 = e3.evaluate(cols)
a4 = e4.evaluate(cols)
a1, a2, a3, a4

(Columns(current_set=StrPSet(['SUV']), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'])),
 Columns(current_set=StrPSet(['Truck']), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'])),
 Columns(current_set=StrPSet(['Truck', 'SUV']), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'])),
 Columns(current_set=StrPSet(['Compact', 'Salesperson', 'Sedan']), all_columns=StrPVector(['Salesperson', 'Compact', 'Sedan', 'SUV', 'Truck'])))

## Old version that returned strings

In [30]:
def wrapped_tuples(method_names, wrapper):
    return tuple((name, wrapper(name)) for name in method_names)

    
def wrap_method(method_name):
    """Makes a magic method that defers to the columns field"""
    def magic_method(self, *args, **kwargs):
        return getattr(self.columns, method_name)(*args, **kwargs)
    return magic_method


_other_set_methods = ['__len__', 
                      '__contains__', 
                      '__sizeof__', 
                      'issuperset', 
                      'isdisjoint', 
                      'issubset',
                      'union', 
                      'remove', 
                      'intersection', 
                      'difference', 
                      'copy', 
                      'discard', 
                      'symmetric_difference', 
                      'update',
                      'add']

_set_methods = (wrapped_tuples(_comparison_methods, wrap_comparison_method) + 
                wrapped_tuples(_other_set_methods, wrap_method) +
               (('negate', lambda self, *args: self.columns - pset(args)), ))

In [36]:
def make_Columns_class(columns):
    return type('Columns', (PRecord,), {'all_columns':pset_field(str), 'current_set':pset_field(str)})

In [46]:
import pyrsistent
Cols = make_Columns_class(columns)
cols = Cols(all_columns = columns, current_set = columns)
cols

Columns(current_set=StrPSet(['Compact', 'Truck', 'Salesperson', 'Sedan', 'SUV']), all_columns=StrPSet(['Compact', 'Truck', 'Salesperson', 'Sedan', 'SUV']))

In [None]:
def col_field(cls, name):
    def method(self):

In [28]:
fields = col_fields(columns)
fields

(('Salesperson', <pyrsistent._field_common._PField at 0x117abb410>),
 ('Compact', <pyrsistent._field_common._PField at 0x117abb048>),
 ('Sedan', <pyrsistent._field_common._PField at 0x117abb0f8>),
 ('SUV', <pyrsistent._field_common._PField at 0x117abb150>),
 ('Truck', <pyrsistent._field_common._PField at 0x117abb1a8>),
 ('columns', <pyrsistent._field_common._PField at 0x117abb468>),
 ('set', <pyrsistent._field_common._PField at 0x117abb258>))

## Initial Testing

In [22]:
from pyrsistent import pset, pset_field

def col_fields(columns):
    return tuple((name,field(str)) for name in columns) + (('columns', pset_field(str)),)



# TODO: Think about how to return a new columns object


In [577]:
_other_set_methods = ['__len__', 
                      '__ne__',
                      '__ge__', 
                      '__lt__', 
                      '__eq__', 
                      'issuperset', 
                      '__or__', 
                      '__contains__', 
                      '__sizeof__', 
                      'isdisjoint', 
                      'issubset',
                      '__gt__', 
                      '__le__']

_set_methods_return_set = ['__and__', 
                           '__sub__', 
                           'union', 
                           'remove', 
                           '__or__',  
                           '__xor__', 
                           'intersection', 
                           'difference', 
                           'copy', 
                           'discard', 
                           'symmetric_difference', 
                           'update',
                           'add']


_other_set_methods_wrapped = wrapped_tuples(_other_set_methods) 
_set_methods = tuple((name, wrap_method(name)) for name in _set_methods_return_set + _other_set_methods)
_set_methods = _set_methods + (('negate', lambda self, *args: self.columns - pset(args)), )

TypeError: wrapped_tuples() missing 1 required positional argument: 'wrapper'

In [473]:
_comparison_methods = ['__ne__',
                      '__ge__', 
                      '__lt__', 
                      '__eq__', 
                      '__or__', 
                      '__gt__', 
                      '__le__',
                      '__and__', 
                      '__sub__', 
                      '__or__',  
                      '__xor__']

def wrap_comparison_method(method_name):
    msg = 'Set comparison require both object be Column Record'
    def magic_method(self, other):
        if isinstance(other, PRecord) and hasattr(other, 'columns'): 
            return getattr(self.columns, method_name)(other.columns)
        else:
            return getattr(self.columns, method_name)(other)
    return magic_method

def wrapped_tuples(method_names, wrapper):
    return tuple((name, wrapper(name)) for name in method_names)

    
def wrap_method(method_name):
    """Makes a magic method that defers to the columns field"""
    def magic_method(self, *args, **kwargs):
        return getattr(self.columns, method_name)(*args, **kwargs)
    return magic_method


_other_set_methods = ['__len__', 
                      '__contains__', 
                      '__sizeof__', 
                      'issuperset', 
                      'isdisjoint', 
                      'issubset',
                      'union', 
                      'remove', 
                      'intersection', 
                      'difference', 
                      'copy', 
                      'discard', 
                      'symmetric_difference', 
                      'update',
                      'add']

_set_methods = (wrapped_tuples(_comparison_methods, wrap_comparison_method) + 
                wrapped_tuples(_other_set_methods, wrap_method) +
               (('negate', lambda self, *args: self.columns - pset(args)), ))

In [474]:
def wrap_in_class(cls, method_name):
    def magic_method(self, *args, **kwargs):
        output = getattr(self.columns, method_name)(*args, **kwargs)
        return cls(**make_field_values(output)) 
    return magic_method

def decorate_method_returns_column(method):
    def magic_method(self, *args, **kwargs):
        output = method(self, *args, **kwargs)
        Cols = type('Columns', (PRecord,), dict(fields(output) + _set_methods))
        for name in _set_methods_return_set:
            setattr(Cols, name, wrap_in_class(Cols, name))
        return Cols(**make_field_values(output)) 
    return magic_method

In [475]:
def make_field_values(columns):
    return dict(tuple(zip(columns, columns)) + (('columns', pset(columns)),))
make_field_values(columns)

{'Salesperson': 'Salesperson',
 'Compact': 'Compact',
 'Sedan': 'Sedan',
 'SUV': 'SUV',
 'Truck': 'Truck',
 'columns': pset(['Salesperson', 'SUV', 'Truck', 'Sedan', 'Compact'])}

In [505]:
def make_column_record(columns):
    if isinstance(columns, PRecord) and hasattr(columns, 'columns'):
        columns = columns.columns
    Col =  type('Columns', (PRecord,), dict(col_fields(columns) + _set_methods))
    return Col(**make_field_values(columns))
    

In [506]:
cols = make_column_record(columns)
cols

Columns(Salesperson='Salesperson', SUV='SUV', Truck='Truck', Sedan='Sedan', columns=StrPSet(['Salesperson', 'SUV', 'Truck', 'Sedan', 'Compact']), Compact='Compact')

In [507]:
isinstance(cols, PRecord)

True

## Column fields return columns names (strings)

In [1]:
cols.Truck

NameError: name 'cols' is not defined

In [514]:
make_column_record(cols)

Columns(Salesperson='Salesperson', SUV='SUV', Truck='Truck', Sedan='Sedan', columns=StrPSet(['Salesperson', 'SUV', 'Truck', 'Sedan', 'Compact']), Compact='Compact')

## Use the `negate` method to negate a column list

In [509]:
cols.negate('Salesperson')

pset(['SUV', 'Truck', 'Sedan', 'Compact'])

In [510]:
cols.negate(cols.Salesperson, cols.Truck)

pset(['SUV', 'Sedan', 'Compact'])

## Use set operations

**Useful for select and select helpers**

Note: These will then need to be recast as a new Columns class

In [511]:
'Salesperson' in cols, 'Todd' in cols

(True, False)

In [512]:
len(cols)

5

In [484]:
names = make_column_record(['Salesperson', 'Truck'])
cols - names 

pset(['SUV', 'Sedan', 'Compact'])

In [485]:
names = ['Salesperson', 'Truck']
cols - names 

pset(['SUV', 'Sedan', 'Compact'])

In [486]:
cols.difference(names)

pset(['SUV', 'Sedan', 'Compact'])

In [487]:
cols & names

pset(['Salesperson', 'Truck'])

In [490]:
a = make_column_record(cols & names) 
b = make_column_record(names)
a == b

True

In [492]:
a = make_column_record(cols & names) 
b = make_column_record(names)
a == b

True

In [493]:
cols.intersection(names)

pset(['Salesperson', 'Truck'])

In [494]:
new_names = names + ["Utility"]
cols | new_names

pset(['Utility', 'Salesperson', 'SUV', 'Truck', 'Sedan', 'Compact'])

In [497]:
new_names = names + ["Utility"]
cols | make_column_record(new_names)

pset(['Utility', 'Salesperson', 'SUV', 'Truck', 'Sedan', 'Compact'])

In [498]:
cols.union(new_names)

pset(['Utility', 'Salesperson', 'SUV', 'Truck', 'Sedan', 'Compact'])

In [499]:
cols.union(new_names) > cols

True

In [500]:
cols ^ new_names

pset(['SUV', 'Utility', 'Sedan', 'Compact'])

In [501]:
cols ^ new_names > cols

False

In [502]:
cols == cols

True

In [504]:
cols == make_column_record(cols)

False

In [412]:
cols.symmetric_difference(new_names)

pset(['SUV', 'Utility', 'Sedan', 'Compact'])

In [413]:
cols.columns.discard('Salesperson')

StrPSet(['SUV', 'Truck', 'Sedan', 'Compact'])

In [414]:
dir(cols.columns)

['Evolver',
 '_Evolver',
 '__and__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__or__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__serializer__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__type__',
 '__weakref__',
 '__xor__',
 '_checked_invariants',
 '_checked_types',
 '_from_iterable',
 '_map',
 'add',
 'copy',
 'create',
 'difference',
 'discard',
 'evolver',
 'intersection',
 'isdisjoint',
 'issubset',
 'issuperset',
 'remove',
 'serialize',
 'symmetric_difference',
 'union',
 'update']

In [204]:
from pyrsistent import pset
pset(cols.keys())

pset(['Salesperson', 'SUV', 'Truck', 'Sedan', 'Compact'])

In [161]:
[m for m in dir(Row) if not m.startswith('__')]

['_Evolver',
 '_buckets',
 '_cached_hash',
 '_contains',
 '_get_bucket',
 '_getitem',
 '_precord_fields',
 '_precord_initial_values',
 '_precord_invariants',
 '_precord_mandatory_fields',
 '_size',
 'copy',
 'create',
 'discard',
 'evolver',
 'get',
 'items',
 'iteritems',
 'iterkeys',
 'itervalues',
 'keys',
 'remove',
 'serialize',
 'set',
 'transform',
 'update',
 'update_with',
 'values']

In [168]:
row1 = rows[0]
row1.serialize()

{'Salesperson': 'Ann', 'SUV': 15, 'Truck': 12, 'Sedan': '18', 'Compact': '22'}

In [None]:
def mutate_row()

In [142]:
Row2 = makeRow(columns, my_col_types2)
rows2 = [Row2(**row) for row in l]
rows2

[Row(Salesperson='Ann', SUV=15, Truck=12, Sedan=18, Compact=22),
 Row(Salesperson='Bob', SUV=17, Truck=20, Sedan=12, Compact=19),
 Row(Salesperson='Yolanda', SUV=32, Truck=15, Sedan=8, Compact=19),
 Row(Salesperson='Xerxes', SUV=18, Truck=9, Sedan=23, Compact=12)]

In [69]:
Row = type('Row', (PRecord,), {name:field(factory=fact) for name, fact in zip(columns, col_types) if name.isidentifier()})
r1 = Row(**l[0])
rows = list(map(lambda r: Row(**r), l))
rows

[Row(Salesperson='Ann', SUV=15, Truck=12, Sedan=18, Compact=22),
 Row(Salesperson='Bob', SUV=17, Truck=20, Sedan=12, Compact=19),
 Row(Salesperson='Yolanda', SUV=32, Truck=15, Sedan=8, Compact=19),
 Row(Salesperson='Xerxes', SUV=18, Truck=9, Sedan=23, Compact=12)]

### This approach makes column arithmetic easy

In [385]:
[r.Salesperson.lower() for r in rows]

['ann', 'bob', 'yolanda', 'xerxes']

In [386]:
[r['Truck'] + r['SUV'] for r in rows]

[27, 37, 47, 27]

In [387]:
[r.set(Salesperson = r.Salesperson.lower(), SUV = r.SUV*2) for r in rows]

[Row(Salesperson='ann', SUV=30, Truck=12, Sedan='18', Compact='22'),
 Row(Salesperson='bob', SUV=34, Truck=20, Sedan='12', Compact='19'),
 Row(Salesperson='yolanda', SUV=64, Truck=15, Sedan='8', Compact='19'),
 Row(Salesperson='xerxes', SUV=36, Truck=9, Sedan='23', Compact='12')]

## Using this approach makes it harder to make new columns -- We will need to make a new row class when mutating or selecting

In [388]:
[r.set(Utility = r.Truck + r.SUV) for r in rows]

AttributeError: 'Utility' is not among the specified fields for Row

In [389]:
[n for n in dir('a') if not n.startswith('__')]

['capitalize',
 'casefold',
 'center',
 'count',
 'encode',
 'endswith',
 'expandtabs',
 'find',
 'format',
 'format_map',
 'index',
 'isalnum',
 'isalpha',
 'isdecimal',
 'isdigit',
 'isidentifier',
 'islower',
 'isnumeric',
 'isprintable',
 'isspace',
 'istitle',
 'isupper',
 'join',
 'ljust',
 'lower',
 'lstrip',
 'maketrans',
 'partition',
 'replace',
 'rfind',
 'rindex',
 'rjust',
 'rpartition',
 'rsplit',
 'rstrip',
 'split',
 'splitlines',
 'startswith',
 'strip',
 'swapcase',
 'title',
 'translate',
 'upper',
 'zfill']

## Making generators for each row

In [821]:
from csv import DictReader, Sniffer
with open('./data/auto_sales.csv', newline='\n') as csvfile:
    dialect = Sniffer().sniff(csvfile.read(1024))
    csvfile.seek(0)
    reader = DictReader(csvfile, dialect=dialect)
    out_raw = [row for row in reader]
out_raw

[OrderedDict([('Salesperson', 'Ann'),
              ('Compact', '22'),
              ('Sedan', '18'),
              ('SUV', '15'),
              ('Truck', '12')]),
 OrderedDict([('Salesperson', 'Bob'),
              ('Compact', '19'),
              ('Sedan', '12'),
              ('SUV', '17'),
              ('Truck', '20')]),
 OrderedDict([('Salesperson', 'Yolanda'),
              ('Compact', '19'),
              ('Sedan', '8'),
              ('SUV', '32'),
              ('Truck', '15')]),
 OrderedDict([('Salesperson', 'Xerxes'),
              ('Compact', '12'),
              ('Sedan', '23'),
              ('SUV', '18'),
              ('Truck', '9')])]

In [852]:
from toolz import peek
from collections import deque
from functools import reduce

class DataFrame(object):
    def __init__(self, 
                 iter_of_dict, 
                 col_type_dict = {}, 
                 cache = [], 
                 mutators = (lambda x: x,)):
        self.rows = iter_of_dict
        self.col_type_dict = col_type_dict
        # Cache holds raw unprocessed rows
        self._cache = deque(cache)
        try:
            if len(self._cache) > 0:
                first = self._cache[0]
            else:
                first = next(self.rows)
        except StopIteration:
            raise ValueError("There needs to be at least one row (in either iter_of_dict or cache)")
        self._mutators = mutators
        # Apply the mutators then determine the Row type
        mut_first = self._apply_mutators(first)
        fieldnames = list(mut_first.keys())
        self.col_record = make_column_record(fieldnames)
        self.Row = makeRow(fieldnames, self.col_type_dict)
        if len(self._cache) == 0:
            # Cache is raw unprocessed rows
            self._cache.append(first)
        # Make sure that the last action is to convert to a Row
        self._mutators = self._mutators + (lambda r: self.Row(**r), )
            
    def _apply_mutators(self, row):
        # Note the newest mutator needs to be last
        return reduce(lambda acc, f: f(acc), self._mutators, row)
    
    def __iter__(self):
        return self
        
    def __next__(self):
        if len(self._cache) > 0:
            new_row = self.Row(**self._apply_mutators(self._cache.popleft()))
            return new_row
        new_row =  self.Row(**self._apply_mutators(next(self.rows)))
        return new_row
    
    def __contains__(self, key):
        return key in self.col_record
    
    def _mutated_stream(self, row_mutator):
        example_current_row = self.Row(**self._apply_mutators(self._cache[0]))
        example_output = row_mutator(example_current_row)
        new_col_types = {col:type(val) for col, val in example_output.items()}
        # Newest mutator needs to be last to be applied last
        new_mutators = self._mutators + (row_mutator, ) 
        return DataFrame(self.rows, col_type_dict=new_col_types, cache=self._cache, mutators=new_mutators)

In [860]:
from csv import DictReader, Sniffer
with open('./data/auto_sales.csv', newline='\n') as csvfile:
    dialect = Sniffer().sniff(csvfile.read(1024))
    csvfile.seek(0)
    reader = DictReader(csvfile, dialect=dialect)
    df = DataFrame(reader, col_type_dict=my_col_types2)
    out1 = [row for row in df]
out1

[Row(Salesperson='Ann', SUV=15, Truck=12, Sedan=18, Compact=22),
 Row(Salesperson='Bob', SUV=17, Truck=20, Sedan=12, Compact=19),
 Row(Salesperson='Yolanda', SUV=32, Truck=15, Sedan=8, Compact=19),
 Row(Salesperson='Xerxes', SUV=18, Truck=9, Sedan=23, Compact=12)]

## Example mutate all ints

In [854]:
def example_row_mutator(row):
    return {col:2*i if isinstance(i, int) else i for col, i in row.items()}

In [861]:
from csv import DictReader, Sniffer
with open('./data/auto_sales.csv', newline='\n') as csvfile:
    dialect = Sniffer().sniff(csvfile.read(1024))
    csvfile.seek(0)
    reader = DictReader(csvfile, dialect=dialect)
    df = DataFrame(reader, col_type_dict=my_col_types2)
    df2 = df._mutated_stream(example_row_mutator)
    out = [row for row in df2]
out

[Row(Salesperson='Ann', SUV=30, Truck=24, Sedan=36, Compact=44),
 Row(Salesperson='Bob', SUV=34, Truck=40, Sedan=24, Compact=38),
 Row(Salesperson='Yolanda', SUV=64, Truck=30, Sedan=16, Compact=38),
 Row(Salesperson='Xerxes', SUV=36, Truck=18, Sedan=46, Compact=24)]

## Example - types change automatically  

In [856]:
def example_row_mutator2(row):
    return {col:2.0*i if isinstance(i, int) else i for col, i in row.items()}

In [862]:
from csv import DictReader, Sniffer
with open('./data/auto_sales.csv', newline='\n') as csvfile:
    dialect = Sniffer().sniff(csvfile.read(1024))
    csvfile.seek(0)
    reader = DictReader(csvfile, dialect=dialect)
    df = DataFrame(reader, col_type_dict=my_col_types2)
    df2 = df._mutated_stream(example_row_mutator2)
    for row in df2:
        print(row)

Row(Salesperson='Ann', SUV=30.0, Truck=24.0, Sedan=36.0, Compact=44.0)
Row(Salesperson='Bob', SUV=34.0, Truck=40.0, Sedan=24.0, Compact=38.0)
Row(Salesperson='Yolanda', SUV=64.0, Truck=30.0, Sedan=16.0, Compact=38.0)
Row(Salesperson='Xerxes', SUV=36.0, Truck=18.0, Sedan=46.0, Compact=24.0)


## Example select

In [858]:
def example_row_mutator3(row):
    return {col:val for col, val in row.items() if col not in ['Truck', 'SUV']}

In [866]:
from csv import DictReader, Sniffer
with open('./data/auto_sales.csv', newline='\n') as csvfile:
    dialect = Sniffer().sniff(csvfile.read(1024))
    csvfile.seek(0)
    reader = DictReader(csvfile, dialect=dialect)
    df = DataFrame(reader, col_type_dict=my_col_types2)
    df3 = df._mutated_stream(example_row_mutator3)
    print(df3.col_record)
    print(df3._cache)
    for row in df3:
        print(row)

Columns(Salesperson='Salesperson', Sedan='Sedan', columns=StrPSet(['Salesperson', 'Sedan', 'Compact']), Compact='Compact')
deque([OrderedDict([('Salesperson', 'Ann'), ('Compact', '22'), ('Sedan', '18'), ('SUV', '15'), ('Truck', '12')])])
Row(Salesperson='Ann', Sedan=18, Compact=22)
Row(Salesperson='Bob', Sedan=12, Compact=19)
Row(Salesperson='Yolanda', Sedan=8, Compact=19)
Row(Salesperson='Xerxes', Sedan=23, Compact=12)


## Intension Exploration

Applying to rows gives the values and evaluates expressions

In [881]:
from dfply import X

i = X.Truck + X.Salesperson

In [882]:
rows

[Row(Salesperson='Ann', SUV=15, Truck=12, Sedan=18, Compact=22),
 Row(Salesperson='Bob', SUV=17, Truck=20, Sedan=12, Compact=19),
 Row(Salesperson='Yolanda', SUV=32, Truck=15, Sedan=8, Compact=19),
 Row(Salesperson='Xerxes', SUV=18, Truck=9, Sedan=23, Compact=12)]

In [883]:
r1 = rows[0]
r1

Row(Salesperson='Ann', SUV=15, Truck=12, Sedan=18, Compact=22)

In [884]:
i.evaluate(r1)

NotImplemented

In [885]:
i(r1)

<dfply.base.Intention at 0x119348470>

In [874]:
i

<dfply.base.Intention at 0x119348be0>

In [886]:
e1 = X.SUV
e2 = X.Truck
e3 = e1 + e2

In [887]:
a1 = e1.evaluate(r1)
a2 = e2.evaluate(r1)
a3 = e3.evaluate(r1)
a1, a2, a3

(15, 12, 27)

In [888]:
e4 = X.Salesperson.lower()
a4 = e4.evaluate(r1)
a4

'ann'

In [889]:
e6 = X.Truck > X.SUV
a6 = e6.evaluate(r1)
a6

False

In [890]:
[e6.evaluate(r) for r in rows]

[False, True, False, False]

In [891]:
e7 = abs(X.Truck)
e7.evaluate(rows2[0])

12

## Monad stuff

In [424]:
!pip install oslash



### Cont

In [411]:
from oslash.cont import Cont
from oslash.util import identity, compose


# pure = Cont.pure
unit = Cont.unit
call_cc = Cont.call_cc

In [416]:
add = lambda x, y: unit(x + y)
square = lambda x: unit(x * x)

pythagoras = lambda x, y: square(x) | (lambda xx: (
                          square(y) | (lambda yy: 
                          add(xx, yy))))

In [417]:
pythagoras(3,4)(identity)

25

In [418]:
pythagoras(4,4)(identity)

32

### Maybe

In [425]:
from oslash.maybe import Maybe, Just, Nothing
from oslash.util import identity, compose, fmap

pure = Just.pure
unit = Just.unit

In [427]:
f = lambda x: x * 2
x = Just(21)

x.map(f), Just(42), x.map(f) == Just(42)

(Just 42, Just 42, True)