In [None]:
%run talktools

# Processing Lists and Tables with Dictionaries

- Counting frequencies
- Grouping data by some key/value

In [None]:
from more_itertools import with_iter
from csv import reader
from toolz import compose, first, do, pipe, assoc
from toolz.curried import get, curry, map, drop, filter, take
from functools import reduce

## Data setup

In [40]:
read_csv = compose(list, reader, with_iter, open)
batting = read_csv('Batting.csv')
pipe(batting,
    take(3),
    list)

[['playerID',
  'yearID',
  'stint',
  'teamID',
  'lgID',
  'G',
  'AB',
  'R',
  'H',
  '2B',
  '3B',
  'HR',
  'RBI',
  'SB',
  'CS',
  'BB',
  'SO',
  'IBB',
  'HBP',
  'SH',
  'SF',
  'GIDP'],
 ['abercda01',
  '1871',
  '1',
  'TRO',
  'NA',
  '1',
  '4',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '',
  '',
  '',
  '',
  ''],
 ['addybo01',
  '1871',
  '1',
  'RC1',
  'NA',
  '25',
  '118',
  '30',
  '32',
  '6',
  '0',
  '0',
  '13',
  '8',
  '1',
  '4',
  '0',
  '',
  '',
  '',
  '',
  '']]

In [None]:
row = batting[1]
get_year = get(1)
get_year(row)

In [None]:
get_years = compose(list, map(get_year))
pipe(batting,
    take(10),
    drop(1),
    get_years,
    list)

# Task 1: Count each unique item in a list

- we have a list of hashable items
- we want a dictionary of frequencies for each item

## Imperative solution

- Start with an empty dictionary
- Add 1 to the value for each key we observe

In [None]:
acc = {}
for year in get_years(drop(1, batting)):
    old_count = get(year, acc, 0)
    new_count = old_count + 1
    acc = assoc(acc, year, new_count)
acc


In [None]:
acc = {}
for year in get_years(drop(1, batting)):
    acc = assoc(acc, year, get(year, acc, 0) + 1)
acc


## Watch the progress with `print`

In [None]:
acc = {}
for year in get_years(drop(1, take(120, batting))):
    print(year, acc)
    acc = assoc(acc, year, get(year, acc, 0) + 1)
acc

## Solution with `reduce`

In [None]:
acc = {}
for year in get_years(drop(1, batting)):
    acc = assoc(acc, year, get(year, acc, 0) + 1)
acc

In [None]:
init = {}
update_count = lambda acc, item: assoc(acc, item, get(item, acc, 0) + 1)
my_freq = lambda seq: reduce(update_count, seq, {})
my_freq(get_years(drop(1, batting)))

## Using `frequencies` from `toolz`

- `toolz` already has this function

In [None]:
from toolz import frequencies
pipe(batting,
    drop(1),
    get_years,
    frequencies)

# Task 2: Collect all items of the same type

- Solution with dictionary
    - keys are groups
    - values are list of group members
- Use a key_function to classify

In [None]:
row = batting[1]
get_year_ab = get([1, 6])
get_year_ab(row)

In [None]:
get_years_abs = compose(list, map(get_year_ab))
get_years_abs(batting[1:10])

In [None]:
years_abs = pipe(batting,
                drop(1),
                get_years_abs)
years_abs[:10]

### Example: Group all rows by year

In [None]:
get_year = get(0)
key_func = get_year
key_func(years_abs[0])

In [None]:
acc = {}
for row in years_abs:
    old_collection = get(key_func(row), acc, [])
    new_collection = old_collection + [row]
    acc = assoc(acc, key_func(row), new_collection)
acc

In [None]:
from operator import add
old_collection = lambda acc, row: get(key_func(row), acc, [])
new_collection = lambda acc, row: old_collection(acc, row) + [row]
update_collection = lambda acc, row: assoc(acc, key_func(row), new_collection(acc, row))

acc = {}
for row in years_abs:
    acc = update_collection(acc, row)
acc

## Solution with `reduce`

In [None]:
update_collection = curry(lambda key_func, 
                                 acc, 
                                 row: assoc(acc, 
                                            key_func(row), 
                                            new_collection(acc, row)))
my_groupby = lambda key_func, seq: reduce(update_collection(key_func), seq, {})
my_groupby(get_year, years_abs)

## Using `groupby` from `toolz`

In [None]:
from toolz import groupby
groupby(get_year, years_abs)

# What's next?

- next we will process the result of `groupby` with
    - `valmap`
    - `valfilter`
- See section 7.6

# Aggregating Data

- Important process
- Step 1: Group the data
- Step 2: Compute summary statistics

## Using `valmap` and `valfilter` to compute statistics

- Both `valmap` and `valfilter`
    - Argument: func, dict
    - Returns: dict
- `valmap` *maps* a function to each *value*
- `valfilter` *filters* each *value* 

### Example

In [57]:
from toolz.curried import valmap, valfilter, map, filter, keyfilter, keymap
d = {'a':[1,2,3,4,5],
     'b':[11,12,13],
     'c':[111,112,113,114]}
valfilter(lambda val: len(val) > 3, d)

{'a': [1, 2, 3, 4, 5], 'c': [111, 112, 113, 114]}

In [58]:
valmap(sum, d)

{'a': 15, 'b': 36, 'c': 450}

## Using `valmap` and `valfilter` to compute statistics

- Both `keymap` and `keyfilter`
    - Argument: func, dict
    - Returns: dict
- `valmap` *maps* a function to each *key*
- `valfilter` *filters* each *key* 

In [200]:
keyfilter(lambda key: key in 'aeiou', d)

{'a': [1, 2, 3, 4, 5]}

In [201]:
keymap(lambda key: key.upper(), d)

{'A': [1, 2, 3, 4, 5], 'B': [11, 12, 13], 'C': [111, 112, 113, 114]}

## Working with Nested Data

### Combining `valmap`, `valfilter`, `map`, and `filter` 

- Use `map` and/or `filter` inside `valmap` or `valfilter`
- Obey's the layers of abstraction rule
    - `map` and `filter` describe the inner action
    - `valmap` and `valfilter` describe the outer action

### Example - Filter out odd values from each value

**Layers:**

1. dictionary consists of string(key) and lists(val)
2. The string are string (atomic)
3. The list consist of numbers
4. The number are number (atomic)

In [202]:
d

{'a': [1, 2, 3, 4, 5], 'b': [11, 12, 13], 'c': [111, 112, 113, 114]}

<img src="https://github.com/yardsale8/STAT489/blob/master/img/level_abstraction_dict_of_list.png?raw=true" width="500">

#### Step 1 - Inner most function - Numbers

In [203]:
is_odd = lambda n: n % 2 == 1

#### Step 2 - value/list function

In [204]:
from toolz import compose
keep_odd = compose(list, filter(is_odd))

#### Step 3 - dictionary function

- apply the value function to each value $\rightarrow$ VALMAP!

In [205]:
valmap(keep_odd, d)

{'a': [1, 3, 5], 'b': [11, 13], 'c': [111, 113]}

## Example 2 - Back to the batting example.

- Compute the average runs for each year
- Only keep players with more than 20 at bats

#### Step 1 - Data setup

In [206]:
from more_itertools import with_iter
from csv import reader
from toolz import compose, first, do, pipe, assoc, groupby
from toolz.curried import get, curry, map, drop, filter, take, valmap, valfilter, keyfilter
from functools import reduce

In [207]:
read_csv = compose(list, reader, with_iter, open)
batting = read_csv('Batting.csv')

year_ab_h = pipe( batting,
                 drop(1),
                 map(get([1, 6, 8])))
pipe(year_ab_h,
     list,
    take(10),
    list)

[('1871', '4', '0'),
 ('1871', '118', '32'),
 ('1871', '137', '40'),
 ('1871', '133', '44'),
 ('1871', '120', '39'),
 ('1871', '49', '11'),
 ('1871', '4', '1'),
 ('1871', '157', '63'),
 ('1871', '5', '1'),
 ('1871', '86', '13')]

#### Step 2 - Group the data

In [208]:
get_year = get(0)
batting_by_year = groupby(get_year, year_ab_h)
batting_by_year

{}

#### What went wrong?

In [209]:
year_ab_h = pipe( batting,
                 drop(1),
                 map(get([1, 6, 8])),
                 list)
batting_by_year = groupby(get_year, year_ab_h)
pipe(batting_by_year,
    get('1970'),
    take(10),
    list)

[('1970', '516', '154'),
 ('1970', '63', '13'),
 ('1970', '0', '0'),
 ('1970', '3', '0'),
 ('1970', '14', '3'),
 ('1970', '0', '0'),
 ('1970', '27', '4'),
 ('1970', '636', '182'),
 ('1970', '2', '0'),
 ('1970', '16', '1')]

In [210]:
year_ab_h = pipe( batting,
                 drop(1),
                 map(get([1, 6, 8])),
                 list)
batting_by_year = groupby(get_year, year_ab_h)
pipe(batting_by_year,
    get('1970'),
    take(10),
    list)

[('1970', '516', '154'),
 ('1970', '63', '13'),
 ('1970', '0', '0'),
 ('1970', '3', '0'),
 ('1970', '14', '3'),
 ('1970', '0', '0'),
 ('1970', '27', '4'),
 ('1970', '636', '182'),
 ('1970', '2', '0'),
 ('1970', '16', '1')]

In [211]:
get_70_80_dict = keyfilter(lambda key: key in ['1970', '1980'])
subset_dict = compose(valmap(compose(list, 
                                     take(10))), 
                      get_70_80_dict)
subset_dict(batting_by_year)

{'1970': [('1970', '516', '154'),
  ('1970', '63', '13'),
  ('1970', '0', '0'),
  ('1970', '3', '0'),
  ('1970', '14', '3'),
  ('1970', '0', '0'),
  ('1970', '27', '4'),
  ('1970', '636', '182'),
  ('1970', '2', '0'),
  ('1970', '16', '1')],
 '1980': [('1980', '', ''),
  ('1980', '', ''),
  ('1980', '262', '75'),
  ('1980', '47', '13'),
  ('1980', '543', '151'),
  ('1980', '111', '27'),
  ('1980', '83', '15'),
  ('1980', '178', '40'),
  ('1980', '3', '1'),
  ('1980', '', '')]}

#### Step 3 - Convert entries of values

- item functions are str, int, int
- now make a value function
- then map the value function to all values

In [212]:
maybe_int = lambda val: int(val) if len(val) > 0 else 0
assert maybe_int('1') == 1
assert maybe_int('') == 0

In [213]:
convert_tuple = lambda tup: [f(i) for f, i in zip((str, maybe_int, maybe_int), tup)]
assert convert_tuple(('a', '1', '2')) == ['a', 1, 2]
assert convert_tuple(('a', '', '2')) == ['a', 0, 2]

In [214]:
convert_value = compose(list, map(convert_tuple))
pipe(batting_by_year,
    get('1970'),
    take(10),
    convert_value,
    list)

[['1970', 516, 154],
 ['1970', 63, 13],
 ['1970', 0, 0],
 ['1970', 3, 0],
 ['1970', 14, 3],
 ['1970', 0, 0],
 ['1970', 27, 4],
 ['1970', 636, 182],
 ['1970', 2, 0],
 ['1970', 16, 1]]

In [215]:
batting_by_year = pipe(batting_by_year, 
                       valmap(convert_value))
subset_dict(batting_by_year)

{'1970': [['1970', 516, 154],
  ['1970', 63, 13],
  ['1970', 0, 0],
  ['1970', 3, 0],
  ['1970', 14, 3],
  ['1970', 0, 0],
  ['1970', 27, 4],
  ['1970', 636, 182],
  ['1970', 2, 0],
  ['1970', 16, 1]],
 '1980': [['1980', 0, 0],
  ['1980', 0, 0],
  ['1980', 262, 75],
  ['1980', 47, 13],
  ['1980', 543, 151],
  ['1980', 111, 27],
  ['1980', 83, 15],
  ['1980', 178, 40],
  ['1980', 3, 1],
  ['1980', 0, 0]]}

#### Step 4 - Filter out player with at least 20 AB

In [216]:
at_least_20_ab_tup = compose(lambda n: n >= 20,
                            get(1))
assert at_least_20_ab_tup(['1970', 33, 11])
assert not at_least_20_ab_tup(['1970', 15, 7])

In [217]:
at_least_20_AB_value = compose(list, filter(at_least_20_ab_tup))
assert at_least_20_AB_value([['1970', 516, 154], [1970, 13, 3]]) == [['1970', 516, 154]]

In [218]:
batting_by_year = pipe(batting_by_year,
                      valmap(at_least_20_AB_value))
subset_dict(batting_by_year)

{'1970': [['1970', 516, 154],
  ['1970', 63, 13],
  ['1970', 27, 4],
  ['1970', 636, 182],
  ['1970', 120, 20],
  ['1970', 261, 61],
  ['1970', 459, 128],
  ['1970', 38, 8],
  ['1970', 61, 14],
  ['1970', 426, 104]],
 '1980': [['1980', 262, 75],
  ['1980', 47, 13],
  ['1980', 543, 151],
  ['1980', 111, 27],
  ['1980', 83, 15],
  ['1980', 178, 40],
  ['1980', 70, 25],
  ['1980', 51, 12],
  ['1980', 38, 10],
  ['1980', 112, 19]]}

#### Step 5 - get the hits

In [219]:
get_hits_tup = get(-1)
assert get_hits_tup(['1980', 262, 75]) == 75

In [220]:
get_hits_val = compose(list, map(get_hits_tup))
assert get_hits_val([['1980', 262, 75],['1980', 47, 13]]) == [75, 13]

In [221]:
batting_by_year = pipe(batting_by_year,
                      valmap(get_hits_val))
subset_dict(batting_by_year)

{'1970': [154, 13, 4, 182, 20, 61, 128, 8, 14, 104],
 '1980': [75, 13, 151, 27, 15, 40, 25, 12, 10, 19]}

#### Step 6 - Find the mean of each value

In [223]:
mean = lambda L: sum(L)/len(L)
assert mean([1,2,3]) == 2

In [225]:
pipe(subset_dict(batting_by_year),
     valmap(mean))

{'1970': 68.8, '1980': 38.7}

In [226]:
batting_by_year =  pipe(batting_by_year,
                       valmap(mean))

In [228]:
get_70_80_dict(batting_by_year)

{'1970': 56.654173764906304, '1980': 66.58070175438597}

In [None]:
batting_by_year

#### Exercise: Put this all together in one pipe