In [None]:
%run talktools

# Processing Lists and Tables with Dictionaries

- Counting frequencies
- Grouping data by some key/value

In [None]:
from more_itertools import with_iter
from csv import reader
from toolz import compose, first, do, pipe, assoc
from toolz.curried import get, curry, map, drop, filter, take
from functools import reduce

## Data setup

In [None]:
read_csv = compose(list, reader, with_iter, open)
batting = read_csv('Batting.csv')
pipe(batting,
    take(3),
    list)

In [None]:
row = batting[1]
get_year = get(1)
get_year(row)

In [None]:
get_years = compose(list, map(get_year))
pipe(batting,
    take(10),
    drop(1),
    get_years,
    list)

# Task 1: Count each unique item in a list

- we have a list of hashable items
- we want a dictionary of frequencies for each item

## Imperative solution

- Start with an empty dictionary
- Add 1 to the value for each key we observe

In [None]:
acc = {}
for year in get_years(drop(1, batting)):
    old_count = get(year, acc, 0) + 1
    new_count = old_count + 1
    acc = assoc(acc, year, new_count)
acc


In [None]:
acc = {}
for year in get_years(drop(1, batting)):
    acc = assoc(acc, year, get(year, acc, 0) + 1)
acc


## Watch the progress with `print`

In [None]:
acc = {}
for year in get_years(drop(1, take(120, batting))):
    print(year, acc)
    acc = assoc(acc, year, get(year, acc, 0) + 1)
acc

## Solution with `reduce`

In [None]:
acc = {}
for year in get_years(drop(1, batting)):
    acc = assoc(acc, year, get(year, acc, 0) + 1)
acc

In [None]:
init = {}
update_count = lambda acc, item: assoc(acc, item, get(item, acc, 0) + 1)
my_freq = lambda seq: reduce(update_count, seq, {})
my_freq(get_years(drop(1, batting)))

## Using `frequencies` from `toolz`

- `toolz` already has this function

In [None]:
from toolz import frequencies
pipe(batting,
    drop(1),
    get_years,
    frequencies)

# Task 2: Collect all items of the same type

- Solution with dictionary
    - keys are groups
    - values are list of group members
- Use a key_function to classify

In [None]:
row = batting[1]
get_year_ab = get([1, 6])
get_year_ab(row)

In [None]:
get_years_abs = compose(list, map(get_year_ab))
get_years_abs(batting[1:10])

In [None]:
years_abs = pipe(batting,
                drop(1),
                get_years_abs)
years_abs[:10]

### Example: Group all rows by year

In [None]:
get_year = get(0)
key_func = get_year
key_func(years_abs[0])

In [None]:
acc = {}
for row in years_abs:
    old_collection = get(key_func(row), acc, [])
    new_collection = old_collection + [row]
    acc = assoc(acc, key_func(row), new_collection)
acc

In [None]:
from operator import add
old_collection = lambda acc, row: get(key_func(row), acc, [])
new_collection = lambda acc, row: old_collection(acc, row) + [row]
update_collection = lambda acc, row: assoc(acc, key_func(row), new_collection(acc, row))

acc = {}
for row in years_abs:
    acc = update_collection(acc, row)
acc

## Solution with `reduce`

In [None]:
update_collection = curry(lambda key_func, 
                                 acc, 
                                 row: assoc(acc, 
                                            key_func(row), 
                                            new_collection(acc, row)))
my_groupby = lambda key_func, seq: reduce(update_collection(key_func), seq, {})
my_groupby(get_year, years_abs)

## Using `groupby` from `toolz`

In [None]:
from toolz import groupby
groupby(get_year, years_abs)

# What's next?

- next we will process the result of `groupby` with
    - `valmap`
    - `valfilter`
- See section 7.6