# Map and Filter Group Work

In this exercise, we will revisit the group work assignment on reading and writing tables.  Below you will find the answer key for the earlier assignment.  Your task will be to convert all code to use `map` and `filter` whereever possible.

**Here are the rules**

1. Use `with_iter` from `more_itertools` combined with `csv.reader` to read tables.
2. Combine steps using `compose` and/or `pipe`
    a. Use curried functions when possible (`get`, `map`, `filter` at the least)
3. Rewrite the helper functions curried function and functions from `operator`



<font color="red">**Question 1:**</font>  While rare in the modern era, it used to be common practice for teams to have player-managers, i.e. using a current player as a manager.  Compute the total number of runs scored by all player-managers (all time).

**Relevant Files:**  Managers.csv, Batting.csv

**Hint:** You will need to join the tables on `playerID` and `year`

In [34]:
from more_itertools import with_iter
from csv import reader
from toolz.curried import get, map, filter, curry, drop
from operator import add, contains, add, gt

read_csv = lambda filename: list(reader(with_iter(open(filename))))
managers = read_csv("Managers.csv")
batting = read_csv("Batting.csv")

In [19]:
player_managers = filter(lambda r: get(-1, r) == 'Y', managers)
player_managers = pipe(managers, filter(get(-1)), list)
player_managers[:3]

[['playerID',
  'yearID',
  'teamID',
  'lgID',
  'inseason',
  'G',
  'W',
  'L',
  'rank',
  'plyrMgr'],
 ['wrighha01', '1871', 'BS1', 'NA', '1', '31', '20', '10', '3', 'Y'],
 ['woodji01', '1871', 'CH1', 'NA', '1', '28', '19', '9', '2', 'Y']]

In [20]:
# Note: We will be using this to check membership --> use a set for efficiency
player_managers_IDs = set(map(lambda r: get(0, r), player_managers))
player_managers_IDs = pipe(player_managers, 
                           map(get(0)), 
                           set)

player_managers_IDs

{'actama99',
 'adairbi99',
 'adcocjo01',
 'addybo01',
 'allenbo01',
 'allisdo01',
 'alomasa02',
 'aloufe01',
 'alstowa01',
 'altobjo01',
 'amalfjo01',
 'andersp01',
 'ansonca01',
 'applilu01',
 'armoubi99',
 'asproke01',
 'ausmubr01',
 'austiji01',
 'bakerde01',
 'bakerdu01',
 'bambege01',
 'bancrda01',
 'bancrfr99',
 'banisje01',
 'barklsa01',
 'barnibi01',
 'barroed99',
 'barryja01',
 'battijo01',
 'bauerha01',
 'baylodo01',
 'bellbu01',
 'bensove01',
 'berrayo01',
 'bevinte99',
 'bezdehu99',
 'bicke99',
 'birmijo01',
 'bissode01',
 'blackbu02',
 'blackle01',
 'bladera01',
 'blairwa01',
 'bluegos01',
 'bochybr01',
 'bogarti01',
 'bolesjo99',
 'bondto01',
 'boonebo01',
 'borosst01',
 'bottoji01',
 'boudrlo01',
 'bowala01',
 'bowerfr01',
 'boydbi01',
 'boyerke01',
 'bradlbi01',
 'bragabo01',
 'brenlbo01',
 'bresnro01',
 'bristda99',
 'brownda99',
 'brownfr99',
 'brownmo01',
 'brownto01',
 'bruckea01',
 'buckeal99',
 'buffich01',
 'burdoja01',
 'burkeji01',
 'burnhwa99',
 'burnsto01',
 

In [28]:
contains = curry(contains)
run_filt_help = compose(contains(player_managers_IDs), get(0))
run_filt_help(player_managers[0])

True

In [36]:
runs_filtered = filter(run_filt_help, drop(1, batting))
runs = map(lambda r: len(get(7, r)) > 0 and int(get(7, r)), runs_filtered)

sum(runs)

274877

In [None]:
runs = pipe(batting,
           drop(1),
           filter(runs_filtered),
           map(compose(len,))

In [None]:
runs_map_help = lambda r: len(get(7, r)) > 0 and int(get(7, r))

runs_map_help 

In [8]:
from toolz import pipe, compose
from toolz.curried import get, map, filter
from operator import eq

runs = pipe(managers,
           filter(compose(lambda mgr: eq(mgr, 'Y'),
                         get(-1))),
           map(get(0)),
           sum)

TypeError: unsupported operand type(s) for +: 'int' and 'str'

<font color="red">**Question 2:**</font> Create a table of statistics on pitching that contains the following information:

* players_name, 
* year, 
* team_name,
* wins(W), 
* losses(L), 
* complete games (CG), 
* innings pitched (IP)

Write this information to a file named **PitchingNames.csv** and include this file with your submission.

**Edit:** Do this for the last 10 years.

**Relevant Files:** Pitching.csv, Master.csv, Teams.csv

In [15]:
pitching = read_csv('Pitching.csv')
master = read_csv('Master.csv')
teams = read_csv('Teams.csv')

In [16]:
# Step 1: get the info from pitching.
# We need playerID (0), yearId(1), teamID (3),wins (5), loses(6), complete games (9) and IPouts (12) 
# See enumeration in Question 3
from toolz import drop
filter_row = lambda r: get([0, 1, 3, 5, 6, 9, 12], r)
pitching2006_2010_filt = filter(lambda r: 2006 <= int(get(1, r)) <= 2010, drop(1, pitching))
pitching2006_2010 = list(map(filter_row, pitching2006_2010_filt))

pitchersIDs = set(map(lambda r: get(0, r), pitching2006_2010))

In [17]:
raw_names_filt = filter(lambda r: get(0, r) in pitchersIDs, master)
raw_names = map(lambda r: get([0, 13, 14], r), raw_names_filt)

add_names = lambda row: (row[0], row[1] + " " + row[2])
names = list(map(add_names, raw_names))

In [18]:
from itertools import product
name_row = lambda tup: get(0, tup)
info_row = lambda tup: get(1, tup)
combined_row = lambda tup: get([1], name_row(tup)) + get([1, 2, 3, 4, 5, 6], info_row(tup))
# The rows will be joined by mathcing playerIDs.  Here is a helper to do this.
match_on_playerID = lambda tup: get(0, name_row(tup)) == get(0, info_row(tup))
info_with_names_filt = filter(match_on_playerID, product(names, pitching2006_2010))
info_with_names = list(map(combined_row, info_with_names_filt))

In [19]:
team_names_filt = filter(lambda r: get([0,2], r), teams)
team_names = list(map(lambda r: get([0,2,40], r), team_names_filt))
team_names[:10]

[('yearID', 'teamID', 'name'),
 ('1871', 'BS1', 'Boston Red Stockings'),
 ('1871', 'CH1', 'Chicago White Stockings'),
 ('1871', 'CL1', 'Cleveland Forest Citys'),
 ('1871', 'FW1', 'Fort Wayne Kekiongas'),
 ('1871', 'NY2', 'New York Mutuals'),
 ('1871', 'PH1', 'Philadelphia Athletics'),
 ('1871', 'RC1', 'Rockford Forest Citys'),
 ('1871', 'TRO', 'Troy Haymakers'),
 ('1871', 'WS3', 'Washington Olympics')]

In [20]:
teamIDs = set(map(lambda r: get([1, 2], r), info_with_names))

In [21]:
info_row = lambda tup: get(1, tup)
team_row = lambda tup: get(0, tup)
combined_row = lambda tup: get([0,1], info_row(tup)) + get([2], team_row(tup)) + get([-4, -3, -2, -1], info_row(tup))

join_on_teamID_year  = lambda tup: get(0, team_row(tup)) == get(1, info_row(tup)) and get(1, team_row(tup)) == get(2, info_row(tup))
info_with_name_team_filt = filter(join_on_teamID_year, product(team_names, info_with_names))
info_with_name_team = list(map(combined_row, info_with_name_team_filt))
info_with_name_team[:3]

[('Greg Aquino', '2006', 'Arizona Diamondbacks', '2', '0', '0', '145'),
 ('Jeff Bajenaru', '2006', 'Arizona Diamondbacks', '0', '1', '0', '3'),
 ('Miguel Batista', '2006', 'Arizona Diamondbacks', '11', '8', '3', '619')]

In [22]:
from more_itertools import consume
with open('PitchingNames.csv', 'w') as out:
    joined_rows = map(lambda r: ','.join(r),info_with_name_team)
    consume(map(lambda r: print(r, file=out), joined_rows))
! cat PitchingNames.csv | head -n 5

Greg Aquino,2006,Arizona Diamondbacks,2,0,0,145
Jeff Bajenaru,2006,Arizona Diamondbacks,0,1,0,3
Miguel Batista,2006,Arizona Diamondbacks,11,8,3,619
Randy Choate,2006,Arizona Diamondbacks,0,1,0,48
Juan Cruz,2006,Arizona Diamondbacks,5,6,0,284
cat: stdout: Broken pipe


<font color="red">**Question 3:**</font> Find the name of the pitcher and the associated team name for the pitcher with the lowest ERA (earned runs average) out of all pitchers between the years 1970 and 2010.  Limit your search to pitchers with at least 5 games pitched in a given year.

** Relevant Files:** Pitching.csv, Master.csv, Teams.csv

**Step 1:** Read in all three files.

In [40]:
pitching = read_csv('Pitching.csv')
master = read_csv('Master.csv')
teams = read_csv('Teams.csv')

**Step 2:** Filter the pitching table to only include the years 1970 to 2010.

In [41]:
# Look at the pitching header to determine the location of year
pitching_header = pitching[0]
list(enumerate(pitching_header))

[(0, 'playerID'),
 (1, 'yearID'),
 (2, 'stint'),
 (3, 'teamID'),
 (4, 'lgID'),
 (5, 'W'),
 (6, 'L'),
 (7, 'G'),
 (8, 'GS'),
 (9, 'CG'),
 (10, 'SHO'),
 (11, 'SV'),
 (12, 'IPouts'),
 (13, 'H'),
 (14, 'ER'),
 (15, 'HR'),
 (16, 'BB'),
 (17, 'SO'),
 (18, 'BAOpp'),
 (19, 'ERA'),
 (20, 'IBB'),
 (21, 'WP'),
 (22, 'HBP'),
 (23, 'BK'),
 (24, 'BFP'),
 (25, 'GF'),
 (26, 'R'),
 (27, 'SH'),
 (28, 'SF'),
 (29, 'GIDP')]

In [42]:
pitching1970_2010 = list(filter(lambda r: 1970 <= int(get(1, r)) <= 2010, drop(1, pitching)))
pitching1970_2010[:1]

[['abernte02',
  '1970',
  '1',
  'CHN',
  'NL',
  '0',
  '0',
  '11',
  '0',
  '0',
  '0',
  '1',
  '27',
  '9',
  '2',
  '0',
  '5',
  '2',
  '0.28',
  '2',
  '1',
  '0',
  '1',
  '0',
  '40',
  '3',
  '2',
  '',
  '',
  '']]

**Step 3:** Make a new table that contains the ERA (int) and playerID of each pitcher in the filtered list

**Note** We also need to hang onto the year and teamID for later joins.

In [43]:
# Pull out the raw string entries using get
eras = list(map(lambda r: get([0,1,3,19], r), pitching1970_2010))
# Map float to era, making sure we skip empty rows
process_row = lambda r: get([0,1,2], r) + tuple(map(float, get([-1], r)))
assert process_row(['a', 'b', 'c', '2.5']) == ('a', 'b', 'c', 2.5)

In [44]:
eras = list(filter(lambda r: len(get(-1, r)) > 0, eras))
eras = list(map(process_row, eras))
# NOTE we will use an abstraction to allow us to do this in one step in a later exercise
eras[:3]

[('abernte02', '1970', 'CHN', 2.0),
 ('abernte02', '1970', 'KCA', 2.59),
 ('abernte02', '1970', 'SLN', 2.95)]

**Step 4:** Use `min` with a key function (see **Section 4.3.4** of the book) to identify the player with the lowest ERA

In [45]:
# We are sorting by era, which has index 2
key_func = lambda r: get(3, r)
min_era = min(eras, key=key_func)
min_era

('bolinbo01', '1970', 'BOS', 0.0)

In [46]:
min_player, min_year, min_team, min_era = min_era
min_player

'bolinbo01'

In [47]:
# BONUS - For fun lets find all players with that tie with this era
ties = [r for r in eras if get(3, r) == min_era]
list(filter(lambda r: get(3, r) == min_era, eras))
len(ties)

434

In [48]:
ties[:4]

[('bolinbo01', '1970', 'BOS', 0.0),
 ('jonesga01', '1970', 'NYA', 0.0),
 ('mcbeaal01', '1970', 'LAN', 0.0),
 ('molonri01', '1970', 'CHA', 0.0)]

**Step 5:** Use filters to get the name and the team of the player identified in the last step.

In [49]:
# Use enumerate on the header to find the indexes for nameGiven
master_header = master[0]
list(enumerate(master_header))

[(0, 'playerID'),
 (1, 'birthYear'),
 (2, 'birthMonth'),
 (3, 'birthDay'),
 (4, 'birthCountry'),
 (5, 'birthState'),
 (6, 'birthCity'),
 (7, 'deathYear'),
 (8, 'deathMonth'),
 (9, 'deathDay'),
 (10, 'deathCountry'),
 (11, 'deathState'),
 (12, 'deathCity'),
 (13, 'nameFirst'),
 (14, 'nameLast'),
 (15, 'nameGiven'),
 (16, 'weight'),
 (17, 'height'),
 (18, 'bats'),
 (19, 'throws'),
 (20, 'debut'),
 (21, 'finalGame'),
 (22, 'retroID'),
 (23, 'bbrefID')]

In [51]:
# Get the  name by matching with id
# Note that the name sill be first + last
from toolz import first
add_name = lambda row: row[0] + " " + row[1]
player_name_filt = filter(lambda r: get(0, r) == min_player, master)
player_name_map = map(lambda r: add_name(get([13, 14], r)), player_name_filt)
player_name = first(player_name_map)
player_name

'Bobby Bolin'

In [52]:
team_header = teams[0]
list(enumerate(team_header))

[(0, 'yearID'),
 (1, 'lgID'),
 (2, 'teamID'),
 (3, 'franchID'),
 (4, 'divID'),
 (5, 'Rank'),
 (6, 'G'),
 (7, 'Ghome'),
 (8, 'W'),
 (9, 'L'),
 (10, 'DivWin'),
 (11, 'WCWin'),
 (12, 'LgWin'),
 (13, 'WSWin'),
 (14, 'R'),
 (15, 'AB'),
 (16, 'H'),
 (17, '2B'),
 (18, '3B'),
 (19, 'HR'),
 (20, 'BB'),
 (21, 'SO'),
 (22, 'SB'),
 (23, 'CS'),
 (24, 'HBP'),
 (25, 'SF'),
 (26, 'RA'),
 (27, 'ER'),
 (28, 'ERA'),
 (29, 'CG'),
 (30, 'SHO'),
 (31, 'SV'),
 (32, 'IPouts'),
 (33, 'HA'),
 (34, 'HRA'),
 (35, 'BBA'),
 (36, 'SOA'),
 (37, 'E'),
 (38, 'DP'),
 (39, 'FP'),
 (40, 'name'),
 (41, 'park'),
 (42, 'attendance'),
 (43, 'BPF'),
 (44, 'PPF'),
 (45, 'teamIDBR'),
 (46, 'teamIDlahman45'),
 (47, 'teamIDretro')]

In [53]:
from toolz import get_in
team_name_filt = list(filter(lambda r: get(0, r) == min_year and get(2, r) == min_team, teams))
team_name = get_in([0,-8], team_name_filt)
team_name

'Boston Red Sox'

In [54]:
player_name, team_name, min_year, min_era

('Bobby Bolin', 'Boston Red Sox', '1970', 0.0)