### 以前的课程中我们分享过基本的分组聚合操作，然而用的并不是太多，因为分组聚合非常常用，所以在今天的课程中，我们将多次使用pandas的该功能。

### 另外，你想通过映射关系函数来快速完成数据转换或者数据添加吗，在这节课程中你将学习到。

### 你想使用除了pandas自带的(mean,count,value_count,sum)聚合函数之外的自定义函数完成聚合操作吗，通过这个示例中，你也可以迅速掌握此技能。

- 这是一个基于美国大选数据的示例，在该示例中我们将完成以下几方面的分析工作：
	- 读取数据
	- 新建一列，保存各个候选人所在党派（party）
	- 查看party 这一列中有哪些党派
	- 统计party列中各个党派出现的次数
	- 查看各个党派分别收到的政治献金总额
	- 查看每天，每个党派收到政治献金的总额
	- 将表中日期格式转换为'yyyy-mm-dd'的格式
	- 查看DISABLED VETERAN主要支持哪位参选人(给谁捐钱最多)
	- 查看每个候选人分别收到的政治献金总额
	- 找到候选人的捐赠者中，捐赠金额最大的人的职业以及捐献额

In [1]:
# 导入必要的包
import numpy as np
import pandas as pd

In [2]:
# 准备基础数据，以便后面使用
months = {'JAN' : 1, 'FEB' : 2, 'MAR' : 3, 'APR' : 4, 'MAY' : 5, 'JUN' : 6,
          'JUL' : 7, 'AUG' : 8, 'SEP' : 9, 'OCT': 10, 'NOV': 11, 'DEC' : 12}

of_interest = ['Obama, Barack', 'Romney, Mitt', 'Santorum, Rick', 
               'Paul, Ron', 'Gingrich, Newt']

parties = {
  'Bachmann, Michelle': 'Republican',
  'Romney, Mitt': 'Republican',
  'Obama, Barack': 'Democrat',
  "Roemer, Charles E. 'Buddy' III": 'Reform',
  'Pawlenty, Timothy': 'Republican',
  'Johnson, Gary Earl': 'Libertarian',
  'Paul, Ron': 'Republican',
  'Santorum, Rick': 'Republican',
  'Cain, Herman': 'Republican',
  'Gingrich, Newt': 'Republican',
  'McCotter, Thaddeus G': 'Republican',
  'Huntsman, Jon': 'Republican',
  'Perry, Rick': 'Republican'           
 }

#### 读取原始表格数据

In [3]:
df = pd.read_csv('./usa_election.txt')
df.head(3)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,cmte_id,cand_id,cand_nm,contbr_nm,contbr_city,contbr_st,contbr_zip,contbr_employer,contbr_occupation,contb_receipt_amt,contb_receipt_dt,receipt_desc,memo_cd,memo_text,form_tp,file_num
0,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,366010000.0,RETIRED,RETIRED,250.0,20-JUN-11,,,,SA17A,736166
1,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,366010000.0,RETIRED,RETIRED,50.0,23-JUN-11,,,,SA17A,736166
2,C00410118,P20002978,"Bachmann, Michelle","SMITH, LANIER",LANETT,AL,368633000.0,INFORMATION REQUESTED,INFORMATION REQUESTED,250.0,05-JUL-11,,,,SA17A,749073


#### 新建一列，保存各个候选人所在党派（party）

In [4]:
df['party'] = df['cand_nm'].map(parties) # 基于字典对应关系，对每一个数据进行转换
df.head(3)

Unnamed: 0,cmte_id,cand_id,cand_nm,contbr_nm,contbr_city,contbr_st,contbr_zip,contbr_employer,contbr_occupation,contb_receipt_amt,contb_receipt_dt,receipt_desc,memo_cd,memo_text,form_tp,file_num,party
0,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,366010000.0,RETIRED,RETIRED,250.0,20-JUN-11,,,,SA17A,736166,Republican
1,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,366010000.0,RETIRED,RETIRED,50.0,23-JUN-11,,,,SA17A,736166,Republican
2,C00410118,P20002978,"Bachmann, Michelle","SMITH, LANIER",LANETT,AL,368633000.0,INFORMATION REQUESTED,INFORMATION REQUESTED,250.0,05-JUL-11,,,,SA17A,749073,Republican


#### 查看party 这一列中有哪些党派

In [5]:
df['party'].unique()

array(['Republican', 'Democrat', 'Reform', 'Libertarian'], dtype=object)

#### 统计party列中各个党派出现的次数

In [6]:
df['party'].value_counts()

Democrat       292400
Republican     237575
Reform           5364
Libertarian       702
Name: party, dtype: int64

#### 查看各个党派分别收到的政治献金总额

In [7]:
df.groupby(by='party')['contb_receipt_amt'].sum()

party
Democrat       8.105758e+07
Libertarian    4.132769e+05
Reform         3.390338e+05
Republican     1.192255e+08
Name: contb_receipt_amt, dtype: float64

#### 查看每天，每个党派收到政治献金的总额

In [8]:
df.groupby(by=['contb_receipt_dt','party'],axis=0)['contb_receipt_amt'].sum()

contb_receipt_dt  party      
01-APR-11         Reform             50.00
                  Republican      12635.00
01-AUG-11         Democrat       175281.00
                  Libertarian      1000.00
                  Reform           1847.00
                                   ...    
31-MAY-11         Republican     301339.80
31-OCT-11         Democrat       204996.87
                  Libertarian      4250.00
                  Reform           3105.00
                  Republican     734601.83
Name: contb_receipt_amt, Length: 1183, dtype: float64

#### 将表中日期格式转换为'yyyy-mm-dd'的格式

In [9]:
# 构建应用到每一行数据中，日期列的转换函数
def change_date(ori_date):
    # 20-JUN-11
    year,month,day = ori_date.split('-')
    year = str(20)+year
    month = str(months[month])
    month = '0' + month if len(month) == 1 else month
    return '-'.join([year,month,day])

In [10]:
# df['contb_receipt_dt'].apply(change_date) # 也能完成同样的功能
df['contb_receipt_dt'] = df['contb_receipt_dt'].map(change_date)
df.head(3)

Unnamed: 0,cmte_id,cand_id,cand_nm,contbr_nm,contbr_city,contbr_st,contbr_zip,contbr_employer,contbr_occupation,contb_receipt_amt,contb_receipt_dt,receipt_desc,memo_cd,memo_text,form_tp,file_num,party
0,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,366010000.0,RETIRED,RETIRED,250.0,2020-06-11,,,,SA17A,736166,Republican
1,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,366010000.0,RETIRED,RETIRED,50.0,2023-06-11,,,,SA17A,736166,Republican
2,C00410118,P20002978,"Bachmann, Michelle","SMITH, LANIER",LANETT,AL,368633000.0,INFORMATION REQUESTED,INFORMATION REQUESTED,250.0,2005-07-11,,,,SA17A,749073,Republican


#### 查看DISABLED VETERAN主要支持哪位参选人(给谁捐钱最多)

In [11]:
# 获取捐款人的职业是老兵的行
df_veteran = df.loc[df['contbr_occupation']=='DISABLED VETERAN']
df_veteran.head(3)

Unnamed: 0,cmte_id,cand_id,cand_nm,contbr_nm,contbr_city,contbr_st,contbr_zip,contbr_employer,contbr_occupation,contb_receipt_amt,contb_receipt_dt,receipt_desc,memo_cd,memo_text,form_tp,file_num,party
149790,C00431445,P80003338,"Obama, Barack","MAHURIN, DAVID",TAMPA,FL,33616.0,VETERANS ADMINISTRATION,DISABLED VETERAN,10.0,2017-01-12,,,,SA17A,766535,Democrat
150910,C00431445,P80003338,"Obama, Barack","MAHURIN, DAVID",TAMPA,FL,33616.0,VETERANS ADMINISTRATION,DISABLED VETERAN,20.0,2001-01-12,,,,SA17A,766535,Democrat
174041,C00431445,P80003338,"Obama, Barack","KRUCHTEN, MICHAEL",ROCKFORD,IL,611147000.0,DISABLED,DISABLED VETERAN,50.0,2002-12-11,,,,SA17A,763233,Democrat


In [12]:
# 分组后，使用聚合函数完成统计，aggregate支持使用自定义的聚合函数
df_veteran.groupby(by='cand_nm',axis=0)['contb_receipt_amt'].aggregate('sum').sort_values(ascending=False)

cand_nm
Obama, Barack     4205.00
Paul, Ron         2425.49
Cain, Herman       300.00
Santorum, Rick     250.00
Name: contb_receipt_amt, dtype: float64

#### 查看每个候选人分别收到的政治献金总额

In [13]:
df.groupby(by='cand_nm')['contb_receipt_amt'].sum()

cand_nm
Bachmann, Michelle                2.607916e+06
Cain, Herman                      7.010446e+06
Gingrich, Newt                    9.271751e+06
Huntsman, Jon                     3.200693e+06
Johnson, Gary Earl                4.132769e+05
McCotter, Thaddeus G              3.703000e+04
Obama, Barack                     8.105758e+07
Paul, Ron                         1.543576e+07
Pawlenty, Timothy                 4.238859e+06
Perry, Rick                       1.864425e+07
Roemer, Charles E. 'Buddy' III    3.390338e+05
Romney, Mitt                      5.542734e+07
Santorum, Rick                    3.351440e+06
Name: contb_receipt_amt, dtype: float64

#### 找到候选人的捐赠者中，捐赠金额最大的人的职业以及捐献额

In [14]:
# 找到捐赠的最大金额
max_account = df['contb_receipt_amt'].max()
max_account

1944042.43

In [15]:
# 找到捐赠金额最大的那一行数据
df_max = df.query('contb_receipt_amt == %s'%max_account)
df_max

Unnamed: 0,cmte_id,cand_id,cand_nm,contbr_nm,contbr_city,contbr_st,contbr_zip,contbr_employer,contbr_occupation,contb_receipt_amt,contb_receipt_dt,receipt_desc,memo_cd,memo_text,form_tp,file_num,party
176127,C00431445,P80003338,"Obama, Barack",OBAMA VICTORY FUND 2012 - UNITEMIZED,CHICAGO,IL,60680,,,1944042.43,2031-12-11,,X,*,SA18,763233,Democrat


In [16]:
df_max['contbr_occupation'] # 职业

176127    NaN
Name: contbr_occupation, dtype: object

In [17]:
df_max['contb_receipt_amt'] # 捐献额

176127    1944042.43
Name: contb_receipt_amt, dtype: float64