In [1]:
import numpy as np
import pandas as pd

## groupby

In [3]:
df = pd.DataFrame({'key1':list('aabba'),
                  'key2': ['one','two','one','two','one'],
                  'data1': np.random.randn(5),
                  'data2': np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,-1.304072,0.602548,a,one
1,0.89217,-0.386406,a,two
2,0.457713,-1.045518,b,one
3,1.123721,-0.057502,b,two
4,-1.136373,1.305967,a,one


In [20]:
grouped = df.groupby(df['key1'])
grouped.mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.516091,0.50737
b,0.790717,-0.55151


In [21]:
grouped = df['data1'].groupby(df['key1'])
grouped.mean()

key1
a   -0.516091
b    0.790717
Name: data1, dtype: float64

In [23]:
# 按多个键分组
grouped = df.groupby([df['key1'],df['key2']])
grouped.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-1.220222,0.954258
a,two,0.89217,-0.386406
b,one,0.457713,-1.045518
b,two,1.123721,-0.057502


In [25]:
# 以上的分组键均为Series，实际上分组键可以是任何长度适当的数组
states=np.array(['Ohio','California','California','Ohio','Ohio'])
years=np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()

California  2005    0.892170
            2006    0.457713
Ohio        2005   -0.090175
            2006   -1.136373
Name: data1, dtype: float64

In [27]:
# 可以看出没有key2列，因为df[‘key2’]不是数值数据，所以被从结果中移除。
# 默认情况下，所有数值列都会被聚合，虽然有时可能被过滤为一个子集。
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.516091,0.50737
b,0.790717,-0.55151


## 对分组进行迭代

In [33]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0 -1.304072  0.602548    a  one
1  0.892170 -0.386406    a  two
4 -1.136373  1.305967    a  one
b
      data1     data2 key1 key2
2  0.457713 -1.045518    b  one
3  1.123721 -0.057502    b  two


In [34]:
# name就是groupby中的key1的值，group就是要输出的内容。 
# 同理：
for (k1,k2),group in df.groupby(['key1','key2']):
    print ('===k1,k2:')
    print (k1,k2)
    print ('===k3:')
    print (group)

===k1,k2:
a one
===k3:
      data1     data2 key1 key2
0 -1.304072  0.602548    a  one
4 -1.136373  1.305967    a  one
===k1,k2:
a two
===k3:
     data1     data2 key1 key2
1  0.89217 -0.386406    a  two
===k1,k2:
b one
===k3:
      data1     data2 key1 key2
2  0.457713 -1.045518    b  one
===k1,k2:
b two
===k3:
      data1     data2 key1 key2
3  1.123721 -0.057502    b  two


In [35]:
piece=dict(list(df.groupby('key1')))
piece

{'a':       data1     data2 key1 key2
 0 -1.304072  0.602548    a  one
 1  0.892170 -0.386406    a  two
 4 -1.136373  1.305967    a  one, 'b':       data1     data2 key1 key2
 2  0.457713 -1.045518    b  one
 3  1.123721 -0.057502    b  two}

In [38]:
# groupby默认是在axis=0上进行分组的，通过设置也可以在其他任何轴上进行分组.

grouped=df.groupby(df.dtypes, axis=1)
dict(list(grouped))

{dtype('float64'):       data1     data2
 0 -1.304072  0.602548
 1  0.892170 -0.386406
 2  0.457713 -1.045518
 3  1.123721 -0.057502
 4 -1.136373  1.305967, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

## 选取一个或者一组列

In [46]:
df

Unnamed: 0,data1,data2,key1,key2
0,-1.304072,0.602548,a,one
1,0.89217,-0.386406,a,two
2,0.457713,-1.045518,b,one
3,1.123721,-0.057502,b,two
4,-1.136373,1.305967,a,one


In [52]:
print(df.groupby(['key1','key2']).mean())

# 对于大数据，很多情况是只需要对部分列进行聚合
print(df.groupby(['key1','key2'])['data2'].mean())
df.groupby(['key1','key2'])['data2','data1'].mean()

              data1     data2
key1 key2                    
a    one  -1.220222  0.954258
     two   0.892170 -0.386406
b    one   0.457713 -1.045518
     two   1.123721 -0.057502
key1  key2
a     one     0.954258
      two    -0.386406
b     one    -1.045518
      two    -0.057502
Name: data2, dtype: float64


Unnamed: 0_level_0,Unnamed: 1_level_0,data2,data1
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.954258,-1.220222
a,two,-0.386406,0.89217
b,one,-1.045518,0.457713
b,two,-0.057502,1.123721


## 通过字典或者series进行分组

In [70]:
people=pd.DataFrame(np.random.randn(5,5),
                   columns=list('abcde'),
                   index=['Joe','Steve','Wes','Jim','Travis'])

people.ix[2:3,['b','c']]=np.nan #设置几个nan
people

Unnamed: 0,a,b,c,d,e
Joe,0.836659,-0.155809,-0.260933,-0.479236,-0.747557
Steve,-0.750791,-0.672957,-0.830508,0.835848,0.327191
Wes,0.935452,,,0.26025,1.839458
Jim,0.97729,-0.223177,1.648929,-0.895358,-1.537907
Travis,0.838842,0.643993,0.130966,0.822127,0.882899


In [72]:
mapping={'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_column=people.groupby(mapping,axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,-0.740169,-0.066708
Steve,0.00534,-1.096557
Wes,0.26025,2.77491
Jim,0.753572,-0.783794
Travis,0.953092,2.365734


In [78]:
# 如果不加axis=1, 则只会出现 a b c d e
mapping={'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_column=people.groupby(mapping)
by_column.sum()

Unnamed: 0,a,b,c,d,e


In [80]:
# Series 也一样
map_series=pd.Series(mapping)
print(map_series)
people.groupby(map_series,axis=1).count()

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object


Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


## 通过函数进行分组

In [86]:
# 相较于dic或者Series，python函数在定义分组关系映射时更有创意。
# 任何被当做分组键的函数都会在各个索引上被调用一次，其返回值就会被用作分组名称。
# 假设你按人名的长度进行分组，仅仅传入len即可
print(people)
people.groupby(len).sum()

               a         b         c         d         e
Joe     0.836659 -0.155809 -0.260933 -0.479236 -0.747557
Steve  -0.750791 -0.672957 -0.830508  0.835848  0.327191
Wes     0.935452       NaN       NaN  0.260250  1.839458
Jim     0.977290 -0.223177  1.648929 -0.895358 -1.537907
Travis  0.838842  0.643993  0.130966  0.822127  0.882899


Unnamed: 0,a,b,c,d,e
3,2.749401,-0.378987,1.387996,-1.114343,-0.446006
5,-0.750791,-0.672957,-0.830508,0.835848,0.327191
6,0.838842,0.643993,0.130966,0.822127,0.882899


In [87]:
# 将函数和数组、列表、字典、Series混合使用也不是问题，因为任何东西都会最终转换为数组
key_list=['one','one','one','two','two'] 
people.groupby([len,key_list]).sum()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,1.772111,-0.155809,-0.260933,-0.218986,1.091901
3,two,0.97729,-0.223177,1.648929,-0.895358,-1.537907
5,one,-0.750791,-0.672957,-0.830508,0.835848,0.327191
6,two,0.838842,0.643993,0.130966,0.822127,0.882899


## 根据索引级别进行分组

In [88]:
# 层次化索引最方便的地方就在于他能够根据索引级别进行聚合。
# 要实现该目的，通过level关键字出入级别编号或者名称即可:
columns=pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['cty','tenor'])
hier_df=pd.DataFrame(np.random.randn(4,5),columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.841506,-0.318683,-0.727548,-0.102212,1.135224
1,0.656744,-0.371614,-0.186148,-0.991809,0.9449
2,-0.499534,0.083669,-0.464316,-0.131412,-1.253453
3,-0.496822,1.911844,-1.335649,0.436272,1.962672


In [89]:
hier_df.groupby(level='cty',axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


# 数据聚合

### 调用自定义的聚合函数

In [110]:
print(df)
grouped = df['data1'].groupby(df['key1'])
for name,group in df['data1'].groupby(df['key1']):
    print ('===name:')
    print (name)
    print ('===group:')
    print (group)

# 自定义函数
def peak_to_peak(arr):
    return arr.max()-arr.min()
grouped.agg(peak_to_peak)

      data1     data2 key1 key2
0 -1.304072  0.602548    a  one
1  0.892170 -0.386406    a  two
2  0.457713 -1.045518    b  one
3  1.123721 -0.057502    b  two
4 -1.136373  1.305967    a  one
===name:
a
===group:
0   -1.304072
1    0.892170
4   -1.136373
Name: data1, dtype: float64
===name:
b
===group:
2    0.457713
3    1.123721
Name: data1, dtype: float64


key1
a    2.196242
b    0.666009
Name: data1, dtype: float64

### 面向列的多函数应用

In [None]:
# 常用函数  count、sum、mean、median、std、var(方差)、
# min、max、prod（非NA值的积）、first（第一个非NA的值）、last

In [112]:
# 对Series或者DataFrame列的聚合运算实际是使用aggregate或者调用mean，std等方法。
# 下面对不同的列使用不同的聚合函数，或者一次应用多个函数