In [35]:
import pandas as pd
import numpy as np

In [36]:
# pandas提供一个groupby接口，可以对数据集进行切片、切块和总结.

# 10.1 Group by机制

In [37]:
# Demo
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                   'key2': ['one', 'two', 'one', 'two', 'one'],
                   'data1': np.random.randn(5),
                   'data2': np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.578945,-0.422153
1,a,two,-0.304687,-0.483221
2,b,one,-0.121286,1.184114
3,b,two,-1.60414,0.835268
4,a,one,1.044308,0.481501


In [38]:
# Goal：根据key1标签计算data1列的均值
# Solution：访问data1并使用key1列调用groupby方法，得到一个Series.
grouped = df['data1'].groupby(df['key1'])
grouped.mean()

key1
a   -0.279775
b   -0.862713
Name: data1, dtype: float64

In [39]:
# 使用两个键对数据分组，结果Series有一个包含唯一键对的多层索引.
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one    -0.267318
      two    -0.304687
b     one    -0.121286
      two    -1.604140
Name: data1, dtype: float64

In [40]:
# df['key2']不是数值数据，是一个冗余列，所以不显示.
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.279775,-0.141291
b,-0.862713,1.009691


In [41]:
# size方法返回包含组大小信息的Series.
# 分组中的缺失值排除在结果外.
df.groupby('key1').size()

key1
a    3
b    2
dtype: int64

### 10.1.1 遍历各分组
GroupBy对象支持迭代，会生成一个包含组名和数据块的2维元组序列.

In [42]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one -1.578945 -0.422153
1    a  two -0.304687 -0.483221
4    a  one  1.044308  0.481501
b
  key1 key2     data1     data2
2    b  one -0.121286  1.184114
3    b  two -1.604140  0.835268


In [43]:
# 多个分组键情况下，元组的第一个元素是键值的元组.
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one -1.578945 -0.422153
4    a  one  1.044308  0.481501
('a', 'two')
  key1 key2     data1     data2
1    a  two -0.304687 -0.483221
('b', 'one')
  key1 key2     data1     data2
2    b  one -0.121286  1.184114
('b', 'two')
  key1 key2    data1     data2
3    b  two -1.60414  0.835268


In [44]:
# 可以在任意轴上分组
grouped = df.groupby(df.dtypes, axis=1)
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -1.578945 -0.422153
1 -0.304687 -0.483221
2 -0.121286  1.184114
3 -1.604140  0.835268
4  1.044308  0.481501
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


### 10.1.2 选择一列或所有列的子集
从DataFrame创建的GroupBy对象用列名称或列成名数组进行索引时，会产生用于聚合的列子集的效果.

In [45]:
df['data1'].groupby(df['key1'])
# 语法糖🔽
df.groupby('key1')['data1']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x120a63350>

In [46]:
df.groupby(['key1', 'key2'])['data2'].mean()

key1  key2
a     one     0.029674
      two    -0.483221
b     one     1.184114
      two     0.835268
Name: data2, dtype: float64

### 10.1.3 使用字典和Series分组

In [47]:
# 分组信息可能以非数组形式存在

In [48]:
people = pd.DataFrame(np.random.randn(5, 5), 
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,-1.157354,0.876774,-0.60372,-1.481119,-0.434218
Steve,-1.466986,0.658311,0.618929,-0.307087,-0.655534
Wes,0.373463,,,-0.386314,-0.735979
Jim,-2.745109,0.180282,-0.582819,-0.441969,0.740627
Travis,0.870106,0.633136,0.128452,1.91885,-1.008746


In [49]:
# 有各列的分组对应关系，把各列按组累加.
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f': 'orange'}

by_column = people.groupby(mapping, axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,-2.08484,-0.714798
Steve,0.311843,-1.464209
Wes,-0.386314,-0.362516
Jim,-1.024788,-1.824199
Travis,2.047302,0.494496


In [50]:
# Series也有相同的功能，视为固定大小的映射
map_series = pd.Series(mapping)
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


### 10.1.4 使用函数分组