In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame({'key1':['a','a','b','b','a'],
                   'key2':['one','two','one','two','one'],
                   'data1':np.random.randn(5),
                   'data2':np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.709919,-0.331395
1,a,two,-0.263051,-0.133493
2,b,one,-2.145169,-0.086369
3,b,two,-0.9457,0.449262
4,a,one,1.514196,0.797068


In [4]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001C0B782B940>

In [5]:
grouped.mean()

key1
a    0.180408
b   -1.545434
Name: data1, dtype: float64

In [6]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one     0.402138
      two    -0.263051
b     one    -2.145169
      two    -0.945700
Name: data1, dtype: float64

In [7]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.402138,-0.263051
b,-2.145169,-0.9457


In [8]:
states = np.array(['Ohio','California','California','Ohio','Ohio'])
years = np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()

California  2005   -0.263051
            2006   -2.145169
Ohio        2005   -0.827809
            2006    1.514196
Name: data1, dtype: float64

In [9]:
df.groupby('key1').mean()  # 非数值列会自动忽略

  df.groupby('key1').mean()


Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.180408,0.110727
b,-1.545434,0.181446


In [10]:
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

#### 遍历各分组

In [11]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one -0.709919 -0.331395
1    a  two -0.263051 -0.133493
4    a  one  1.514196  0.797068
b
  key1 key2     data1     data2
2    b  one -2.145169 -0.086369
3    b  two -0.945700  0.449262


In [12]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one -0.709919 -0.331395
4    a  one  1.514196  0.797068
('a', 'two')
  key1 key2     data1     data2
1    a  two -0.263051 -0.133493
('b', 'one')
  key1 key2     data1     data2
2    b  one -2.145169 -0.086369
('b', 'two')
  key1 key2   data1     data2
3    b  two -0.9457  0.449262


In [13]:
list(df.groupby('key1'))

[('a',
    key1 key2     data1     data2
  0    a  one -0.709919 -0.331395
  1    a  two -0.263051 -0.133493
  4    a  one  1.514196  0.797068),
 ('b',
    key1 key2     data1     data2
  2    b  one -2.145169 -0.086369
  3    b  two -0.945700  0.449262)]

In [14]:
pieces = dict(list(df.groupby('key1')))
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,-2.145169,-0.086369
3,b,two,-0.9457,0.449262


In [15]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [16]:
grouped = df.groupby(df.dtypes, axis=1)
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -0.709919 -0.331395
1 -0.263051 -0.133493
2 -2.145169 -0.086369
3 -0.945700  0.449262
4  1.514196  0.797068
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


#### 选择一列或所有列的子集

In [17]:
df.groupby(['key1','key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.232836
a,two,-0.133493
b,one,-0.086369
b,two,0.449262


In [18]:
s_grouped = df.groupby(['key1','key2'])['data2']
s_grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001C0B8A4D7C0>

In [19]:
s_grouped.mean()

key1  key2
a     one     0.232836
      two    -0.133493
b     one    -0.086369
      two     0.449262
Name: data2, dtype: float64

#### 使用字典和Series分组

In [21]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a','b','c','d','e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,-0.818818,-1.192103,-0.285353,1.274489,0.391745
Steve,0.412308,-0.019038,-0.132553,-1.107406,0.027489
Wes,0.46283,0.127425,0.163989,-0.914873,-0.412965
Jim,-0.680904,-0.096289,-1.434905,1.460374,-0.274097
Travis,1.048609,-0.034832,0.07032,-0.46528,0.271174
