In [1]:
import pandas as pd 
import numpy as np

In [2]:
df = pd.DataFrame({'key1':['a','a','b','b','a'],
                  'key2':['one','two','one','two','one'],
                  'data1':np.random.randn(5),
                  'data2':np.random.randn(5)})

In [3]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.026159,-0.198051
1,a,two,-1.475368,-0.092221
2,b,one,-1.200452,1.59032
3,b,two,-0.336121,0.620681
4,a,one,-0.581891,-1.008933


In [4]:
grouped = df['data1'].groupby(df['key1'])

In [5]:
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000016DEC3ED1C0>

In [6]:
grouped.mean()

key1
a   -0.694473
b   -0.768286
Name: data1, dtype: float64

In [7]:
means = df['data1'].groupby([df['key1'],df['key2']]).mean()

In [8]:
means

key1  key2
a     one    -0.304025
      two    -1.475368
b     one    -1.200452
      two    -0.336121
Name: data1, dtype: float64

In [9]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.304025,-1.475368
b,-1.200452,-0.336121


In [10]:
states = np.array(['ohio','california','california','ohio','ohio'])

In [11]:
years = np.array([2005,2005,2006,2005,2006])

In [12]:
df['data1'].groupby([states,years]).mean()

california  2005   -1.475368
            2006   -1.200452
ohio        2005   -0.181140
            2006   -0.581891
Name: data1, dtype: float64

In [13]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.694473,-0.433068
b,-0.768286,1.105501


In [14]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.304025,-0.603492
a,two,-1.475368,-0.092221
b,one,-1.200452,1.59032
b,two,-0.336121,0.620681


In [15]:
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

### 遍历各分组

In [16]:
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one -0.026159 -0.198051
1    a  two -1.475368 -0.092221
4    a  one -0.581891 -1.008933
b
  key1 key2     data1     data2
2    b  one -1.200452  1.590320
3    b  two -0.336121  0.620681


In [18]:
for (k1,k2),group in df.groupby(['key1','key2']):
    print((k1,k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one -0.026159 -0.198051
4    a  one -0.581891 -1.008933
('a', 'two')
  key1 key2     data1     data2
1    a  two -1.475368 -0.092221
('b', 'one')
  key1 key2     data1    data2
2    b  one -1.200452  1.59032
('b', 'two')
  key1 key2     data1     data2
3    b  two -0.336121  0.620681


In [19]:
pieces = dict(list(df.groupby('key1')))

In [20]:
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,-1.200452,1.59032
3,b,two,-0.336121,0.620681


In [21]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [22]:
grouped = df.groupby(df.dtypes,axis=1)

In [23]:
for dtype,group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -0.026159 -0.198051
1 -1.475368 -0.092221
2 -1.200452  1.590320
3 -0.336121  0.620681
4 -0.581891 -1.008933
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


### 选择一列或所有列的子集

In [24]:
df.groupby('key1')['data1']
df.groupby('key2')['data2']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000016DEFFEA730>

In [27]:
df.groupby(['key1','key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.603492
a,two,-0.092221
b,one,1.59032
b,two,0.620681


In [28]:
s_grouped = df.groupby(['key1','key2'])['data2']

In [29]:
s_grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000016DEC48C040>

In [30]:
s_grouped.mean()

key1  key2
a     one    -0.603492
      two    -0.092221
b     one     1.590320
      two     0.620681
Name: data2, dtype: float64

### 使用字典和Series分组

In [31]:
people = pd.DataFrame(np.random.randn(5,5),
                     columns=['a','b','c','d','e'],
                     index = ['joe','steve','wes','jim','travis'])

In [32]:
people.iloc[2:3,[1,2]] = np.nan

In [33]:
people

Unnamed: 0,a,b,c,d,e
joe,1.116883,-0.648266,0.458952,-1.740406,-1.422877
steve,-0.63329,0.851748,0.204577,0.987399,-1.952328
wes,0.591222,,,-0.462847,1.606959
jim,-0.351454,-0.044942,0.569967,-0.054435,-0.052792
travis,0.992347,1.951524,1.40569,0.531022,-0.353634


In [34]:
mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}

In [36]:
by_column = people.groupby(mapping,axis=1)

In [37]:
by_column.sum()

Unnamed: 0,blue,red
joe,-1.281454,-0.95426
steve,1.191975,-1.73387
wes,-0.462847,2.198181
jim,0.515532,-0.449187
travis,1.936712,2.590237


In [38]:
map_series = pd.Series(mapping)

In [39]:
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [41]:
people.groupby(map_series,axis=1).count()

Unnamed: 0,blue,red
joe,2,3
steve,2,3
wes,1,2
jim,2,3
travis,2,3


### 使用函数分组