In [1]:
import numpy as np
import pandas as pd


In [2]:
df = pd.DataFrame({
    'key': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})

In [3]:
df

Unnamed: 0,data1,data2,key,key2
0,-1.25234,-0.08941,a,one
1,-1.591322,-0.432497,a,two
2,0.680342,1.19238,b,one
3,0.360603,-0.163459,b,two
4,1.397283,-0.070584,a,one


In [4]:
grouped = df['data1'].groupby(df['key'])

In [5]:
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x7f4c21169a20>

In [6]:
grouped.mean()

key
a   -0.482126
b    0.520472
Name: data1, dtype: float64

In [7]:
grouped.sum()

key
a   -1.446379
b    1.040945
Name: data1, dtype: float64

In [8]:
# 

means = df['data1'].groupby([df['key'], df['key2']]).mean()

In [9]:
means

key  key2
a    one     0.072471
     two    -1.591322
b    one     0.680342
     two     0.360603
Name: data1, dtype: float64

In [10]:
means.unstack()

key2,one,two
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.072471,-1.591322
b,0.680342,0.360603


In [14]:
states = np.array('ohio california california ohio ohio'.split())

In [15]:
years = np.array([2005, 2005, 2006, 2005, 2006])

In [16]:
df['data1'].groupby([states, years]).mean()

california  2005   -1.591322
            2006    0.680342
ohio        2005   -0.445869
            2006    1.397283
Name: data1, dtype: float64

In [17]:
df.groupby('key').mean()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.482126,-0.197497
b,0.520472,0.514461


In [18]:
df.groupby(['key', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.072471,-0.079997
a,two,-1.591322,-0.432497
b,one,0.680342,1.19238
b,two,0.360603,-0.163459


In [19]:
df.groupby(['key', 'key2']).size()

key  key2
a    one     2
     two     1
b    one     1
     two     1
dtype: int64

### Iterating	Over	Groups

In [20]:
for name, group in df.groupby('key'):
    print(name)
    print(group)

a
      data1     data2 key key2
0 -1.252340 -0.089410   a  one
1 -1.591322 -0.432497   a  two
4  1.397283 -0.070584   a  one
b
      data1     data2 key key2
2  0.680342  1.192380   b  one
3  0.360603 -0.163459   b  two


In [21]:
# for two keys

for (k1, k2), group in df.groupby(['key', 'key2']):
    print(k1, k2)
    print(group)

a one
      data1     data2 key key2
0 -1.252340 -0.089410   a  one
4  1.397283 -0.070584   a  one
a two
      data1     data2 key key2
1 -1.591322 -0.432497   a  two
b one
      data1    data2 key key2
2  0.680342  1.19238   b  one
b two
      data1     data2 key key2
3  0.360603 -0.163459   b  two


In [22]:
# change its form

pieces = dict(list(df.groupby('key')))

In [23]:
pieces

{'a':       data1     data2 key key2
 0 -1.252340 -0.089410   a  one
 1 -1.591322 -0.432497   a  two
 4  1.397283 -0.070584   a  one, 'b':       data1     data2 key key2
 2  0.680342  1.192380   b  one
 3  0.360603 -0.163459   b  two}

In [24]:
pieces['a']

Unnamed: 0,data1,data2,key,key2
0,-1.25234,-0.08941,a,one
1,-1.591322,-0.432497,a,two
4,1.397283,-0.070584,a,one


### Selecting	a	Column	or	Subset	of	Columns

In [25]:
df.groupby(['key', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key,key2,Unnamed: 2_level_1
a,one,-0.079997
a,two,-0.432497
b,one,1.19238
b,two,-0.163459


In [26]:
df.groupby(['key', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.072471,-0.079997
a,two,-1.591322,-0.432497
b,one,0.680342,1.19238
b,two,0.360603,-0.163459
