In [1]:
import numpy as np
import pandas as pd


In [2]:
df = pd.DataFrame({
    'key': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})

In [3]:
df

Unnamed: 0,data1,data2,key,key2
0,-1.25234,-0.08941,a,one
1,-1.591322,-0.432497,a,two
2,0.680342,1.19238,b,one
3,0.360603,-0.163459,b,two
4,1.397283,-0.070584,a,one


In [4]:
grouped = df['data1'].groupby(df['key'])

In [5]:
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x7f4c21169a20>

In [6]:
grouped.mean()

key
a   -0.482126
b    0.520472
Name: data1, dtype: float64

In [7]:
grouped.sum()

key
a   -1.446379
b    1.040945
Name: data1, dtype: float64

In [8]:
# 

means = df['data1'].groupby([df['key'], df['key2']]).mean()

In [9]:
means

key  key2
a    one     0.072471
     two    -1.591322
b    one     0.680342
     two     0.360603
Name: data1, dtype: float64

In [10]:
means.unstack()

key2,one,two
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.072471,-1.591322
b,0.680342,0.360603


In [14]:
states = np.array('ohio california california ohio ohio'.split())

In [15]:
years = np.array([2005, 2005, 2006, 2005, 2006])

In [16]:
df['data1'].groupby([states, years]).mean()

california  2005   -1.591322
            2006    0.680342
ohio        2005   -0.445869
            2006    1.397283
Name: data1, dtype: float64

In [17]:
df.groupby('key').mean()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.482126,-0.197497
b,0.520472,0.514461


In [18]:
df.groupby(['key', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.072471,-0.079997
a,two,-1.591322,-0.432497
b,one,0.680342,1.19238
b,two,0.360603,-0.163459


In [19]:
df.groupby(['key', 'key2']).size()

key  key2
a    one     2
     two     1
b    one     1
     two     1
dtype: int64

### Iterating	Over	Groups

In [20]:
for name, group in df.groupby('key'):
    print(name)
    print(group)

a
      data1     data2 key key2
0 -1.252340 -0.089410   a  one
1 -1.591322 -0.432497   a  two
4  1.397283 -0.070584   a  one
b
      data1     data2 key key2
2  0.680342  1.192380   b  one
3  0.360603 -0.163459   b  two


In [21]:
# for two keys

for (k1, k2), group in df.groupby(['key', 'key2']):
    print(k1, k2)
    print(group)

a one
      data1     data2 key key2
0 -1.252340 -0.089410   a  one
4  1.397283 -0.070584   a  one
a two
      data1     data2 key key2
1 -1.591322 -0.432497   a  two
b one
      data1    data2 key key2
2  0.680342  1.19238   b  one
b two
      data1     data2 key key2
3  0.360603 -0.163459   b  two


In [22]:
# change its form

pieces = dict(list(df.groupby('key')))

In [23]:
pieces

{'a':       data1     data2 key key2
 0 -1.252340 -0.089410   a  one
 1 -1.591322 -0.432497   a  two
 4  1.397283 -0.070584   a  one, 'b':       data1     data2 key key2
 2  0.680342  1.192380   b  one
 3  0.360603 -0.163459   b  two}

In [24]:
pieces['a']

Unnamed: 0,data1,data2,key,key2
0,-1.25234,-0.08941,a,one
1,-1.591322,-0.432497,a,two
4,1.397283,-0.070584,a,one


### Selecting	a	Column	or	Subset	of	Columns

In [25]:
df.groupby(['key', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key,key2,Unnamed: 2_level_1
a,one,-0.079997
a,two,-0.432497
b,one,1.19238
b,two,-0.163459


In [26]:
df.groupby(['key', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.072471,-0.079997
a,two,-1.591322,-0.432497
b,one,0.680342,1.19238
b,two,0.360603,-0.163459


In [27]:
s_grouped = df.groupby(['key', 'key2'])['data2']

In [28]:
s_grouped

<pandas.core.groupby.SeriesGroupBy object at 0x7f4c1e65bc50>

In [29]:
s_grouped.mean()

key  key2
a    one    -0.079997
     two    -0.432497
b    one     1.192380
     two    -0.163459
Name: data2, dtype: float64

### Grouping	with	Dicts	and	Series

In [30]:
people = pd.DataFrame(np.random.randn(5,5),
                      columns='a b c d e'.split(),
                      index='joe steve wes jim travis'.split())

In [31]:
people

Unnamed: 0,a,b,c,d,e
joe,0.084323,-0.287388,-0.336083,0.595868,1.038707
steve,-1.141974,0.985471,1.11958,2.013118,-0.06006
wes,1.416431,0.803591,0.073749,0.062828,-0.0104
jim,0.443071,0.602317,0.278408,-0.501055,1.214699
travis,-0.926399,-0.175236,0.845821,-0.413699,0.35966


In [32]:
people.iloc[2:3, [1,2]] = np.nan

In [33]:
people

Unnamed: 0,a,b,c,d,e
joe,0.084323,-0.287388,-0.336083,0.595868,1.038707
steve,-1.141974,0.985471,1.11958,2.013118,-0.06006
wes,1.416431,,,0.062828,-0.0104
jim,0.443071,0.602317,0.278408,-0.501055,1.214699
travis,-0.926399,-0.175236,0.845821,-0.413699,0.35966


In [34]:
# a dict

mapping = {
    'a': 'red', 'b': 'red', 'c': 'blue',
    'd': 'blue', 'e': 'red', 'f': 'orange'
}

In [35]:
by_columns = people.groupby(mapping, axis=1)

In [37]:
by_columns.sum()

Unnamed: 0,blue,red
joe,0.259786,0.835643
steve,3.132698,-0.216562
wes,0.062828,1.406031
jim,-0.222647,2.260088
travis,0.432122,-0.741974


In [38]:
map_series = pd.Series(mapping)

In [39]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
joe,2,3
steve,2,3
wes,1,2
jim,2,3
travis,2,3


### Grouping	with	Functions

In [40]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,1.943825,0.31493,-0.057675,0.157641,2.243007
5,-1.141974,0.985471,1.11958,2.013118,-0.06006
6,-0.926399,-0.175236,0.845821,-0.413699,0.35966


In [41]:
key_list = 'one one one two one'.split()

In [42]:
people.groupby([len, key_list]).sum()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,1.500754,-0.287388,-0.336083,0.658696,1.028308
3,two,0.443071,0.602317,0.278408,-0.501055,1.214699
5,one,-1.141974,0.985471,1.11958,2.013118,-0.06006
6,one,-0.926399,-0.175236,0.845821,-0.413699,0.35966


In [43]:
people

Unnamed: 0,a,b,c,d,e
joe,0.084323,-0.287388,-0.336083,0.595868,1.038707
steve,-1.141974,0.985471,1.11958,2.013118,-0.06006
wes,1.416431,,,0.062828,-0.0104
jim,0.443071,0.602317,0.278408,-0.501055,1.214699
travis,-0.926399,-0.175236,0.845821,-0.413699,0.35966


### Grouping	by	Index	Levels

In [44]:
columns = pd.MultiIndex.from_arrays(['US US US JP JP'.split(), [1,3,5,1,3]], names='cty tenor'.split())

In [45]:
columns

MultiIndex(levels=[['JP', 'US'], [1, 3, 5]],
           labels=[[1, 1, 1, 0, 0], [0, 1, 2, 0, 1]],
           names=['cty', 'tenor'])

In [46]:
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)

In [47]:
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,2.286029,-0.822318,-1.652267,-0.211199,0.493685
1,1.973289,-0.275978,0.654526,-1.46617,-0.057662
2,0.062142,0.428799,1.726074,1.003019,1.035389
3,0.366552,1.548622,0.251311,-1.113539,0.640776
