In [1]:
import numpy as np
import pandas as pd

In [3]:
# create a data frame
df1 = pd.DataFrame({'k1':list('XXYYZ'),
                    'k2':['alpha', 'beta', 'alpha', 'beta', 'alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})
df1

Unnamed: 0,dataset1,dataset2,k1,k2
0,1.059563,-0.153346,X,alpha
1,-1.314812,-1.033353,X,beta
2,0.063592,0.521678,Y,alpha
3,-0.574193,0.530819,Y,beta
4,-1.070406,2.09992,Z,alpha


In [6]:
# group by k1 key
group1 = df1['dataset1'].groupby(df1['k1'])
# show that we now have a group object (won't actually display data)
group1

<pandas.core.groupby.SeriesGroupBy object at 0x00000168C8DA3588>

In [7]:
# mean of each group
group1.mean()

k1
X   -0.127624
Y   -0.255301
Z   -1.070406
Name: dataset1, dtype: float64

In [8]:
# create arrays of cities and months
cities = np.array(['NY', 'LA', 'LA', 'NY', 'NY'])
months = np.array(['Jan', 'Feb', 'Jan', 'Feb', 'Jan'])

In [9]:
df1['dataset1'].groupby([cities, months]).mean()

LA  Feb   -1.314812
    Jan    0.063592
NY  Feb   -0.574193
    Jan   -0.005422
Name: dataset1, dtype: float64

In [12]:
# group both datasets by k1
df1.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,-0.127624,-0.593349
Y,-0.255301,0.526248
Z,-1.070406,2.09992


In [13]:
# group both data sets by both keys
df1.groupby(['k1', 'k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,1.059563,-0.153346
X,beta,-1.314812,-1.033353
Y,alpha,0.063592,0.521678
Y,beta,-0.574193,0.530819
Z,alpha,-1.070406,2.09992


In [15]:
# we can pull the counts/size of each group too
df1.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

In [18]:
# iterate over groups
for name, group in df1.groupby('k1'):
    print('The group name is {}'.format(name))
    print(group)
    print('\n')

The group name is X
   dataset1  dataset2 k1     k2
0  1.059563 -0.153346  X  alpha
1 -1.314812 -1.033353  X   beta


The group name is Y
   dataset1  dataset2 k1     k2
2  0.063592  0.521678  Y  alpha
3 -0.574193  0.530819  Y   beta


The group name is Z
   dataset1  dataset2 k1     k2
4 -1.070406   2.09992  Z  alpha




In [20]:
# iterate by multiple keys
for (k1, k2), group in df1.groupby(['k1', 'k2']):
    print('Key 1 = {}, Key 2 = {}'.format(k1, k2))
    print(group)
    print('\n')

Key 1 = X, Key 2 = alpha
   dataset1  dataset2 k1     k2
0  1.059563 -0.153346  X  alpha


Key 1 = X, Key 2 = beta
   dataset1  dataset2 k1    k2
1 -1.314812 -1.033353  X  beta


Key 1 = Y, Key 2 = alpha
   dataset1  dataset2 k1     k2
2  0.063592  0.521678  Y  alpha


Key 1 = Y, Key 2 = beta
   dataset1  dataset2 k1    k2
3 -0.574193  0.530819  Y  beta


Key 1 = Z, Key 2 = alpha
   dataset1  dataset2 k1     k2
4 -1.070406   2.09992  Z  alpha




### Group By Lists and Series

In [22]:
# create animals data frame
animals = pd.DataFrame(np.arange(16).reshape(4, 4),
                       columns = list('WXYZ'),
                       index = ['Dog', 'Cat', 'Bird', 'Mouse'])
animals

Unnamed: 0,W,X,Y,Z
Dog,0,1,2,3
Cat,4,5,6,7
Bird,8,9,10,11
Mouse,12,13,14,15


In [24]:
# set index 1 values for W and Y to null
animals.ix[1:2, ['W', 'Y']] = np.nan
animals

Unnamed: 0,W,X,Y,Z
Dog,0.0,1,2.0,3
Cat,,5,,7
Bird,8.0,9,10.0,11
Mouse,12.0,13,14.0,15


In [25]:
# create a behavior map
behavior_map = {'W':'good', 'X':'bad', 'Y':'good', 'Z':'bad'}

In [28]:
# sum up good and bad for each index row
animal_col = animals.groupby(behavior_map, axis = 1)
animal_col.sum()

Unnamed: 0,bad,good
Dog,4,2.0
Cat,12,
Bird,20,18.0
Mouse,28,26.0


In [30]:
# we can also count the values
animal_col.count()

Unnamed: 0,bad,good
Dog,2,2
Cat,2,0
Bird,2,2
Mouse,2,2


In [34]:
# find the max in each category
animal_col.max()

Unnamed: 0,bad,good
Dog,3,2.0
Cat,7,
Bird,11,10.0
Mouse,15,14.0


In [31]:
# sum each column by length
animals.groupby(len).sum()

Unnamed: 0,W,X,Y,Z
3,0,6,2,10
4,8,9,10,11
5,12,13,14,15
