In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df = pd.DataFrame({
    'key1':['a','a','b','a','a'],
    'key2':['one','two','one','two','one'],
    'data1':np.random.randn(5),
    'data2':np.random.randn(5),
})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.951055,0.603397
1,a,two,-0.061537,-0.033768
2,b,one,-0.645651,0.458786
3,a,two,-0.184415,-1.183701
4,a,one,-0.543681,0.131205


In [3]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7ffb7e5e6790>

In [4]:
# it is a group by object
grouped.mean()

key1
a    0.040356
b   -0.645651
Name: data1, dtype: float64

In [5]:
grouped.sum()

key1
a    0.161423
b   -0.645651
Name: data1, dtype: float64

In [6]:
means = df['data1'].groupby([df['key1'],df['key2']]).mean()

In [7]:
means

key1  key2
a     one     0.203687
      two    -0.122976
b     one    -0.645651
Name: data1, dtype: float64

In [8]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.203687,-0.122976
b,-0.645651,


In [9]:
means.unstack().stack()

key1  key2
a     one     0.203687
      two    -0.122976
b     one    -0.645651
dtype: float64

In [10]:
states = np.array(['kar','tn','ap','kar','tn'])
years = np.array([2005,2006,2007,2006,2007])

In [11]:
states

array(['kar', 'tn', 'ap', 'kar', 'tn'], dtype='<U3')

In [12]:
years

array([2005, 2006, 2007, 2006, 2007])

In [13]:
df['data1'].groupby([states, years]).mean()

ap   2007   -0.645651
kar  2005    0.951055
     2006   -0.184415
tn   2006   -0.061537
     2007   -0.543681
Name: data1, dtype: float64

In [14]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.040356,-0.120717
b,-0.645651,0.458786


In [15]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.203687,0.367301
a,two,-0.122976,-0.608734
b,one,-0.645651,0.458786


In [16]:
df.groupby('key1').size()

key1
a    4
b    1
dtype: int64

In [17]:
# missing values will be excluded from the group
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     2
b     one     1
dtype: int64

### Iterating over the group

In [18]:
for i in df.groupby('key1'):
    print(i)

('a',   key1 key2     data1     data2
0    a  one  0.951055  0.603397
1    a  two -0.061537 -0.033768
3    a  two -0.184415 -1.183701
4    a  one -0.543681  0.131205)
('b',   key1 key2     data1     data2
2    b  one -0.645651  0.458786)


In [19]:
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  0.951055  0.603397
1    a  two -0.061537 -0.033768
3    a  two -0.184415 -1.183701
4    a  one -0.543681  0.131205
b
  key1 key2     data1     data2
2    b  one -0.645651  0.458786


In [20]:
for (k1,k2),group in df.groupby(['key1','key2']):
    print(k1)
    print(k2)
    print(group)

a
one
  key1 key2     data1     data2
0    a  one  0.951055  0.603397
4    a  one -0.543681  0.131205
a
two
  key1 key2     data1     data2
1    a  two -0.061537 -0.033768
3    a  two -0.184415 -1.183701
b
one
  key1 key2     data1     data2
2    b  one -0.645651  0.458786


In [21]:
pieces = dict(list(df.groupby('key1')))

In [22]:
pieces['a']

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.951055,0.603397
1,a,two,-0.061537,-0.033768
3,a,two,-0.184415,-1.183701
4,a,one,-0.543681,0.131205


## Split apply Combine