In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [3]:
df = pd.DataFrame({
    'key1':['a','a','b','a','a'],
    'key2':['one','two','one','two','one'],
    'data1':np.random.randn(5),
    'data2':np.random.randn(5),
})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.053618,0.956994
1,a,two,-2.590769,-1.199469
2,b,one,-0.314394,0.13966
3,a,two,0.646784,-1.441944
4,a,one,1.061722,0.996215


In [4]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000244B3DB2908>

In [6]:
# it is a group by object
grouped.mean()

key1
a   -0.207161
b   -0.314394
Name: data1, dtype: float64

In [7]:
grouped.sum()

key1
a   -0.828645
b   -0.314394
Name: data1, dtype: float64

In [9]:
means = df['data1'].groupby([df['key1'],df['key2']]).mean()

In [10]:
means

key1  key2
a     one     0.557670
      two    -0.971992
b     one    -0.314394
Name: data1, dtype: float64

In [11]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.55767,-0.971992
b,-0.314394,


In [13]:
means.unstack().stack()

key1  key2
a     one     0.557670
      two    -0.971992
b     one    -0.314394
dtype: float64

In [18]:
states = np.array(['kar','tn','ap','kar','tn'])
years = np.array([2005,2006,2007,2006,2007])

In [19]:
states

array(['kar', 'tn', 'ap', 'kar', 'tn'], dtype='<U3')

In [20]:
years

array([2005, 2006, 2007, 2006, 2007])

In [21]:
df['data1'].groupby([states, years]).mean()

ap   2007   -0.314394
kar  2005    0.053618
     2006    0.646784
tn   2006   -2.590769
     2007    1.061722
Name: data1, dtype: float64

In [22]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.207161,-0.172051
b,-0.314394,0.13966


In [23]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.55767,0.976605
a,two,-0.971992,-1.320707
b,one,-0.314394,0.13966


In [24]:
df.groupby('key1').size()

key1
a    4
b    1
dtype: int64

In [26]:
# missing values will be excluded from the group
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     2
b     one     1
dtype: int64

### Iterating over the group

In [28]:
for i in df.groupby('key1'):
    print(i)

('a',   key1 key2     data1     data2
0    a  one  0.053618  0.956994
1    a  two -2.590769 -1.199469
3    a  two  0.646784 -1.441944
4    a  one  1.061722  0.996215)
('b',   key1 key2     data1    data2
2    b  one -0.314394  0.13966)


In [30]:
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  0.053618  0.956994
1    a  two -2.590769 -1.199469
3    a  two  0.646784 -1.441944
4    a  one  1.061722  0.996215
b
  key1 key2     data1    data2
2    b  one -0.314394  0.13966


In [31]:
for (k1,k2),group in df.groupby(['key1','key2']):
    print(k1)
    print(k2)
    print(group)

a
one
  key1 key2     data1     data2
0    a  one  0.053618  0.956994
4    a  one  1.061722  0.996215
a
two
  key1 key2     data1     data2
1    a  two -2.590769 -1.199469
3    a  two  0.646784 -1.441944
b
one
  key1 key2     data1    data2
2    b  one -0.314394  0.13966


In [35]:
pieces = dict(list(df.groupby('key1')))

In [36]:
pieces['a']

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.053618,0.956994
1,a,two,-2.590769,-1.199469
3,a,two,0.646784,-1.441944
4,a,one,1.061722,0.996215
